diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,19 +2,19 @@ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, - "eval_steps": 100000, - "global_step": 11868, + "eval_steps": 100, + "global_step": 12465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 4.212299915754001e-10, - "logits/chosen": -3.166908025741577, - "logits/rejected": -3.3487741947174072, - "logps/chosen": -546.013916015625, - "logps/rejected": -472.92132568359375, + "learning_rate": 4.009623095429029e-10, + "logits/chosen": -3.064915418624878, + "logits/rejected": -3.046143054962158, + "logps/chosen": -238.21163940429688, + "logps/rejected": -135.75088500976562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,16623 +24,19447 @@ }, { "epoch": 0.0, - "learning_rate": 4.212299915754001e-09, - "logits/chosen": -2.9773247241973877, - "logits/rejected": -2.852890968322754, - "logps/chosen": -277.7378234863281, - "logps/rejected": -267.5666198730469, + "learning_rate": 4.0096230954290295e-09, + "logits/chosen": -2.9954445362091064, + "logits/rejected": -3.1511900424957275, + "logps/chosen": -257.3526611328125, + "logps/rejected": -236.0702362060547, "loss": 0.6964, - "rewards/accuracies": 0.4444444477558136, - "rewards/chosen": 0.0006573781720362604, - "rewards/margins": -0.007902717217803001, - "rewards/rejected": 0.008560094982385635, + "rewards/accuracies": 0.2777777910232544, + "rewards/chosen": -0.012606658972799778, + "rewards/margins": -0.008060736581683159, + "rewards/rejected": -0.004545920994132757, "step": 10 }, { - "epoch": 0.01, - "learning_rate": 8.424599831508002e-09, - "logits/chosen": -2.807126998901367, - "logits/rejected": -2.7919721603393555, - "logps/chosen": -245.955078125, - "logps/rejected": -225.7766571044922, - "loss": 0.692, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.01841505989432335, - "rewards/margins": 0.023902256041765213, - "rewards/rejected": -0.0054871938191354275, + "epoch": 0.0, + "learning_rate": 8.019246190858059e-09, + "logits/chosen": -3.035249948501587, + "logits/rejected": -2.9968161582946777, + "logps/chosen": -252.831298828125, + "logps/rejected": -123.2061996459961, + "loss": 0.6912, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.005537747871130705, + "rewards/margins": -0.01774672046303749, + "rewards/rejected": 0.012208972126245499, "step": 20 }, { "epoch": 0.01, - "learning_rate": 1.2636899747262005e-08, - "logits/chosen": -2.989205837249756, - "logits/rejected": -2.9802839756011963, - "logps/chosen": -299.1269836425781, - "logps/rejected": -249.7318572998047, - "loss": 0.6929, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.002239107619971037, - "rewards/margins": 0.006709246896207333, - "rewards/rejected": -0.004470138344913721, + "learning_rate": 1.2028869286287089e-08, + "logits/chosen": -3.160801410675049, + "logits/rejected": -3.1754937171936035, + "logps/chosen": -330.74969482421875, + "logps/rejected": -294.14288330078125, + "loss": 0.6907, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005278147757053375, + "rewards/margins": 0.018668215721845627, + "rewards/rejected": -0.0133900698274374, "step": 30 }, { "epoch": 0.01, - "learning_rate": 1.6849199663016004e-08, - "logits/chosen": -2.9071033000946045, - "logits/rejected": -2.8376030921936035, - "logps/chosen": -300.695556640625, - "logps/rejected": -234.8794708251953, - "loss": 0.6956, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.004403972998261452, - "rewards/margins": 0.006278342567384243, - "rewards/rejected": -0.0018743708496913314, + "learning_rate": 1.6038492381716118e-08, + "logits/chosen": -3.138197422027588, + "logits/rejected": -3.1155025959014893, + "logps/chosen": -255.21621704101562, + "logps/rejected": -246.06802368164062, + "loss": 0.6974, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.004621773026883602, + "rewards/margins": -0.015276918187737465, + "rewards/rejected": 0.010655145160853863, "step": 40 }, { "epoch": 0.01, - "learning_rate": 2.106149957877001e-08, - "logits/chosen": -2.7081246376037598, - "logits/rejected": -2.683647871017456, - "logps/chosen": -249.2618408203125, - "logps/rejected": -249.1317901611328, - "loss": 0.6885, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.019329214468598366, - "rewards/margins": 0.01521158218383789, - "rewards/rejected": 0.0041176313534379005, + "learning_rate": 2.0048115477145146e-08, + "logits/chosen": -3.072852373123169, + "logits/rejected": -3.123746395111084, + "logps/chosen": -299.31695556640625, + "logps/rejected": -286.1008605957031, + "loss": 0.6931, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.01078118197619915, + "rewards/margins": 0.0025832075625658035, + "rewards/rejected": 0.008197975344955921, "step": 50 }, { - "epoch": 0.02, - "learning_rate": 2.527379949452401e-08, - "logits/chosen": -2.8151774406433105, - "logits/rejected": -2.8212380409240723, - "logps/chosen": -186.51699829101562, - "logps/rejected": -238.936279296875, - "loss": 0.6903, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.0006278610089793801, - "rewards/margins": 0.010208692401647568, - "rewards/rejected": -0.010836553759872913, + "epoch": 0.01, + "learning_rate": 2.4057738572574177e-08, + "logits/chosen": -3.096876859664917, + "logits/rejected": -3.027501344680786, + "logps/chosen": -239.8458251953125, + "logps/rejected": -281.56610107421875, + "loss": 0.6945, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.001797366188839078, + "rewards/margins": -0.020706120878458023, + "rewards/rejected": 0.022503485903143883, "step": 60 }, { "epoch": 0.02, - "learning_rate": 2.948609941027801e-08, - "logits/chosen": -2.926729440689087, - "logits/rejected": -2.8498129844665527, - "logps/chosen": -298.0478210449219, - "logps/rejected": -214.31356811523438, - "loss": 0.6899, - "rewards/accuracies": 0.3499999940395355, - "rewards/chosen": 0.015186784788966179, - "rewards/margins": 0.006457159761339426, - "rewards/rejected": 0.008729624561965466, + "learning_rate": 2.8067361668003205e-08, + "logits/chosen": -3.0402631759643555, + "logits/rejected": -3.0895168781280518, + "logps/chosen": -291.98260498046875, + "logps/rejected": -212.9949951171875, + "loss": 0.6867, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.022620830684900284, + "rewards/margins": 0.023156987503170967, + "rewards/rejected": -0.0005361553630791605, "step": 70 }, { "epoch": 0.02, - "learning_rate": 3.369839932603201e-08, - "logits/chosen": -3.0162465572357178, - "logits/rejected": -2.957993507385254, - "logps/chosen": -293.937255859375, - "logps/rejected": -307.1500549316406, - "loss": 0.6906, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.015594196505844593, - "rewards/margins": 0.021771308034658432, - "rewards/rejected": -0.006177111063152552, + "learning_rate": 3.2076984763432236e-08, + "logits/chosen": -3.006594181060791, + "logits/rejected": -2.9405314922332764, + "logps/chosen": -149.168212890625, + "logps/rejected": -167.89016723632812, + "loss": 0.6876, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.001428012503311038, + "rewards/margins": -0.004226250108331442, + "rewards/rejected": 0.005654263310134411, "step": 80 }, { "epoch": 0.02, - "learning_rate": 3.791069924178601e-08, - "logits/chosen": -2.7287724018096924, - "logits/rejected": -2.6703319549560547, - "logps/chosen": -355.0783386230469, - "logps/rejected": -228.819091796875, - "loss": 0.687, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.013224788010120392, - "rewards/margins": 0.025851404294371605, - "rewards/rejected": -0.012626620009541512, + "learning_rate": 3.608660785886127e-08, + "logits/chosen": -3.099435329437256, + "logits/rejected": -3.0696494579315186, + "logps/chosen": -204.51351928710938, + "logps/rejected": -225.5569610595703, + "loss": 0.6852, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02792467176914215, + "rewards/margins": 0.028380069881677628, + "rewards/rejected": -0.00045539866550825536, "step": 90 }, { - "epoch": 0.03, - "learning_rate": 4.212299915754002e-08, - "logits/chosen": -2.8378939628601074, - "logits/rejected": -2.860485792160034, - "logps/chosen": -337.00531005859375, - "logps/rejected": -187.0914306640625, - "loss": 0.6873, - "rewards/accuracies": 0.4000000059604645, - "rewards/chosen": 0.012190933339297771, - "rewards/margins": 0.007645601872354746, - "rewards/rejected": 0.004545331001281738, + "epoch": 0.02, + "learning_rate": 4.009623095429029e-08, + "logits/chosen": -2.948312759399414, + "logits/rejected": -3.02382493019104, + "logps/chosen": -283.7791442871094, + "logps/rejected": -193.44203186035156, + "loss": 0.6817, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00888245739042759, + "rewards/margins": 0.027316834777593613, + "rewards/rejected": -0.01843438111245632, + "step": 100 + }, + { + "epoch": 0.02, + "eval_logits/chosen": -3.155965805053711, + "eval_logits/rejected": -3.160491943359375, + "eval_logps/chosen": -195.81240844726562, + "eval_logps/rejected": -184.39120483398438, + "eval_loss": 0.6873495578765869, + "eval_rewards/accuracies": 0.5149999856948853, + "eval_rewards/chosen": 0.01486087404191494, + "eval_rewards/margins": 0.014673066325485706, + "eval_rewards/rejected": 0.00018780909886118025, + "eval_runtime": 131.9307, + "eval_samples_per_second": 23.922, + "eval_steps_per_second": 0.379, "step": 100 }, { "epoch": 0.03, - "learning_rate": 4.6335299073294016e-08, - "logits/chosen": -2.791350841522217, - "logits/rejected": -2.7435081005096436, - "logps/chosen": -152.36270141601562, - "logps/rejected": -189.67120361328125, - "loss": 0.6811, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0046439082361757755, - "rewards/margins": 0.01751074194908142, - "rewards/rejected": -0.012866830453276634, + "learning_rate": 4.410585404971932e-08, + "logits/chosen": -3.0717997550964355, + "logits/rejected": -3.0305328369140625, + "logps/chosen": -255.3395538330078, + "logps/rejected": -272.1784362792969, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020307859405875206, + "rewards/margins": 0.026331651955842972, + "rewards/rejected": -0.006023784633725882, "step": 110 }, { "epoch": 0.03, - "learning_rate": 5.054759898904802e-08, - "logits/chosen": -2.9045522212982178, - "logits/rejected": -2.8898544311523438, - "logps/chosen": -241.04635620117188, - "logps/rejected": -231.09036254882812, - "loss": 0.681, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.030125772580504417, - "rewards/margins": 0.03214115649461746, - "rewards/rejected": -0.0020153801888227463, + "learning_rate": 4.8115477145148354e-08, + "logits/chosen": -3.053476333618164, + "logits/rejected": -3.0541749000549316, + "logps/chosen": -267.1380310058594, + "logps/rejected": -266.35064697265625, + "loss": 0.6843, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.017900725826621056, + "rewards/margins": 0.024556264281272888, + "rewards/rejected": -0.006655541248619556, "step": 120 }, { "epoch": 0.03, - "learning_rate": 5.475989890480202e-08, - "logits/chosen": -2.863396167755127, - "logits/rejected": -2.875047206878662, - "logps/chosen": -295.2554626464844, - "logps/rejected": -316.55908203125, - "loss": 0.6819, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.06428538262844086, - "rewards/margins": 0.02827184833586216, - "rewards/rejected": 0.03601354733109474, + "learning_rate": 5.2125100240577385e-08, + "logits/chosen": -3.1513009071350098, + "logits/rejected": -3.1172022819519043, + "logps/chosen": -257.81011962890625, + "logps/rejected": -253.70675659179688, + "loss": 0.6877, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.053882915526628494, + "rewards/margins": 0.04688756912946701, + "rewards/rejected": 0.006995342671871185, "step": 130 }, { - "epoch": 0.04, - "learning_rate": 5.897219882055602e-08, - "logits/chosen": -2.800774097442627, - "logits/rejected": -2.7655835151672363, - "logps/chosen": -227.52932739257812, - "logps/rejected": -209.88265991210938, - "loss": 0.6768, + "epoch": 0.03, + "learning_rate": 5.613472333600641e-08, + "logits/chosen": -3.1228244304656982, + "logits/rejected": -3.114727735519409, + "logps/chosen": -149.54647827148438, + "logps/rejected": -203.30245971679688, + "loss": 0.6789, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.055633824318647385, - "rewards/margins": 0.0319707877933979, - "rewards/rejected": 0.023663034662604332, + "rewards/chosen": 0.03608744591474533, + "rewards/margins": 0.019435148686170578, + "rewards/rejected": 0.016652299091219902, "step": 140 }, { "epoch": 0.04, - "learning_rate": 6.318449873631002e-08, - "logits/chosen": -2.7800867557525635, - "logits/rejected": -2.8257360458374023, - "logps/chosen": -201.40185546875, - "logps/rejected": -236.130859375, - "loss": 0.6778, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06176251918077469, - "rewards/margins": 0.037913981825113297, - "rewards/rejected": 0.023848531767725945, + "learning_rate": 6.014434643143545e-08, + "logits/chosen": -3.123277425765991, + "logits/rejected": -3.159675121307373, + "logps/chosen": -198.58377075195312, + "logps/rejected": -202.8527374267578, + "loss": 0.6724, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07267072796821594, + "rewards/margins": 0.03419844061136246, + "rewards/rejected": 0.03847228363156319, "step": 150 }, { "epoch": 0.04, - "learning_rate": 6.739679865206401e-08, - "logits/chosen": -2.9548959732055664, - "logits/rejected": -2.835465669631958, - "logps/chosen": -235.51101684570312, - "logps/rejected": -188.86111450195312, - "loss": 0.6604, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.05679447576403618, - "rewards/margins": 0.043836116790771484, - "rewards/rejected": 0.012958364561200142, + "learning_rate": 6.415396952686447e-08, + "logits/chosen": -3.1125125885009766, + "logits/rejected": -3.0934643745422363, + "logps/chosen": -329.90679931640625, + "logps/rejected": -235.77151489257812, + "loss": 0.6684, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.09809212386608124, + "rewards/margins": 0.014599055051803589, + "rewards/rejected": 0.08349306136369705, "step": 160 }, { "epoch": 0.04, - "learning_rate": 7.160909856781803e-08, - "logits/chosen": -2.813509464263916, - "logits/rejected": -2.895786762237549, - "logps/chosen": -317.48004150390625, - "logps/rejected": -302.57806396484375, - "loss": 0.6769, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.09258848428726196, - "rewards/margins": 0.06396313011646271, - "rewards/rejected": 0.02862536534667015, + "learning_rate": 6.81635926222935e-08, + "logits/chosen": -3.142857313156128, + "logits/rejected": -3.1772620677948, + "logps/chosen": -248.07559204101562, + "logps/rejected": -237.1481170654297, + "loss": 0.6518, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.14629176259040833, + "rewards/margins": 0.13037565350532532, + "rewards/rejected": 0.0159161277115345, "step": 170 }, { - "epoch": 0.05, - "learning_rate": 7.582139848357203e-08, - "logits/chosen": -2.7334518432617188, - "logits/rejected": -2.749671697616577, - "logps/chosen": -183.47982788085938, - "logps/rejected": -292.4325866699219, - "loss": 0.6528, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.019605696201324463, - "rewards/margins": 0.04809585586190224, - "rewards/rejected": -0.028490161523222923, + "epoch": 0.04, + "learning_rate": 7.217321571772253e-08, + "logits/chosen": -3.1290981769561768, + "logits/rejected": -3.210472583770752, + "logps/chosen": -204.18447875976562, + "logps/rejected": -152.8319549560547, + "loss": 0.6598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.06993807852268219, + "rewards/margins": 0.06618108600378036, + "rewards/rejected": 0.003756991820409894, "step": 180 }, { "epoch": 0.05, - "learning_rate": 8.003369839932602e-08, - "logits/chosen": -2.8567605018615723, - "logits/rejected": -2.8261303901672363, - "logps/chosen": -310.6612854003906, - "logps/rejected": -254.41552734375, - "loss": 0.6434, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.11722777038812637, - "rewards/margins": 0.0635734349489212, - "rewards/rejected": 0.05365434288978577, + "learning_rate": 7.618283881315156e-08, + "logits/chosen": -3.1935317516326904, + "logits/rejected": -3.1765670776367188, + "logps/chosen": -318.52032470703125, + "logps/rejected": -354.9051513671875, + "loss": 0.6575, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13720566034317017, + "rewards/margins": 0.08860117942094803, + "rewards/rejected": 0.048604488372802734, "step": 190 }, { "epoch": 0.05, - "learning_rate": 8.424599831508004e-08, - "logits/chosen": -2.7087759971618652, - "logits/rejected": -2.7274348735809326, - "logps/chosen": -306.12469482421875, - "logps/rejected": -233.0144500732422, - "loss": 0.6536, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.042596496641635895, - "rewards/margins": 0.04861391335725784, - "rewards/rejected": -0.00601741811260581, + "learning_rate": 8.019246190858058e-08, + "logits/chosen": -3.0937659740448, + "logits/rejected": -3.0600972175598145, + "logps/chosen": -196.54562377929688, + "logps/rejected": -250.0481719970703, + "loss": 0.6767, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0702294260263443, + "rewards/margins": -0.009940843097865582, + "rewards/rejected": 0.08017027378082275, "step": 200 }, { "epoch": 0.05, - "learning_rate": 8.845829823083403e-08, - "logits/chosen": -2.6913976669311523, - "logits/rejected": -2.7625555992126465, - "logps/chosen": -195.1536865234375, - "logps/rejected": -288.43914794921875, - "loss": 0.6492, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.08051207661628723, - "rewards/margins": 0.15957090258598328, - "rewards/rejected": -0.07905881106853485, + "eval_logits/chosen": -3.160501718521118, + "eval_logits/rejected": -3.1654369831085205, + "eval_logps/chosen": -195.13624572753906, + "eval_logps/rejected": -184.2245635986328, + "eval_loss": 0.661376953125, + "eval_rewards/accuracies": 0.5575000047683716, + "eval_rewards/chosen": 0.08247680962085724, + "eval_rewards/margins": 0.06562582403421402, + "eval_rewards/rejected": 0.01685098186135292, + "eval_runtime": 131.9707, + "eval_samples_per_second": 23.914, + "eval_steps_per_second": 0.379, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 8.420208500400962e-08, + "logits/chosen": -3.0224945545196533, + "logits/rejected": -3.0139596462249756, + "logps/chosen": -192.14273071289062, + "logps/rejected": -222.08645629882812, + "loss": 0.649, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15767574310302734, + "rewards/margins": 0.06203259155154228, + "rewards/rejected": 0.09564316272735596, "step": 210 }, { - "epoch": 0.06, - "learning_rate": 9.267059814658803e-08, - "logits/chosen": -2.8155956268310547, - "logits/rejected": -2.858513355255127, - "logps/chosen": -269.7468566894531, - "logps/rejected": -209.1760711669922, - "loss": 0.6394, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.09490491449832916, - "rewards/margins": 0.09294173866510391, - "rewards/rejected": 0.0019631728064268827, + "epoch": 0.05, + "learning_rate": 8.821170809943865e-08, + "logits/chosen": -3.130227565765381, + "logits/rejected": -3.1300535202026367, + "logps/chosen": -188.0099639892578, + "logps/rejected": -218.964111328125, + "loss": 0.6377, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.1188805103302002, + "rewards/margins": 0.02827729657292366, + "rewards/rejected": 0.09060321748256683, "step": 220 }, { "epoch": 0.06, - "learning_rate": 9.688289806234204e-08, - "logits/chosen": -2.7522032260894775, - "logits/rejected": -2.6138253211975098, - "logps/chosen": -223.89501953125, - "logps/rejected": -219.81045532226562, - "loss": 0.6445, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.024391641840338707, - "rewards/margins": 0.02039915695786476, - "rewards/rejected": -0.04479080066084862, + "learning_rate": 9.222133119486767e-08, + "logits/chosen": -3.119661808013916, + "logits/rejected": -3.1264634132385254, + "logps/chosen": -318.0464782714844, + "logps/rejected": -249.5142059326172, + "loss": 0.618, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3172004818916321, + "rewards/margins": 0.2074754238128662, + "rewards/rejected": 0.10972510278224945, "step": 230 }, { "epoch": 0.06, - "learning_rate": 1.0109519797809604e-07, - "logits/chosen": -2.892430543899536, - "logits/rejected": -2.9211509227752686, - "logps/chosen": -262.0943908691406, - "logps/rejected": -300.74822998046875, - "loss": 0.64, + "learning_rate": 9.623095429029671e-08, + "logits/chosen": -3.119455099105835, + "logits/rejected": -3.1523375511169434, + "logps/chosen": -250.0281219482422, + "logps/rejected": -182.06683349609375, + "loss": 0.6517, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.1830977201461792, - "rewards/margins": 0.11756487190723419, - "rewards/rejected": 0.0655328631401062, + "rewards/chosen": 0.20795579254627228, + "rewards/margins": 0.1413995325565338, + "rewards/rejected": 0.06655625998973846, "step": 240 }, { "epoch": 0.06, - "learning_rate": 1.0530749789385003e-07, - "logits/chosen": -2.816969633102417, - "logits/rejected": -2.772429943084717, - "logps/chosen": -212.73501586914062, - "logps/rejected": -219.41799926757812, - "loss": 0.6396, + "learning_rate": 1.0024057738572573e-07, + "logits/chosen": -3.002434730529785, + "logits/rejected": -3.0305192470550537, + "logps/chosen": -248.87380981445312, + "logps/rejected": -194.0207061767578, + "loss": 0.6234, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.05148952081799507, - "rewards/margins": 0.03601338341832161, - "rewards/rejected": 0.015476112253963947, + "rewards/chosen": 0.25454169511795044, + "rewards/margins": 0.15869472920894623, + "rewards/rejected": 0.0958469957113266, "step": 250 }, { - "epoch": 0.07, - "learning_rate": 1.0951979780960404e-07, - "logits/chosen": -2.801922082901001, - "logits/rejected": -2.7320210933685303, - "logps/chosen": -311.57659912109375, - "logps/rejected": -213.0792694091797, - "loss": 0.6479, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.11519031226634979, - "rewards/margins": 0.20116600394248962, - "rewards/rejected": -0.08597570657730103, + "epoch": 0.06, + "learning_rate": 1.0425020048115477e-07, + "logits/chosen": -3.1732025146484375, + "logits/rejected": -3.164207935333252, + "logps/chosen": -187.8583984375, + "logps/rejected": -183.4801788330078, + "loss": 0.6354, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.21577689051628113, + "rewards/margins": 0.15044564008712769, + "rewards/rejected": 0.06533125042915344, "step": 260 }, { - "epoch": 0.07, - "learning_rate": 1.1373209772535804e-07, - "logits/chosen": -2.9025025367736816, - "logits/rejected": -2.8095669746398926, - "logps/chosen": -204.12356567382812, - "logps/rejected": -184.1841278076172, - "loss": 0.6558, + "epoch": 0.06, + "learning_rate": 1.082598235765838e-07, + "logits/chosen": -3.137756824493408, + "logits/rejected": -3.0722391605377197, + "logps/chosen": -245.33425903320312, + "logps/rejected": -235.2985382080078, + "loss": 0.6571, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.18039628863334656, - "rewards/margins": 0.17073126137256622, - "rewards/rejected": 0.009665054269134998, + "rewards/chosen": 0.04129766300320625, + "rewards/margins": 0.1584317684173584, + "rewards/rejected": -0.11713409423828125, "step": 270 }, { "epoch": 0.07, - "learning_rate": 1.1794439764111204e-07, - "logits/chosen": -2.873542308807373, - "logits/rejected": -2.6858971118927, - "logps/chosen": -302.70599365234375, - "logps/rejected": -284.6993103027344, - "loss": 0.6326, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.10259075462818146, - "rewards/margins": 0.242101788520813, - "rewards/rejected": -0.13951101899147034, + "learning_rate": 1.1226944667201282e-07, + "logits/chosen": -3.0344252586364746, + "logits/rejected": -2.9903206825256348, + "logps/chosen": -152.6925506591797, + "logps/rejected": -237.93588256835938, + "loss": 0.6339, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10848043113946915, + "rewards/margins": 0.07681788504123688, + "rewards/rejected": 0.03166256099939346, "step": 280 }, { "epoch": 0.07, - "learning_rate": 1.2215669755686605e-07, - "logits/chosen": -2.8171753883361816, - "logits/rejected": -2.789837598800659, - "logps/chosen": -176.04864501953125, - "logps/rejected": -190.3517303466797, - "loss": 0.6455, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.07305292040109634, - "rewards/margins": 0.2632436156272888, - "rewards/rejected": -0.19019068777561188, + "learning_rate": 1.1627906976744186e-07, + "logits/chosen": -3.100961685180664, + "logits/rejected": -3.1213812828063965, + "logps/chosen": -274.2236022949219, + "logps/rejected": -230.8939666748047, + "loss": 0.6191, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06249620392918587, + "rewards/margins": 0.1558597832918167, + "rewards/rejected": -0.09336356818675995, "step": 290 }, { - "epoch": 0.08, - "learning_rate": 1.2636899747262003e-07, - "logits/chosen": -2.90800404548645, - "logits/rejected": -2.73028302192688, - "logps/chosen": -283.43798828125, - "logps/rejected": -231.88259887695312, - "loss": 0.5735, + "epoch": 0.07, + "learning_rate": 1.202886928628709e-07, + "logits/chosen": -3.0173017978668213, + "logits/rejected": -3.0081381797790527, + "logps/chosen": -297.2784729003906, + "logps/rejected": -404.6129455566406, + "loss": 0.6328, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.24098046123981476, - "rewards/margins": 0.5552220344543457, - "rewards/rejected": -0.31424158811569214, + "rewards/chosen": 0.21068871021270752, + "rewards/margins": 0.30474939942359924, + "rewards/rejected": -0.0940607339143753, "step": 300 }, { - "epoch": 0.08, - "learning_rate": 1.3058129738837404e-07, - "logits/chosen": -2.6579442024230957, - "logits/rejected": -2.7264251708984375, - "logps/chosen": -171.2255096435547, - "logps/rejected": -202.13211059570312, - "loss": 0.5957, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.09882710874080658, - "rewards/margins": 0.27224260568618774, - "rewards/rejected": -0.17341549694538116, + "epoch": 0.07, + "eval_logits/chosen": -3.15289306640625, + "eval_logits/rejected": -3.157904624938965, + "eval_logps/chosen": -196.3348846435547, + "eval_logps/rejected": -186.50473022460938, + "eval_loss": 0.6246495842933655, + "eval_rewards/accuracies": 0.5874999761581421, + "eval_rewards/chosen": -0.037386391311883926, + "eval_rewards/margins": 0.17377792298793793, + "eval_rewards/rejected": -0.21116434037685394, + "eval_runtime": 132.2126, + "eval_samples_per_second": 23.871, + "eval_steps_per_second": 0.378, + "step": 300 + }, + { + "epoch": 0.07, + "learning_rate": 1.242983159582999e-07, + "logits/chosen": -3.041438102722168, + "logits/rejected": -3.0385303497314453, + "logps/chosen": -219.4403076171875, + "logps/rejected": -203.63101196289062, + "loss": 0.6228, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.23164334893226624, + "rewards/margins": 0.005232417490333319, + "rewards/rejected": -0.2368757277727127, "step": 310 }, { "epoch": 0.08, - "learning_rate": 1.3479359730412803e-07, - "logits/chosen": -2.829624891281128, - "logits/rejected": -2.885951519012451, - "logps/chosen": -294.4091796875, - "logps/rejected": -299.72235107421875, - "loss": 0.6045, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.20352359116077423, - "rewards/margins": 0.23111791908740997, - "rewards/rejected": -0.027594303712248802, + "learning_rate": 1.2830793905372894e-07, + "logits/chosen": -3.1399011611938477, + "logits/rejected": -3.166691303253174, + "logps/chosen": -286.4032897949219, + "logps/rejected": -232.0251007080078, + "loss": 0.6262, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1417991816997528, + "rewards/margins": 0.2163160741329193, + "rewards/rejected": -0.0745168924331665, "step": 320 }, { "epoch": 0.08, - "learning_rate": 1.3900589721988204e-07, - "logits/chosen": -2.950706958770752, - "logits/rejected": -2.893571138381958, - "logps/chosen": -417.59442138671875, - "logps/rejected": -337.65692138671875, - "loss": 0.6005, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.16790476441383362, - "rewards/margins": 0.23245680332183838, - "rewards/rejected": -0.06455201655626297, + "learning_rate": 1.3231756214915798e-07, + "logits/chosen": -3.0583558082580566, + "logits/rejected": -3.1892318725585938, + "logps/chosen": -366.7509765625, + "logps/rejected": -294.9625549316406, + "loss": 0.579, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.20990796387195587, + "rewards/margins": 0.5760718584060669, + "rewards/rejected": -0.3661639094352722, "step": 330 }, { - "epoch": 0.09, - "learning_rate": 1.4321819713563605e-07, - "logits/chosen": -2.7175984382629395, - "logits/rejected": -2.662710189819336, - "logps/chosen": -268.44659423828125, - "logps/rejected": -274.83990478515625, - "loss": 0.6299, + "epoch": 0.08, + "learning_rate": 1.36327185244587e-07, + "logits/chosen": -2.9049999713897705, + "logits/rejected": -2.84378719329834, + "logps/chosen": -208.74185180664062, + "logps/rejected": -172.14341735839844, + "loss": 0.5724, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.08806786686182022, - "rewards/margins": 0.3283998966217041, - "rewards/rejected": -0.24033205211162567, + "rewards/chosen": -0.10089479386806488, + "rewards/margins": 0.3428480327129364, + "rewards/rejected": -0.4437428116798401, "step": 340 }, { - "epoch": 0.09, - "learning_rate": 1.4743049705139004e-07, - "logits/chosen": -2.8463797569274902, - "logits/rejected": -2.8426125049591064, - "logps/chosen": -237.9515838623047, - "logps/rejected": -267.5751037597656, - "loss": 0.6076, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.33398309350013733, - "rewards/margins": 0.43801507353782654, - "rewards/rejected": -0.10403194278478622, + "epoch": 0.08, + "learning_rate": 1.4033680834001603e-07, + "logits/chosen": -2.7481255531311035, + "logits/rejected": -2.8643033504486084, + "logps/chosen": -232.1314697265625, + "logps/rejected": -163.34121704101562, + "loss": 0.5603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011425286531448364, + "rewards/margins": 0.2193736582994461, + "rewards/rejected": -0.23079895973205566, "step": 350 }, { "epoch": 0.09, - "learning_rate": 1.5164279696714405e-07, - "logits/chosen": -2.76839542388916, - "logits/rejected": -2.7172908782958984, - "logps/chosen": -237.4521942138672, - "logps/rejected": -197.22958374023438, - "loss": 0.6356, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.15721561014652252, - "rewards/margins": 0.23134712874889374, - "rewards/rejected": -0.07413151115179062, + "learning_rate": 1.4434643143544507e-07, + "logits/chosen": -3.1650760173797607, + "logits/rejected": -3.107818603515625, + "logps/chosen": -274.85430908203125, + "logps/rejected": -269.91156005859375, + "loss": 0.5828, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06234749034047127, + "rewards/margins": 0.34251466393470764, + "rewards/rejected": -0.2801671624183655, "step": 360 }, { "epoch": 0.09, - "learning_rate": 1.5585509688289806e-07, - "logits/chosen": -2.804783821105957, - "logits/rejected": -2.7225444316864014, - "logps/chosen": -200.27352905273438, - "logps/rejected": -182.17044067382812, - "loss": 0.6186, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.11609198898077011, - "rewards/margins": 0.4650752544403076, - "rewards/rejected": -0.3489832282066345, + "learning_rate": 1.483560545308741e-07, + "logits/chosen": -3.112389087677002, + "logits/rejected": -3.1251137256622314, + "logps/chosen": -182.7734375, + "logps/rejected": -182.17208862304688, + "loss": 0.5815, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.08121310919523239, + "rewards/margins": 0.3062261939048767, + "rewards/rejected": -0.22501309216022491, "step": 370 }, { - "epoch": 0.1, - "learning_rate": 1.6006739679865205e-07, - "logits/chosen": -2.867514133453369, - "logits/rejected": -2.7255942821502686, - "logps/chosen": -240.0352020263672, - "logps/rejected": -234.1785125732422, - "loss": 0.5953, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.3244686722755432, - "rewards/margins": 0.2539460361003876, - "rewards/rejected": 0.07052260637283325, + "epoch": 0.09, + "learning_rate": 1.5236567762630312e-07, + "logits/chosen": -3.1370511054992676, + "logits/rejected": -3.08475923538208, + "logps/chosen": -256.37689208984375, + "logps/rejected": -252.94436645507812, + "loss": 0.584, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11396662145853043, + "rewards/margins": 0.3842363953590393, + "rewards/rejected": -0.27026981115341187, "step": 380 }, + { + "epoch": 0.09, + "learning_rate": 1.5637530072173216e-07, + "logits/chosen": -3.172546625137329, + "logits/rejected": -3.1299076080322266, + "logps/chosen": -323.76556396484375, + "logps/rejected": -186.3572998046875, + "loss": 0.5368, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34009110927581787, + "rewards/margins": 0.5798455476760864, + "rewards/rejected": -0.23975440859794617, + "step": 390 + }, { "epoch": 0.1, - "learning_rate": 1.6427969671440606e-07, - "logits/chosen": -2.634826183319092, - "logits/rejected": -2.6665759086608887, - "logps/chosen": -243.5812530517578, - "logps/rejected": -324.9338684082031, - "loss": 0.6195, + "learning_rate": 1.6038492381716117e-07, + "logits/chosen": -3.1632840633392334, + "logits/rejected": -3.1762077808380127, + "logps/chosen": -245.2843017578125, + "logps/rejected": -274.73828125, + "loss": 0.5919, "rewards/accuracies": 0.5, - "rewards/chosen": 0.01270450372248888, - "rewards/margins": 0.18538111448287964, - "rewards/rejected": -0.17267660796642303, - "step": 390 + "rewards/chosen": 0.2282308042049408, + "rewards/margins": 0.057515304535627365, + "rewards/rejected": 0.17071552574634552, + "step": 400 }, { "epoch": 0.1, - "learning_rate": 1.6849199663016007e-07, - "logits/chosen": -2.812776565551758, - "logits/rejected": -2.7242472171783447, - "logps/chosen": -211.7786102294922, - "logps/rejected": -196.15328979492188, - "loss": 0.6083, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.20752961933612823, - "rewards/margins": 0.26783204078674316, - "rewards/rejected": -0.06030241772532463, + "eval_logits/chosen": -3.1243319511413574, + "eval_logits/rejected": -3.1291916370391846, + "eval_logps/chosen": -193.14886474609375, + "eval_logps/rejected": -185.0590057373047, + "eval_loss": 0.5977873802185059, + "eval_rewards/accuracies": 0.612500011920929, + "eval_rewards/chosen": 0.2812157869338989, + "eval_rewards/margins": 0.34780701994895935, + "eval_rewards/rejected": -0.06659123301506042, + "eval_runtime": 132.1302, + "eval_samples_per_second": 23.886, + "eval_steps_per_second": 0.378, "step": 400 }, { "epoch": 0.1, - "learning_rate": 1.7270429654591406e-07, - "logits/chosen": -2.7651867866516113, - "logits/rejected": -2.794961452484131, - "logps/chosen": -200.4475555419922, - "logps/rejected": -193.27154541015625, - "loss": 0.6431, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.17628836631774902, - "rewards/margins": 0.17924582958221436, - "rewards/rejected": -0.0029574513901025057, + "learning_rate": 1.6439454691259023e-07, + "logits/chosen": -3.026432991027832, + "logits/rejected": -2.996279239654541, + "logps/chosen": -304.29425048828125, + "logps/rejected": -310.5896911621094, + "loss": 0.5434, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5693452954292297, + "rewards/margins": 0.5008207559585571, + "rewards/rejected": 0.06852452456951141, "step": 410 }, { - "epoch": 0.11, - "learning_rate": 1.7691659646166807e-07, - "logits/chosen": -3.0512638092041016, - "logits/rejected": -2.8745832443237305, - "logps/chosen": -388.82977294921875, - "logps/rejected": -230.249755859375, - "loss": 0.5436, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.42595309019088745, - "rewards/margins": 0.3382980525493622, - "rewards/rejected": 0.08765505254268646, + "epoch": 0.1, + "learning_rate": 1.6840417000801924e-07, + "logits/chosen": -3.0537185668945312, + "logits/rejected": -3.080775737762451, + "logps/chosen": -276.4287109375, + "logps/rejected": -234.7353515625, + "loss": 0.5993, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.372450053691864, + "rewards/margins": 0.5930916666984558, + "rewards/rejected": -0.22064165771007538, "step": 420 }, { - "epoch": 0.11, - "learning_rate": 1.8112889637742208e-07, - "logits/chosen": -2.8399643898010254, - "logits/rejected": -2.7200350761413574, - "logps/chosen": -215.1141815185547, - "logps/rejected": -203.30105590820312, - "loss": 0.609, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.2471599280834198, - "rewards/margins": 0.24380986392498016, - "rewards/rejected": 0.0033500641584396362, + "epoch": 0.1, + "learning_rate": 1.7241379310344828e-07, + "logits/chosen": -2.835801362991333, + "logits/rejected": -2.925590991973877, + "logps/chosen": -251.0234375, + "logps/rejected": -314.4089660644531, + "loss": 0.5472, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.020180488005280495, + "rewards/margins": 0.1645812690258026, + "rewards/rejected": -0.18476173281669617, "step": 430 }, { "epoch": 0.11, - "learning_rate": 1.8534119629317606e-07, - "logits/chosen": -2.8783884048461914, - "logits/rejected": -2.8517260551452637, - "logps/chosen": -260.9423522949219, - "logps/rejected": -225.1972198486328, - "loss": 0.6453, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.18088439106941223, - "rewards/margins": 0.19021722674369812, - "rewards/rejected": -0.00933288224041462, + "learning_rate": 1.764234161988773e-07, + "logits/chosen": -3.031013011932373, + "logits/rejected": -2.992793321609497, + "logps/chosen": -231.69900512695312, + "logps/rejected": -229.4535369873047, + "loss": 0.569, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.10661451518535614, + "rewards/margins": 0.8876360058784485, + "rewards/rejected": -0.7810214161872864, "step": 440 }, { "epoch": 0.11, - "learning_rate": 1.8955349620893008e-07, - "logits/chosen": -2.776001214981079, - "logits/rejected": -2.8345043659210205, - "logps/chosen": -240.72265625, - "logps/rejected": -260.09356689453125, - "loss": 0.572, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.009611167944967747, - "rewards/margins": 0.2188769280910492, - "rewards/rejected": -0.20926575362682343, + "learning_rate": 1.8043303929430633e-07, + "logits/chosen": -3.0358529090881348, + "logits/rejected": -3.04333758354187, + "logps/chosen": -175.71751403808594, + "logps/rejected": -170.6195831298828, + "loss": 0.5853, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.42912259697914124, + "rewards/margins": 0.09240008890628815, + "rewards/rejected": -0.5215227007865906, "step": 450 }, { - "epoch": 0.12, - "learning_rate": 1.937657961246841e-07, - "logits/chosen": -2.6703200340270996, - "logits/rejected": -2.78947114944458, - "logps/chosen": -145.38352966308594, - "logps/rejected": -203.1894989013672, - "loss": 0.6171, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.023900752887129784, - "rewards/margins": 0.18232859671115875, - "rewards/rejected": -0.2062292993068695, + "epoch": 0.11, + "learning_rate": 1.8444266238973534e-07, + "logits/chosen": -3.0174341201782227, + "logits/rejected": -3.070925235748291, + "logps/chosen": -250.961181640625, + "logps/rejected": -262.90216064453125, + "loss": 0.6231, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.16643106937408447, + "rewards/margins": 0.5589379072189331, + "rewards/rejected": -0.39250683784484863, "step": 460 }, { - "epoch": 0.12, - "learning_rate": 1.9797809604043807e-07, - "logits/chosen": -2.8545408248901367, - "logits/rejected": -2.7196555137634277, - "logps/chosen": -252.4641571044922, - "logps/rejected": -168.0990753173828, - "loss": 0.5999, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.4809208810329437, - "rewards/margins": 0.6154331564903259, - "rewards/rejected": -0.1345122754573822, + "epoch": 0.11, + "learning_rate": 1.884522854851644e-07, + "logits/chosen": -3.088111400604248, + "logits/rejected": -3.10201358795166, + "logps/chosen": -239.4950408935547, + "logps/rejected": -177.04478454589844, + "loss": 0.6463, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04864073917269707, + "rewards/margins": 0.5137122273445129, + "rewards/rejected": -0.465071439743042, "step": 470 }, { "epoch": 0.12, - "learning_rate": 2.0219039595619208e-07, - "logits/chosen": -2.9020743370056152, - "logits/rejected": -2.805799961090088, - "logps/chosen": -278.7673034667969, - "logps/rejected": -270.74188232421875, - "loss": 0.5801, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.48040515184402466, - "rewards/margins": 0.5795575976371765, - "rewards/rejected": -0.09915249049663544, + "learning_rate": 1.9246190858059342e-07, + "logits/chosen": -3.1442697048187256, + "logits/rejected": -3.1288418769836426, + "logps/chosen": -271.8927917480469, + "logps/rejected": -226.47738647460938, + "loss": 0.5608, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.44616442918777466, + "rewards/margins": 0.7041479349136353, + "rewards/rejected": -0.257983535528183, "step": 480 }, { "epoch": 0.12, - "learning_rate": 2.064026958719461e-07, - "logits/chosen": -2.7485694885253906, - "logits/rejected": -2.668761730194092, - "logps/chosen": -173.67547607421875, - "logps/rejected": -155.75164794921875, - "loss": 0.5928, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.32208365201950073, - "rewards/margins": 0.7304231524467468, - "rewards/rejected": -0.4083394408226013, + "learning_rate": 1.9647153167602245e-07, + "logits/chosen": -3.010608196258545, + "logits/rejected": -3.012549638748169, + "logps/chosen": -152.94723510742188, + "logps/rejected": -265.013671875, + "loss": 0.559, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.020343530923128128, + "rewards/margins": 0.3226454555988312, + "rewards/rejected": -0.3429889976978302, "step": 490 }, { - "epoch": 0.13, - "learning_rate": 2.1061499578770005e-07, - "logits/chosen": -2.8859145641326904, - "logits/rejected": -2.747610569000244, - "logps/chosen": -401.01629638671875, - "logps/rejected": -255.532470703125, - "loss": 0.5404, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06550431251525879, - "rewards/margins": 0.473673015832901, - "rewards/rejected": -0.40816861391067505, + "epoch": 0.12, + "learning_rate": 2.0048115477145147e-07, + "logits/chosen": -2.9077136516571045, + "logits/rejected": -2.943850040435791, + "logps/chosen": -294.1748962402344, + "logps/rejected": -272.5565185546875, + "loss": 0.5545, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.11357943713665009, + "rewards/margins": 0.5857218503952026, + "rewards/rejected": -0.47214239835739136, "step": 500 }, { - "epoch": 0.13, - "learning_rate": 2.1482729570345407e-07, - "logits/chosen": -2.8993895053863525, - "logits/rejected": -2.767453193664551, - "logps/chosen": -269.9494323730469, - "logps/rejected": -317.01702880859375, - "loss": 0.5515, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.41263261437416077, - "rewards/margins": 0.5958629250526428, - "rewards/rejected": -0.18323031067848206, + "epoch": 0.12, + "eval_logits/chosen": -3.078751564025879, + "eval_logits/rejected": -3.0819265842437744, + "eval_logps/chosen": -194.21905517578125, + "eval_logps/rejected": -187.2035369873047, + "eval_loss": 0.5800355672836304, + "eval_rewards/accuracies": 0.6274999976158142, + "eval_rewards/chosen": 0.17419549822807312, + "eval_rewards/margins": 0.4552420675754547, + "eval_rewards/rejected": -0.2810465693473816, + "eval_runtime": 132.5261, + "eval_samples_per_second": 23.814, + "eval_steps_per_second": 0.377, + "step": 500 + }, + { + "epoch": 0.12, + "learning_rate": 2.044907778668805e-07, + "logits/chosen": -3.0181641578674316, + "logits/rejected": -2.993478298187256, + "logps/chosen": -294.31646728515625, + "logps/rejected": -215.51132202148438, + "loss": 0.6949, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3190132677555084, + "rewards/margins": 0.5404255390167236, + "rewards/rejected": -0.22141222655773163, "step": 510 }, { "epoch": 0.13, - "learning_rate": 2.1903959561920808e-07, - "logits/chosen": -2.8467533588409424, - "logits/rejected": -2.801426410675049, - "logps/chosen": -204.9807586669922, - "logps/rejected": -192.07164001464844, - "loss": 0.5514, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.3908511698246002, - "rewards/margins": 0.37529826164245605, - "rewards/rejected": 0.015552910976111889, + "learning_rate": 2.0850040096230954e-07, + "logits/chosen": -3.1144859790802, + "logits/rejected": -3.109499931335449, + "logps/chosen": -323.41790771484375, + "logps/rejected": -242.5978546142578, + "loss": 0.5961, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.13799306750297546, + "rewards/margins": 0.7062627673149109, + "rewards/rejected": -0.5682697296142578, "step": 520 }, { "epoch": 0.13, - "learning_rate": 2.2325189553496206e-07, - "logits/chosen": -2.7452545166015625, - "logits/rejected": -2.702454090118408, - "logps/chosen": -204.1117401123047, - "logps/rejected": -267.8322448730469, - "loss": 0.5728, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.40074747800827026, - "rewards/margins": 0.31568995118141174, - "rewards/rejected": 0.0850575715303421, + "learning_rate": 2.1251002405773858e-07, + "logits/chosen": -3.0933876037597656, + "logits/rejected": -3.1562013626098633, + "logps/chosen": -324.9996643066406, + "logps/rejected": -264.2984619140625, + "loss": 0.5526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4742143750190735, + "rewards/margins": 0.3282214105129242, + "rewards/rejected": 0.1459929496049881, "step": 530 }, { - "epoch": 0.14, - "learning_rate": 2.2746419545071608e-07, - "logits/chosen": -2.7847249507904053, - "logits/rejected": -2.720637083053589, - "logps/chosen": -281.24444580078125, - "logps/rejected": -255.8726043701172, - "loss": 0.6399, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.10813522338867188, - "rewards/margins": 0.3281122148036957, - "rewards/rejected": -0.2199770212173462, + "epoch": 0.13, + "learning_rate": 2.165196471531676e-07, + "logits/chosen": -2.8193135261535645, + "logits/rejected": -2.8608927726745605, + "logps/chosen": -206.00552368164062, + "logps/rejected": -217.15921020507812, + "loss": 0.6029, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20380251109600067, + "rewards/margins": 0.5336498618125916, + "rewards/rejected": -0.32984742522239685, "step": 540 }, { - "epoch": 0.14, - "learning_rate": 2.316764953664701e-07, - "logits/chosen": -2.8527252674102783, - "logits/rejected": -2.836733818054199, - "logps/chosen": -235.00033569335938, - "logps/rejected": -273.7145080566406, - "loss": 0.5523, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.5961872935295105, - "rewards/margins": 0.9351669549942017, - "rewards/rejected": -0.33897966146469116, + "epoch": 0.13, + "learning_rate": 2.2052927024859663e-07, + "logits/chosen": -2.9759271144866943, + "logits/rejected": -2.9960126876831055, + "logps/chosen": -294.0548095703125, + "logps/rejected": -235.9616241455078, + "loss": 0.6839, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5202827453613281, + "rewards/margins": 0.5485485792160034, + "rewards/rejected": -0.028265809640288353, "step": 550 }, { - "epoch": 0.14, - "learning_rate": 2.3588879528222407e-07, - "logits/chosen": -2.7602150440216064, - "logits/rejected": -2.620371103286743, - "logps/chosen": -293.63043212890625, - "logps/rejected": -320.56707763671875, - "loss": 0.5559, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.47802114486694336, - "rewards/margins": 0.22451654076576233, - "rewards/rejected": 0.2535046637058258, + "epoch": 0.13, + "learning_rate": 2.2453889334402564e-07, + "logits/chosen": -3.057814836502075, + "logits/rejected": -3.041170358657837, + "logps/chosen": -193.94448852539062, + "logps/rejected": -117.39552307128906, + "loss": 0.5322, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5988059639930725, + "rewards/margins": 0.711750864982605, + "rewards/rejected": -0.11294497549533844, "step": 560 }, { "epoch": 0.14, - "learning_rate": 2.4010109519797806e-07, - "logits/chosen": -2.7937815189361572, - "logits/rejected": -2.7583813667297363, - "logps/chosen": -248.607666015625, - "logps/rejected": -213.4592742919922, - "loss": 0.5489, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.4767304062843323, - "rewards/margins": 0.6132498383522034, - "rewards/rejected": -0.13651947677135468, + "learning_rate": 2.285485164394547e-07, + "logits/chosen": -2.9411561489105225, + "logits/rejected": -2.9532182216644287, + "logps/chosen": -197.73008728027344, + "logps/rejected": -266.1523742675781, + "loss": 0.5541, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.37457144260406494, + "rewards/margins": 0.8404830694198608, + "rewards/rejected": -0.4659116268157959, "step": 570 }, { - "epoch": 0.15, - "learning_rate": 2.443133951137321e-07, - "logits/chosen": -2.705596685409546, - "logits/rejected": -2.764143705368042, - "logps/chosen": -183.74258422851562, - "logps/rejected": -181.19253540039062, - "loss": 0.5128, + "epoch": 0.14, + "learning_rate": 2.3255813953488372e-07, + "logits/chosen": -3.0148580074310303, + "logits/rejected": -3.022594451904297, + "logps/chosen": -205.1793670654297, + "logps/rejected": -208.3339385986328, + "loss": 0.5729, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.038012512028217316, - "rewards/margins": 0.4396725594997406, - "rewards/rejected": -0.40166011452674866, + "rewards/chosen": 0.6015924215316772, + "rewards/margins": 0.43703681230545044, + "rewards/rejected": 0.16455568373203278, "step": 580 }, { - "epoch": 0.15, - "learning_rate": 2.485256950294861e-07, - "logits/chosen": -2.7596192359924316, - "logits/rejected": -2.7716755867004395, - "logps/chosen": -170.6914520263672, - "logps/rejected": -194.93226623535156, - "loss": 0.4832, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.023831719532608986, - "rewards/margins": 0.9237872362136841, - "rewards/rejected": -0.8999554514884949, + "epoch": 0.14, + "learning_rate": 2.3656776263031275e-07, + "logits/chosen": -3.1047730445861816, + "logits/rejected": -3.103303909301758, + "logps/chosen": -265.89031982421875, + "logps/rejected": -201.85369873046875, + "loss": 0.5768, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.44284144043922424, + "rewards/margins": 0.6697565317153931, + "rewards/rejected": -0.22691497206687927, "step": 590 }, { - "epoch": 0.15, - "learning_rate": 2.5273799494524007e-07, - "logits/chosen": -2.7988181114196777, - "logits/rejected": -2.784140110015869, - "logps/chosen": -239.1433868408203, - "logps/rejected": -284.5286865234375, - "loss": 0.5607, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.14084307849407196, - "rewards/margins": 0.25364288687705994, - "rewards/rejected": -0.3944859504699707, + "epoch": 0.14, + "learning_rate": 2.405773857257418e-07, + "logits/chosen": -3.1247141361236572, + "logits/rejected": -3.1185617446899414, + "logps/chosen": -291.5391540527344, + "logps/rejected": -242.916748046875, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4717758595943451, + "rewards/margins": 0.35362595319747925, + "rewards/rejected": 0.11814995855093002, + "step": 600 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -3.059744358062744, + "eval_logits/rejected": -3.0600857734680176, + "eval_logps/chosen": -193.55068969726562, + "eval_logps/rejected": -187.46932983398438, + "eval_loss": 0.5598892569541931, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": 0.24103333055973053, + "eval_rewards/margins": 0.5486571788787842, + "eval_rewards/rejected": -0.30762383341789246, + "eval_runtime": 132.5499, + "eval_samples_per_second": 23.81, + "eval_steps_per_second": 0.377, "step": 600 }, { "epoch": 0.15, - "learning_rate": 2.5695029486099405e-07, - "logits/chosen": -2.7752766609191895, - "logits/rejected": -2.7815215587615967, - "logps/chosen": -264.5174560546875, - "logps/rejected": -266.3208923339844, - "loss": 0.5931, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06395702064037323, - "rewards/margins": 0.6657644510269165, - "rewards/rejected": -0.6018074154853821, + "learning_rate": 2.445870088211708e-07, + "logits/chosen": -3.0544307231903076, + "logits/rejected": -3.066545009613037, + "logps/chosen": -284.37506103515625, + "logps/rejected": -265.29180908203125, + "loss": 0.6701, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.21369579434394836, + "rewards/margins": 0.05160403251647949, + "rewards/rejected": 0.16209176182746887, "step": 610 }, { - "epoch": 0.16, - "learning_rate": 2.611625947767481e-07, - "logits/chosen": -2.7037911415100098, - "logits/rejected": -2.7605366706848145, - "logps/chosen": -243.78945922851562, - "logps/rejected": -243.4246368408203, - "loss": 0.573, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.20454922318458557, - "rewards/margins": 0.645856499671936, - "rewards/rejected": -0.4413072466850281, + "epoch": 0.15, + "learning_rate": 2.485966319165998e-07, + "logits/chosen": -2.8753466606140137, + "logits/rejected": -2.9049174785614014, + "logps/chosen": -156.206298828125, + "logps/rejected": -183.92886352539062, + "loss": 0.7536, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.27397117018699646, + "rewards/margins": 0.48598161339759827, + "rewards/rejected": -0.2120104283094406, "step": 620 }, { - "epoch": 0.16, - "learning_rate": 2.653748946925021e-07, - "logits/chosen": -2.7574284076690674, - "logits/rejected": -2.7437338829040527, - "logps/chosen": -228.0672607421875, - "logps/rejected": -259.17694091796875, - "loss": 0.5244, + "epoch": 0.15, + "learning_rate": 2.526062550120289e-07, + "logits/chosen": -3.146651029586792, + "logits/rejected": -3.1471495628356934, + "logps/chosen": -212.36257934570312, + "logps/rejected": -189.55467224121094, + "loss": 0.6613, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.3170861601829529, - "rewards/margins": 0.26493725180625916, - "rewards/rejected": 0.05214894562959671, + "rewards/chosen": 0.21897678077220917, + "rewards/margins": 0.49531808495521545, + "rewards/rejected": -0.2763412594795227, "step": 630 }, { - "epoch": 0.16, - "learning_rate": 2.6958719460825606e-07, - "logits/chosen": -2.7019240856170654, - "logits/rejected": -2.7028634548187256, - "logps/chosen": -254.992431640625, - "logps/rejected": -231.47348022460938, - "loss": 0.6187, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.4512414038181305, - "rewards/margins": 0.4618772864341736, - "rewards/rejected": -0.010635855607688427, + "epoch": 0.15, + "learning_rate": 2.566158781074579e-07, + "logits/chosen": -3.024630069732666, + "logits/rejected": -3.013554811477661, + "logps/chosen": -242.3477783203125, + "logps/rejected": -228.82803344726562, + "loss": 0.5859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03820228576660156, + "rewards/margins": 0.4131718575954437, + "rewards/rejected": -0.3749695420265198, "step": 640 }, { "epoch": 0.16, - "learning_rate": 2.737994945240101e-07, - "logits/chosen": -2.835756301879883, - "logits/rejected": -2.7334399223327637, - "logps/chosen": -396.8888854980469, - "logps/rejected": -331.6492614746094, - "loss": 0.6095, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.5410932302474976, - "rewards/margins": 0.8466060757637024, - "rewards/rejected": -0.305512934923172, + "learning_rate": 2.606255012028869e-07, + "logits/chosen": -2.880851984024048, + "logits/rejected": -2.9244561195373535, + "logps/chosen": -297.00787353515625, + "logps/rejected": -248.0228729248047, + "loss": 0.4965, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.26357099413871765, + "rewards/margins": 0.9044533967971802, + "rewards/rejected": -1.1680243015289307, "step": 650 }, { - "epoch": 0.17, - "learning_rate": 2.780117944397641e-07, - "logits/chosen": -2.6983981132507324, - "logits/rejected": -2.595994234085083, - "logps/chosen": -259.11248779296875, - "logps/rejected": -187.53634643554688, - "loss": 0.6914, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.18554158508777618, - "rewards/margins": 0.41711145639419556, - "rewards/rejected": -0.23156991600990295, + "epoch": 0.16, + "learning_rate": 2.6463512429831596e-07, + "logits/chosen": -2.9929141998291016, + "logits/rejected": -2.917968511581421, + "logps/chosen": -248.5657958984375, + "logps/rejected": -261.52398681640625, + "loss": 0.583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42559370398521423, + "rewards/margins": 0.770692765712738, + "rewards/rejected": -1.1962864398956299, "step": 660 }, { - "epoch": 0.17, - "learning_rate": 2.8222409435551807e-07, - "logits/chosen": -2.6740572452545166, - "logits/rejected": -2.573707103729248, - "logps/chosen": -223.1436767578125, - "logps/rejected": -239.71603393554688, - "loss": 0.5766, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.2260672152042389, - "rewards/margins": 0.28128767013549805, - "rewards/rejected": -0.05522041767835617, + "epoch": 0.16, + "learning_rate": 2.68644747393745e-07, + "logits/chosen": -2.8122334480285645, + "logits/rejected": -2.849437952041626, + "logps/chosen": -219.21533203125, + "logps/rejected": -173.00262451171875, + "loss": 0.5735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.467887818813324, + "rewards/margins": 0.44535714387893677, + "rewards/rejected": -0.9132450222969055, "step": 670 }, { - "epoch": 0.17, - "learning_rate": 2.864363942712721e-07, - "logits/chosen": -2.5910723209381104, - "logits/rejected": -2.700523614883423, - "logps/chosen": -147.84005737304688, - "logps/rejected": -224.4444580078125, - "loss": 0.6463, + "epoch": 0.16, + "learning_rate": 2.72654370489174e-07, + "logits/chosen": -2.9684383869171143, + "logits/rejected": -2.949713945388794, + "logps/chosen": -207.4556884765625, + "logps/rejected": -292.7832946777344, + "loss": 0.5441, "rewards/accuracies": 0.75, - "rewards/chosen": -0.4095599055290222, - "rewards/margins": 0.5575370788574219, - "rewards/rejected": -0.9670969843864441, + "rewards/chosen": -0.20602980256080627, + "rewards/margins": 0.847222626209259, + "rewards/rejected": -1.0532524585723877, "step": 680 }, { "epoch": 0.17, - "learning_rate": 2.906486941870261e-07, - "logits/chosen": -2.8283638954162598, - "logits/rejected": -2.7460625171661377, - "logps/chosen": -283.7747802734375, - "logps/rejected": -201.9105224609375, - "loss": 0.5628, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2565240263938904, - "rewards/margins": 0.5982394814491272, - "rewards/rejected": -0.8547635078430176, + "learning_rate": 2.76663993584603e-07, + "logits/chosen": -3.066805362701416, + "logits/rejected": -3.0770373344421387, + "logps/chosen": -298.5137939453125, + "logps/rejected": -295.9231872558594, + "loss": 0.5347, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05317535251379013, + "rewards/margins": 0.40149515867233276, + "rewards/rejected": -0.4546705186367035, "step": 690 }, { - "epoch": 0.18, - "learning_rate": 2.948609941027801e-07, - "logits/chosen": -2.8705124855041504, - "logits/rejected": -2.8632733821868896, - "logps/chosen": -291.81622314453125, - "logps/rejected": -266.4309997558594, - "loss": 0.5825, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.031624965369701385, - "rewards/margins": 0.8366470336914062, - "rewards/rejected": -0.8682720065116882, + "epoch": 0.17, + "learning_rate": 2.8067361668003206e-07, + "logits/chosen": -2.739448070526123, + "logits/rejected": -2.7039871215820312, + "logps/chosen": -310.03009033203125, + "logps/rejected": -234.18838500976562, + "loss": 0.5326, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47643113136291504, + "rewards/margins": 0.6174861788749695, + "rewards/rejected": -1.0939172506332397, "step": 700 }, { - "epoch": 0.18, - "learning_rate": 2.990732940185341e-07, - "logits/chosen": -2.5919554233551025, - "logits/rejected": -2.614384174346924, - "logps/chosen": -242.84140014648438, - "logps/rejected": -381.2159423828125, - "loss": 0.5726, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.7084521055221558, - "rewards/margins": 0.08272404968738556, - "rewards/rejected": -0.7911761999130249, + "epoch": 0.17, + "eval_logits/chosen": -2.909024238586426, + "eval_logits/rejected": -2.907602548599243, + "eval_logps/chosen": -198.4624481201172, + "eval_logps/rejected": -194.0914306640625, + "eval_loss": 0.538511335849762, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": -0.25014111399650574, + "eval_rewards/margins": 0.7196922302246094, + "eval_rewards/rejected": -0.9698333740234375, + "eval_runtime": 132.3578, + "eval_samples_per_second": 23.844, + "eval_steps_per_second": 0.378, + "step": 700 + }, + { + "epoch": 0.17, + "learning_rate": 2.8468323977546113e-07, + "logits/chosen": -3.0001258850097656, + "logits/rejected": -3.0255441665649414, + "logps/chosen": -325.72698974609375, + "logps/rejected": -301.756591796875, + "loss": 0.5648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10249632596969604, + "rewards/margins": 0.6428587436676025, + "rewards/rejected": -0.7453551888465881, "step": 710 }, { - "epoch": 0.18, - "learning_rate": 3.032855939342881e-07, - "logits/chosen": -2.7366249561309814, - "logits/rejected": -2.763061761856079, - "logps/chosen": -290.13043212890625, - "logps/rejected": -257.94073486328125, - "loss": 0.5624, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.22357621788978577, - "rewards/margins": 0.9331549406051636, - "rewards/rejected": -1.156731128692627, + "epoch": 0.17, + "learning_rate": 2.8869286287089014e-07, + "logits/chosen": -2.958035945892334, + "logits/rejected": -2.994882822036743, + "logps/chosen": -305.9093322753906, + "logps/rejected": -243.5149688720703, + "loss": 0.5491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20739097893238068, + "rewards/margins": 0.5625081658363342, + "rewards/rejected": -0.7698990702629089, "step": 720 }, { "epoch": 0.18, - "learning_rate": 3.074978938500421e-07, - "logits/chosen": -2.858980655670166, - "logits/rejected": -2.879626512527466, - "logps/chosen": -260.36572265625, - "logps/rejected": -331.29986572265625, - "loss": 0.5597, + "learning_rate": 2.9270248596631915e-07, + "logits/chosen": -2.8672804832458496, + "logits/rejected": -2.8485159873962402, + "logps/chosen": -254.518798828125, + "logps/rejected": -263.0560302734375, + "loss": 0.5835, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.024600837379693985, - "rewards/margins": 0.9509462118148804, - "rewards/rejected": -0.9755471348762512, + "rewards/chosen": -0.3628667891025543, + "rewards/margins": 1.5132993459701538, + "rewards/rejected": -1.8761663436889648, "step": 730 }, { - "epoch": 0.19, - "learning_rate": 3.117101937657961e-07, - "logits/chosen": -2.755232572555542, - "logits/rejected": -2.788558006286621, - "logps/chosen": -271.6325378417969, - "logps/rejected": -268.0082702636719, - "loss": 0.55, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.017580661922693253, - "rewards/margins": 0.9975998997688293, - "rewards/rejected": -1.0151805877685547, + "epoch": 0.18, + "learning_rate": 2.967121090617482e-07, + "logits/chosen": -2.8067781925201416, + "logits/rejected": -2.8000786304473877, + "logps/chosen": -197.89767456054688, + "logps/rejected": -189.97723388671875, + "loss": 0.5719, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7084823250770569, + "rewards/margins": 0.33432266116142273, + "rewards/rejected": -1.0428050756454468, "step": 740 }, { - "epoch": 0.19, - "learning_rate": 3.159224936815501e-07, - "logits/chosen": -2.7869515419006348, - "logits/rejected": -2.7380776405334473, - "logps/chosen": -252.6619110107422, - "logps/rejected": -179.35830688476562, - "loss": 0.5059, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.08400382101535797, - "rewards/margins": 0.8033849596977234, - "rewards/rejected": -0.7193810343742371, + "epoch": 0.18, + "learning_rate": 3.007217321571772e-07, + "logits/chosen": -3.0698084831237793, + "logits/rejected": -3.015273332595825, + "logps/chosen": -367.19573974609375, + "logps/rejected": -332.44403076171875, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18584397435188293, + "rewards/margins": 0.9501940608024597, + "rewards/rejected": -1.136038064956665, "step": 750 }, { - "epoch": 0.19, - "learning_rate": 3.201347935973041e-07, - "logits/chosen": -2.7246577739715576, - "logits/rejected": -2.6718568801879883, - "logps/chosen": -303.58880615234375, - "logps/rejected": -240.606201171875, - "loss": 0.6173, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.2395877093076706, - "rewards/margins": 0.45436787605285645, - "rewards/rejected": -0.21478009223937988, - "step": 760 + "epoch": 0.18, + "learning_rate": 3.0473135525260624e-07, + "logits/chosen": -2.9151289463043213, + "logits/rejected": -2.9265260696411133, + "logps/chosen": -248.06369018554688, + "logps/rejected": -267.24005126953125, + "loss": 0.5325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06655324250459671, + "rewards/margins": 0.7783264517784119, + "rewards/rejected": -0.8448797464370728, + "step": 760 }, { "epoch": 0.19, - "learning_rate": 3.2434709351305813e-07, - "logits/chosen": -2.788994550704956, - "logits/rejected": -2.5727732181549072, - "logps/chosen": -313.975341796875, - "logps/rejected": -212.37728881835938, - "loss": 0.5013, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.17403548955917358, - "rewards/margins": 0.97325199842453, - "rewards/rejected": -0.799216628074646, + "learning_rate": 3.0874097834803525e-07, + "logits/chosen": -2.8861327171325684, + "logits/rejected": -2.9578425884246826, + "logps/chosen": -359.11248779296875, + "logps/rejected": -285.6987609863281, + "loss": 0.5288, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0850412845611572, + "rewards/margins": 0.8727619051933289, + "rewards/rejected": -1.9578031301498413, "step": 770 }, { - "epoch": 0.2, - "learning_rate": 3.285593934288121e-07, - "logits/chosen": -2.840376615524292, - "logits/rejected": -2.8498775959014893, - "logps/chosen": -203.07363891601562, - "logps/rejected": -230.2865753173828, - "loss": 0.6018, + "epoch": 0.19, + "learning_rate": 3.127506014434643e-07, + "logits/chosen": -2.708082914352417, + "logits/rejected": -2.8446242809295654, + "logps/chosen": -286.6722717285156, + "logps/rejected": -282.42083740234375, + "loss": 0.533, "rewards/accuracies": 0.75, - "rewards/chosen": -0.06935430318117142, - "rewards/margins": 0.49717801809310913, - "rewards/rejected": -0.5665322542190552, + "rewards/chosen": 0.020838048309087753, + "rewards/margins": 1.5394041538238525, + "rewards/rejected": -1.5185660123825073, "step": 780 }, { - "epoch": 0.2, - "learning_rate": 3.327716933445661e-07, - "logits/chosen": -2.7698721885681152, - "logits/rejected": -2.7035422325134277, - "logps/chosen": -287.3998718261719, - "logps/rejected": -255.6114044189453, - "loss": 0.5399, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.09833178669214249, - "rewards/margins": 1.1349576711654663, - "rewards/rejected": -1.2332894802093506, + "epoch": 0.19, + "learning_rate": 3.167602245388933e-07, + "logits/chosen": -2.980109930038452, + "logits/rejected": -2.995049238204956, + "logps/chosen": -251.4605255126953, + "logps/rejected": -198.1638946533203, + "loss": 0.5707, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1482660323381424, + "rewards/margins": 0.6824778318405151, + "rewards/rejected": -0.8307439684867859, "step": 790 }, { - "epoch": 0.2, - "learning_rate": 3.3698399326032014e-07, - "logits/chosen": -2.7961173057556152, - "logits/rejected": -2.7884459495544434, - "logps/chosen": -214.2937774658203, - "logps/rejected": -260.4123840332031, - "loss": 0.5682, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.39419320225715637, - "rewards/margins": 1.3177748918533325, - "rewards/rejected": -1.711968183517456, + "epoch": 0.19, + "learning_rate": 3.2076984763432233e-07, + "logits/chosen": -3.074256658554077, + "logits/rejected": -3.0817668437957764, + "logps/chosen": -310.3722839355469, + "logps/rejected": -289.89373779296875, + "loss": 0.5126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06711219251155853, + "rewards/margins": 0.40633732080459595, + "rewards/rejected": -0.4734494686126709, "step": 800 }, { - "epoch": 0.2, - "learning_rate": 3.411962931760741e-07, - "logits/chosen": -2.7424988746643066, - "logits/rejected": -2.7070367336273193, - "logps/chosen": -309.4488830566406, - "logps/rejected": -298.2324523925781, - "loss": 0.6783, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.3691617548465729, - "rewards/margins": 0.6875399351119995, - "rewards/rejected": -1.0567017793655396, + "epoch": 0.19, + "eval_logits/chosen": -2.9963479042053223, + "eval_logits/rejected": -2.9964656829833984, + "eval_logps/chosen": -199.57691955566406, + "eval_logps/rejected": -196.17636108398438, + "eval_loss": 0.523777425289154, + "eval_rewards/accuracies": 0.6524999737739563, + "eval_rewards/chosen": -0.36158767342567444, + "eval_rewards/margins": 0.8167411684989929, + "eval_rewards/rejected": -1.1783289909362793, + "eval_runtime": 131.8153, + "eval_samples_per_second": 23.943, + "eval_steps_per_second": 0.379, + "step": 800 + }, + { + "epoch": 0.19, + "learning_rate": 3.2477947072975135e-07, + "logits/chosen": -2.973468780517578, + "logits/rejected": -3.0309994220733643, + "logps/chosen": -248.0684051513672, + "logps/rejected": -227.3339080810547, + "loss": 0.6189, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2031877040863037, + "rewards/margins": 0.5858628153800964, + "rewards/rejected": -0.7890505194664001, "step": 810 }, { - "epoch": 0.21, - "learning_rate": 3.454085930918281e-07, - "logits/chosen": -2.8790063858032227, - "logits/rejected": -2.8102855682373047, - "logps/chosen": -267.56610107421875, - "logps/rejected": -255.9617919921875, - "loss": 0.5377, + "epoch": 0.2, + "learning_rate": 3.2878909382518046e-07, + "logits/chosen": -2.944810390472412, + "logits/rejected": -2.9564061164855957, + "logps/chosen": -264.3750305175781, + "logps/rejected": -221.53756713867188, + "loss": 0.5213, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.4485592842102051, - "rewards/margins": 0.6301880478858948, - "rewards/rejected": -1.0787473917007446, + "rewards/chosen": -0.14951172471046448, + "rewards/margins": 0.7098695039749146, + "rewards/rejected": -0.8593811988830566, "step": 820 }, { - "epoch": 0.21, - "learning_rate": 3.4962089300758215e-07, - "logits/chosen": -2.9085679054260254, - "logits/rejected": -2.8483242988586426, - "logps/chosen": -309.87261962890625, - "logps/rejected": -326.3826599121094, - "loss": 0.5412, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.11304453760385513, - "rewards/margins": 0.7235099077224731, - "rewards/rejected": -0.6104652285575867, + "epoch": 0.2, + "learning_rate": 3.327987169206095e-07, + "logits/chosen": -2.8502869606018066, + "logits/rejected": -2.847830295562744, + "logps/chosen": -195.5387420654297, + "logps/rejected": -277.2113342285156, + "loss": 0.5721, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9960781335830688, + "rewards/margins": 0.3432408273220062, + "rewards/rejected": -1.3393189907073975, "step": 830 }, { - "epoch": 0.21, - "learning_rate": 3.5383319292333613e-07, - "logits/chosen": -2.7523531913757324, - "logits/rejected": -2.7195019721984863, - "logps/chosen": -275.505859375, - "logps/rejected": -251.9176483154297, - "loss": 0.5109, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.01079724170267582, - "rewards/margins": 0.6783922910690308, - "rewards/rejected": -0.689189612865448, + "epoch": 0.2, + "learning_rate": 3.368083400160385e-07, + "logits/chosen": -2.9394795894622803, + "logits/rejected": -3.0222933292388916, + "logps/chosen": -205.0727081298828, + "logps/rejected": -226.78466796875, + "loss": 0.5496, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4092499315738678, + "rewards/margins": 0.4020964205265045, + "rewards/rejected": -0.8113464117050171, "step": 840 }, { - "epoch": 0.21, - "learning_rate": 3.580454928390901e-07, - "logits/chosen": -2.5856566429138184, - "logits/rejected": -2.6075503826141357, - "logps/chosen": -180.05770874023438, - "logps/rejected": -246.65780639648438, - "loss": 0.642, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.2767847180366516, - "rewards/margins": -0.03543096035718918, - "rewards/rejected": -0.24135378003120422, + "epoch": 0.2, + "learning_rate": 3.408179631114675e-07, + "logits/chosen": -2.8417601585388184, + "logits/rejected": -2.8362514972686768, + "logps/chosen": -228.3723602294922, + "logps/rejected": -227.6416473388672, + "loss": 0.5983, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.38122066855430603, + "rewards/margins": 0.793207049369812, + "rewards/rejected": -1.1744277477264404, "step": 850 }, { - "epoch": 0.22, - "learning_rate": 3.6225779275484416e-07, - "logits/chosen": -2.8647685050964355, - "logits/rejected": -2.837616205215454, - "logps/chosen": -256.5491638183594, - "logps/rejected": -266.04217529296875, - "loss": 0.5703, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.10952053964138031, - "rewards/margins": 0.4287700653076172, - "rewards/rejected": -0.3192494809627533, + "epoch": 0.21, + "learning_rate": 3.4482758620689656e-07, + "logits/chosen": -2.878786325454712, + "logits/rejected": -2.937821388244629, + "logps/chosen": -210.30258178710938, + "logps/rejected": -325.44903564453125, + "loss": 0.5503, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.22935771942138672, + "rewards/margins": 0.5897646546363831, + "rewards/rejected": -0.36040693521499634, "step": 860 }, { - "epoch": 0.22, - "learning_rate": 3.6647009267059814e-07, - "logits/chosen": -2.8707778453826904, - "logits/rejected": -2.848655939102173, - "logps/chosen": -225.21530151367188, - "logps/rejected": -267.1256408691406, - "loss": 0.566, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.2901657521724701, - "rewards/margins": 0.3968164324760437, - "rewards/rejected": -0.10665065050125122, + "epoch": 0.21, + "learning_rate": 3.4883720930232557e-07, + "logits/chosen": -2.992931604385376, + "logits/rejected": -2.9002814292907715, + "logps/chosen": -297.7613220214844, + "logps/rejected": -245.82638549804688, + "loss": 0.5519, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10210029780864716, + "rewards/margins": 1.153980016708374, + "rewards/rejected": -1.051879644393921, "step": 870 }, { - "epoch": 0.22, - "learning_rate": 3.7068239258635213e-07, - "logits/chosen": -2.695209503173828, - "logits/rejected": -2.670459508895874, - "logps/chosen": -225.5996551513672, - "logps/rejected": -241.8999481201172, - "loss": 0.4506, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.11750245094299316, - "rewards/margins": 1.2355433702468872, - "rewards/rejected": -1.118040919303894, + "epoch": 0.21, + "learning_rate": 3.528468323977546e-07, + "logits/chosen": -3.003692150115967, + "logits/rejected": -3.0055644512176514, + "logps/chosen": -256.29180908203125, + "logps/rejected": -217.17892456054688, + "loss": 0.6633, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5966130495071411, + "rewards/margins": 0.4915493130683899, + "rewards/rejected": -1.0881621837615967, "step": 880 }, { - "epoch": 0.22, - "learning_rate": 3.7489469250210617e-07, - "logits/chosen": -2.76595139503479, - "logits/rejected": -2.7959845066070557, - "logps/chosen": -253.39501953125, - "logps/rejected": -301.0154113769531, - "loss": 0.6257, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.16110540926456451, - "rewards/margins": 0.8840128183364868, - "rewards/rejected": -1.0451180934906006, + "epoch": 0.21, + "learning_rate": 3.568564554931836e-07, + "logits/chosen": -2.9390883445739746, + "logits/rejected": -3.0067508220672607, + "logps/chosen": -284.08087158203125, + "logps/rejected": -261.71307373046875, + "loss": 0.5557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8059605360031128, + "rewards/margins": 0.6136695742607117, + "rewards/rejected": -1.4196301698684692, "step": 890 }, { - "epoch": 0.23, - "learning_rate": 3.7910699241786015e-07, - "logits/chosen": -2.901207685470581, - "logits/rejected": -2.9004340171813965, - "logps/chosen": -318.0623474121094, - "logps/rejected": -266.1954040527344, - "loss": 0.5089, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.47898969054222107, - "rewards/margins": 1.0601681470870972, - "rewards/rejected": -0.5811785459518433, + "epoch": 0.22, + "learning_rate": 3.6086607858861266e-07, + "logits/chosen": -3.139829397201538, + "logits/rejected": -3.1293606758117676, + "logps/chosen": -298.77294921875, + "logps/rejected": -300.91143798828125, + "loss": 0.5283, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6465376615524292, + "rewards/margins": 0.768233597278595, + "rewards/rejected": -1.414771318435669, "step": 900 }, { - "epoch": 0.23, - "learning_rate": 3.8331929233361414e-07, - "logits/chosen": -2.93418025970459, - "logits/rejected": -2.9847195148468018, - "logps/chosen": -290.5269470214844, - "logps/rejected": -285.1426086425781, - "loss": 0.5668, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.011163473129272461, - "rewards/margins": 0.9022845029830933, - "rewards/rejected": -0.9134479761123657, + "epoch": 0.22, + "eval_logits/chosen": -3.0133914947509766, + "eval_logits/rejected": -3.013317346572876, + "eval_logps/chosen": -200.1031494140625, + "eval_logps/rejected": -196.93482971191406, + "eval_loss": 0.5288864970207214, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -0.41421395540237427, + "eval_rewards/margins": 0.8399606347084045, + "eval_rewards/rejected": -1.2541745901107788, + "eval_runtime": 131.7817, + "eval_samples_per_second": 23.949, + "eval_steps_per_second": 0.379, + "step": 900 + }, + { + "epoch": 0.22, + "learning_rate": 3.6487570168404167e-07, + "logits/chosen": -3.049015998840332, + "logits/rejected": -3.0608251094818115, + "logps/chosen": -313.48077392578125, + "logps/rejected": -276.17852783203125, + "loss": 0.5598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0582214817404747, + "rewards/margins": 0.8767786026000977, + "rewards/rejected": -0.8185571432113647, "step": 910 }, { - "epoch": 0.23, - "learning_rate": 3.875315922493682e-07, - "logits/chosen": -2.695725679397583, - "logits/rejected": -2.6235289573669434, - "logps/chosen": -239.16494750976562, - "logps/rejected": -298.2274169921875, - "loss": 0.5698, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.03379325941205025, - "rewards/margins": 0.6167046427726746, - "rewards/rejected": -0.6504980325698853, + "epoch": 0.22, + "learning_rate": 3.688853247794707e-07, + "logits/chosen": -2.9787068367004395, + "logits/rejected": -3.0255489349365234, + "logps/chosen": -287.5423583984375, + "logps/rejected": -222.5380096435547, + "loss": 0.7587, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19185402989387512, + "rewards/margins": 0.8485038876533508, + "rewards/rejected": -1.0403578281402588, "step": 920 }, { - "epoch": 0.24, - "learning_rate": 3.9174389216512216e-07, - "logits/chosen": -2.9124255180358887, - "logits/rejected": -2.9122350215911865, - "logps/chosen": -208.6260528564453, - "logps/rejected": -265.9713439941406, - "loss": 0.62, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.343755304813385, - "rewards/margins": 0.6561600565910339, - "rewards/rejected": -0.31240472197532654, + "epoch": 0.22, + "learning_rate": 3.7289494787489975e-07, + "logits/chosen": -2.9751698970794678, + "logits/rejected": -3.0051333904266357, + "logps/chosen": -273.0906982421875, + "logps/rejected": -227.42874145507812, + "loss": 0.5347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8002338409423828, + "rewards/margins": 1.3545544147491455, + "rewards/rejected": -0.5543204545974731, "step": 930 }, { - "epoch": 0.24, - "learning_rate": 3.9595619208087615e-07, - "logits/chosen": -2.6123409271240234, - "logits/rejected": -2.6417336463928223, - "logps/chosen": -242.59756469726562, - "logps/rejected": -194.54550170898438, - "loss": 0.612, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.26655206084251404, - "rewards/margins": 0.2533474564552307, - "rewards/rejected": -0.5198994874954224, + "epoch": 0.23, + "learning_rate": 3.769045709703288e-07, + "logits/chosen": -2.93672513961792, + "logits/rejected": -2.909937620162964, + "logps/chosen": -242.1005096435547, + "logps/rejected": -216.00576782226562, + "loss": 0.5331, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18395619094371796, + "rewards/margins": 0.7320182919502258, + "rewards/rejected": -0.9159743189811707, "step": 940 }, { - "epoch": 0.24, - "learning_rate": 4.001684919966302e-07, - "logits/chosen": -2.965735912322998, - "logits/rejected": -2.874760389328003, - "logps/chosen": -333.0281982421875, - "logps/rejected": -354.705810546875, - "loss": 0.6405, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.08252526819705963, - "rewards/margins": 0.7334955334663391, - "rewards/rejected": -0.6509702205657959, + "epoch": 0.23, + "learning_rate": 3.809141940657578e-07, + "logits/chosen": -3.039015293121338, + "logits/rejected": -2.9209115505218506, + "logps/chosen": -213.7278594970703, + "logps/rejected": -205.055419921875, + "loss": 0.5856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2937147617340088, + "rewards/margins": 0.9760116338729858, + "rewards/rejected": -1.269726276397705, "step": 950 }, { - "epoch": 0.24, - "learning_rate": 4.0438079191238417e-07, - "logits/chosen": -2.4438750743865967, - "logits/rejected": -2.4380674362182617, - "logps/chosen": -255.1825714111328, - "logps/rejected": -220.0428466796875, - "loss": 0.5256, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.007409202866256237, - "rewards/margins": 0.8924994468688965, - "rewards/rejected": -0.885090172290802, + "epoch": 0.23, + "learning_rate": 3.8492381716118683e-07, + "logits/chosen": -3.0735080242156982, + "logits/rejected": -3.0859174728393555, + "logps/chosen": -244.3859100341797, + "logps/rejected": -263.65313720703125, + "loss": 0.5541, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12054260820150375, + "rewards/margins": 0.5370070934295654, + "rewards/rejected": -0.6575496196746826, "step": 960 }, { - "epoch": 0.25, - "learning_rate": 4.0859309182813815e-07, - "logits/chosen": -2.8350677490234375, - "logits/rejected": -2.726632595062256, - "logps/chosen": -368.48760986328125, - "logps/rejected": -201.78909301757812, - "loss": 0.5834, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.32845601439476013, - "rewards/margins": 1.1812372207641602, - "rewards/rejected": -0.8527814149856567, + "epoch": 0.23, + "learning_rate": 3.8893344025661585e-07, + "logits/chosen": -2.993734836578369, + "logits/rejected": -3.008204936981201, + "logps/chosen": -232.89096069335938, + "logps/rejected": -258.3067321777344, + "loss": 0.6391, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.3648054003715515, + "rewards/margins": 0.9257339239120483, + "rewards/rejected": -0.5609285235404968, "step": 970 }, { - "epoch": 0.25, - "learning_rate": 4.128053917438922e-07, - "logits/chosen": -2.846832752227783, - "logits/rejected": -2.860927104949951, - "logps/chosen": -332.1506652832031, - "logps/rejected": -294.8549499511719, - "loss": 0.7617, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.3438522219657898, - "rewards/margins": 0.39307039976119995, - "rewards/rejected": -0.7369226813316345, + "epoch": 0.24, + "learning_rate": 3.929430633520449e-07, + "logits/chosen": -2.8218185901641846, + "logits/rejected": -2.8051164150238037, + "logps/chosen": -297.8997497558594, + "logps/rejected": -283.1402587890625, + "loss": 0.5184, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.049232371151447296, + "rewards/margins": 0.786023736000061, + "rewards/rejected": -0.8352560997009277, "step": 980 }, { - "epoch": 0.25, - "learning_rate": 4.170176916596461e-07, - "logits/chosen": -2.7602732181549072, - "logits/rejected": -2.7995636463165283, - "logps/chosen": -201.95748901367188, - "logps/rejected": -303.0880432128906, - "loss": 0.4734, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.04982544854283333, - "rewards/margins": 1.2540963888168335, - "rewards/rejected": -1.3039219379425049, + "epoch": 0.24, + "learning_rate": 3.969526864474739e-07, + "logits/chosen": -2.9782512187957764, + "logits/rejected": -2.9858968257904053, + "logps/chosen": -248.342041015625, + "logps/rejected": -231.19174194335938, + "loss": 0.5368, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03936024755239487, + "rewards/margins": 0.5973891019821167, + "rewards/rejected": -0.6367493271827698, "step": 990 }, { - "epoch": 0.25, - "learning_rate": 4.212299915754001e-07, - "logits/chosen": -2.849196195602417, - "logits/rejected": -2.8943185806274414, - "logps/chosen": -298.24432373046875, - "logps/rejected": -271.255859375, - "loss": 0.5213, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.5941381454467773, - "rewards/margins": 0.5909891128540039, - "rewards/rejected": -1.1851271390914917, + "epoch": 0.24, + "learning_rate": 4.0096230954290293e-07, + "logits/chosen": -2.8390631675720215, + "logits/rejected": -2.8416409492492676, + "logps/chosen": -304.2760314941406, + "logps/rejected": -246.52120971679688, + "loss": 0.5303, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03373967483639717, + "rewards/margins": 0.9764014482498169, + "rewards/rejected": -0.9426616430282593, "step": 1000 }, { - "epoch": 0.26, - "learning_rate": 4.2544229149115415e-07, - "logits/chosen": -2.801237106323242, - "logits/rejected": -2.6629276275634766, - "logps/chosen": -323.99285888671875, - "logps/rejected": -250.1830596923828, - "loss": 0.5358, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.05499999597668648, - "rewards/margins": 1.6413654088974, - "rewards/rejected": -1.6963655948638916, + "epoch": 0.24, + "eval_logits/chosen": -2.9669058322906494, + "eval_logits/rejected": -2.966337203979492, + "eval_logps/chosen": -201.9101104736328, + "eval_logps/rejected": -200.2815399169922, + "eval_loss": 0.5214495062828064, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.5949113965034485, + "eval_rewards/margins": 0.9939325451850891, + "eval_rewards/rejected": -1.5888441801071167, + "eval_runtime": 131.7272, + "eval_samples_per_second": 23.959, + "eval_steps_per_second": 0.38, + "step": 1000 + }, + { + "epoch": 0.24, + "learning_rate": 4.0497193263833194e-07, + "logits/chosen": -3.041374921798706, + "logits/rejected": -3.009096384048462, + "logps/chosen": -245.7183380126953, + "logps/rejected": -208.8644256591797, + "loss": 0.5438, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6507007479667664, + "rewards/margins": 0.2612120509147644, + "rewards/rejected": -0.9119127988815308, "step": 1010 }, { - "epoch": 0.26, - "learning_rate": 4.2965459140690813e-07, - "logits/chosen": -2.8145062923431396, - "logits/rejected": -2.7096211910247803, - "logps/chosen": -281.5048828125, - "logps/rejected": -285.79962158203125, - "loss": 0.695, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3529316186904907, - "rewards/margins": -0.7123409509658813, - "rewards/rejected": -0.6405906677246094, + "epoch": 0.25, + "learning_rate": 4.08981555733761e-07, + "logits/chosen": -2.9632010459899902, + "logits/rejected": -3.024637460708618, + "logps/chosen": -348.48150634765625, + "logps/rejected": -286.70806884765625, + "loss": 0.5293, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17253263294696808, + "rewards/margins": 1.1051769256591797, + "rewards/rejected": -1.2777094841003418, "step": 1020 }, { - "epoch": 0.26, - "learning_rate": 4.338668913226621e-07, - "logits/chosen": -2.8272197246551514, - "logits/rejected": -2.808361530303955, - "logps/chosen": -283.245361328125, - "logps/rejected": -337.6376953125, - "loss": 0.6913, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.703170657157898, - "rewards/margins": 0.26156896352767944, - "rewards/rejected": -0.9647396206855774, + "epoch": 0.25, + "learning_rate": 4.1299117882919007e-07, + "logits/chosen": -2.8312857151031494, + "logits/rejected": -2.904731273651123, + "logps/chosen": -279.64068603515625, + "logps/rejected": -279.8646545410156, + "loss": 0.618, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3922005593776703, + "rewards/margins": 0.6728824377059937, + "rewards/rejected": -1.0650830268859863, "step": 1030 }, { - "epoch": 0.26, - "learning_rate": 4.3807919123841616e-07, - "logits/chosen": -2.8427348136901855, - "logits/rejected": -2.7732455730438232, - "logps/chosen": -212.1287078857422, - "logps/rejected": -149.4747314453125, - "loss": 1.3506, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.0113106369972229, - "rewards/margins": 0.6789531111717224, - "rewards/rejected": -0.6902638077735901, + "epoch": 0.25, + "learning_rate": 4.170008019246191e-07, + "logits/chosen": -2.921680450439453, + "logits/rejected": -2.884779453277588, + "logps/chosen": -240.69668579101562, + "logps/rejected": -301.9366455078125, + "loss": 0.6094, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.21675479412078857, + "rewards/margins": 1.0607211589813232, + "rewards/rejected": -0.8439663648605347, "step": 1040 }, { - "epoch": 0.27, - "learning_rate": 4.4229149115417014e-07, - "logits/chosen": -2.6644446849823, - "logits/rejected": -2.764239549636841, - "logps/chosen": -177.46188354492188, - "logps/rejected": -227.08425903320312, - "loss": 0.5089, + "epoch": 0.25, + "learning_rate": 4.210104250200481e-07, + "logits/chosen": -3.155412197113037, + "logits/rejected": -3.1117303371429443, + "logps/chosen": -254.21627807617188, + "logps/rejected": -218.7623748779297, + "loss": 0.7043, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004345702938735485, - "rewards/margins": 1.017622947692871, - "rewards/rejected": -1.0132771730422974, + "rewards/chosen": -0.12576763331890106, + "rewards/margins": 0.5899392366409302, + "rewards/rejected": -0.7157068252563477, "step": 1050 }, { - "epoch": 0.27, - "learning_rate": 4.4650379106992413e-07, - "logits/chosen": -2.8153128623962402, - "logits/rejected": -2.73732852935791, - "logps/chosen": -315.84454345703125, - "logps/rejected": -352.3153381347656, - "loss": 0.5678, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.8632125854492188, - "rewards/margins": 0.012102723121643066, - "rewards/rejected": -0.8753153085708618, + "epoch": 0.26, + "learning_rate": 4.2502004811547716e-07, + "logits/chosen": -3.0184712409973145, + "logits/rejected": -2.999357223510742, + "logps/chosen": -206.9328155517578, + "logps/rejected": -167.8073272705078, + "loss": 0.5888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16964790225028992, + "rewards/margins": 1.5692901611328125, + "rewards/rejected": -1.7389380931854248, "step": 1060 }, { - "epoch": 0.27, - "learning_rate": 4.5071609098567817e-07, - "logits/chosen": -2.832590103149414, - "logits/rejected": -2.742182493209839, - "logps/chosen": -353.1194152832031, - "logps/rejected": -341.9236755371094, - "loss": 0.6138, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7696177959442139, - "rewards/margins": 0.7982410192489624, - "rewards/rejected": -1.5678590536117554, + "epoch": 0.26, + "learning_rate": 4.2902967121090617e-07, + "logits/chosen": -3.027787923812866, + "logits/rejected": -2.9506077766418457, + "logps/chosen": -145.356201171875, + "logps/rejected": -201.5256805419922, + "loss": 0.682, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1941504031419754, + "rewards/margins": 0.6811510324478149, + "rewards/rejected": -0.8753014802932739, "step": 1070 }, { - "epoch": 0.27, - "learning_rate": 4.5492839090143215e-07, - "logits/chosen": -2.7319958209991455, - "logits/rejected": -2.675238609313965, - "logps/chosen": -230.688232421875, - "logps/rejected": -146.16233825683594, - "loss": 0.5068, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5361072421073914, - "rewards/margins": 0.8432807922363281, - "rewards/rejected": -1.3793880939483643, + "epoch": 0.26, + "learning_rate": 4.330392943063352e-07, + "logits/chosen": -3.06502103805542, + "logits/rejected": -2.9889209270477295, + "logps/chosen": -270.00299072265625, + "logps/rejected": -320.61669921875, + "loss": 0.6214, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.29140493273735046, + "rewards/margins": 0.8878979682922363, + "rewards/rejected": -0.596493124961853, "step": 1080 }, { - "epoch": 0.28, - "learning_rate": 4.5914069081718614e-07, - "logits/chosen": -2.6153883934020996, - "logits/rejected": -2.7117838859558105, - "logps/chosen": -204.71461486816406, - "logps/rejected": -221.910400390625, - "loss": 0.5345, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.3535842001438141, - "rewards/margins": 0.7887686491012573, - "rewards/rejected": -1.142352819442749, + "epoch": 0.26, + "learning_rate": 4.370489174017642e-07, + "logits/chosen": -2.998223304748535, + "logits/rejected": -2.9898276329040527, + "logps/chosen": -138.95907592773438, + "logps/rejected": -224.95309448242188, + "loss": 0.5818, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06540943682193756, + "rewards/margins": 1.0736910104751587, + "rewards/rejected": -1.1391003131866455, "step": 1090 }, { - "epoch": 0.28, - "learning_rate": 4.633529907329402e-07, - "logits/chosen": -2.7112722396850586, - "logits/rejected": -2.7043750286102295, - "logps/chosen": -218.708984375, - "logps/rejected": -170.6296844482422, - "loss": 0.5437, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.26763540506362915, - "rewards/margins": 1.7196743488311768, - "rewards/rejected": -1.4520387649536133, + "epoch": 0.26, + "learning_rate": 4.4105854049719326e-07, + "logits/chosen": -2.817340850830078, + "logits/rejected": -2.880329132080078, + "logps/chosen": -276.13397216796875, + "logps/rejected": -196.4142608642578, + "loss": 0.5969, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3812592029571533, + "rewards/margins": 0.5779326558113098, + "rewards/rejected": -0.9591917991638184, "step": 1100 }, { - "epoch": 0.28, - "learning_rate": 4.6756529064869416e-07, - "logits/chosen": -2.7523810863494873, - "logits/rejected": -2.484694719314575, - "logps/chosen": -204.053466796875, - "logps/rejected": -223.3732147216797, - "loss": 0.5025, + "epoch": 0.26, + "eval_logits/chosen": -2.946826696395874, + "eval_logits/rejected": -2.9402353763580322, + "eval_logps/chosen": -201.8848114013672, + "eval_logps/rejected": -199.6153564453125, + "eval_loss": 0.5235151052474976, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.5923787951469421, + "eval_rewards/margins": 0.9298465847969055, + "eval_rewards/rejected": -1.5222253799438477, + "eval_runtime": 132.252, + "eval_samples_per_second": 23.864, + "eval_steps_per_second": 0.378, + "step": 1100 + }, + { + "epoch": 0.27, + "learning_rate": 4.4506816359262227e-07, + "logits/chosen": -3.040398597717285, + "logits/rejected": -3.0126452445983887, + "logps/chosen": -352.1734924316406, + "logps/rejected": -259.9134216308594, + "loss": 0.508, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8717359304428101, - "rewards/margins": 1.1432167291641235, - "rewards/rejected": -2.0149526596069336, + "rewards/chosen": -0.06680227816104889, + "rewards/margins": 0.7869914770126343, + "rewards/rejected": -0.8537937998771667, "step": 1110 }, { - "epoch": 0.28, - "learning_rate": 4.7177759056444814e-07, - "logits/chosen": -2.618384838104248, - "logits/rejected": -2.608262062072754, - "logps/chosen": -232.3643341064453, - "logps/rejected": -211.66806030273438, - "loss": 0.5708, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.28576603531837463, - "rewards/margins": 0.973733127117157, - "rewards/rejected": -1.259499192237854, + "epoch": 0.27, + "learning_rate": 4.490777866880513e-07, + "logits/chosen": -3.090517044067383, + "logits/rejected": -2.9955923557281494, + "logps/chosen": -283.9098815917969, + "logps/rejected": -212.3140106201172, + "loss": 0.5354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.25595977902412415, + "rewards/margins": 0.9857895970344543, + "rewards/rejected": -0.7298299074172974, "step": 1120 }, { - "epoch": 0.29, - "learning_rate": 4.759898904802022e-07, - "logits/chosen": -2.828278064727783, - "logits/rejected": -2.793574810028076, - "logps/chosen": -334.7503967285156, - "logps/rejected": -232.13388061523438, - "loss": 0.5265, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.3884439170360565, - "rewards/margins": 0.7667890787124634, - "rewards/rejected": -1.1552331447601318, + "epoch": 0.27, + "learning_rate": 4.530874097834803e-07, + "logits/chosen": -2.9406232833862305, + "logits/rejected": -2.9164605140686035, + "logps/chosen": -202.14700317382812, + "logps/rejected": -195.4089813232422, + "loss": 0.5657, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10260385274887085, + "rewards/margins": 0.7417261004447937, + "rewards/rejected": -0.8443300127983093, "step": 1130 }, { - "epoch": 0.29, - "learning_rate": 4.802021903959561e-07, - "logits/chosen": -2.667235851287842, - "logits/rejected": -2.727304458618164, - "logps/chosen": -290.23175048828125, - "logps/rejected": -288.10882568359375, - "loss": 0.5996, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.5534285306930542, - "rewards/margins": 0.9525440335273743, - "rewards/rejected": -1.5059726238250732, + "epoch": 0.27, + "learning_rate": 4.570970328789094e-07, + "logits/chosen": -2.967994451522827, + "logits/rejected": -2.9259307384490967, + "logps/chosen": -282.4659118652344, + "logps/rejected": -396.87213134765625, + "loss": 0.6142, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2336424589157104, + "rewards/margins": 3.261566162109375, + "rewards/rejected": -4.495208263397217, "step": 1140 }, { - "epoch": 0.29, - "learning_rate": 4.844144903117102e-07, - "logits/chosen": -2.830899477005005, - "logits/rejected": -2.8354620933532715, - "logps/chosen": -373.57281494140625, - "logps/rejected": -359.60174560546875, - "loss": 0.6887, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5015013217926025, - "rewards/margins": 0.7062493562698364, - "rewards/rejected": -1.2077505588531494, + "epoch": 0.28, + "learning_rate": 4.611066559743384e-07, + "logits/chosen": -2.9327492713928223, + "logits/rejected": -2.998396873474121, + "logps/chosen": -280.8419189453125, + "logps/rejected": -266.79046630859375, + "loss": 0.5015, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3540969491004944, + "rewards/margins": 0.73493891954422, + "rewards/rejected": -1.0890357494354248, "step": 1150 }, { - "epoch": 0.29, - "learning_rate": 4.886267902274642e-07, - "logits/chosen": -2.749483585357666, - "logits/rejected": -2.6624131202697754, - "logps/chosen": -278.58392333984375, - "logps/rejected": -275.7124938964844, - "loss": 0.5975, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.3455383777618408, - "rewards/margins": 1.2535759210586548, - "rewards/rejected": -0.9080374836921692, + "epoch": 0.28, + "learning_rate": 4.6511627906976743e-07, + "logits/chosen": -2.838588237762451, + "logits/rejected": -2.9173264503479004, + "logps/chosen": -230.4303741455078, + "logps/rejected": -246.2703399658203, + "loss": 0.5554, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.40750521421432495, + "rewards/margins": 0.5711129903793335, + "rewards/rejected": -0.9786182641983032, "step": 1160 }, { - "epoch": 0.3, - "learning_rate": 4.928390901432181e-07, - "logits/chosen": -2.786632776260376, - "logits/rejected": -2.681096076965332, - "logps/chosen": -240.35556030273438, - "logps/rejected": -227.7943878173828, - "loss": 0.5064, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.4853554666042328, - "rewards/margins": 1.8523401021957397, - "rewards/rejected": -2.337695837020874, + "epoch": 0.28, + "learning_rate": 4.6912590216519644e-07, + "logits/chosen": -2.924062967300415, + "logits/rejected": -2.9326508045196533, + "logps/chosen": -253.70132446289062, + "logps/rejected": -270.0424499511719, + "loss": 0.6554, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5961793065071106, + "rewards/margins": 0.3027878403663635, + "rewards/rejected": -0.8989670872688293, "step": 1170 }, { - "epoch": 0.3, - "learning_rate": 4.970513900589722e-07, - "logits/chosen": -2.6930205821990967, - "logits/rejected": -2.7257983684539795, - "logps/chosen": -224.31124877929688, - "logps/rejected": -241.4190216064453, - "loss": 0.5607, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8809860944747925, - "rewards/margins": 1.269070029258728, - "rewards/rejected": -2.1500561237335205, + "epoch": 0.28, + "learning_rate": 4.731355252606255e-07, + "logits/chosen": -2.850661277770996, + "logits/rejected": -2.852339744567871, + "logps/chosen": -247.0175018310547, + "logps/rejected": -227.436767578125, + "loss": 0.6098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.24157729744911194, + "rewards/margins": 0.7741583585739136, + "rewards/rejected": -0.5325810313224792, "step": 1180 }, { - "epoch": 0.3, - "learning_rate": 4.99859563711263e-07, - "logits/chosen": -2.6348814964294434, - "logits/rejected": -2.605294704437256, - "logps/chosen": -281.0661926269531, - "logps/rejected": -287.85369873046875, - "loss": 0.5648, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5088950395584106, - "rewards/margins": 0.9193344116210938, - "rewards/rejected": -1.4282294511795044, + "epoch": 0.29, + "learning_rate": 4.771451483560545e-07, + "logits/chosen": -2.80539870262146, + "logits/rejected": -2.8022513389587402, + "logps/chosen": -264.24407958984375, + "logps/rejected": -263.88433837890625, + "loss": 0.5289, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.03439965099096298, + "rewards/margins": 1.9651371240615845, + "rewards/rejected": -1.9307372570037842, "step": 1190 }, { - "epoch": 0.3, - "learning_rate": 4.993914427488063e-07, - "logits/chosen": -2.661515712738037, - "logits/rejected": -2.533756732940674, - "logps/chosen": -228.9407196044922, - "logps/rejected": -198.20849609375, - "loss": 0.6059, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.789717435836792, - "rewards/margins": 0.8907758593559265, - "rewards/rejected": -1.6804933547973633, + "epoch": 0.29, + "learning_rate": 4.811547714514836e-07, + "logits/chosen": -2.974710702896118, + "logits/rejected": -2.925783634185791, + "logps/chosen": -248.8636016845703, + "logps/rejected": -227.7292938232422, + "loss": 0.581, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41872087121009827, + "rewards/margins": 0.6917397975921631, + "rewards/rejected": -1.110460638999939, "step": 1200 }, { - "epoch": 0.31, - "learning_rate": 4.989233217863496e-07, - "logits/chosen": -2.6996188163757324, - "logits/rejected": -2.7078824043273926, - "logps/chosen": -216.7144012451172, - "logps/rejected": -271.33245849609375, - "loss": 0.7244, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5363994836807251, - "rewards/margins": 1.1410717964172363, - "rewards/rejected": -1.677471399307251, - "step": 1210 + "epoch": 0.29, + "eval_logits/chosen": -2.7227060794830322, + "eval_logits/rejected": -2.7064521312713623, + "eval_logps/chosen": -203.50909423828125, + "eval_logps/rejected": -201.46778869628906, + "eval_loss": 0.5886973738670349, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": -0.7548065185546875, + "eval_rewards/margins": 0.9526646137237549, + "eval_rewards/rejected": -1.7074711322784424, + "eval_runtime": 132.1113, + "eval_samples_per_second": 23.889, + "eval_steps_per_second": 0.378, + "step": 1200 }, { - "epoch": 0.31, - "learning_rate": 4.984552008238928e-07, - "logits/chosen": -2.9735779762268066, - "logits/rejected": -2.8310341835021973, - "logps/chosen": -349.55645751953125, - "logps/rejected": -314.33392333984375, - "loss": 0.6247, + "epoch": 0.29, + "learning_rate": 4.851643945469126e-07, + "logits/chosen": -2.817868947982788, + "logits/rejected": -2.8497607707977295, + "logps/chosen": -207.07168579101562, + "logps/rejected": -187.9242706298828, + "loss": 0.6706, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.46798020601272583, - "rewards/margins": 1.0479462146759033, - "rewards/rejected": -1.5159263610839844, + "rewards/chosen": -0.029719460755586624, + "rewards/margins": 1.6363286972045898, + "rewards/rejected": -1.666048288345337, + "step": 1210 + }, + { + "epoch": 0.29, + "learning_rate": 4.891740176423416e-07, + "logits/chosen": -2.6656768321990967, + "logits/rejected": -2.792525053024292, + "logps/chosen": -296.31121826171875, + "logps/rejected": -255.60025024414062, + "loss": 0.6607, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0158870220184326, + "rewards/margins": -0.16955016553401947, + "rewards/rejected": -1.8463369607925415, "step": 1220 }, { - "epoch": 0.31, - "learning_rate": 4.979870798614362e-07, - "logits/chosen": -2.696829319000244, - "logits/rejected": -2.7793948650360107, - "logps/chosen": -351.41314697265625, - "logps/rejected": -300.1158447265625, - "loss": 0.5116, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.781160831451416, - "rewards/margins": 0.9367626309394836, - "rewards/rejected": -1.7179237604141235, + "epoch": 0.3, + "learning_rate": 4.931836407377706e-07, + "logits/chosen": -2.8745858669281006, + "logits/rejected": -2.8145947456359863, + "logps/chosen": -287.16766357421875, + "logps/rejected": -200.80148315429688, + "loss": 0.632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2250034362077713, + "rewards/margins": 1.1734898090362549, + "rewards/rejected": -1.3984934091567993, "step": 1230 }, { - "epoch": 0.31, - "learning_rate": 4.975189588989795e-07, - "logits/chosen": -2.7155284881591797, - "logits/rejected": -2.8466756343841553, - "logps/chosen": -252.1944580078125, - "logps/rejected": -328.84832763671875, - "loss": 0.5823, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.43154963850975037, - "rewards/margins": 1.5160157680511475, - "rewards/rejected": -1.9475654363632202, + "epoch": 0.3, + "learning_rate": 4.971932638331996e-07, + "logits/chosen": -2.902522563934326, + "logits/rejected": -2.8150391578674316, + "logps/chosen": -256.2174377441406, + "logps/rejected": -295.2197265625, + "loss": 0.5589, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6392114162445068, + "rewards/margins": 0.673500657081604, + "rewards/rejected": -1.3127119541168213, "step": 1240 }, { - "epoch": 0.32, - "learning_rate": 4.970508379365228e-07, - "logits/chosen": -2.7954907417297363, - "logits/rejected": -2.793194055557251, - "logps/chosen": -276.27459716796875, - "logps/rejected": -347.00750732421875, - "loss": 0.533, + "epoch": 0.3, + "learning_rate": 4.998662863255482e-07, + "logits/chosen": -2.8771591186523438, + "logits/rejected": -2.815300703048706, + "logps/chosen": -303.8803405761719, + "logps/rejected": -206.31869506835938, + "loss": 0.8572, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.4929552972316742, - "rewards/margins": 1.3219962120056152, - "rewards/rejected": -1.8149516582489014, + "rewards/chosen": -0.41226935386657715, + "rewards/margins": 1.198472499847412, + "rewards/rejected": -1.6107418537139893, "step": 1250 }, { - "epoch": 0.32, - "learning_rate": 4.965827169740661e-07, - "logits/chosen": -2.824890613555908, - "logits/rejected": -2.8709912300109863, - "logps/chosen": -275.1580810546875, - "logps/rejected": -275.6163635253906, - "loss": 0.5267, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.5243432521820068, - "rewards/margins": 0.1552998125553131, - "rewards/rejected": -1.6796430349349976, + "epoch": 0.3, + "learning_rate": 4.994205740773756e-07, + "logits/chosen": -2.827841281890869, + "logits/rejected": -2.8493478298187256, + "logps/chosen": -231.36044311523438, + "logps/rejected": -240.12484741210938, + "loss": 0.5539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8475181460380554, + "rewards/margins": 0.8393493890762329, + "rewards/rejected": -1.6868677139282227, "step": 1260 }, { - "epoch": 0.32, - "learning_rate": 4.961145960116093e-07, - "logits/chosen": -2.79732084274292, - "logits/rejected": -2.5819685459136963, - "logps/chosen": -276.87945556640625, - "logps/rejected": -212.22482299804688, - "loss": 0.5344, + "epoch": 0.31, + "learning_rate": 4.989748618292031e-07, + "logits/chosen": -2.8288745880126953, + "logits/rejected": -2.666501522064209, + "logps/chosen": -314.38226318359375, + "logps/rejected": -298.8790588378906, + "loss": 0.5804, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0666576623916626, - "rewards/margins": 1.3572540283203125, - "rewards/rejected": -2.4239115715026855, + "rewards/chosen": -1.4482263326644897, + "rewards/margins": 1.7762887477874756, + "rewards/rejected": -3.224515199661255, "step": 1270 }, { - "epoch": 0.32, - "learning_rate": 4.956464750491527e-07, - "logits/chosen": -2.5743613243103027, - "logits/rejected": -2.6122655868530273, - "logps/chosen": -293.70367431640625, - "logps/rejected": -337.3021240234375, - "loss": 0.5087, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.438387155532837, - "rewards/margins": 1.1028835773468018, - "rewards/rejected": -2.5412707328796387, + "epoch": 0.31, + "learning_rate": 4.985291495810304e-07, + "logits/chosen": -2.8966383934020996, + "logits/rejected": -2.8969597816467285, + "logps/chosen": -428.73504638671875, + "logps/rejected": -401.6664123535156, + "loss": 0.5144, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.17718151211738586, + "rewards/margins": 1.9835445880889893, + "rewards/rejected": -2.1607260704040527, "step": 1280 }, { - "epoch": 0.33, - "learning_rate": 4.95178354086696e-07, - "logits/chosen": -2.6074695587158203, - "logits/rejected": -2.5442311763763428, - "logps/chosen": -235.07211303710938, - "logps/rejected": -192.266357421875, - "loss": 0.6397, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.3241581916809082, - "rewards/margins": 0.9993916749954224, - "rewards/rejected": -1.3235498666763306, + "epoch": 0.31, + "learning_rate": 4.980834373328579e-07, + "logits/chosen": -2.829662561416626, + "logits/rejected": -2.7973456382751465, + "logps/chosen": -276.19317626953125, + "logps/rejected": -248.78298950195312, + "loss": 0.5442, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.35372194647789, + "rewards/margins": 1.2040369510650635, + "rewards/rejected": -1.5577589273452759, "step": 1290 }, { - "epoch": 0.33, - "learning_rate": 4.947102331242393e-07, - "logits/chosen": -2.6536176204681396, - "logits/rejected": -2.5985217094421387, - "logps/chosen": -342.68084716796875, - "logps/rejected": -351.6435852050781, - "loss": 0.7122, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2262356281280518, - "rewards/margins": 0.3329642415046692, - "rewards/rejected": -1.5591998100280762, + "epoch": 0.31, + "learning_rate": 4.976377250846854e-07, + "logits/chosen": -2.92205548286438, + "logits/rejected": -2.8939080238342285, + "logps/chosen": -267.11468505859375, + "logps/rejected": -314.50091552734375, + "loss": 0.817, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03283994272351265, + "rewards/margins": 1.285849928855896, + "rewards/rejected": -1.3186899423599243, "step": 1300 }, { - "epoch": 0.33, - "learning_rate": 4.942421121617826e-07, - "logits/chosen": -2.7266454696655273, - "logits/rejected": -2.7418060302734375, - "logps/chosen": -314.6419372558594, - "logps/rejected": -273.09930419921875, - "loss": 0.5851, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7316147089004517, - "rewards/margins": 0.9679932594299316, - "rewards/rejected": -1.6996078491210938, + "epoch": 0.31, + "eval_logits/chosen": -2.78000545501709, + "eval_logits/rejected": -2.7716803550720215, + "eval_logps/chosen": -211.0212860107422, + "eval_logps/rejected": -208.6137237548828, + "eval_loss": 0.6619690656661987, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -1.506027340888977, + "eval_rewards/margins": 0.9160365462303162, + "eval_rewards/rejected": -2.4220638275146484, + "eval_runtime": 131.9926, + "eval_samples_per_second": 23.91, + "eval_steps_per_second": 0.379, + "step": 1300 + }, + { + "epoch": 0.32, + "learning_rate": 4.971920128365127e-07, + "logits/chosen": -2.908979654312134, + "logits/rejected": -2.8940463066101074, + "logps/chosen": -341.14715576171875, + "logps/rejected": -259.18011474609375, + "loss": 0.5222, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8376142382621765, + "rewards/margins": 1.1220935583114624, + "rewards/rejected": -1.9597078561782837, "step": 1310 }, { - "epoch": 0.33, - "learning_rate": 4.937739911993259e-07, - "logits/chosen": -2.7935047149658203, - "logits/rejected": -2.6458325386047363, - "logps/chosen": -278.81951904296875, - "logps/rejected": -263.1274719238281, - "loss": 0.5954, + "epoch": 0.32, + "learning_rate": 4.967463005883402e-07, + "logits/chosen": -2.839299201965332, + "logits/rejected": -2.8338985443115234, + "logps/chosen": -311.40814208984375, + "logps/rejected": -277.2676696777344, + "loss": 0.558, "rewards/accuracies": 0.75, - "rewards/chosen": -1.1032342910766602, - "rewards/margins": 0.8505995869636536, - "rewards/rejected": -1.9538339376449585, + "rewards/chosen": -0.7818495035171509, + "rewards/margins": 0.7565358281135559, + "rewards/rejected": -1.5383851528167725, "step": 1320 }, { - "epoch": 0.34, - "learning_rate": 4.933058702368692e-07, - "logits/chosen": -2.3314476013183594, - "logits/rejected": -2.297711133956909, - "logps/chosen": -214.3764190673828, - "logps/rejected": -262.4116516113281, - "loss": 0.6091, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2805622816085815, - "rewards/margins": 0.8711134195327759, - "rewards/rejected": -2.1516757011413574, + "epoch": 0.32, + "learning_rate": 4.963005883401676e-07, + "logits/chosen": -2.664158344268799, + "logits/rejected": -2.682422161102295, + "logps/chosen": -206.17919921875, + "logps/rejected": -202.81809997558594, + "loss": 0.6761, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8475558757781982, + "rewards/margins": 0.8920677900314331, + "rewards/rejected": -2.739623785018921, "step": 1330 }, { - "epoch": 0.34, - "learning_rate": 4.928377492744124e-07, - "logits/chosen": -2.690143346786499, - "logits/rejected": -2.599189281463623, - "logps/chosen": -229.4182891845703, - "logps/rejected": -258.7178039550781, - "loss": 0.6126, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7359424829483032, - "rewards/margins": 1.148744821548462, - "rewards/rejected": -1.8846874237060547, + "epoch": 0.32, + "learning_rate": 4.95854876091995e-07, + "logits/chosen": -2.5715408325195312, + "logits/rejected": -2.522519111633301, + "logps/chosen": -309.2889404296875, + "logps/rejected": -289.9693298339844, + "loss": 0.7991, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5233513116836548, + "rewards/margins": 1.0860936641693115, + "rewards/rejected": -1.6094449758529663, "step": 1340 }, { - "epoch": 0.34, - "learning_rate": 4.923696283119558e-07, - "logits/chosen": -2.6452858448028564, - "logits/rejected": -2.6183056831359863, - "logps/chosen": -200.41815185546875, - "logps/rejected": -210.90628051757812, - "loss": 0.5397, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8483043909072876, - "rewards/margins": 0.6536422371864319, - "rewards/rejected": -1.5019466876983643, + "epoch": 0.32, + "learning_rate": 4.954091638438224e-07, + "logits/chosen": -2.7232768535614014, + "logits/rejected": -2.7459959983825684, + "logps/chosen": -237.03213500976562, + "logps/rejected": -232.98281860351562, + "loss": 0.5024, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2251498699188232, + "rewards/margins": 1.2850638628005981, + "rewards/rejected": -2.510213613510132, "step": 1350 }, { - "epoch": 0.34, - "learning_rate": 4.919015073494991e-07, - "logits/chosen": -2.5736565589904785, - "logits/rejected": -2.5667662620544434, - "logps/chosen": -229.0320281982422, - "logps/rejected": -277.36669921875, - "loss": 0.6221, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.28454262018203735, - "rewards/margins": 1.7313134670257568, - "rewards/rejected": -2.0158562660217285, + "epoch": 0.33, + "learning_rate": 4.949634515956499e-07, + "logits/chosen": -2.6301980018615723, + "logits/rejected": -2.518221378326416, + "logps/chosen": -226.6194610595703, + "logps/rejected": -229.434814453125, + "loss": 0.4457, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3141323328018188, + "rewards/margins": 1.9887878894805908, + "rewards/rejected": -3.30292010307312, "step": 1360 }, { - "epoch": 0.35, - "learning_rate": 4.914333863870424e-07, - "logits/chosen": -2.467679262161255, - "logits/rejected": -2.5415878295898438, - "logps/chosen": -301.71099853515625, - "logps/rejected": -286.52423095703125, - "loss": 0.6835, + "epoch": 0.33, + "learning_rate": 4.945177393474772e-07, + "logits/chosen": -2.6427488327026367, + "logits/rejected": -2.5693862438201904, + "logps/chosen": -341.1999816894531, + "logps/rejected": -279.8642272949219, + "loss": 0.5298, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8348533511161804, - "rewards/margins": 1.363982915878296, - "rewards/rejected": -2.198835849761963, + "rewards/chosen": -0.7192713022232056, + "rewards/margins": 4.5171966552734375, + "rewards/rejected": -5.2364678382873535, "step": 1370 }, { - "epoch": 0.35, - "learning_rate": 4.909652654245857e-07, - "logits/chosen": -2.5349931716918945, - "logits/rejected": -2.6447384357452393, - "logps/chosen": -316.8460388183594, - "logps/rejected": -313.21990966796875, - "loss": 0.718, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.01732412539422512, - "rewards/margins": 1.913246512413025, - "rewards/rejected": -1.9305706024169922, + "epoch": 0.33, + "learning_rate": 4.940720270993047e-07, + "logits/chosen": -2.6646227836608887, + "logits/rejected": -2.6930480003356934, + "logps/chosen": -225.49874877929688, + "logps/rejected": -266.44903564453125, + "loss": 0.502, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4898685216903687, + "rewards/margins": 0.8386867642402649, + "rewards/rejected": -2.3285553455352783, "step": 1380 }, { - "epoch": 0.35, - "learning_rate": 4.90497144462129e-07, - "logits/chosen": -2.7121050357818604, - "logits/rejected": -2.6427199840545654, - "logps/chosen": -342.8936462402344, - "logps/rejected": -341.6715393066406, - "loss": 0.7374, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.07042744010686874, - "rewards/margins": 1.4891926050186157, - "rewards/rejected": -1.4187650680541992, + "epoch": 0.33, + "learning_rate": 4.936263148511321e-07, + "logits/chosen": -2.575330972671509, + "logits/rejected": -2.7480521202087402, + "logps/chosen": -309.33428955078125, + "logps/rejected": -257.23797607421875, + "loss": 0.6542, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.9829602241516113, + "rewards/margins": 0.4586246609687805, + "rewards/rejected": -3.441584825515747, "step": 1390 }, { - "epoch": 0.35, - "learning_rate": 4.900290234996723e-07, - "logits/chosen": -2.7661263942718506, - "logits/rejected": -2.6738123893737793, - "logps/chosen": -425.2027893066406, - "logps/rejected": -262.91510009765625, - "loss": 0.5545, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.35545283555984497, - "rewards/margins": 1.151031494140625, - "rewards/rejected": -1.5064842700958252, + "epoch": 0.34, + "learning_rate": 4.931806026029595e-07, + "logits/chosen": -2.679490327835083, + "logits/rejected": -2.7307605743408203, + "logps/chosen": -283.4678039550781, + "logps/rejected": -282.97027587890625, + "loss": 0.6039, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7690193057060242, + "rewards/margins": 1.6748729944229126, + "rewards/rejected": -2.443892240524292, "step": 1400 }, { - "epoch": 0.36, - "learning_rate": 4.895609025372156e-07, - "logits/chosen": -2.6749067306518555, - "logits/rejected": -2.6837830543518066, - "logps/chosen": -218.03640747070312, - "logps/rejected": -294.2855529785156, - "loss": 0.5206, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3428592085838318, - "rewards/margins": 1.5088475942611694, - "rewards/rejected": -1.8517067432403564, + "epoch": 0.34, + "eval_logits/chosen": -2.6917049884796143, + "eval_logits/rejected": -2.68276309967041, + "eval_logps/chosen": -212.78140258789062, + "eval_logps/rejected": -212.8324737548828, + "eval_loss": 0.5320679545402527, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -1.6820369958877563, + "eval_rewards/margins": 1.1619027853012085, + "eval_rewards/rejected": -2.8439395427703857, + "eval_runtime": 132.1208, + "eval_samples_per_second": 23.887, + "eval_steps_per_second": 0.378, + "step": 1400 + }, + { + "epoch": 0.34, + "learning_rate": 4.927348903547869e-07, + "logits/chosen": -2.912747383117676, + "logits/rejected": -2.8937995433807373, + "logps/chosen": -292.30572509765625, + "logps/rejected": -275.7328186035156, + "loss": 0.6151, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1151381731033325, + "rewards/margins": 0.369060218334198, + "rewards/rejected": -1.4841983318328857, "step": 1410 }, { - "epoch": 0.36, - "learning_rate": 4.890927815747589e-07, - "logits/chosen": -2.6067261695861816, - "logits/rejected": -2.5629193782806396, - "logps/chosen": -302.8584289550781, - "logps/rejected": -244.94229125976562, - "loss": 0.7956, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.47615474462509155, - "rewards/margins": 0.5270879864692688, - "rewards/rejected": -1.0032426118850708, + "epoch": 0.34, + "learning_rate": 4.922891781066144e-07, + "logits/chosen": -2.7397265434265137, + "logits/rejected": -2.760406255722046, + "logps/chosen": -214.39376831054688, + "logps/rejected": -223.88827514648438, + "loss": 0.6322, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8861867785453796, + "rewards/margins": 1.3033297061920166, + "rewards/rejected": -2.189516305923462, "step": 1420 }, { - "epoch": 0.36, - "learning_rate": 4.886246606123022e-07, - "logits/chosen": -2.6083011627197266, - "logits/rejected": -2.646594524383545, - "logps/chosen": -341.74346923828125, - "logps/rejected": -243.8942108154297, - "loss": 0.644, + "epoch": 0.34, + "learning_rate": 4.918434658584418e-07, + "logits/chosen": -2.7892801761627197, + "logits/rejected": -2.748758554458618, + "logps/chosen": -395.18719482421875, + "logps/rejected": -281.3499755859375, + "loss": 0.5626, "rewards/accuracies": 0.75, - "rewards/chosen": -0.2975875437259674, - "rewards/margins": 1.4379489421844482, - "rewards/rejected": -1.7355365753173828, + "rewards/chosen": -1.0108449459075928, + "rewards/margins": 1.1944694519042969, + "rewards/rejected": -2.2053146362304688, "step": 1430 }, { - "epoch": 0.36, - "learning_rate": 4.881565396498455e-07, - "logits/chosen": -2.612194538116455, - "logits/rejected": -2.68499755859375, - "logps/chosen": -235.001953125, - "logps/rejected": -252.8855438232422, - "loss": 0.6004, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.43834367394447327, - "rewards/margins": 0.21190723776817322, - "rewards/rejected": -0.6502509117126465, + "epoch": 0.35, + "learning_rate": 4.913977536102692e-07, + "logits/chosen": -2.619915723800659, + "logits/rejected": -2.5783581733703613, + "logps/chosen": -229.4763946533203, + "logps/rejected": -246.9639892578125, + "loss": 0.5822, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9170472025871277, + "rewards/margins": 0.9443166851997375, + "rewards/rejected": -1.8613640069961548, "step": 1440 }, { - "epoch": 0.37, - "learning_rate": 4.876884186873888e-07, - "logits/chosen": -2.730816125869751, - "logits/rejected": -2.696262836456299, - "logps/chosen": -237.52621459960938, - "logps/rejected": -233.2257080078125, - "loss": 0.5765, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.1927516907453537, - "rewards/margins": 0.9156246185302734, - "rewards/rejected": -1.1083762645721436, + "epoch": 0.35, + "learning_rate": 4.909520413620967e-07, + "logits/chosen": -2.601719379425049, + "logits/rejected": -2.6248908042907715, + "logps/chosen": -281.9274597167969, + "logps/rejected": -275.88970947265625, + "loss": 0.7104, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.30259108543396, + "rewards/margins": 0.7970986366271973, + "rewards/rejected": -2.0996899604797363, "step": 1450 }, { - "epoch": 0.37, - "learning_rate": 4.872202977249321e-07, - "logits/chosen": -2.778622627258301, - "logits/rejected": -2.734001636505127, - "logps/chosen": -254.0236053466797, - "logps/rejected": -273.97210693359375, - "loss": 0.5525, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0060231685638428, - "rewards/margins": 0.8538422584533691, - "rewards/rejected": -1.8598655462265015, + "epoch": 0.35, + "learning_rate": 4.90506329113924e-07, + "logits/chosen": -2.776642084121704, + "logits/rejected": -2.7469542026519775, + "logps/chosen": -325.6810607910156, + "logps/rejected": -309.17584228515625, + "loss": 0.5111, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1769328117370605, + "rewards/margins": 1.475838541984558, + "rewards/rejected": -2.652771472930908, "step": 1460 }, { - "epoch": 0.37, - "learning_rate": 4.867521767624753e-07, - "logits/chosen": -2.767373561859131, - "logits/rejected": -2.6322503089904785, - "logps/chosen": -336.3904113769531, - "logps/rejected": -339.51312255859375, - "loss": 0.5659, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.2834639251232147, - "rewards/margins": 2.169893741607666, - "rewards/rejected": -2.4533581733703613, + "epoch": 0.35, + "learning_rate": 4.900606168657515e-07, + "logits/chosen": -2.5737416744232178, + "logits/rejected": -2.5899055004119873, + "logps/chosen": -283.1946105957031, + "logps/rejected": -322.8107604980469, + "loss": 0.7519, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8150444030761719, + "rewards/margins": 1.5333455801010132, + "rewards/rejected": -3.3483901023864746, "step": 1470 }, { - "epoch": 0.37, - "learning_rate": 4.862840558000187e-07, - "logits/chosen": -2.5647339820861816, - "logits/rejected": -2.552048921585083, - "logps/chosen": -259.8147888183594, - "logps/rejected": -360.00311279296875, - "loss": 0.5895, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1612476110458374, - "rewards/margins": 0.8322628140449524, - "rewards/rejected": -1.9935102462768555, + "epoch": 0.36, + "learning_rate": 4.896149046175789e-07, + "logits/chosen": -2.859255313873291, + "logits/rejected": -2.762847423553467, + "logps/chosen": -306.9642639160156, + "logps/rejected": -288.6649169921875, + "loss": 0.6339, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6730111241340637, + "rewards/margins": 1.3070439100265503, + "rewards/rejected": -1.9800550937652588, "step": 1480 }, { - "epoch": 0.38, - "learning_rate": 4.85815934837562e-07, - "logits/chosen": -2.696507692337036, - "logits/rejected": -2.722266674041748, - "logps/chosen": -251.97970581054688, - "logps/rejected": -266.2132568359375, - "loss": 0.578, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7193154692649841, - "rewards/margins": 1.0223782062530518, - "rewards/rejected": -1.7416938543319702, + "epoch": 0.36, + "learning_rate": 4.891691923694063e-07, + "logits/chosen": -2.937251567840576, + "logits/rejected": -2.9676809310913086, + "logps/chosen": -334.50238037109375, + "logps/rejected": -362.9433898925781, + "loss": 0.5368, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9834873080253601, + "rewards/margins": 1.2966114282608032, + "rewards/rejected": -2.2800984382629395, "step": 1490 }, { - "epoch": 0.38, - "learning_rate": 4.853478138751054e-07, - "logits/chosen": -2.6109890937805176, - "logits/rejected": -2.663027286529541, - "logps/chosen": -233.97970581054688, - "logps/rejected": -182.5577392578125, - "loss": 0.5884, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3420624732971191, - "rewards/margins": 0.07519197463989258, - "rewards/rejected": -1.4172544479370117, + "epoch": 0.36, + "learning_rate": 4.887234801212337e-07, + "logits/chosen": -2.7312240600585938, + "logits/rejected": -2.656231164932251, + "logps/chosen": -208.14993286132812, + "logps/rejected": -220.1405029296875, + "loss": 0.6666, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1167850494384766, + "rewards/margins": 1.1534658670425415, + "rewards/rejected": -2.2702507972717285, "step": 1500 }, { - "epoch": 0.38, - "learning_rate": 4.848796929126486e-07, - "logits/chosen": -2.785861015319824, - "logits/rejected": -2.8285059928894043, - "logps/chosen": -283.89300537109375, - "logps/rejected": -269.63909912109375, - "loss": 0.6041, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.7470687627792358, - "rewards/margins": 1.224205732345581, - "rewards/rejected": -1.9712746143341064, + "epoch": 0.36, + "eval_logits/chosen": -2.859431505203247, + "eval_logits/rejected": -2.855691432952881, + "eval_logps/chosen": -209.8364715576172, + "eval_logps/rejected": -210.77725219726562, + "eval_loss": 0.5303400158882141, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -1.3875455856323242, + "eval_rewards/margins": 1.2508691549301147, + "eval_rewards/rejected": -2.6384148597717285, + "eval_runtime": 132.1002, + "eval_samples_per_second": 23.891, + "eval_steps_per_second": 0.379, + "step": 1500 + }, + { + "epoch": 0.36, + "learning_rate": 4.882777678730611e-07, + "logits/chosen": -2.8714451789855957, + "logits/rejected": -2.8805060386657715, + "logps/chosen": -267.64398193359375, + "logps/rejected": -256.2818908691406, + "loss": 0.6269, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2171975076198578, + "rewards/margins": 1.3595329523086548, + "rewards/rejected": -1.5767303705215454, "step": 1510 }, { - "epoch": 0.38, - "learning_rate": 4.844115719501919e-07, - "logits/chosen": -2.7913641929626465, - "logits/rejected": -2.6347858905792236, - "logps/chosen": -304.1687927246094, - "logps/rejected": -333.74029541015625, - "loss": 0.7299, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.37677839398384094, - "rewards/margins": 1.4539364576339722, - "rewards/rejected": -1.8307149410247803, + "epoch": 0.37, + "learning_rate": 4.878320556248885e-07, + "logits/chosen": -2.939527988433838, + "logits/rejected": -2.863615036010742, + "logps/chosen": -232.2190399169922, + "logps/rejected": -227.65771484375, + "loss": 0.5884, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0052870512008667, + "rewards/margins": 0.726232647895813, + "rewards/rejected": -1.7315196990966797, "step": 1520 }, { - "epoch": 0.39, - "learning_rate": 4.839434509877352e-07, - "logits/chosen": -2.791416645050049, - "logits/rejected": -2.765383720397949, - "logps/chosen": -226.19091796875, - "logps/rejected": -252.04275512695312, - "loss": 0.6193, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5906246304512024, - "rewards/margins": 1.1396105289459229, - "rewards/rejected": -1.7302350997924805, + "epoch": 0.37, + "learning_rate": 4.87386343376716e-07, + "logits/chosen": -2.9457874298095703, + "logits/rejected": -2.8765900135040283, + "logps/chosen": -240.7601318359375, + "logps/rejected": -191.36117553710938, + "loss": 0.5108, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0548090934753418, + "rewards/margins": 0.7512520551681519, + "rewards/rejected": -1.8060611486434937, "step": 1530 }, { - "epoch": 0.39, - "learning_rate": 4.834753300252785e-07, - "logits/chosen": -2.711320400238037, - "logits/rejected": -2.6810293197631836, - "logps/chosen": -277.157470703125, - "logps/rejected": -259.8028259277344, - "loss": 0.4853, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8695998191833496, - "rewards/margins": 1.3735220432281494, - "rewards/rejected": -2.24312162399292, + "epoch": 0.37, + "learning_rate": 4.869406311285433e-07, + "logits/chosen": -2.8488717079162598, + "logits/rejected": -2.959240436553955, + "logps/chosen": -230.619140625, + "logps/rejected": -287.23712158203125, + "loss": 0.568, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2126989364624023, + "rewards/margins": 0.685308575630188, + "rewards/rejected": -1.8980076313018799, "step": 1540 }, { - "epoch": 0.39, - "learning_rate": 4.830072090628219e-07, - "logits/chosen": -2.5173277854919434, - "logits/rejected": -2.4909121990203857, - "logps/chosen": -251.896484375, - "logps/rejected": -213.8211669921875, - "loss": 0.4497, - "rewards/accuracies": 0.75, - "rewards/chosen": 1.2197401523590088, - "rewards/margins": 2.002092123031616, - "rewards/rejected": -0.7823519110679626, + "epoch": 0.37, + "learning_rate": 4.864949188803708e-07, + "logits/chosen": -2.9162182807922363, + "logits/rejected": -2.882370710372925, + "logps/chosen": -270.56597900390625, + "logps/rejected": -256.61199951171875, + "loss": 0.6384, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8191057443618774, + "rewards/margins": 1.4249552488327026, + "rewards/rejected": -2.24406099319458, "step": 1550 }, { - "epoch": 0.39, - "learning_rate": 4.825390881003651e-07, - "logits/chosen": -2.7679383754730225, - "logits/rejected": -2.7764766216278076, - "logps/chosen": -310.71722412109375, - "logps/rejected": -282.398681640625, - "loss": 0.7393, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2943713665008545, - "rewards/margins": 1.1048939228057861, - "rewards/rejected": -2.3992652893066406, + "epoch": 0.38, + "learning_rate": 4.860492066321983e-07, + "logits/chosen": -2.7422914505004883, + "logits/rejected": -2.7637314796447754, + "logps/chosen": -206.1497802734375, + "logps/rejected": -210.01394653320312, + "loss": 0.5673, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4817397594451904, + "rewards/margins": 1.0429823398590088, + "rewards/rejected": -2.524722099304199, "step": 1560 }, { - "epoch": 0.4, - "learning_rate": 4.820709671379084e-07, - "logits/chosen": -2.8770172595977783, - "logits/rejected": -2.7384181022644043, - "logps/chosen": -291.2705383300781, - "logps/rejected": -278.2033996582031, - "loss": 0.6648, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4087117910385132, - "rewards/margins": 0.2625928521156311, - "rewards/rejected": -1.671304702758789, + "epoch": 0.38, + "learning_rate": 4.856034943840256e-07, + "logits/chosen": -2.736290693283081, + "logits/rejected": -2.7692618370056152, + "logps/chosen": -286.99920654296875, + "logps/rejected": -371.7643127441406, + "loss": 0.6459, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4035732746124268, + "rewards/margins": 0.47406449913978577, + "rewards/rejected": -1.8776376247406006, "step": 1570 }, { - "epoch": 0.4, - "learning_rate": 4.816028461754517e-07, - "logits/chosen": -2.6546216011047363, - "logits/rejected": -2.7144055366516113, - "logps/chosen": -252.04006958007812, - "logps/rejected": -205.1694793701172, - "loss": 0.5475, + "epoch": 0.38, + "learning_rate": 4.851577821358531e-07, + "logits/chosen": -2.7426650524139404, + "logits/rejected": -2.6288838386535645, + "logps/chosen": -204.90005493164062, + "logps/rejected": -280.9257507324219, + "loss": 0.5213, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.298349142074585, - "rewards/margins": 0.6605261564254761, - "rewards/rejected": -1.958875060081482, + "rewards/chosen": -1.31305992603302, + "rewards/margins": 2.527942657470703, + "rewards/rejected": -3.841002941131592, "step": 1580 }, { - "epoch": 0.4, - "learning_rate": 4.81134725212995e-07, - "logits/chosen": -2.7433269023895264, - "logits/rejected": -2.7543070316314697, - "logps/chosen": -260.1664733886719, - "logps/rejected": -256.9065856933594, - "loss": 0.6055, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.9419029951095581, - "rewards/margins": 1.7040073871612549, - "rewards/rejected": -2.6459105014801025, + "epoch": 0.38, + "learning_rate": 4.847120698876805e-07, + "logits/chosen": -2.83410906791687, + "logits/rejected": -2.8961398601531982, + "logps/chosen": -252.52200317382812, + "logps/rejected": -230.789794921875, + "loss": 0.5937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0155789852142334, + "rewards/margins": 1.6747217178344727, + "rewards/rejected": -2.690300703048706, "step": 1590 }, { - "epoch": 0.4, - "learning_rate": 4.806666042505384e-07, - "logits/chosen": -2.68558931350708, - "logits/rejected": -2.5312259197235107, - "logps/chosen": -251.97323608398438, - "logps/rejected": -229.5078582763672, - "loss": 0.5146, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2642735242843628, - "rewards/margins": 1.0432544946670532, - "rewards/rejected": -2.307528257369995, + "epoch": 0.39, + "learning_rate": 4.842663576395079e-07, + "logits/chosen": -2.6591238975524902, + "logits/rejected": -2.651122570037842, + "logps/chosen": -238.4807586669922, + "logps/rejected": -246.7001953125, + "loss": 0.6907, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5470054149627686, + "rewards/margins": 1.2377474308013916, + "rewards/rejected": -2.784752607345581, "step": 1600 }, { - "epoch": 0.41, - "learning_rate": 4.801984832880816e-07, - "logits/chosen": -2.603732109069824, - "logits/rejected": -2.388270854949951, - "logps/chosen": -353.0534362792969, - "logps/rejected": -289.6091003417969, - "loss": 0.5718, + "epoch": 0.39, + "eval_logits/chosen": -2.828794240951538, + "eval_logits/rejected": -2.8226699829101562, + "eval_logps/chosen": -216.61843872070312, + "eval_logps/rejected": -216.60684204101562, + "eval_loss": 0.5409246683120728, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -2.065741777420044, + "eval_rewards/margins": 1.1556315422058105, + "eval_rewards/rejected": -3.2213733196258545, + "eval_runtime": 132.2875, + "eval_samples_per_second": 23.857, + "eval_steps_per_second": 0.378, + "step": 1600 + }, + { + "epoch": 0.39, + "learning_rate": 4.838206453913353e-07, + "logits/chosen": -3.0326790809631348, + "logits/rejected": -2.9398951530456543, + "logps/chosen": -290.9919128417969, + "logps/rejected": -272.7713317871094, + "loss": 0.5751, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.376704216003418, - "rewards/margins": 1.6749976873397827, - "rewards/rejected": -3.051701784133911, + "rewards/chosen": -1.576283574104309, + "rewards/margins": 0.9577566981315613, + "rewards/rejected": -2.5340399742126465, "step": 1610 }, { - "epoch": 0.41, - "learning_rate": 4.797303623256249e-07, - "logits/chosen": -2.7381432056427, - "logits/rejected": -2.661796808242798, - "logps/chosen": -302.10357666015625, - "logps/rejected": -230.2789764404297, - "loss": 0.5316, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9080637693405151, - "rewards/margins": 1.5905338525772095, - "rewards/rejected": -2.4985973834991455, + "epoch": 0.39, + "learning_rate": 4.833749331431628e-07, + "logits/chosen": -2.8972344398498535, + "logits/rejected": -2.8660812377929688, + "logps/chosen": -230.9638214111328, + "logps/rejected": -173.11141967773438, + "loss": 0.5143, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9524039030075073, + "rewards/margins": 1.410833716392517, + "rewards/rejected": -2.3632376194000244, "step": 1620 }, { - "epoch": 0.41, - "learning_rate": 4.792622413631682e-07, - "logits/chosen": -2.6611194610595703, - "logits/rejected": -2.5353498458862305, - "logps/chosen": -378.8614501953125, - "logps/rejected": -439.38787841796875, - "loss": 0.5265, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.44443148374557495, - "rewards/margins": 1.3168675899505615, - "rewards/rejected": -1.7612993717193604, + "epoch": 0.39, + "learning_rate": 4.829292208949901e-07, + "logits/chosen": -2.9414827823638916, + "logits/rejected": -2.971438407897949, + "logps/chosen": -226.3228759765625, + "logps/rejected": -232.98818969726562, + "loss": 0.5422, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3864765167236328, + "rewards/margins": 0.7363510131835938, + "rewards/rejected": -2.1228275299072266, "step": 1630 }, { - "epoch": 0.41, - "learning_rate": 4.787941204007115e-07, - "logits/chosen": -2.631410598754883, - "logits/rejected": -2.5915353298187256, - "logps/chosen": -287.78936767578125, - "logps/rejected": -323.13653564453125, - "loss": 0.5354, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.061402678489685, - "rewards/margins": 1.3255428075790405, - "rewards/rejected": -2.3869454860687256, + "epoch": 0.39, + "learning_rate": 4.824835086468176e-07, + "logits/chosen": -2.8019332885742188, + "logits/rejected": -2.7973155975341797, + "logps/chosen": -262.8872985839844, + "logps/rejected": -231.1161346435547, + "loss": 0.5791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9670025706291199, + "rewards/margins": 1.215883731842041, + "rewards/rejected": -2.1828863620758057, "step": 1640 }, { - "epoch": 0.42, - "learning_rate": 4.783259994382548e-07, - "logits/chosen": -2.79594349861145, - "logits/rejected": -2.6912193298339844, - "logps/chosen": -304.9742736816406, - "logps/rejected": -246.300048828125, - "loss": 0.6284, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2038127183914185, - "rewards/margins": 1.096224069595337, - "rewards/rejected": -2.300036907196045, + "epoch": 0.4, + "learning_rate": 4.82037796398645e-07, + "logits/chosen": -2.767401695251465, + "logits/rejected": -2.801602602005005, + "logps/chosen": -124.29368591308594, + "logps/rejected": -160.11630249023438, + "loss": 0.499, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7369630932807922, + "rewards/margins": 1.3837449550628662, + "rewards/rejected": -2.1207079887390137, "step": 1650 }, { - "epoch": 0.42, - "learning_rate": 4.778578784757981e-07, - "logits/chosen": -2.738156795501709, - "logits/rejected": -2.596961259841919, - "logps/chosen": -268.0915832519531, - "logps/rejected": -274.79278564453125, - "loss": 0.4787, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.030813455581665, - "rewards/margins": 0.8118458986282349, - "rewards/rejected": -1.8426593542099, + "epoch": 0.4, + "learning_rate": 4.815920841504724e-07, + "logits/chosen": -2.7986817359924316, + "logits/rejected": -2.7415225505828857, + "logps/chosen": -211.67605590820312, + "logps/rejected": -318.15789794921875, + "loss": 0.622, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8099263310432434, + "rewards/margins": 1.4976965188980103, + "rewards/rejected": -2.3076229095458984, "step": 1660 }, { - "epoch": 0.42, - "learning_rate": 4.773897575133414e-07, - "logits/chosen": -2.63154673576355, - "logits/rejected": -2.627650737762451, - "logps/chosen": -237.63363647460938, - "logps/rejected": -223.29159545898438, - "loss": 0.575, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -1.6023433208465576, - "rewards/margins": 0.5691138505935669, - "rewards/rejected": -2.171457529067993, + "epoch": 0.4, + "learning_rate": 4.811463719022998e-07, + "logits/chosen": -2.759817600250244, + "logits/rejected": -2.718721866607666, + "logps/chosen": -408.4496154785156, + "logps/rejected": -385.46160888671875, + "loss": 0.7775, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -6.907963752746582, + "rewards/margins": -1.081120252609253, + "rewards/rejected": -5.82684326171875, "step": 1670 }, { - "epoch": 0.42, - "learning_rate": 4.769216365508848e-07, - "logits/chosen": -2.695577621459961, - "logits/rejected": -2.5401132106781006, - "logps/chosen": -339.2311096191406, - "logps/rejected": -262.02874755859375, - "loss": 0.7872, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.6823625564575195, - "rewards/margins": 0.8626251220703125, - "rewards/rejected": -2.5449881553649902, + "epoch": 0.4, + "learning_rate": 4.807006596541273e-07, + "logits/chosen": -2.6703763008117676, + "logits/rejected": -2.7096924781799316, + "logps/chosen": -165.2185821533203, + "logps/rejected": -187.33465576171875, + "loss": 0.9972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6897889375686646, + "rewards/margins": 1.038997769355774, + "rewards/rejected": -1.7287867069244385, "step": 1680 }, { - "epoch": 0.43, - "learning_rate": 4.76453515588428e-07, - "logits/chosen": -2.7896368503570557, - "logits/rejected": -2.6924490928649902, - "logps/chosen": -339.7137756347656, - "logps/rejected": -307.8088073730469, - "loss": 0.5801, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.13714599609375, - "rewards/margins": 1.197256326675415, - "rewards/rejected": -2.334402322769165, + "epoch": 0.41, + "learning_rate": 4.802549474059546e-07, + "logits/chosen": -2.701014995574951, + "logits/rejected": -2.7146852016448975, + "logps/chosen": -180.14768981933594, + "logps/rejected": -219.23886108398438, + "loss": 0.7425, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7074281573295593, + "rewards/margins": 1.2295209169387817, + "rewards/rejected": -1.936949372291565, "step": 1690 }, { - "epoch": 0.43, - "learning_rate": 4.759853946259713e-07, - "logits/chosen": -2.591322183609009, - "logits/rejected": -2.6701719760894775, - "logps/chosen": -301.0086975097656, - "logps/rejected": -269.099609375, - "loss": 0.6423, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.6230440139770508, - "rewards/margins": 0.9465805888175964, - "rewards/rejected": -2.569624662399292, + "epoch": 0.41, + "learning_rate": 4.798092351577821e-07, + "logits/chosen": -2.855579137802124, + "logits/rejected": -2.800286293029785, + "logps/chosen": -309.85546875, + "logps/rejected": -289.11907958984375, + "loss": 0.5772, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6967086791992188, + "rewards/margins": 0.9293675422668457, + "rewards/rejected": -2.6260764598846436, "step": 1700 }, { - "epoch": 0.43, - "learning_rate": 4.7551727366351465e-07, - "logits/chosen": -2.6842517852783203, - "logits/rejected": -2.5960946083068848, - "logps/chosen": -305.3388671875, - "logps/rejected": -275.5049133300781, - "loss": 0.6272, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0681524276733398, - "rewards/margins": 1.0799347162246704, - "rewards/rejected": -2.1480870246887207, + "epoch": 0.41, + "eval_logits/chosen": -2.6634597778320312, + "eval_logits/rejected": -2.649837017059326, + "eval_logps/chosen": -215.8096923828125, + "eval_logps/rejected": -217.22642517089844, + "eval_loss": 0.5308806896209717, + "eval_rewards/accuracies": 0.6875, + "eval_rewards/chosen": -1.9848674535751343, + "eval_rewards/margins": 1.2984668016433716, + "eval_rewards/rejected": -3.283334255218506, + "eval_runtime": 132.0872, + "eval_samples_per_second": 23.893, + "eval_steps_per_second": 0.379, + "step": 1700 + }, + { + "epoch": 0.41, + "learning_rate": 4.793635229096096e-07, + "logits/chosen": -2.7919347286224365, + "logits/rejected": -2.7726006507873535, + "logps/chosen": -227.92050170898438, + "logps/rejected": -226.15225219726562, + "loss": 0.53, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.045238733291626, + "rewards/margins": 0.6967805027961731, + "rewards/rejected": -1.7420192956924438, "step": 1710 }, { - "epoch": 0.43, - "learning_rate": 4.7504915270105794e-07, - "logits/chosen": -2.475844621658325, - "logits/rejected": -2.477785110473633, - "logps/chosen": -283.61627197265625, - "logps/rejected": -302.0332946777344, - "loss": 0.572, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.9548444747924805, - "rewards/margins": 1.2471121549606323, - "rewards/rejected": -3.2019565105438232, + "epoch": 0.41, + "learning_rate": 4.789178106614369e-07, + "logits/chosen": -2.731544256210327, + "logits/rejected": -2.768850326538086, + "logps/chosen": -219.46530151367188, + "logps/rejected": -216.56143188476562, + "loss": 0.512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9595749974250793, + "rewards/margins": 1.32124924659729, + "rewards/rejected": -2.2808241844177246, "step": 1720 }, { - "epoch": 0.44, - "learning_rate": 4.745810317386013e-07, - "logits/chosen": -2.798185348510742, - "logits/rejected": -2.6507716178894043, - "logps/chosen": -425.5433044433594, - "logps/rejected": -361.62921142578125, - "loss": 0.6259, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7486549615859985, - "rewards/margins": 1.0134116411209106, - "rewards/rejected": -2.762066602706909, + "epoch": 0.42, + "learning_rate": 4.784720984132644e-07, + "logits/chosen": -2.769169330596924, + "logits/rejected": -2.793710947036743, + "logps/chosen": -194.6066436767578, + "logps/rejected": -204.08262634277344, + "loss": 0.5696, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7715390920639038, + "rewards/margins": 1.711836576461792, + "rewards/rejected": -2.4833757877349854, "step": 1730 }, { - "epoch": 0.44, - "learning_rate": 4.741129107761445e-07, - "logits/chosen": -2.7440648078918457, - "logits/rejected": -2.7234296798706055, - "logps/chosen": -310.80633544921875, - "logps/rejected": -349.42059326171875, - "loss": 0.4625, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9778580665588379, - "rewards/margins": 1.1600100994110107, - "rewards/rejected": -2.1378684043884277, + "epoch": 0.42, + "learning_rate": 4.780263861650918e-07, + "logits/chosen": -2.8988900184631348, + "logits/rejected": -2.8280751705169678, + "logps/chosen": -244.33297729492188, + "logps/rejected": -242.18295288085938, + "loss": 0.5127, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7681396007537842, + "rewards/margins": 0.5898653268814087, + "rewards/rejected": -2.3580048084259033, "step": 1740 }, { - "epoch": 0.44, - "learning_rate": 4.736447898136878e-07, - "logits/chosen": -2.5742950439453125, - "logits/rejected": -2.5962517261505127, - "logps/chosen": -284.7892150878906, - "logps/rejected": -281.23394775390625, - "loss": 0.559, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4265632629394531, - "rewards/margins": 1.247928261756897, - "rewards/rejected": -2.6744914054870605, + "epoch": 0.42, + "learning_rate": 4.775806739169192e-07, + "logits/chosen": -2.819436550140381, + "logits/rejected": -2.8085174560546875, + "logps/chosen": -298.68109130859375, + "logps/rejected": -271.6244812011719, + "loss": 0.4601, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.869078516960144, + "rewards/margins": 2.018928289413452, + "rewards/rejected": -2.8880066871643066, "step": 1750 }, { - "epoch": 0.44, - "learning_rate": 4.7317666885123115e-07, - "logits/chosen": -2.501652240753174, - "logits/rejected": -2.4336256980895996, - "logps/chosen": -199.65084838867188, - "logps/rejected": -295.1380310058594, - "loss": 0.5376, + "epoch": 0.42, + "learning_rate": 4.771349616687466e-07, + "logits/chosen": -2.841352939605713, + "logits/rejected": -2.880388021469116, + "logps/chosen": -361.2449645996094, + "logps/rejected": -343.948974609375, + "loss": 0.4596, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.718796968460083, - "rewards/margins": 0.35511070489883423, - "rewards/rejected": -2.0739076137542725, + "rewards/chosen": -0.8302813768386841, + "rewards/margins": 1.6769945621490479, + "rewards/rejected": -2.5072758197784424, "step": 1760 }, { - "epoch": 0.45, - "learning_rate": 4.7270854788877444e-07, - "logits/chosen": -2.6972432136535645, - "logits/rejected": -2.6885361671447754, - "logps/chosen": -362.0516662597656, - "logps/rejected": -301.98431396484375, - "loss": 0.5787, + "epoch": 0.43, + "learning_rate": 4.7668924942057403e-07, + "logits/chosen": -2.8133256435394287, + "logits/rejected": -2.898975372314453, + "logps/chosen": -315.82958984375, + "logps/rejected": -261.02960205078125, + "loss": 0.5365, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8265206217765808, - "rewards/margins": 1.7504993677139282, - "rewards/rejected": -2.5770199298858643, + "rewards/chosen": -0.8830242156982422, + "rewards/margins": 1.5805141925811768, + "rewards/rejected": -2.463538408279419, "step": 1770 }, { - "epoch": 0.45, - "learning_rate": 4.7224042692631773e-07, - "logits/chosen": -2.5660178661346436, - "logits/rejected": -2.6825168132781982, - "logps/chosen": -236.5127410888672, - "logps/rejected": -286.50506591796875, - "loss": 0.6535, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2312124967575073, - "rewards/margins": 0.5038989782333374, - "rewards/rejected": -1.7351115942001343, + "epoch": 0.43, + "learning_rate": 4.7624353717240143e-07, + "logits/chosen": -2.86845064163208, + "logits/rejected": -2.883598804473877, + "logps/chosen": -171.30068969726562, + "logps/rejected": -252.77059936523438, + "loss": 0.5437, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0212795734405518, + "rewards/margins": 1.4662119150161743, + "rewards/rejected": -2.4874911308288574, "step": 1780 }, { - "epoch": 0.45, - "learning_rate": 4.71772305963861e-07, - "logits/chosen": -2.6352035999298096, - "logits/rejected": -2.724378824234009, - "logps/chosen": -291.7593688964844, - "logps/rejected": -318.66241455078125, - "loss": 0.6269, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1717381477355957, - "rewards/margins": 0.9231816530227661, - "rewards/rejected": -2.0949196815490723, + "epoch": 0.43, + "learning_rate": 4.757978249242289e-07, + "logits/chosen": -2.8889145851135254, + "logits/rejected": -2.8409581184387207, + "logps/chosen": -419.391357421875, + "logps/rejected": -329.8358154296875, + "loss": 0.6465, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8031169772148132, + "rewards/margins": 0.31203651428222656, + "rewards/rejected": -1.1151535511016846, "step": 1790 }, { - "epoch": 0.46, - "learning_rate": 4.7130418500140436e-07, - "logits/chosen": -2.7564892768859863, - "logits/rejected": -2.6997904777526855, - "logps/chosen": -433.239501953125, - "logps/rejected": -370.81121826171875, - "loss": 0.6336, + "epoch": 0.43, + "learning_rate": 4.753521126760563e-07, + "logits/chosen": -2.8375251293182373, + "logits/rejected": -2.878145456314087, + "logps/chosen": -244.9544219970703, + "logps/rejected": -178.86183166503906, + "loss": 0.5601, "rewards/accuracies": 0.75, - "rewards/chosen": -1.5008946657180786, - "rewards/margins": 1.5569981336593628, - "rewards/rejected": -3.0578930377960205, + "rewards/chosen": -0.329495370388031, + "rewards/margins": 0.966100811958313, + "rewards/rejected": -1.2955963611602783, "step": 1800 }, { - "epoch": 0.46, - "learning_rate": 4.7083606403894765e-07, - "logits/chosen": -2.5221521854400635, - "logits/rejected": -2.511442184448242, - "logps/chosen": -255.2664031982422, - "logps/rejected": -250.4320526123047, - "loss": 0.7713, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.6310237646102905, - "rewards/margins": 0.4643372893333435, - "rewards/rejected": -2.0953612327575684, + "epoch": 0.43, + "eval_logits/chosen": -2.8918044567108154, + "eval_logits/rejected": -2.8890292644500732, + "eval_logps/chosen": -213.32553100585938, + "eval_logps/rejected": -215.03591918945312, + "eval_loss": 0.5280715227127075, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -1.7364517450332642, + "eval_rewards/margins": 1.3278307914733887, + "eval_rewards/rejected": -3.0642824172973633, + "eval_runtime": 131.9602, + "eval_samples_per_second": 23.916, + "eval_steps_per_second": 0.379, + "step": 1800 + }, + { + "epoch": 0.44, + "learning_rate": 4.749064004278837e-07, + "logits/chosen": -2.9717700481414795, + "logits/rejected": -2.97356915473938, + "logps/chosen": -341.64556884765625, + "logps/rejected": -345.1664123535156, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2634963989257812, + "rewards/margins": 0.8707951307296753, + "rewards/rejected": -2.134291410446167, "step": 1810 }, { - "epoch": 0.46, - "learning_rate": 4.70367943076491e-07, - "logits/chosen": -2.61252760887146, - "logits/rejected": -2.590552806854248, - "logps/chosen": -184.2164306640625, - "logps/rejected": -193.83358764648438, - "loss": 0.9105, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8147584795951843, - "rewards/margins": 0.5969401597976685, - "rewards/rejected": -1.4116986989974976, + "epoch": 0.44, + "learning_rate": 4.7446068817971115e-07, + "logits/chosen": -2.6714043617248535, + "logits/rejected": -2.6584677696228027, + "logps/chosen": -267.92669677734375, + "logps/rejected": -301.8381652832031, + "loss": 0.5162, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.538935422897339, + "rewards/margins": 0.3252183794975281, + "rewards/rejected": -2.864154100418091, "step": 1820 }, { - "epoch": 0.46, - "learning_rate": 4.6989982211403423e-07, - "logits/chosen": -2.653203010559082, - "logits/rejected": -2.6058340072631836, - "logps/chosen": -335.33941650390625, - "logps/rejected": -291.458740234375, - "loss": 0.6721, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.1626819372177124, - "rewards/margins": 1.0949734449386597, - "rewards/rejected": -2.257655620574951, + "epoch": 0.44, + "learning_rate": 4.7401497593153855e-07, + "logits/chosen": -2.792344570159912, + "logits/rejected": -2.7671806812286377, + "logps/chosen": -312.053466796875, + "logps/rejected": -266.09051513671875, + "loss": 0.6418, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0438663959503174, + "rewards/margins": 0.6732957363128662, + "rewards/rejected": -3.7171623706817627, "step": 1830 }, { - "epoch": 0.47, - "learning_rate": 4.694317011515775e-07, - "logits/chosen": -2.737450361251831, - "logits/rejected": -2.6910014152526855, - "logps/chosen": -300.6209411621094, - "logps/rejected": -287.80657958984375, - "loss": 0.6583, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2600080966949463, - "rewards/margins": 1.2313051223754883, - "rewards/rejected": -2.4913132190704346, + "epoch": 0.44, + "learning_rate": 4.7356926368336596e-07, + "logits/chosen": -2.831927537918091, + "logits/rejected": -2.8219683170318604, + "logps/chosen": -186.5095977783203, + "logps/rejected": -202.5377960205078, + "loss": 0.6745, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.1156291961669922, + "rewards/margins": 0.9165793657302856, + "rewards/rejected": -2.032208204269409, "step": 1840 }, { - "epoch": 0.47, - "learning_rate": 4.6896358018912086e-07, - "logits/chosen": -2.5987954139709473, - "logits/rejected": -2.709334373474121, - "logps/chosen": -242.94735717773438, - "logps/rejected": -284.9515075683594, - "loss": 0.6904, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2675464153289795, - "rewards/margins": 0.8861404657363892, - "rewards/rejected": -2.153687000274658, + "epoch": 0.45, + "learning_rate": 4.731235514351934e-07, + "logits/chosen": -2.8895366191864014, + "logits/rejected": -2.8536458015441895, + "logps/chosen": -223.1267547607422, + "logps/rejected": -234.9865264892578, + "loss": 0.6086, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1764342784881592, + "rewards/margins": 0.5081696510314941, + "rewards/rejected": -1.6846036911010742, "step": 1850 }, { - "epoch": 0.47, - "learning_rate": 4.6849545922666415e-07, - "logits/chosen": -2.7179274559020996, - "logits/rejected": -2.660978317260742, - "logps/chosen": -259.5520935058594, - "logps/rejected": -226.71029663085938, - "loss": 0.6684, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.9327359199523926, - "rewards/margins": 1.161287546157837, - "rewards/rejected": -2.0940234661102295, + "epoch": 0.45, + "learning_rate": 4.726778391870208e-07, + "logits/chosen": -2.6941001415252686, + "logits/rejected": -2.6582393646240234, + "logps/chosen": -234.7300262451172, + "logps/rejected": -265.83453369140625, + "loss": 0.6445, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6458721160888672, + "rewards/margins": 1.7076094150543213, + "rewards/rejected": -2.3534815311431885, "step": 1860 }, { - "epoch": 0.47, - "learning_rate": 4.6802733826420744e-07, - "logits/chosen": -2.618227005004883, - "logits/rejected": -2.589632272720337, - "logps/chosen": -308.71923828125, - "logps/rejected": -278.947265625, - "loss": 0.5687, + "epoch": 0.45, + "learning_rate": 4.7223212693884827e-07, + "logits/chosen": -2.834977626800537, + "logits/rejected": -2.8682796955108643, + "logps/chosen": -189.2861785888672, + "logps/rejected": -210.2560272216797, + "loss": 0.5359, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5463954210281372, - "rewards/margins": 1.3313301801681519, - "rewards/rejected": -2.877725124359131, + "rewards/chosen": -0.7677046656608582, + "rewards/margins": 1.2189629077911377, + "rewards/rejected": -1.9866676330566406, "step": 1870 }, { - "epoch": 0.48, - "learning_rate": 4.6755921730175073e-07, - "logits/chosen": -2.6452157497406006, - "logits/rejected": -2.5336766242980957, - "logps/chosen": -346.36529541015625, - "logps/rejected": -289.3600158691406, - "loss": 0.6538, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5556385517120361, - "rewards/margins": 0.8465816378593445, - "rewards/rejected": -2.4022200107574463, + "epoch": 0.45, + "learning_rate": 4.7178641469067573e-07, + "logits/chosen": -2.799755573272705, + "logits/rejected": -2.82403302192688, + "logps/chosen": -235.48239135742188, + "logps/rejected": -256.05706787109375, + "loss": 0.6002, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6764448881149292, + "rewards/margins": 0.802793025970459, + "rewards/rejected": -1.4792379140853882, "step": 1880 }, { - "epoch": 0.48, - "learning_rate": 4.6709109633929407e-07, - "logits/chosen": -2.7492051124572754, - "logits/rejected": -2.700204610824585, - "logps/chosen": -316.11492919921875, - "logps/rejected": -313.65338134765625, - "loss": 0.6407, + "epoch": 0.45, + "learning_rate": 4.7134070244250313e-07, + "logits/chosen": -2.799698829650879, + "logits/rejected": -2.7578086853027344, + "logps/chosen": -279.86151123046875, + "logps/rejected": -227.9444122314453, + "loss": 0.4615, "rewards/accuracies": 0.75, - "rewards/chosen": -1.3001348972320557, - "rewards/margins": 1.444930911064148, - "rewards/rejected": -2.745065927505493, + "rewards/chosen": -1.2664954662322998, + "rewards/margins": 1.0705196857452393, + "rewards/rejected": -2.337015390396118, "step": 1890 }, { - "epoch": 0.48, - "learning_rate": 4.6662297537683736e-07, - "logits/chosen": -2.871032238006592, - "logits/rejected": -2.772630214691162, - "logps/chosen": -233.3348846435547, - "logps/rejected": -266.38677978515625, - "loss": 0.5614, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.9081518054008484, - "rewards/margins": 0.832781195640564, - "rewards/rejected": -1.7409330606460571, + "epoch": 0.46, + "learning_rate": 4.7089499019433053e-07, + "logits/chosen": -2.860975742340088, + "logits/rejected": -2.745042324066162, + "logps/chosen": -294.7018127441406, + "logps/rejected": -283.7228088378906, + "loss": 0.576, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1074726581573486, + "rewards/margins": 1.6157457828521729, + "rewards/rejected": -2.7232184410095215, "step": 1900 }, { - "epoch": 0.48, - "learning_rate": 4.6615485441438065e-07, - "logits/chosen": -2.6391685009002686, - "logits/rejected": -2.620103120803833, - "logps/chosen": -267.82135009765625, - "logps/rejected": -257.8954162597656, - "loss": 0.5407, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.1627651453018188, - "rewards/margins": 0.6330679059028625, - "rewards/rejected": -1.7958329916000366, + "epoch": 0.46, + "eval_logits/chosen": -2.742706537246704, + "eval_logits/rejected": -2.736884832382202, + "eval_logps/chosen": -210.7831268310547, + "eval_logps/rejected": -213.6872100830078, + "eval_loss": 0.5265913605690002, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -1.4822113513946533, + "eval_rewards/margins": 1.4472006559371948, + "eval_rewards/rejected": -2.9294116497039795, + "eval_runtime": 131.8208, + "eval_samples_per_second": 23.942, + "eval_steps_per_second": 0.379, + "step": 1900 + }, + { + "epoch": 0.46, + "learning_rate": 4.70449277946158e-07, + "logits/chosen": -2.7894601821899414, + "logits/rejected": -2.803238868713379, + "logps/chosen": -259.3186950683594, + "logps/rejected": -261.62384033203125, + "loss": 0.4885, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9366682767868042, + "rewards/margins": 0.8063037991523743, + "rewards/rejected": -1.7429721355438232, "step": 1910 }, { - "epoch": 0.49, - "learning_rate": 4.6568673345192394e-07, - "logits/chosen": -2.872558116912842, - "logits/rejected": -2.7963390350341797, - "logps/chosen": -368.4126892089844, - "logps/rejected": -305.3444519042969, - "loss": 0.6164, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8419774770736694, - "rewards/margins": 1.7068140506744385, - "rewards/rejected": -2.5487914085388184, + "epoch": 0.46, + "learning_rate": 4.700035656979854e-07, + "logits/chosen": -2.74914813041687, + "logits/rejected": -2.785342216491699, + "logps/chosen": -282.2004699707031, + "logps/rejected": -230.36727905273438, + "loss": 0.7369, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.180564522743225, + "rewards/margins": 0.7879037857055664, + "rewards/rejected": -1.9684680700302124, "step": 1920 }, { - "epoch": 0.49, - "learning_rate": 4.652186124894673e-07, - "logits/chosen": -2.8567965030670166, - "logits/rejected": -2.6726253032684326, - "logps/chosen": -317.45062255859375, - "logps/rejected": -160.01547241210938, - "loss": 0.553, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.01207830011844635, - "rewards/margins": 1.3234484195709229, - "rewards/rejected": -1.335526704788208, + "epoch": 0.46, + "learning_rate": 4.695578534498128e-07, + "logits/chosen": -2.6837284564971924, + "logits/rejected": -2.668029308319092, + "logps/chosen": -269.26422119140625, + "logps/rejected": -305.1382141113281, + "loss": 0.6476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3110121488571167, + "rewards/margins": 1.739222764968872, + "rewards/rejected": -2.0502350330352783, "step": 1930 }, { - "epoch": 0.49, - "learning_rate": 4.6475049152701057e-07, - "logits/chosen": -2.758852481842041, - "logits/rejected": -2.6877527236938477, - "logps/chosen": -291.3718566894531, - "logps/rejected": -226.0242919921875, - "loss": 0.4435, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.6673688292503357, - "rewards/margins": 1.2510454654693604, - "rewards/rejected": -1.9184144735336304, + "epoch": 0.47, + "learning_rate": 4.691121412016402e-07, + "logits/chosen": -2.88321852684021, + "logits/rejected": -2.7781715393066406, + "logps/chosen": -236.67385864257812, + "logps/rejected": -252.6656036376953, + "loss": 0.5415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6674965620040894, + "rewards/margins": 0.9925910830497742, + "rewards/rejected": -1.6600875854492188, "step": 1940 }, { - "epoch": 0.49, - "learning_rate": 4.6428237056455386e-07, - "logits/chosen": -2.761021137237549, - "logits/rejected": -2.714482307434082, - "logps/chosen": -228.28213500976562, - "logps/rejected": -216.9440155029297, - "loss": 0.6122, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.011810541152954, - "rewards/margins": 0.710497260093689, - "rewards/rejected": -1.7223079204559326, + "epoch": 0.47, + "learning_rate": 4.6866642895346765e-07, + "logits/chosen": -2.849828004837036, + "logits/rejected": -2.794015407562256, + "logps/chosen": -197.5206298828125, + "logps/rejected": -210.77627563476562, + "loss": 0.5326, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1780354976654053, + "rewards/margins": 1.46303391456604, + "rewards/rejected": -2.6410696506500244, "step": 1950 }, { - "epoch": 0.5, - "learning_rate": 4.6381424960209715e-07, - "logits/chosen": -2.654102325439453, - "logits/rejected": -2.702723264694214, - "logps/chosen": -296.45733642578125, - "logps/rejected": -313.7186279296875, - "loss": 0.6241, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2756218910217285, - "rewards/margins": 0.5113860368728638, - "rewards/rejected": -1.7870079278945923, + "epoch": 0.47, + "learning_rate": 4.6822071670529506e-07, + "logits/chosen": -2.780311107635498, + "logits/rejected": -2.7946696281433105, + "logps/chosen": -272.17462158203125, + "logps/rejected": -201.8812713623047, + "loss": 0.4578, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.500429630279541, + "rewards/margins": 1.5816775560379028, + "rewards/rejected": -3.0821073055267334, "step": 1960 }, { - "epoch": 0.5, - "learning_rate": 4.6334612863964044e-07, - "logits/chosen": -2.782576322555542, - "logits/rejected": -2.6841907501220703, - "logps/chosen": -284.8516540527344, - "logps/rejected": -260.6100769042969, - "loss": 0.6017, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.1669677197933197, - "rewards/margins": 0.5671809911727905, - "rewards/rejected": -0.7341487407684326, + "epoch": 0.47, + "learning_rate": 4.6777500445712246e-07, + "logits/chosen": -2.7558159828186035, + "logits/rejected": -2.7028214931488037, + "logps/chosen": -212.8761749267578, + "logps/rejected": -187.07119750976562, + "loss": 0.5949, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6154773235321045, + "rewards/margins": 0.9239140748977661, + "rewards/rejected": -2.539391279220581, "step": 1970 }, { - "epoch": 0.5, - "learning_rate": 4.628780076771838e-07, - "logits/chosen": -2.5050246715545654, - "logits/rejected": -2.5695414543151855, - "logps/chosen": -236.9279022216797, - "logps/rejected": -214.5349578857422, - "loss": 0.5509, + "epoch": 0.48, + "learning_rate": 4.673292922089499e-07, + "logits/chosen": -2.8809754848480225, + "logits/rejected": -2.93678617477417, + "logps/chosen": -221.31167602539062, + "logps/rejected": -236.80224609375, + "loss": 0.5489, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.2865060567855835, - "rewards/margins": 1.4661260843276978, - "rewards/rejected": -1.7526321411132812, + "rewards/chosen": -0.7277089357376099, + "rewards/margins": 1.3031330108642578, + "rewards/rejected": -2.0308420658111572, "step": 1980 }, { - "epoch": 0.5, - "learning_rate": 4.6240988671472707e-07, - "logits/chosen": -2.651034355163574, - "logits/rejected": -2.722280979156494, - "logps/chosen": -177.27679443359375, - "logps/rejected": -235.3101806640625, - "loss": 0.5908, + "epoch": 0.48, + "learning_rate": 4.668835799607773e-07, + "logits/chosen": -2.600198984146118, + "logits/rejected": -2.7005727291107178, + "logps/chosen": -244.8251190185547, + "logps/rejected": -224.99520874023438, + "loss": 0.5634, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.22419700026512146, - "rewards/margins": 1.5160796642303467, - "rewards/rejected": -1.7402766942977905, + "rewards/chosen": -0.6404024958610535, + "rewards/margins": 1.7106231451034546, + "rewards/rejected": -2.3510258197784424, "step": 1990 }, { - "epoch": 0.51, - "learning_rate": 4.6194176575227036e-07, - "logits/chosen": -2.5649561882019043, - "logits/rejected": -2.4804835319519043, - "logps/chosen": -387.7535095214844, - "logps/rejected": -289.9999694824219, - "loss": 0.6403, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.967598557472229, - "rewards/margins": 0.8338910937309265, - "rewards/rejected": -1.8014894723892212, + "epoch": 0.48, + "learning_rate": 4.664378677126047e-07, + "logits/chosen": -2.6169991493225098, + "logits/rejected": -2.5494322776794434, + "logps/chosen": -409.3736877441406, + "logps/rejected": -323.9679260253906, + "loss": 1.2064, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9442112445831299, + "rewards/margins": 2.7875208854675293, + "rewards/rejected": -4.731732368469238, "step": 2000 }, { - "epoch": 0.51, - "learning_rate": 4.6147364478981365e-07, - "logits/chosen": -2.6053478717803955, - "logits/rejected": -2.6039249897003174, - "logps/chosen": -218.8382110595703, - "logps/rejected": -216.4173126220703, - "loss": 0.5884, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8255708813667297, - "rewards/margins": 0.8683991432189941, - "rewards/rejected": -1.6939697265625, + "epoch": 0.48, + "eval_logits/chosen": -2.695667028427124, + "eval_logits/rejected": -2.677318811416626, + "eval_logps/chosen": -221.4541778564453, + "eval_logps/rejected": -222.01815795898438, + "eval_loss": 0.5538309812545776, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -2.5493154525756836, + "eval_rewards/margins": 1.2131924629211426, + "eval_rewards/rejected": -3.762507915496826, + "eval_runtime": 131.7535, + "eval_samples_per_second": 23.954, + "eval_steps_per_second": 0.379, + "step": 2000 + }, + { + "epoch": 0.48, + "learning_rate": 4.659921554644322e-07, + "logits/chosen": -2.7399284839630127, + "logits/rejected": -2.78826904296875, + "logps/chosen": -226.072509765625, + "logps/rejected": -145.2623748779297, + "loss": 0.59, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.391592025756836, + "rewards/margins": 1.1602134704589844, + "rewards/rejected": -2.5518057346343994, "step": 2010 }, { - "epoch": 0.51, - "learning_rate": 4.61005523827357e-07, - "logits/chosen": -2.6284866333007812, - "logits/rejected": -2.6124508380889893, - "logps/chosen": -302.99798583984375, - "logps/rejected": -240.9373779296875, - "loss": 0.5455, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.18973544239997864, - "rewards/margins": 1.3795931339263916, - "rewards/rejected": -1.5693285465240479, + "epoch": 0.49, + "learning_rate": 4.655464432162596e-07, + "logits/chosen": -2.913388252258301, + "logits/rejected": -2.7955703735351562, + "logps/chosen": -267.78546142578125, + "logps/rejected": -222.69186401367188, + "loss": 0.532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4486467838287354, + "rewards/margins": 1.6511014699935913, + "rewards/rejected": -3.099748373031616, "step": 2020 }, { - "epoch": 0.51, - "learning_rate": 4.605374028649003e-07, - "logits/chosen": -2.6844563484191895, - "logits/rejected": -2.695885181427002, - "logps/chosen": -267.65289306640625, - "logps/rejected": -275.90753173828125, - "loss": 0.524, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9961126446723938, - "rewards/margins": 1.043602705001831, - "rewards/rejected": -2.03971529006958, + "epoch": 0.49, + "learning_rate": 4.65100730968087e-07, + "logits/chosen": -2.7484095096588135, + "logits/rejected": -2.8469882011413574, + "logps/chosen": -256.9844970703125, + "logps/rejected": -247.5992431640625, + "loss": 0.5606, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1596620082855225, + "rewards/margins": 1.5145039558410645, + "rewards/rejected": -3.674165725708008, "step": 2030 }, { - "epoch": 0.52, - "learning_rate": 4.600692819024436e-07, - "logits/chosen": -2.759692907333374, - "logits/rejected": -2.5767171382904053, - "logps/chosen": -397.1188049316406, - "logps/rejected": -295.84710693359375, - "loss": 0.5585, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.6714985966682434, - "rewards/margins": 1.577000379562378, - "rewards/rejected": -2.2484986782073975, + "epoch": 0.49, + "learning_rate": 4.6465501871991444e-07, + "logits/chosen": -2.858558177947998, + "logits/rejected": -2.793433904647827, + "logps/chosen": -282.8448486328125, + "logps/rejected": -230.8045654296875, + "loss": 0.6676, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4471412897109985, + "rewards/margins": 1.071502685546875, + "rewards/rejected": -2.518644094467163, "step": 2040 }, { - "epoch": 0.52, - "learning_rate": 4.5960116093998686e-07, - "logits/chosen": -2.511993885040283, - "logits/rejected": -2.5586748123168945, - "logps/chosen": -247.68722534179688, - "logps/rejected": -325.49945068359375, - "loss": 0.4833, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3887598514556885, - "rewards/margins": 1.375656008720398, - "rewards/rejected": -2.7644155025482178, + "epoch": 0.49, + "learning_rate": 4.6420930647174184e-07, + "logits/chosen": -2.774423599243164, + "logits/rejected": -2.7939794063568115, + "logps/chosen": -274.24981689453125, + "logps/rejected": -230.8719940185547, + "loss": 0.6243, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4280846118927002, + "rewards/margins": 0.4704992175102234, + "rewards/rejected": -1.8985836505889893, "step": 2050 }, { - "epoch": 0.52, - "learning_rate": 4.5913303997753015e-07, - "logits/chosen": -2.638223648071289, - "logits/rejected": -2.7245888710021973, - "logps/chosen": -197.88986206054688, - "logps/rejected": -289.672119140625, - "loss": 0.5621, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2302311658859253, - "rewards/margins": 1.2392613887786865, - "rewards/rejected": -2.4694924354553223, + "epoch": 0.5, + "learning_rate": 4.6376359422356924e-07, + "logits/chosen": -2.8167576789855957, + "logits/rejected": -2.804478168487549, + "logps/chosen": -283.3070068359375, + "logps/rejected": -309.2830505371094, + "loss": 0.6032, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7114044427871704, + "rewards/margins": 0.5315491557121277, + "rewards/rejected": -2.2429535388946533, "step": 2060 }, { - "epoch": 0.52, - "learning_rate": 4.586649190150735e-07, - "logits/chosen": -2.5059456825256348, - "logits/rejected": -2.5340614318847656, - "logps/chosen": -299.3819885253906, - "logps/rejected": -259.20843505859375, - "loss": 0.5224, + "epoch": 0.5, + "learning_rate": 4.633178819753967e-07, + "logits/chosen": -2.8849430084228516, + "logits/rejected": -2.9031262397766113, + "logps/chosen": -246.83230590820312, + "logps/rejected": -280.3484191894531, + "loss": 0.5387, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5727068185806274, - "rewards/margins": 1.2742458581924438, - "rewards/rejected": -2.8469526767730713, + "rewards/chosen": -1.2949464321136475, + "rewards/margins": 1.064544439315796, + "rewards/rejected": -2.3594908714294434, "step": 2070 }, { - "epoch": 0.53, - "learning_rate": 4.581967980526168e-07, - "logits/chosen": -2.6917359828948975, - "logits/rejected": -2.831451892852783, - "logps/chosen": -224.42752075195312, - "logps/rejected": -268.67498779296875, - "loss": 0.6184, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5647084712982178, - "rewards/margins": 0.5590938925743103, - "rewards/rejected": -2.123802661895752, + "epoch": 0.5, + "learning_rate": 4.628721697272241e-07, + "logits/chosen": -2.7488393783569336, + "logits/rejected": -2.71333384513855, + "logps/chosen": -247.80874633789062, + "logps/rejected": -228.5020294189453, + "loss": 0.7488, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.928693175315857, + "rewards/margins": 1.8095123767852783, + "rewards/rejected": -3.7382054328918457, "step": 2080 }, { - "epoch": 0.53, - "learning_rate": 4.5772867709016007e-07, - "logits/chosen": -2.5519332885742188, - "logits/rejected": -2.6108298301696777, - "logps/chosen": -205.2392578125, - "logps/rejected": -281.6006774902344, - "loss": 0.5099, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9697121381759644, - "rewards/margins": 1.5353702306747437, - "rewards/rejected": -2.505082607269287, + "epoch": 0.5, + "learning_rate": 4.624264574790515e-07, + "logits/chosen": -2.849909782409668, + "logits/rejected": -2.7705376148223877, + "logps/chosen": -256.9490966796875, + "logps/rejected": -240.8095703125, + "loss": 0.5628, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.483638048171997, + "rewards/margins": 1.2936012744903564, + "rewards/rejected": -2.7772390842437744, "step": 2090 }, { - "epoch": 0.53, - "learning_rate": 4.5726055612770336e-07, - "logits/chosen": -2.7195792198181152, - "logits/rejected": -2.6307480335235596, - "logps/chosen": -257.200439453125, - "logps/rejected": -234.48812866210938, - "loss": 0.5728, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.9558160901069641, - "rewards/margins": 1.1950876712799072, - "rewards/rejected": -2.1509037017822266, + "epoch": 0.51, + "learning_rate": 4.619807452308789e-07, + "logits/chosen": -2.7179818153381348, + "logits/rejected": -2.7097508907318115, + "logps/chosen": -298.8465881347656, + "logps/rejected": -255.6627655029297, + "loss": 0.5751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2503429651260376, + "rewards/margins": 0.8806318044662476, + "rewards/rejected": -2.130974769592285, "step": 2100 }, { - "epoch": 0.53, - "learning_rate": 4.567924351652467e-07, - "logits/chosen": -2.538254737854004, - "logits/rejected": -2.5346908569335938, - "logps/chosen": -212.212646484375, - "logps/rejected": -295.8035888671875, - "loss": 0.7148, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.9982824325561523, - "rewards/margins": 0.681343674659729, - "rewards/rejected": -2.679626226425171, + "epoch": 0.51, + "eval_logits/chosen": -2.665705919265747, + "eval_logits/rejected": -2.6489739418029785, + "eval_logps/chosen": -215.20672607421875, + "eval_logps/rejected": -215.87278747558594, + "eval_loss": 0.5464906096458435, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -1.9245682954788208, + "eval_rewards/margins": 1.223402976989746, + "eval_rewards/rejected": -3.1479713916778564, + "eval_runtime": 131.6773, + "eval_samples_per_second": 23.968, + "eval_steps_per_second": 0.38, + "step": 2100 + }, + { + "epoch": 0.51, + "learning_rate": 4.6153503298270636e-07, + "logits/chosen": -2.794595718383789, + "logits/rejected": -2.8312158584594727, + "logps/chosen": -301.3906555175781, + "logps/rejected": -344.0379943847656, + "loss": 0.5545, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.46836644411087036, + "rewards/margins": 1.3530237674713135, + "rewards/rejected": -1.821390151977539, "step": 2110 }, { - "epoch": 0.54, - "learning_rate": 4.5632431420279e-07, - "logits/chosen": -2.4563584327697754, - "logits/rejected": -2.542332649230957, - "logps/chosen": -251.83627319335938, - "logps/rejected": -282.85302734375, - "loss": 0.4758, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.539489507675171, - "rewards/margins": 1.854832410812378, - "rewards/rejected": -3.394321918487549, + "epoch": 0.51, + "learning_rate": 4.6108932073453377e-07, + "logits/chosen": -2.8825814723968506, + "logits/rejected": -2.8447299003601074, + "logps/chosen": -377.0404357910156, + "logps/rejected": -297.96148681640625, + "loss": 0.5454, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5071232318878174, + "rewards/margins": 0.7904442548751831, + "rewards/rejected": -2.297567367553711, "step": 2120 }, { - "epoch": 0.54, - "learning_rate": 4.5585619324033333e-07, - "logits/chosen": -2.710216999053955, - "logits/rejected": -2.6763594150543213, - "logps/chosen": -289.79974365234375, - "logps/rejected": -273.98248291015625, - "loss": 0.5503, + "epoch": 0.51, + "learning_rate": 4.6064360848636117e-07, + "logits/chosen": -2.8405051231384277, + "logits/rejected": -2.741436719894409, + "logps/chosen": -268.03302001953125, + "logps/rejected": -333.5805358886719, + "loss": 0.6575, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3079802989959717, - "rewards/margins": 1.1398521661758423, - "rewards/rejected": -2.4478325843811035, + "rewards/chosen": -1.1949812173843384, + "rewards/margins": 0.4733108878135681, + "rewards/rejected": -1.6682920455932617, "step": 2130 }, { - "epoch": 0.54, - "learning_rate": 4.5538807227787657e-07, - "logits/chosen": -2.601003408432007, - "logits/rejected": -2.473191976547241, - "logps/chosen": -419.03216552734375, - "logps/rejected": -315.4063415527344, - "loss": 0.523, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.6389806866645813, - "rewards/margins": 1.801963210105896, - "rewards/rejected": -2.440943956375122, + "epoch": 0.52, + "learning_rate": 4.601978962381886e-07, + "logits/chosen": -2.8221044540405273, + "logits/rejected": -2.810089349746704, + "logps/chosen": -246.90478515625, + "logps/rejected": -255.42385864257812, + "loss": 0.5297, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9667257070541382, + "rewards/margins": 0.7890064716339111, + "rewards/rejected": -1.7557321786880493, "step": 2140 }, { - "epoch": 0.54, - "learning_rate": 4.5491995131541986e-07, - "logits/chosen": -2.6831939220428467, - "logits/rejected": -2.7121834754943848, - "logps/chosen": -205.4101104736328, - "logps/rejected": -221.7682647705078, - "loss": 0.5012, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1309032440185547, - "rewards/margins": 1.0398448705673218, - "rewards/rejected": -2.170748233795166, + "epoch": 0.52, + "learning_rate": 4.5975218399001603e-07, + "logits/chosen": -2.725141763687134, + "logits/rejected": -2.7341232299804688, + "logps/chosen": -252.95870971679688, + "logps/rejected": -292.8521423339844, + "loss": 0.8055, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9205362796783447, + "rewards/margins": 0.22402307391166687, + "rewards/rejected": -2.144559621810913, "step": 2150 }, { - "epoch": 0.55, - "learning_rate": 4.544518303529632e-07, - "logits/chosen": -2.668999671936035, - "logits/rejected": -2.5639638900756836, - "logps/chosen": -318.5812072753906, - "logps/rejected": -237.9522247314453, - "loss": 0.661, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.829673409461975, - "rewards/margins": 1.336219072341919, - "rewards/rejected": -3.1658921241760254, + "epoch": 0.52, + "learning_rate": 4.5930647174184343e-07, + "logits/chosen": -2.7981925010681152, + "logits/rejected": -2.8179993629455566, + "logps/chosen": -263.3072204589844, + "logps/rejected": -263.17120361328125, + "loss": 0.4986, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6311888694763184, + "rewards/margins": 1.5977652072906494, + "rewards/rejected": -2.2289538383483887, "step": 2160 }, { - "epoch": 0.55, - "learning_rate": 4.539837093905065e-07, - "logits/chosen": -2.5789854526519775, - "logits/rejected": -2.602631092071533, - "logps/chosen": -325.195556640625, - "logps/rejected": -290.55926513671875, - "loss": 0.5443, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.8139760494232178, - "rewards/margins": 0.6441632509231567, - "rewards/rejected": -3.458139419555664, + "epoch": 0.52, + "learning_rate": 4.588607594936709e-07, + "logits/chosen": -2.7846240997314453, + "logits/rejected": -2.781731128692627, + "logps/chosen": -234.5958709716797, + "logps/rejected": -270.65582275390625, + "loss": 0.5915, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.8767642974853516, + "rewards/margins": 0.9233253598213196, + "rewards/rejected": -2.8000893592834473, "step": 2170 }, { - "epoch": 0.55, - "learning_rate": 4.535155884280498e-07, - "logits/chosen": -2.5557656288146973, - "logits/rejected": -2.5971879959106445, - "logps/chosen": -281.81121826171875, - "logps/rejected": -281.9815673828125, - "loss": 0.5885, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.4063057899475098, - "rewards/margins": 0.7042945623397827, - "rewards/rejected": -3.110599994659424, + "epoch": 0.52, + "learning_rate": 4.584150472454983e-07, + "logits/chosen": -2.654686212539673, + "logits/rejected": -2.612287998199463, + "logps/chosen": -185.30715942382812, + "logps/rejected": -178.09970092773438, + "loss": 0.5742, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6409145593643188, + "rewards/margins": 1.7053394317626953, + "rewards/rejected": -3.3462538719177246, "step": 2180 }, { - "epoch": 0.55, - "learning_rate": 4.5304746746559307e-07, - "logits/chosen": -2.807870864868164, - "logits/rejected": -2.7482285499572754, - "logps/chosen": -346.4349670410156, - "logps/rejected": -309.8635559082031, - "loss": 0.6759, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.917128324508667, - "rewards/margins": 0.6684790253639221, - "rewards/rejected": -2.5856072902679443, + "epoch": 0.53, + "learning_rate": 4.579693349973257e-07, + "logits/chosen": -2.7771549224853516, + "logits/rejected": -2.819648265838623, + "logps/chosen": -181.3798370361328, + "logps/rejected": -208.90597534179688, + "loss": 0.5593, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1666500568389893, + "rewards/margins": 2.3403663635253906, + "rewards/rejected": -3.50701642036438, "step": 2190 }, { - "epoch": 0.56, - "learning_rate": 4.525793465031364e-07, - "logits/chosen": -2.6755623817443848, - "logits/rejected": -2.6319668292999268, - "logps/chosen": -365.0047302246094, - "logps/rejected": -225.29183959960938, - "loss": 0.5749, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.633305311203003, - "rewards/margins": 1.0927493572235107, - "rewards/rejected": -2.7260546684265137, + "epoch": 0.53, + "learning_rate": 4.5752362274915315e-07, + "logits/chosen": -2.8941094875335693, + "logits/rejected": -2.838167190551758, + "logps/chosen": -225.2128143310547, + "logps/rejected": -205.5693359375, + "loss": 0.4757, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5870344638824463, + "rewards/margins": 1.3279521465301514, + "rewards/rejected": -2.9149863719940186, "step": 2200 }, { - "epoch": 0.56, - "learning_rate": 4.521112255406797e-07, - "logits/chosen": -2.551112413406372, - "logits/rejected": -2.5428919792175293, - "logps/chosen": -232.1934356689453, - "logps/rejected": -289.0122375488281, - "loss": 0.8049, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.458327054977417, - "rewards/margins": 1.503689169883728, - "rewards/rejected": -2.9620163440704346, + "epoch": 0.53, + "eval_logits/chosen": -2.71147084236145, + "eval_logits/rejected": -2.6881513595581055, + "eval_logps/chosen": -214.4038543701172, + "eval_logps/rejected": -215.9462127685547, + "eval_loss": 0.5297456979751587, + "eval_rewards/accuracies": 0.6324999928474426, + "eval_rewards/chosen": -1.8442835807800293, + "eval_rewards/margins": 1.311030626296997, + "eval_rewards/rejected": -3.1553144454956055, + "eval_runtime": 131.8793, + "eval_samples_per_second": 23.931, + "eval_steps_per_second": 0.379, + "step": 2200 + }, + { + "epoch": 0.53, + "learning_rate": 4.5707791050098055e-07, + "logits/chosen": -2.782172679901123, + "logits/rejected": -2.808525323867798, + "logps/chosen": -302.11431884765625, + "logps/rejected": -256.6675109863281, + "loss": 0.5994, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09623777866363525, + "rewards/margins": 1.9226150512695312, + "rewards/rejected": -2.018852472305298, "step": 2210 }, { - "epoch": 0.56, - "learning_rate": 4.51643104578223e-07, - "logits/chosen": -2.6861534118652344, - "logits/rejected": -2.6878790855407715, - "logps/chosen": -263.35772705078125, - "logps/rejected": -240.6010284423828, - "loss": 0.5831, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3718329966068268, - "rewards/margins": 1.8208471536636353, - "rewards/rejected": -2.1926798820495605, + "epoch": 0.53, + "learning_rate": 4.5663219825280795e-07, + "logits/chosen": -2.832003116607666, + "logits/rejected": -2.806697130203247, + "logps/chosen": -247.9135284423828, + "logps/rejected": -220.4112548828125, + "loss": 0.7586, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.977599024772644, + "rewards/margins": 0.6515631675720215, + "rewards/rejected": -1.6291621923446655, "step": 2220 }, { - "epoch": 0.56, - "learning_rate": 4.511749836157663e-07, - "logits/chosen": -2.6632721424102783, - "logits/rejected": -2.5350358486175537, - "logps/chosen": -270.24200439453125, - "logps/rejected": -227.2047882080078, - "loss": 0.6363, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4736149311065674, - "rewards/margins": 1.2423803806304932, - "rewards/rejected": -2.7159953117370605, + "epoch": 0.54, + "learning_rate": 4.561864860046354e-07, + "logits/chosen": -2.8176069259643555, + "logits/rejected": -2.8338370323181152, + "logps/chosen": -212.6001434326172, + "logps/rejected": -167.97206115722656, + "loss": 0.5817, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5276968479156494, + "rewards/margins": 0.6183308959007263, + "rewards/rejected": -2.1460278034210205, "step": 2230 }, { - "epoch": 0.57, - "learning_rate": 4.507068626533096e-07, - "logits/chosen": -2.6859564781188965, - "logits/rejected": -2.616516351699829, - "logps/chosen": -337.3983459472656, - "logps/rejected": -304.076904296875, - "loss": 0.6378, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1171061992645264, - "rewards/margins": 1.7147471904754639, - "rewards/rejected": -2.8318533897399902, + "epoch": 0.54, + "learning_rate": 4.557407737564628e-07, + "logits/chosen": -2.744023084640503, + "logits/rejected": -2.7330851554870605, + "logps/chosen": -202.021484375, + "logps/rejected": -212.80172729492188, + "loss": 0.4718, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0353754758834839, + "rewards/margins": 1.3258020877838135, + "rewards/rejected": -2.361177682876587, "step": 2240 }, { - "epoch": 0.57, - "learning_rate": 4.502387416908529e-07, - "logits/chosen": -2.7045998573303223, - "logits/rejected": -2.6133079528808594, - "logps/chosen": -452.3338928222656, - "logps/rejected": -380.03009033203125, - "loss": 0.5329, + "epoch": 0.54, + "learning_rate": 4.552950615082902e-07, + "logits/chosen": -2.6584181785583496, + "logits/rejected": -2.6827573776245117, + "logps/chosen": -236.74026489257812, + "logps/rejected": -284.8752746582031, + "loss": 0.6218, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8934322595596313, - "rewards/margins": 2.0240795612335205, - "rewards/rejected": -2.9175117015838623, + "rewards/chosen": -0.37710869312286377, + "rewards/margins": 1.659280776977539, + "rewards/rejected": -2.0363893508911133, "step": 2250 }, { - "epoch": 0.57, - "learning_rate": 4.497706207283962e-07, - "logits/chosen": -2.5197484493255615, - "logits/rejected": -2.619424819946289, - "logps/chosen": -271.44000244140625, - "logps/rejected": -364.75384521484375, - "loss": 0.5912, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.892828345298767, - "rewards/margins": 0.9958688616752625, - "rewards/rejected": -2.888697385787964, + "epoch": 0.54, + "learning_rate": 4.548493492601176e-07, + "logits/chosen": -3.0061564445495605, + "logits/rejected": -2.9000630378723145, + "logps/chosen": -276.2821960449219, + "logps/rejected": -293.7289733886719, + "loss": 0.5003, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7436341047286987, + "rewards/margins": 2.0284717082977295, + "rewards/rejected": -2.7721059322357178, "step": 2260 }, { - "epoch": 0.57, - "learning_rate": 4.493024997659395e-07, - "logits/chosen": -2.718838930130005, - "logits/rejected": -2.617269515991211, - "logps/chosen": -298.60443115234375, - "logps/rejected": -247.50552368164062, - "loss": 0.4695, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.534011960029602, - "rewards/margins": 1.599219560623169, - "rewards/rejected": -2.1332316398620605, + "epoch": 0.55, + "learning_rate": 4.544036370119451e-07, + "logits/chosen": -3.0206172466278076, + "logits/rejected": -2.9449515342712402, + "logps/chosen": -422.12054443359375, + "logps/rejected": -345.63836669921875, + "loss": 0.4699, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1289310455322266, + "rewards/margins": 1.121739387512207, + "rewards/rejected": -2.2506701946258545, "step": 2270 }, { - "epoch": 0.58, - "learning_rate": 4.488343788034828e-07, - "logits/chosen": -2.6473498344421387, - "logits/rejected": -2.563596487045288, - "logps/chosen": -200.87728881835938, - "logps/rejected": -181.6173858642578, - "loss": 0.5965, + "epoch": 0.55, + "learning_rate": 4.539579247637725e-07, + "logits/chosen": -2.775986671447754, + "logits/rejected": -2.739438533782959, + "logps/chosen": -271.61370849609375, + "logps/rejected": -237.7899169921875, + "loss": 0.5473, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.9050285220146179, - "rewards/margins": 1.7684764862060547, - "rewards/rejected": -2.6735050678253174, + "rewards/chosen": -0.3526901602745056, + "rewards/margins": 1.7576888799667358, + "rewards/rejected": -2.110379457473755, "step": 2280 }, { - "epoch": 0.58, - "learning_rate": 4.483662578410261e-07, - "logits/chosen": -2.6025519371032715, - "logits/rejected": -2.5912060737609863, - "logps/chosen": -240.58151245117188, - "logps/rejected": -267.31304931640625, - "loss": 0.5615, + "epoch": 0.55, + "learning_rate": 4.535122125155999e-07, + "logits/chosen": -2.573038339614868, + "logits/rejected": -2.503798723220825, + "logps/chosen": -358.23394775390625, + "logps/rejected": -318.8902893066406, + "loss": 0.7367, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.6238843202590942, - "rewards/margins": 2.195256471633911, - "rewards/rejected": -2.819140672683716, + "rewards/chosen": -1.6468093395233154, + "rewards/margins": 0.8769130706787109, + "rewards/rejected": -2.5237224102020264, "step": 2290 }, { - "epoch": 0.58, - "learning_rate": 4.478981368785694e-07, - "logits/chosen": -2.5795750617980957, - "logits/rejected": -2.5501458644866943, - "logps/chosen": -216.99716186523438, - "logps/rejected": -218.18185424804688, - "loss": 0.5537, + "epoch": 0.55, + "learning_rate": 4.5306650026742734e-07, + "logits/chosen": -2.874817371368408, + "logits/rejected": -2.7666258811950684, + "logps/chosen": -251.40005493164062, + "logps/rejected": -196.70022583007812, + "loss": 0.4771, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.6390549540519714, - "rewards/margins": 1.9194520711898804, - "rewards/rejected": -2.5585074424743652, + "rewards/chosen": -2.5447452068328857, + "rewards/margins": 1.531328797340393, + "rewards/rejected": -4.07607364654541, "step": 2300 }, { - "epoch": 0.58, - "learning_rate": 4.474300159161127e-07, - "logits/chosen": -2.545380115509033, - "logits/rejected": -2.4202880859375, - "logps/chosen": -211.42227172851562, - "logps/rejected": -239.5775146484375, - "loss": 0.5645, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7160159349441528, - "rewards/margins": 0.9380934834480286, - "rewards/rejected": -1.654109239578247, + "epoch": 0.55, + "eval_logits/chosen": -2.6622838973999023, + "eval_logits/rejected": -2.6414976119995117, + "eval_logps/chosen": -219.30126953125, + "eval_logps/rejected": -221.83599853515625, + "eval_loss": 0.5386084318161011, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -2.3340256214141846, + "eval_rewards/margins": 1.410265326499939, + "eval_rewards/rejected": -3.744290828704834, + "eval_runtime": 131.6348, + "eval_samples_per_second": 23.975, + "eval_steps_per_second": 0.38, + "step": 2300 + }, + { + "epoch": 0.56, + "learning_rate": 4.5262078801925474e-07, + "logits/chosen": -2.735877275466919, + "logits/rejected": -2.710038900375366, + "logps/chosen": -203.05795288085938, + "logps/rejected": -202.85787963867188, + "loss": 0.5453, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7748769521713257, + "rewards/margins": 0.8943048715591431, + "rewards/rejected": -2.6691815853118896, "step": 2310 }, { - "epoch": 0.59, - "learning_rate": 4.46961894953656e-07, - "logits/chosen": -2.40726900100708, - "logits/rejected": -2.431823253631592, - "logps/chosen": -223.0937042236328, - "logps/rejected": -196.89883422851562, - "loss": 0.6963, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.6814838647842407, - "rewards/margins": 0.3216429352760315, - "rewards/rejected": -2.003127098083496, + "epoch": 0.56, + "learning_rate": 4.5217507577108214e-07, + "logits/chosen": -2.892054557800293, + "logits/rejected": -2.8468434810638428, + "logps/chosen": -214.32424926757812, + "logps/rejected": -244.46792602539062, + "loss": 0.5889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.122611403465271, + "rewards/margins": 1.3384064435958862, + "rewards/rejected": -2.4610178470611572, "step": 2320 }, { - "epoch": 0.59, - "learning_rate": 4.4649377399119933e-07, - "logits/chosen": -2.835279703140259, - "logits/rejected": -2.740386486053467, - "logps/chosen": -321.75152587890625, - "logps/rejected": -288.8874206542969, - "loss": 0.5352, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.5283945798873901, - "rewards/margins": 2.2066421508789062, - "rewards/rejected": -2.735036611557007, + "epoch": 0.56, + "learning_rate": 4.517293635229096e-07, + "logits/chosen": -2.9242873191833496, + "logits/rejected": -2.861582040786743, + "logps/chosen": -241.2045440673828, + "logps/rejected": -207.21731567382812, + "loss": 0.5296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.547773540019989, + "rewards/margins": 1.613692045211792, + "rewards/rejected": -2.161465644836426, "step": 2330 }, { - "epoch": 0.59, - "learning_rate": 4.460256530287426e-07, - "logits/chosen": -2.5671308040618896, - "logits/rejected": -2.5428531169891357, - "logps/chosen": -403.35406494140625, - "logps/rejected": -291.94573974609375, - "loss": 0.5566, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.4538414478302002, - "rewards/margins": 0.6209135055541992, - "rewards/rejected": -2.0747549533843994, + "epoch": 0.56, + "learning_rate": 4.51283651274737e-07, + "logits/chosen": -2.631446361541748, + "logits/rejected": -2.59982967376709, + "logps/chosen": -237.553466796875, + "logps/rejected": -242.0279541015625, + "loss": 0.472, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8881114721298218, + "rewards/margins": 2.5885801315307617, + "rewards/rejected": -4.476691722869873, "step": 2340 }, { - "epoch": 0.59, - "learning_rate": 4.4555753206628596e-07, - "logits/chosen": -2.6901965141296387, - "logits/rejected": -2.624437093734741, - "logps/chosen": -179.4101104736328, - "logps/rejected": -185.3762969970703, - "loss": 0.7162, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.7822803258895874, - "rewards/margins": 1.7482283115386963, - "rewards/rejected": -2.530508279800415, + "epoch": 0.57, + "learning_rate": 4.508379390265644e-07, + "logits/chosen": -2.764129877090454, + "logits/rejected": -2.721900224685669, + "logps/chosen": -200.16578674316406, + "logps/rejected": -174.96267700195312, + "loss": 0.5488, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4990546703338623, + "rewards/margins": 1.1366833448410034, + "rewards/rejected": -2.635737895965576, "step": 2350 }, { - "epoch": 0.6, - "learning_rate": 4.450894111038292e-07, - "logits/chosen": -2.7932639122009277, - "logits/rejected": -2.770897626876831, - "logps/chosen": -296.88995361328125, - "logps/rejected": -244.643310546875, - "loss": 0.5341, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1266002655029297, - "rewards/margins": 1.2736326456069946, - "rewards/rejected": -2.400233030319214, + "epoch": 0.57, + "learning_rate": 4.5039222677839186e-07, + "logits/chosen": -2.8360979557037354, + "logits/rejected": -2.86533784866333, + "logps/chosen": -338.47589111328125, + "logps/rejected": -310.8025817871094, + "loss": 0.448, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7962379455566406, + "rewards/margins": 1.0437629222869873, + "rewards/rejected": -2.840000867843628, "step": 2360 }, { - "epoch": 0.6, - "learning_rate": 4.446212901413725e-07, - "logits/chosen": -2.3377389907836914, - "logits/rejected": -2.5143706798553467, - "logps/chosen": -312.0408630371094, - "logps/rejected": -325.20440673828125, - "loss": 0.6169, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4157869815826416, - "rewards/margins": 0.7878390550613403, - "rewards/rejected": -2.2036261558532715, + "epoch": 0.57, + "learning_rate": 4.4994651453021926e-07, + "logits/chosen": -2.663184404373169, + "logits/rejected": -2.689486026763916, + "logps/chosen": -237.29244995117188, + "logps/rejected": -247.76034545898438, + "loss": 0.4726, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2554551362991333, + "rewards/margins": 2.2178115844726562, + "rewards/rejected": -3.473267078399658, "step": 2370 }, { - "epoch": 0.6, - "learning_rate": 4.4415316917891583e-07, - "logits/chosen": -2.5377869606018066, - "logits/rejected": -2.651761293411255, - "logps/chosen": -188.7794647216797, - "logps/rejected": -275.84033203125, - "loss": 0.6268, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4358875751495361, - "rewards/margins": 0.8894082307815552, - "rewards/rejected": -2.3252956867218018, + "epoch": 0.57, + "learning_rate": 4.4950080228204666e-07, + "logits/chosen": -2.7717154026031494, + "logits/rejected": -2.7869515419006348, + "logps/chosen": -328.4939880371094, + "logps/rejected": -348.368408203125, + "loss": 0.5404, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9268843531608582, + "rewards/margins": 2.2614705562591553, + "rewards/rejected": -3.1883554458618164, "step": 2380 }, { - "epoch": 0.6, - "learning_rate": 4.436850482164591e-07, - "logits/chosen": -2.949428081512451, - "logits/rejected": -2.8552987575531006, - "logps/chosen": -290.80780029296875, - "logps/rejected": -259.5678405761719, - "loss": 0.546, + "epoch": 0.58, + "learning_rate": 4.490550900338741e-07, + "logits/chosen": -2.850925922393799, + "logits/rejected": -2.685128688812256, + "logps/chosen": -231.131103515625, + "logps/rejected": -229.58480834960938, + "loss": 0.4294, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.5419038534164429, - "rewards/margins": 1.150870442390442, - "rewards/rejected": -1.6927744150161743, + "rewards/chosen": -1.2299256324768066, + "rewards/margins": 1.9218822717666626, + "rewards/rejected": -3.151808261871338, "step": 2390 }, { - "epoch": 0.61, - "learning_rate": 4.432169272540024e-07, - "logits/chosen": -2.7359933853149414, - "logits/rejected": -2.7360599040985107, - "logps/chosen": -278.16497802734375, - "logps/rejected": -320.8052673339844, - "loss": 0.6529, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9255379438400269, - "rewards/margins": 0.9193289875984192, - "rewards/rejected": -1.8448671102523804, + "epoch": 0.58, + "learning_rate": 4.486093777857015e-07, + "logits/chosen": -2.718325138092041, + "logits/rejected": -2.5954809188842773, + "logps/chosen": -186.8014373779297, + "logps/rejected": -204.5731964111328, + "loss": 0.481, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.9186789393424988, + "rewards/margins": 2.4303152561187744, + "rewards/rejected": -3.348994493484497, "step": 2400 }, { - "epoch": 0.61, - "learning_rate": 4.427488062915457e-07, - "logits/chosen": -2.6792635917663574, - "logits/rejected": -2.724548816680908, - "logps/chosen": -227.6698760986328, - "logps/rejected": -268.72247314453125, - "loss": 0.6107, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.18764783442020416, - "rewards/margins": 1.3458220958709717, - "rewards/rejected": -1.533469557762146, + "epoch": 0.58, + "eval_logits/chosen": -2.6292777061462402, + "eval_logits/rejected": -2.6073172092437744, + "eval_logps/chosen": -212.0459747314453, + "eval_logps/rejected": -215.19302368164062, + "eval_loss": 0.5355206727981567, + "eval_rewards/accuracies": 0.6549999713897705, + "eval_rewards/chosen": -1.6084953546524048, + "eval_rewards/margins": 1.4714986085891724, + "eval_rewards/rejected": -3.079993963241577, + "eval_runtime": 132.0355, + "eval_samples_per_second": 23.903, + "eval_steps_per_second": 0.379, + "step": 2400 + }, + { + "epoch": 0.58, + "learning_rate": 4.481636655375289e-07, + "logits/chosen": -2.8074159622192383, + "logits/rejected": -2.76961612701416, + "logps/chosen": -287.4902648925781, + "logps/rejected": -326.78326416015625, + "loss": 0.6271, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7717764973640442, + "rewards/margins": 1.5434802770614624, + "rewards/rejected": -2.3152568340301514, "step": 2410 }, { - "epoch": 0.61, - "learning_rate": 4.4228068532908904e-07, - "logits/chosen": -2.506946086883545, - "logits/rejected": -2.5913844108581543, - "logps/chosen": -218.8012237548828, - "logps/rejected": -284.5220642089844, - "loss": 0.5946, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.7076995372772217, - "rewards/margins": 1.5795643329620361, - "rewards/rejected": -3.287263870239258, + "epoch": 0.58, + "learning_rate": 4.4771795328935633e-07, + "logits/chosen": -2.8075175285339355, + "logits/rejected": -2.775505542755127, + "logps/chosen": -273.9107666015625, + "logps/rejected": -225.21401977539062, + "loss": 1.0423, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.14238066971302032, + "rewards/margins": 1.3789294958114624, + "rewards/rejected": -1.5213100910186768, "step": 2420 }, { - "epoch": 0.61, - "learning_rate": 4.4181256436663233e-07, - "logits/chosen": -2.5103375911712646, - "logits/rejected": -2.548065185546875, - "logps/chosen": -248.3396453857422, - "logps/rejected": -225.8044891357422, - "loss": 0.5725, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.242130994796753, - "rewards/margins": 1.2999765872955322, - "rewards/rejected": -2.542107343673706, + "epoch": 0.58, + "learning_rate": 4.472722410411838e-07, + "logits/chosen": -2.726250648498535, + "logits/rejected": -2.78377628326416, + "logps/chosen": -194.5577850341797, + "logps/rejected": -219.876708984375, + "loss": 0.5328, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.43630847334861755, + "rewards/margins": 2.2991766929626465, + "rewards/rejected": -2.735485553741455, "step": 2430 }, { - "epoch": 0.62, - "learning_rate": 4.413444434041756e-07, - "logits/chosen": -2.590561628341675, - "logits/rejected": -2.5754146575927734, - "logps/chosen": -268.98944091796875, - "logps/rejected": -290.4842529296875, - "loss": 0.6782, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.8283954858779907, - "rewards/margins": 0.2944660782814026, - "rewards/rejected": -2.122861385345459, + "epoch": 0.59, + "learning_rate": 4.468265287930112e-07, + "logits/chosen": -2.8487045764923096, + "logits/rejected": -2.7860751152038574, + "logps/chosen": -269.79327392578125, + "logps/rejected": -311.254150390625, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8831332921981812, + "rewards/margins": 0.7656612396240234, + "rewards/rejected": -1.6487945318222046, "step": 2440 }, { - "epoch": 0.62, - "learning_rate": 4.408763224417189e-07, - "logits/chosen": -2.510091781616211, - "logits/rejected": -2.4711225032806396, - "logps/chosen": -236.9765167236328, - "logps/rejected": -224.4086151123047, - "loss": 0.5842, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.2400747537612915, - "rewards/margins": 1.2969255447387695, - "rewards/rejected": -2.5370001792907715, + "epoch": 0.59, + "learning_rate": 4.463808165448386e-07, + "logits/chosen": -2.830655813217163, + "logits/rejected": -2.7488274574279785, + "logps/chosen": -289.8877258300781, + "logps/rejected": -244.2772979736328, + "loss": 0.5237, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8461278676986694, + "rewards/margins": 1.3117009401321411, + "rewards/rejected": -2.1578288078308105, "step": 2450 }, { - "epoch": 0.62, - "learning_rate": 4.404082014792622e-07, - "logits/chosen": -2.5807337760925293, - "logits/rejected": -2.552704334259033, - "logps/chosen": -358.95721435546875, - "logps/rejected": -306.596435546875, - "loss": 0.5798, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.365069031715393, - "rewards/margins": 1.3823387622833252, - "rewards/rejected": -2.747407913208008, + "epoch": 0.59, + "learning_rate": 4.4593510429666605e-07, + "logits/chosen": -2.784964084625244, + "logits/rejected": -2.8206772804260254, + "logps/chosen": -259.8763122558594, + "logps/rejected": -265.6490478515625, + "loss": 0.5955, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9333363771438599, + "rewards/margins": 0.9113370776176453, + "rewards/rejected": -1.84467351436615, "step": 2460 }, { - "epoch": 0.62, - "learning_rate": 4.3994008051680554e-07, - "logits/chosen": -2.700641632080078, - "logits/rejected": -2.6497087478637695, - "logps/chosen": -359.22210693359375, - "logps/rejected": -443.8907165527344, - "loss": 0.6075, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.6151984930038452, - "rewards/margins": 1.5008728504180908, - "rewards/rejected": -3.1160712242126465, + "epoch": 0.59, + "learning_rate": 4.4548939204849345e-07, + "logits/chosen": -2.8166115283966064, + "logits/rejected": -2.8520078659057617, + "logps/chosen": -260.6202697753906, + "logps/rejected": -249.4342041015625, + "loss": 0.5734, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9711859226226807, + "rewards/margins": 0.6429948806762695, + "rewards/rejected": -2.61418080329895, "step": 2470 }, { - "epoch": 0.63, - "learning_rate": 4.3947195955434883e-07, - "logits/chosen": -2.2459988594055176, - "logits/rejected": -2.2005584239959717, - "logps/chosen": -274.14263916015625, - "logps/rejected": -352.2812194824219, - "loss": 0.6611, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5034785270690918, - "rewards/margins": 0.2487478256225586, - "rewards/rejected": -1.7522262334823608, + "epoch": 0.6, + "learning_rate": 4.4504367980032085e-07, + "logits/chosen": -2.8487656116485596, + "logits/rejected": -2.8241896629333496, + "logps/chosen": -320.11627197265625, + "logps/rejected": -298.5158386230469, + "loss": 0.4754, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9501481056213379, + "rewards/margins": 1.5180814266204834, + "rewards/rejected": -2.4682297706604004, "step": 2480 }, { - "epoch": 0.63, - "learning_rate": 4.390038385918921e-07, - "logits/chosen": -2.651543140411377, - "logits/rejected": -2.6370761394500732, - "logps/chosen": -270.1546325683594, - "logps/rejected": -262.6888427734375, - "loss": 0.8213, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.9153323173522949, - "rewards/margins": 0.7845653295516968, - "rewards/rejected": -1.6998974084854126, + "epoch": 0.6, + "learning_rate": 4.445979675521483e-07, + "logits/chosen": -2.731095552444458, + "logits/rejected": -2.5452120304107666, + "logps/chosen": -310.227294921875, + "logps/rejected": -190.8519287109375, + "loss": 0.7222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.376519799232483, + "rewards/margins": 1.2727152109146118, + "rewards/rejected": -2.6492347717285156, "step": 2490 }, { - "epoch": 0.63, - "learning_rate": 4.385357176294354e-07, - "logits/chosen": -2.6818630695343018, - "logits/rejected": -2.501220464706421, - "logps/chosen": -290.70037841796875, - "logps/rejected": -244.542236328125, - "loss": 0.6459, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.761728048324585, - "rewards/margins": 0.9028828740119934, - "rewards/rejected": -2.6646108627319336, + "epoch": 0.6, + "learning_rate": 4.441522553039757e-07, + "logits/chosen": -2.7786083221435547, + "logits/rejected": -2.662031650543213, + "logps/chosen": -250.9847869873047, + "logps/rejected": -258.3805236816406, + "loss": 0.523, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3603966236114502, + "rewards/margins": 1.1309810876846313, + "rewards/rejected": -2.491377830505371, "step": 2500 }, { - "epoch": 0.63, - "learning_rate": 4.3806759666697875e-07, - "logits/chosen": -2.6011970043182373, - "logits/rejected": -2.5796284675598145, - "logps/chosen": -267.5291442871094, - "logps/rejected": -274.05657958984375, - "loss": 0.5122, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.3027671575546265, - "rewards/margins": 1.2686413526535034, - "rewards/rejected": -2.571408748626709, + "epoch": 0.6, + "eval_logits/chosen": -2.6394147872924805, + "eval_logits/rejected": -2.61344575881958, + "eval_logps/chosen": -222.0998077392578, + "eval_logps/rejected": -226.74586486816406, + "eval_loss": 0.5130844116210938, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -2.613879680633545, + "eval_rewards/margins": 1.6213963031768799, + "eval_rewards/rejected": -4.2352752685546875, + "eval_runtime": 131.8391, + "eval_samples_per_second": 23.938, + "eval_steps_per_second": 0.379, + "step": 2500 + }, + { + "epoch": 0.6, + "learning_rate": 4.437065430558031e-07, + "logits/chosen": -2.795571804046631, + "logits/rejected": -2.79761004447937, + "logps/chosen": -289.6370849609375, + "logps/rejected": -330.11480712890625, + "loss": 0.6438, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1583130359649658, + "rewards/margins": 1.3368995189666748, + "rewards/rejected": -2.4952125549316406, "step": 2510 }, { - "epoch": 0.64, - "learning_rate": 4.3759947570452204e-07, - "logits/chosen": -2.5026752948760986, - "logits/rejected": -2.3501696586608887, - "logps/chosen": -218.0870819091797, - "logps/rejected": -272.39178466796875, - "loss": 0.654, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.630427896976471, - "rewards/margins": 1.6490509510040283, - "rewards/rejected": -2.2794787883758545, + "epoch": 0.61, + "learning_rate": 4.4326083080763057e-07, + "logits/chosen": -2.7203285694122314, + "logits/rejected": -2.772153854370117, + "logps/chosen": -228.51303100585938, + "logps/rejected": -249.1352081298828, + "loss": 0.4518, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9621639251708984, + "rewards/margins": 2.0491206645965576, + "rewards/rejected": -4.011284828186035, "step": 2520 }, { - "epoch": 0.64, - "learning_rate": 4.3713135474206533e-07, - "logits/chosen": -2.3893485069274902, - "logits/rejected": -2.3126299381256104, - "logps/chosen": -315.6602478027344, - "logps/rejected": -283.17169189453125, - "loss": 0.5226, + "epoch": 0.61, + "learning_rate": 4.4281511855945797e-07, + "logits/chosen": -2.818629741668701, + "logits/rejected": -2.6276965141296387, + "logps/chosen": -201.41331481933594, + "logps/rejected": -166.3119354248047, + "loss": 0.6599, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.1809231042861938, - "rewards/margins": 1.385221242904663, - "rewards/rejected": -2.5661442279815674, + "rewards/chosen": -1.7534383535385132, + "rewards/margins": 1.4937427043914795, + "rewards/rejected": -3.247180938720703, "step": 2530 }, { - "epoch": 0.64, - "learning_rate": 4.366632337796086e-07, - "logits/chosen": -2.6477644443511963, - "logits/rejected": -2.729999542236328, - "logps/chosen": -321.12933349609375, - "logps/rejected": -333.33648681640625, - "loss": 0.7278, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9740563631057739, - "rewards/margins": 0.9034310579299927, - "rewards/rejected": -1.8774875402450562, + "epoch": 0.61, + "learning_rate": 4.423694063112854e-07, + "logits/chosen": -2.8043015003204346, + "logits/rejected": -2.7701640129089355, + "logps/chosen": -192.11373901367188, + "logps/rejected": -215.9029998779297, + "loss": 0.5518, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6874727010726929, + "rewards/margins": 0.9937618374824524, + "rewards/rejected": -2.68123459815979, "step": 2540 }, { - "epoch": 0.64, - "learning_rate": 4.3619511281715196e-07, - "logits/chosen": -2.3891119956970215, - "logits/rejected": -2.4518914222717285, - "logps/chosen": -345.5533752441406, - "logps/rejected": -297.24609375, - "loss": 0.5454, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.45164117217063904, - "rewards/margins": 2.1465792655944824, - "rewards/rejected": -2.598220109939575, + "epoch": 0.61, + "learning_rate": 4.419236940631129e-07, + "logits/chosen": -2.673311710357666, + "logits/rejected": -2.6688475608825684, + "logps/chosen": -211.974853515625, + "logps/rejected": -239.287109375, + "loss": 1.3641, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.075792074203491, + "rewards/margins": 1.3870487213134766, + "rewards/rejected": -3.462841033935547, "step": 2550 }, { - "epoch": 0.65, - "learning_rate": 4.3572699185469525e-07, - "logits/chosen": -2.724630117416382, - "logits/rejected": -2.5989553928375244, - "logps/chosen": -305.9207458496094, - "logps/rejected": -310.9513244628906, - "loss": 0.5299, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.9471807479858398, - "rewards/margins": 1.425126314163208, - "rewards/rejected": -2.372307062149048, + "epoch": 0.62, + "learning_rate": 4.414779818149403e-07, + "logits/chosen": -2.809424877166748, + "logits/rejected": -2.7823243141174316, + "logps/chosen": -254.0945281982422, + "logps/rejected": -238.4109344482422, + "loss": 0.5558, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7369651794433594, + "rewards/margins": 2.277014970779419, + "rewards/rejected": -4.013979911804199, "step": 2560 }, { - "epoch": 0.65, - "learning_rate": 4.3525887089223854e-07, - "logits/chosen": -2.5453360080718994, - "logits/rejected": -2.501199245452881, - "logps/chosen": -276.5281677246094, - "logps/rejected": -264.8980407714844, - "loss": 0.8216, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0282284021377563, - "rewards/margins": 1.2830857038497925, - "rewards/rejected": -2.311314344406128, + "epoch": 0.62, + "learning_rate": 4.410322695667677e-07, + "logits/chosen": -2.7471108436584473, + "logits/rejected": -2.6674394607543945, + "logps/chosen": -263.4889221191406, + "logps/rejected": -307.37554931640625, + "loss": 0.6139, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3566365242004395, + "rewards/margins": 1.3324594497680664, + "rewards/rejected": -3.689095973968506, "step": 2570 }, { - "epoch": 0.65, - "learning_rate": 4.3479074992978183e-07, - "logits/chosen": -2.536196708679199, - "logits/rejected": -2.4768316745758057, - "logps/chosen": -333.0994567871094, - "logps/rejected": -244.43875122070312, - "loss": 0.5424, + "epoch": 0.62, + "learning_rate": 4.4058655731859515e-07, + "logits/chosen": -2.66255521774292, + "logits/rejected": -2.639153003692627, + "logps/chosen": -283.4830627441406, + "logps/rejected": -264.3409729003906, + "loss": 0.4614, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5456063747406006, - "rewards/margins": 0.5869501829147339, - "rewards/rejected": -2.132556438446045, + "rewards/chosen": -1.5424706935882568, + "rewards/margins": 1.0029847621917725, + "rewards/rejected": -2.5454554557800293, "step": 2580 }, { - "epoch": 0.65, - "learning_rate": 4.343226289673251e-07, - "logits/chosen": -2.6771626472473145, - "logits/rejected": -2.6559603214263916, - "logps/chosen": -341.78936767578125, - "logps/rejected": -319.7326965332031, - "loss": 0.4778, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.0155402421951294, - "rewards/margins": 1.3975855112075806, - "rewards/rejected": -2.41312575340271, + "epoch": 0.62, + "learning_rate": 4.4014084507042255e-07, + "logits/chosen": -2.7876970767974854, + "logits/rejected": -2.697725296020508, + "logps/chosen": -268.5246887207031, + "logps/rejected": -300.9275207519531, + "loss": 0.5982, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.080693483352661, + "rewards/margins": 1.983074426651001, + "rewards/rejected": -4.063767433166504, "step": 2590 }, { - "epoch": 0.66, - "learning_rate": 4.3385450800486846e-07, - "logits/chosen": -2.4650776386260986, - "logits/rejected": -2.4052162170410156, - "logps/chosen": -260.9008483886719, - "logps/rejected": -206.55044555664062, - "loss": 0.5196, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0318548679351807, - "rewards/margins": 2.524167060852051, - "rewards/rejected": -3.5560219287872314, + "epoch": 0.63, + "learning_rate": 4.3969513282224995e-07, + "logits/chosen": -2.79573130607605, + "logits/rejected": -2.713449001312256, + "logps/chosen": -233.8760223388672, + "logps/rejected": -213.77099609375, + "loss": 0.6263, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7437509298324585, + "rewards/margins": 1.448844313621521, + "rewards/rejected": -3.1925952434539795, "step": 2600 }, { - "epoch": 0.66, - "learning_rate": 4.3338638704241175e-07, - "logits/chosen": -2.79402494430542, - "logits/rejected": -2.622647762298584, - "logps/chosen": -415.70050048828125, - "logps/rejected": -334.04962158203125, - "loss": 0.6465, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -2.2122654914855957, - "rewards/margins": 1.0877020359039307, - "rewards/rejected": -3.2999675273895264, + "epoch": 0.63, + "eval_logits/chosen": -2.6360855102539062, + "eval_logits/rejected": -2.618863105773926, + "eval_logps/chosen": -222.57470703125, + "eval_logps/rejected": -224.93099975585938, + "eval_loss": 0.528740644454956, + "eval_rewards/accuracies": 0.6449999809265137, + "eval_rewards/chosen": -2.6613693237304688, + "eval_rewards/margins": 1.3924200534820557, + "eval_rewards/rejected": -4.0537896156311035, + "eval_runtime": 132.0172, + "eval_samples_per_second": 23.906, + "eval_steps_per_second": 0.379, + "step": 2600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3924942057407735e-07, + "logits/chosen": -2.7918946743011475, + "logits/rejected": -2.721294641494751, + "logps/chosen": -261.44390869140625, + "logps/rejected": -286.2633056640625, + "loss": 0.6312, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7504851818084717, + "rewards/margins": 0.9053371548652649, + "rewards/rejected": -2.655822515487671, "step": 2610 }, { - "epoch": 0.66, - "learning_rate": 4.3291826607995504e-07, - "logits/chosen": -2.5749564170837402, - "logits/rejected": -2.6640853881835938, - "logps/chosen": -267.54583740234375, - "logps/rejected": -270.51739501953125, - "loss": 0.6652, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0356361865997314, - "rewards/margins": 1.8222503662109375, - "rewards/rejected": -2.857886791229248, + "epoch": 0.63, + "learning_rate": 4.388037083259048e-07, + "logits/chosen": -2.7784228324890137, + "logits/rejected": -2.735015869140625, + "logps/chosen": -386.7674560546875, + "logps/rejected": -328.40618896484375, + "loss": 0.5588, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1267964839935303, + "rewards/margins": 2.2151296138763428, + "rewards/rejected": -3.3419265747070312, "step": 2620 }, { - "epoch": 0.66, - "learning_rate": 4.3245014511749833e-07, - "logits/chosen": -2.6502137184143066, - "logits/rejected": -2.5814380645751953, - "logps/chosen": -284.9609375, - "logps/rejected": -341.3908996582031, - "loss": 0.5912, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.013614535331726, - "rewards/margins": 1.0198663473129272, - "rewards/rejected": -2.0334811210632324, + "epoch": 0.63, + "learning_rate": 4.383579960777322e-07, + "logits/chosen": -2.694390296936035, + "logits/rejected": -2.7619845867156982, + "logps/chosen": -254.3197479248047, + "logps/rejected": -252.53079223632812, + "loss": 0.4764, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.096728801727295, + "rewards/margins": 1.2311856746673584, + "rewards/rejected": -3.3279151916503906, "step": 2630 }, { - "epoch": 0.67, - "learning_rate": 4.3198202415504167e-07, - "logits/chosen": -2.5044636726379395, - "logits/rejected": -2.4996304512023926, - "logps/chosen": -197.9936065673828, - "logps/rejected": -231.295654296875, - "loss": 0.5785, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.9943052530288696, - "rewards/margins": 0.5037738680839539, - "rewards/rejected": -1.4980791807174683, + "epoch": 0.64, + "learning_rate": 4.379122838295596e-07, + "logits/chosen": -2.7858147621154785, + "logits/rejected": -2.7133541107177734, + "logps/chosen": -357.72540283203125, + "logps/rejected": -338.99298095703125, + "loss": 0.5249, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5808498859405518, + "rewards/margins": 1.501114845275879, + "rewards/rejected": -3.0819649696350098, "step": 2640 }, { - "epoch": 0.67, - "learning_rate": 4.3151390319258496e-07, - "logits/chosen": -2.6049554347991943, - "logits/rejected": -2.5984387397766113, - "logps/chosen": -154.5291290283203, - "logps/rejected": -180.2733154296875, - "loss": 0.5901, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.0797511339187622, - "rewards/margins": 1.2088053226470947, - "rewards/rejected": -2.2885565757751465, + "epoch": 0.64, + "learning_rate": 4.3746657158138707e-07, + "logits/chosen": -2.481762647628784, + "logits/rejected": -2.400148868560791, + "logps/chosen": -218.25064086914062, + "logps/rejected": -193.4365997314453, + "loss": 0.6371, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1716020107269287, + "rewards/margins": 0.3693930506706238, + "rewards/rejected": -2.5409951210021973, "step": 2650 }, { - "epoch": 0.67, - "learning_rate": 4.310457822301282e-07, - "logits/chosen": -2.6208643913269043, - "logits/rejected": -2.6619465351104736, - "logps/chosen": -329.0693054199219, - "logps/rejected": -323.13616943359375, - "loss": 0.6039, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.8519481420516968, - "rewards/margins": 1.0418014526367188, - "rewards/rejected": -2.893749713897705, + "epoch": 0.64, + "learning_rate": 4.370208593332145e-07, + "logits/chosen": -2.725480318069458, + "logits/rejected": -2.6749582290649414, + "logps/chosen": -328.45574951171875, + "logps/rejected": -251.87307739257812, + "loss": 0.5603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0079989433288574, + "rewards/margins": 1.9361118078231812, + "rewards/rejected": -3.94411039352417, "step": 2660 }, { - "epoch": 0.67, - "learning_rate": 4.3057766126767154e-07, - "logits/chosen": -2.710085391998291, - "logits/rejected": -2.647268533706665, - "logps/chosen": -318.821044921875, - "logps/rejected": -331.76300048828125, - "loss": 0.5378, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.5609668493270874, - "rewards/margins": 1.7650535106658936, - "rewards/rejected": -3.3260207176208496, + "epoch": 0.64, + "learning_rate": 4.365751470850419e-07, + "logits/chosen": -2.8072752952575684, + "logits/rejected": -2.7648260593414307, + "logps/chosen": -288.92864990234375, + "logps/rejected": -234.51937866210938, + "loss": 0.677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.372525930404663, + "rewards/margins": 1.291799783706665, + "rewards/rejected": -2.6643261909484863, "step": 2670 }, { - "epoch": 0.68, - "learning_rate": 4.3010954030521483e-07, - "logits/chosen": -2.488053798675537, - "logits/rejected": -2.5420925617218018, - "logps/chosen": -304.56951904296875, - "logps/rejected": -276.8664245605469, - "loss": 0.5102, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.8793163299560547, - "rewards/margins": 1.3756439685821533, - "rewards/rejected": -3.254960298538208, + "epoch": 0.65, + "learning_rate": 4.3612943483686933e-07, + "logits/chosen": -2.701993703842163, + "logits/rejected": -2.7323803901672363, + "logps/chosen": -252.71444702148438, + "logps/rejected": -254.0679168701172, + "loss": 0.6178, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1768698692321777, + "rewards/margins": 0.8909411430358887, + "rewards/rejected": -3.0678107738494873, "step": 2680 }, { - "epoch": 0.68, - "learning_rate": 4.2964141934275817e-07, - "logits/chosen": -2.6673781871795654, - "logits/rejected": -2.6666882038116455, - "logps/chosen": -192.8503875732422, - "logps/rejected": -213.07363891601562, - "loss": 0.5483, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7033116817474365, - "rewards/margins": 1.4978384971618652, - "rewards/rejected": -3.201150417327881, + "epoch": 0.65, + "learning_rate": 4.3568372258869674e-07, + "logits/chosen": -2.718327283859253, + "logits/rejected": -2.7185487747192383, + "logps/chosen": -238.07412719726562, + "logps/rejected": -260.9333801269531, + "loss": 0.6459, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.024975299835205, + "rewards/margins": 1.5170586109161377, + "rewards/rejected": -3.5420336723327637, "step": 2690 }, { - "epoch": 0.68, - "learning_rate": 4.2917329838030146e-07, - "logits/chosen": -2.6083192825317383, - "logits/rejected": -2.6443545818328857, - "logps/chosen": -167.23509216308594, - "logps/rejected": -206.14404296875, - "loss": 0.4846, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.8999881744384766, - "rewards/margins": 1.4323008060455322, - "rewards/rejected": -3.3322887420654297, + "epoch": 0.65, + "learning_rate": 4.3523801034052414e-07, + "logits/chosen": -2.755694627761841, + "logits/rejected": -2.7638964653015137, + "logps/chosen": -255.44039916992188, + "logps/rejected": -253.13687133789062, + "loss": 0.5973, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9046614170074463, + "rewards/margins": 0.5747832655906677, + "rewards/rejected": -2.4794445037841797, "step": 2700 }, { - "epoch": 0.69, - "learning_rate": 4.2870517741784475e-07, - "logits/chosen": -2.658937931060791, - "logits/rejected": -2.673595666885376, - "logps/chosen": -341.39056396484375, - "logps/rejected": -346.138671875, - "loss": 0.5855, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -2.2022271156311035, - "rewards/margins": 0.9777986407279968, - "rewards/rejected": -3.180025577545166, + "epoch": 0.65, + "eval_logits/chosen": -2.6317129135131836, + "eval_logits/rejected": -2.6167306900024414, + "eval_logps/chosen": -223.0499267578125, + "eval_logps/rejected": -225.64060974121094, + "eval_loss": 0.5132200121879578, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -2.7088897228240967, + "eval_rewards/margins": 1.4158623218536377, + "eval_rewards/rejected": -4.124752044677734, + "eval_runtime": 132.1823, + "eval_samples_per_second": 23.876, + "eval_steps_per_second": 0.378, + "step": 2700 + }, + { + "epoch": 0.65, + "learning_rate": 4.347922980923516e-07, + "logits/chosen": -2.737522602081299, + "logits/rejected": -2.798412799835205, + "logps/chosen": -234.379150390625, + "logps/rejected": -294.6796875, + "loss": 0.5697, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.009643077850342, + "rewards/margins": 1.8822177648544312, + "rewards/rejected": -3.8918609619140625, "step": 2710 }, { - "epoch": 0.69, - "learning_rate": 4.2823705645538804e-07, - "logits/chosen": -2.581702470779419, - "logits/rejected": -2.4730591773986816, - "logps/chosen": -285.0960388183594, - "logps/rejected": -292.950927734375, - "loss": 0.6057, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.297670602798462, - "rewards/margins": 1.7100436687469482, - "rewards/rejected": -3.0077145099639893, + "epoch": 0.65, + "learning_rate": 4.34346585844179e-07, + "logits/chosen": -2.845856189727783, + "logits/rejected": -2.769813060760498, + "logps/chosen": -226.72396850585938, + "logps/rejected": -200.29049682617188, + "loss": 0.6423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.47199285030365, + "rewards/margins": 1.572796106338501, + "rewards/rejected": -3.0447888374328613, "step": 2720 }, { - "epoch": 0.69, - "learning_rate": 4.277689354929314e-07, - "logits/chosen": -2.655735492706299, - "logits/rejected": -2.5294718742370605, - "logps/chosen": -340.40887451171875, - "logps/rejected": -293.7754821777344, - "loss": 0.5714, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.2910304069519043, - "rewards/margins": 1.173147201538086, - "rewards/rejected": -3.4641776084899902, + "epoch": 0.66, + "learning_rate": 4.339008735960064e-07, + "logits/chosen": -2.9076998233795166, + "logits/rejected": -2.8425519466400146, + "logps/chosen": -363.76837158203125, + "logps/rejected": -291.3793029785156, + "loss": 0.5945, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.453529953956604, + "rewards/margins": 0.6546291708946228, + "rewards/rejected": -2.108159303665161, "step": 2730 }, { - "epoch": 0.69, - "learning_rate": 4.2730081453047467e-07, - "logits/chosen": -2.3572351932525635, - "logits/rejected": -2.313126564025879, - "logps/chosen": -216.91470336914062, - "logps/rejected": -229.12130737304688, - "loss": 0.5627, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2121973037719727, - "rewards/margins": 2.0007376670837402, - "rewards/rejected": -3.212934970855713, + "epoch": 0.66, + "learning_rate": 4.3345516134783386e-07, + "logits/chosen": -2.745168685913086, + "logits/rejected": -2.7224972248077393, + "logps/chosen": -268.2827453613281, + "logps/rejected": -236.50961303710938, + "loss": 0.5016, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7496297359466553, + "rewards/margins": 1.273491382598877, + "rewards/rejected": -3.0231211185455322, "step": 2740 }, { - "epoch": 0.7, - "learning_rate": 4.2683269356801796e-07, - "logits/chosen": -2.639946460723877, - "logits/rejected": -2.4425880908966064, - "logps/chosen": -367.25152587890625, - "logps/rejected": -230.43032836914062, - "loss": 0.5015, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.3880975246429443, - "rewards/margins": 2.261265516281128, - "rewards/rejected": -3.6493630409240723, + "epoch": 0.66, + "learning_rate": 4.3300944909966126e-07, + "logits/chosen": -2.5741584300994873, + "logits/rejected": -2.5889527797698975, + "logps/chosen": -357.14324951171875, + "logps/rejected": -328.6924133300781, + "loss": 0.5419, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9981743097305298, + "rewards/margins": 1.4585250616073608, + "rewards/rejected": -3.4566993713378906, "step": 2750 }, { - "epoch": 0.7, - "learning_rate": 4.2636457260556125e-07, - "logits/chosen": -2.4713833332061768, - "logits/rejected": -2.4109320640563965, - "logps/chosen": -223.4802703857422, - "logps/rejected": -208.6910858154297, - "loss": 0.6571, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.5514646768569946, - "rewards/margins": 2.1207289695739746, - "rewards/rejected": -3.6721935272216797, + "epoch": 0.66, + "learning_rate": 4.3256373685148866e-07, + "logits/chosen": -2.6161623001098633, + "logits/rejected": -2.6070687770843506, + "logps/chosen": -276.72796630859375, + "logps/rejected": -256.005126953125, + "loss": 0.4725, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5618906021118164, + "rewards/margins": 1.3479634523391724, + "rewards/rejected": -2.9098541736602783, "step": 2760 }, { - "epoch": 0.7, - "learning_rate": 4.2589645164310454e-07, - "logits/chosen": -2.6536941528320312, - "logits/rejected": -2.58679461479187, - "logps/chosen": -281.8035888671875, - "logps/rejected": -220.51327514648438, - "loss": 0.6474, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.821862816810608, - "rewards/margins": 1.2211488485336304, - "rewards/rejected": -3.0430119037628174, + "epoch": 0.67, + "learning_rate": 4.3211802460331606e-07, + "logits/chosen": -2.766573429107666, + "logits/rejected": -2.8617796897888184, + "logps/chosen": -294.6518249511719, + "logps/rejected": -303.6689453125, + "loss": 0.6238, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4811642169952393, + "rewards/margins": 2.103609323501587, + "rewards/rejected": -3.584773540496826, "step": 2770 }, { - "epoch": 0.7, - "learning_rate": 4.254283306806479e-07, - "logits/chosen": -2.838618516921997, - "logits/rejected": -2.7523787021636963, - "logps/chosen": -281.54168701171875, - "logps/rejected": -240.41146850585938, - "loss": 0.59, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8369686007499695, - "rewards/margins": 1.0852001905441284, - "rewards/rejected": -1.9221687316894531, + "epoch": 0.67, + "learning_rate": 4.316723123551435e-07, + "logits/chosen": -2.9174141883850098, + "logits/rejected": -2.875657796859741, + "logps/chosen": -320.91876220703125, + "logps/rejected": -331.48773193359375, + "loss": 0.602, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5797592401504517, + "rewards/margins": 0.3916727900505066, + "rewards/rejected": -1.9714317321777344, "step": 2780 }, { - "epoch": 0.71, - "learning_rate": 4.2496020971819117e-07, - "logits/chosen": -2.658292293548584, - "logits/rejected": -2.6537060737609863, - "logps/chosen": -325.69561767578125, - "logps/rejected": -277.89501953125, - "loss": 0.6677, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4621500968933105, - "rewards/margins": 1.1805541515350342, - "rewards/rejected": -2.6427040100097656, + "epoch": 0.67, + "learning_rate": 4.312266001069709e-07, + "logits/chosen": -2.8858160972595215, + "logits/rejected": -2.8740060329437256, + "logps/chosen": -266.24163818359375, + "logps/rejected": -235.0825958251953, + "loss": 0.5254, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.593672513961792, + "rewards/margins": 1.2809340953826904, + "rewards/rejected": -2.8746068477630615, "step": 2790 }, { - "epoch": 0.71, - "learning_rate": 4.2449208875573446e-07, - "logits/chosen": -2.772883176803589, - "logits/rejected": -2.631762981414795, - "logps/chosen": -373.33929443359375, - "logps/rejected": -327.4913635253906, - "loss": 0.5467, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.8034371137619019, - "rewards/margins": 1.0818064212799072, - "rewards/rejected": -1.8852436542510986, + "epoch": 0.67, + "learning_rate": 4.307808878587983e-07, + "logits/chosen": -2.717970371246338, + "logits/rejected": -2.4995810985565186, + "logps/chosen": -281.24639892578125, + "logps/rejected": -283.611083984375, + "loss": 0.8209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3469922542572021, + "rewards/margins": 0.31869930028915405, + "rewards/rejected": -1.6656917333602905, "step": 2800 }, { - "epoch": 0.71, - "learning_rate": 4.2402396779327775e-07, - "logits/chosen": -2.6546027660369873, - "logits/rejected": -2.644376039505005, - "logps/chosen": -250.03372192382812, - "logps/rejected": -314.5126953125, - "loss": 0.5764, + "epoch": 0.67, + "eval_logits/chosen": -2.580322265625, + "eval_logits/rejected": -2.560467481613159, + "eval_logps/chosen": -223.04615783691406, + "eval_logps/rejected": -226.2637481689453, + "eval_loss": 0.5164612531661987, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -2.708512306213379, + "eval_rewards/margins": 1.4785544872283936, + "eval_rewards/rejected": -4.187067031860352, + "eval_runtime": 131.9594, + "eval_samples_per_second": 23.916, + "eval_steps_per_second": 0.379, + "step": 2800 + }, + { + "epoch": 0.68, + "learning_rate": 4.303351756106258e-07, + "logits/chosen": -2.685749053955078, + "logits/rejected": -2.661101818084717, + "logps/chosen": -224.30142211914062, + "logps/rejected": -262.2064514160156, + "loss": 0.6181, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1385111808776855, - "rewards/margins": 1.938734769821167, - "rewards/rejected": -3.0772461891174316, + "rewards/chosen": -2.1005451679229736, + "rewards/margins": 1.160252332687378, + "rewards/rejected": -3.2607975006103516, "step": 2810 }, { - "epoch": 0.71, - "learning_rate": 4.235558468308211e-07, - "logits/chosen": -2.407291889190674, - "logits/rejected": -2.4491875171661377, - "logps/chosen": -278.3325500488281, - "logps/rejected": -268.4944152832031, - "loss": 0.4692, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0057759284973145, - "rewards/margins": 1.8401225805282593, - "rewards/rejected": -2.845898389816284, + "epoch": 0.68, + "learning_rate": 4.298894633624532e-07, + "logits/chosen": -2.841351270675659, + "logits/rejected": -2.7626795768737793, + "logps/chosen": -343.43548583984375, + "logps/rejected": -325.53985595703125, + "loss": 0.5869, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5469176769256592, + "rewards/margins": 2.126919984817505, + "rewards/rejected": -3.673837661743164, "step": 2820 }, { - "epoch": 0.72, - "learning_rate": 4.230877258683644e-07, - "logits/chosen": -2.615633487701416, - "logits/rejected": -2.541612148284912, - "logps/chosen": -250.8361053466797, - "logps/rejected": -178.80010986328125, - "loss": 0.7134, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8592203855514526, - "rewards/margins": 1.3873034715652466, - "rewards/rejected": -2.2465240955352783, + "epoch": 0.68, + "learning_rate": 4.294437511142806e-07, + "logits/chosen": -2.76352596282959, + "logits/rejected": -2.7234859466552734, + "logps/chosen": -408.36517333984375, + "logps/rejected": -284.08837890625, + "loss": 0.4752, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1387109756469727, + "rewards/margins": 1.9080737829208374, + "rewards/rejected": -3.0467848777770996, "step": 2830 }, { - "epoch": 0.72, - "learning_rate": 4.2261960490590767e-07, - "logits/chosen": -2.609163522720337, - "logits/rejected": -2.5728368759155273, - "logps/chosen": -301.5133361816406, - "logps/rejected": -281.16314697265625, - "loss": 0.519, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.4930236339569092, - "rewards/margins": 1.7747684717178345, - "rewards/rejected": -3.267792224884033, + "epoch": 0.68, + "learning_rate": 4.2899803886610804e-07, + "logits/chosen": -2.870086431503296, + "logits/rejected": -2.7910094261169434, + "logps/chosen": -321.8905944824219, + "logps/rejected": -256.9248962402344, + "loss": 0.4466, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.896380603313446, + "rewards/margins": 1.4080432653427124, + "rewards/rejected": -2.3044238090515137, "step": 2840 }, { - "epoch": 0.72, - "learning_rate": 4.2215148394345096e-07, - "logits/chosen": -2.752432107925415, - "logits/rejected": -2.6862025260925293, - "logps/chosen": -332.0007629394531, - "logps/rejected": -251.06582641601562, - "loss": 0.4968, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.6779168844223022, - "rewards/margins": 0.8063459396362305, - "rewards/rejected": -2.484262704849243, + "epoch": 0.69, + "learning_rate": 4.2855232661793545e-07, + "logits/chosen": -2.835108518600464, + "logits/rejected": -2.875153064727783, + "logps/chosen": -297.2215576171875, + "logps/rejected": -313.4309997558594, + "loss": 0.6548, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.449418544769287, + "rewards/margins": 0.7681323289871216, + "rewards/rejected": -3.2175509929656982, "step": 2850 }, { - "epoch": 0.72, - "learning_rate": 4.2168336298099425e-07, - "logits/chosen": -2.6456823348999023, - "logits/rejected": -2.539421558380127, - "logps/chosen": -184.9524688720703, - "logps/rejected": -195.1049346923828, - "loss": 0.5587, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.0221726894378662, - "rewards/margins": 1.4521821737289429, - "rewards/rejected": -2.4743552207946777, + "epoch": 0.69, + "learning_rate": 4.2810661436976285e-07, + "logits/chosen": -2.796283721923828, + "logits/rejected": -2.8260245323181152, + "logps/chosen": -234.15185546875, + "logps/rejected": -271.3819885253906, + "loss": 0.5608, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7194916009902954, + "rewards/margins": 1.8227384090423584, + "rewards/rejected": -2.5422301292419434, "step": 2860 }, { - "epoch": 0.73, - "learning_rate": 4.212152420185376e-07, - "logits/chosen": -2.8353781700134277, - "logits/rejected": -2.7753653526306152, - "logps/chosen": -284.77081298828125, - "logps/rejected": -305.65374755859375, - "loss": 0.5973, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.002152442932129, - "rewards/margins": 0.8888009786605835, - "rewards/rejected": -2.890953540802002, + "epoch": 0.69, + "learning_rate": 4.276609021215903e-07, + "logits/chosen": -2.7258098125457764, + "logits/rejected": -2.6597933769226074, + "logps/chosen": -223.6670379638672, + "logps/rejected": -190.68643188476562, + "loss": 0.6395, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.159341335296631, + "rewards/margins": 0.8163015246391296, + "rewards/rejected": -2.9756431579589844, "step": 2870 }, { - "epoch": 0.73, - "learning_rate": 4.207471210560808e-07, - "logits/chosen": -2.7503039836883545, - "logits/rejected": -2.548729181289673, - "logps/chosen": -317.0842590332031, - "logps/rejected": -323.949462890625, - "loss": 0.6178, + "epoch": 0.69, + "learning_rate": 4.272151898734177e-07, + "logits/chosen": -2.7377333641052246, + "logits/rejected": -2.7785956859588623, + "logps/chosen": -369.17462158203125, + "logps/rejected": -329.52911376953125, + "loss": 0.6358, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.9967567920684814, - "rewards/margins": 1.4991819858551025, - "rewards/rejected": -3.495938539505005, + "rewards/chosen": -1.505967378616333, + "rewards/margins": 2.0116610527038574, + "rewards/rejected": -3.5176281929016113, "step": 2880 }, { - "epoch": 0.73, - "learning_rate": 4.2027900009362417e-07, - "logits/chosen": -2.707176685333252, - "logits/rejected": -2.716992139816284, - "logps/chosen": -258.08221435546875, - "logps/rejected": -381.6121520996094, - "loss": 0.4701, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.2269902229309082, - "rewards/margins": 3.490187406539917, - "rewards/rejected": -4.717177867889404, + "epoch": 0.7, + "learning_rate": 4.267694776252451e-07, + "logits/chosen": -2.8501858711242676, + "logits/rejected": -2.7005438804626465, + "logps/chosen": -250.0526580810547, + "logps/rejected": -296.701416015625, + "loss": 0.7034, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0717856884002686, + "rewards/margins": 1.6147472858428955, + "rewards/rejected": -3.686532497406006, "step": 2890 }, { - "epoch": 0.73, - "learning_rate": 4.1981087913116746e-07, - "logits/chosen": -2.7810075283050537, - "logits/rejected": -2.7391176223754883, - "logps/chosen": -258.90576171875, - "logps/rejected": -228.8199462890625, - "loss": 0.535, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5134868621826172, - "rewards/margins": 1.1353763341903687, - "rewards/rejected": -2.6488633155822754, + "epoch": 0.7, + "learning_rate": 4.2632376537707257e-07, + "logits/chosen": -2.738267421722412, + "logits/rejected": -2.678217887878418, + "logps/chosen": -216.82681274414062, + "logps/rejected": -227.5718536376953, + "loss": 0.5625, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.5090479850769043, + "rewards/margins": 1.4844462871551514, + "rewards/rejected": -3.9934945106506348, "step": 2900 }, { - "epoch": 0.74, - "learning_rate": 4.193427581687108e-07, - "logits/chosen": -2.92142915725708, - "logits/rejected": -2.7393126487731934, - "logps/chosen": -458.55621337890625, - "logps/rejected": -362.22039794921875, - "loss": 0.5891, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -2.4719746112823486, - "rewards/margins": 0.5298187732696533, - "rewards/rejected": -3.001793146133423, + "epoch": 0.7, + "eval_logits/chosen": -2.6162710189819336, + "eval_logits/rejected": -2.589109182357788, + "eval_logps/chosen": -230.7079315185547, + "eval_logps/rejected": -234.76235961914062, + "eval_loss": 0.5117350816726685, + "eval_rewards/accuracies": 0.6324999928474426, + "eval_rewards/chosen": -3.4746899604797363, + "eval_rewards/margins": 1.5622371435165405, + "eval_rewards/rejected": -5.036926746368408, + "eval_runtime": 132.0921, + "eval_samples_per_second": 23.892, + "eval_steps_per_second": 0.379, + "step": 2900 + }, + { + "epoch": 0.7, + "learning_rate": 4.2587805312889997e-07, + "logits/chosen": -2.7481343746185303, + "logits/rejected": -2.819925308227539, + "logps/chosen": -206.71347045898438, + "logps/rejected": -252.79647827148438, + "loss": 0.5463, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9367557764053345, + "rewards/margins": 1.3272382020950317, + "rewards/rejected": -3.263993740081787, "step": 2910 }, { - "epoch": 0.74, - "learning_rate": 4.188746372062541e-07, - "logits/chosen": -2.731666088104248, - "logits/rejected": -2.667985439300537, - "logps/chosen": -369.28753662109375, - "logps/rejected": -301.48187255859375, - "loss": 0.6573, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.5690410137176514, - "rewards/margins": 0.5976849794387817, - "rewards/rejected": -2.1667261123657227, + "epoch": 0.7, + "learning_rate": 4.2543234088072737e-07, + "logits/chosen": -2.790902614593506, + "logits/rejected": -2.705190658569336, + "logps/chosen": -273.88177490234375, + "logps/rejected": -231.716552734375, + "loss": 0.7946, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.978755235671997, + "rewards/margins": 0.3346253037452698, + "rewards/rejected": -2.313380718231201, "step": 2920 }, { - "epoch": 0.74, - "learning_rate": 4.184065162437974e-07, - "logits/chosen": -2.7661612033843994, - "logits/rejected": -2.759232759475708, - "logps/chosen": -351.2125549316406, - "logps/rejected": -406.3238220214844, - "loss": 0.533, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3426258563995361, - "rewards/margins": 1.159343957901001, - "rewards/rejected": -2.501969814300537, + "epoch": 0.71, + "learning_rate": 4.249866286325548e-07, + "logits/chosen": -2.8605992794036865, + "logits/rejected": -2.833580732345581, + "logps/chosen": -248.93368530273438, + "logps/rejected": -293.63140869140625, + "loss": 0.6977, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5869662761688232, + "rewards/margins": 0.7034615278244019, + "rewards/rejected": -2.2904276847839355, "step": 2930 }, { - "epoch": 0.74, - "learning_rate": 4.1793839528134067e-07, - "logits/chosen": -2.6930956840515137, - "logits/rejected": -2.6457581520080566, - "logps/chosen": -241.09414672851562, - "logps/rejected": -242.9203643798828, - "loss": 0.5967, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.6755502223968506, - "rewards/margins": 2.0493533611297607, - "rewards/rejected": -2.7249033451080322, + "epoch": 0.71, + "learning_rate": 4.2454091638438223e-07, + "logits/chosen": -2.7230257987976074, + "logits/rejected": -2.829660177230835, + "logps/chosen": -268.4234313964844, + "logps/rejected": -269.2342224121094, + "loss": 0.6812, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.669065475463867, + "rewards/margins": 1.3194401264190674, + "rewards/rejected": -3.9885058403015137, "step": 2940 }, { - "epoch": 0.75, - "learning_rate": 4.17470274318884e-07, - "logits/chosen": -2.784623622894287, - "logits/rejected": -2.6285128593444824, - "logps/chosen": -266.16241455078125, - "logps/rejected": -191.28790283203125, - "loss": 0.598, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.225970983505249, - "rewards/margins": 1.7024619579315186, - "rewards/rejected": -2.9284331798553467, + "epoch": 0.71, + "learning_rate": 4.2409520413620963e-07, + "logits/chosen": -2.8879504203796387, + "logits/rejected": -2.849337339401245, + "logps/chosen": -296.11224365234375, + "logps/rejected": -259.64013671875, + "loss": 0.508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6054847240447998, + "rewards/margins": 1.2133591175079346, + "rewards/rejected": -2.8188436031341553, "step": 2950 }, { - "epoch": 0.75, - "learning_rate": 4.170021533564273e-07, - "logits/chosen": -2.810001850128174, - "logits/rejected": -2.641927480697632, - "logps/chosen": -349.1688232421875, - "logps/rejected": -250.1381378173828, - "loss": 0.6235, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3024142980575562, - "rewards/margins": 1.194219708442688, - "rewards/rejected": -2.496634006500244, + "epoch": 0.71, + "learning_rate": 4.2364949188803704e-07, + "logits/chosen": -2.842182159423828, + "logits/rejected": -2.756669521331787, + "logps/chosen": -189.5736541748047, + "logps/rejected": -236.970458984375, + "loss": 0.5603, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7173433303833008, + "rewards/margins": 1.4532891511917114, + "rewards/rejected": -3.1706321239471436, "step": 2960 }, { - "epoch": 0.75, - "learning_rate": 4.1653403239397053e-07, - "logits/chosen": -2.920943021774292, - "logits/rejected": -2.827078342437744, - "logps/chosen": -388.9732971191406, - "logps/rejected": -319.05474853515625, - "loss": 0.7266, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.081215500831604, - "rewards/margins": 0.7131466269493103, - "rewards/rejected": -1.7943620681762695, + "epoch": 0.71, + "learning_rate": 4.232037796398645e-07, + "logits/chosen": -2.871962547302246, + "logits/rejected": -2.8441059589385986, + "logps/chosen": -216.8768310546875, + "logps/rejected": -210.2751007080078, + "loss": 0.4745, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4282208681106567, + "rewards/margins": 1.5495332479476929, + "rewards/rejected": -2.9777543544769287, "step": 2970 }, { - "epoch": 0.75, - "learning_rate": 4.160659114315139e-07, - "logits/chosen": -2.6681265830993652, - "logits/rejected": -2.6208529472351074, - "logps/chosen": -246.3448028564453, - "logps/rejected": -256.88446044921875, - "loss": 0.6632, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.1406188011169434, - "rewards/margins": 2.058380603790283, - "rewards/rejected": -3.1989991664886475, + "epoch": 0.72, + "learning_rate": 4.227580673916919e-07, + "logits/chosen": -2.8575963973999023, + "logits/rejected": -2.7912721633911133, + "logps/chosen": -343.3385925292969, + "logps/rejected": -301.3266296386719, + "loss": 0.57, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7823689579963684, + "rewards/margins": 2.1271798610687256, + "rewards/rejected": -2.909548759460449, "step": 2980 }, { - "epoch": 0.76, - "learning_rate": 4.1559779046905717e-07, - "logits/chosen": -2.7833802700042725, - "logits/rejected": -2.676621437072754, - "logps/chosen": -341.5685729980469, - "logps/rejected": -268.85015869140625, - "loss": 0.5589, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4013714790344238, - "rewards/margins": 1.5592626333236694, - "rewards/rejected": -2.9606339931488037, + "epoch": 0.72, + "learning_rate": 4.223123551435193e-07, + "logits/chosen": -2.9277520179748535, + "logits/rejected": -2.832444667816162, + "logps/chosen": -230.76278686523438, + "logps/rejected": -223.6134033203125, + "loss": 0.6033, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4688810110092163, + "rewards/margins": 1.3876698017120361, + "rewards/rejected": -2.856550931930542, "step": 2990 }, { - "epoch": 0.76, - "learning_rate": 4.151296695066005e-07, - "logits/chosen": -2.8344216346740723, - "logits/rejected": -2.7577290534973145, - "logps/chosen": -271.50860595703125, - "logps/rejected": -270.90576171875, - "loss": 0.5541, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.7711751461029053, - "rewards/margins": 0.8914017677307129, - "rewards/rejected": -2.662576913833618, + "epoch": 0.72, + "learning_rate": 4.2186664289534675e-07, + "logits/chosen": -2.7943618297576904, + "logits/rejected": -2.8380627632141113, + "logps/chosen": -317.232177734375, + "logps/rejected": -305.44940185546875, + "loss": 0.5913, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9965862035751343, + "rewards/margins": 1.4427000284194946, + "rewards/rejected": -2.439286470413208, "step": 3000 }, { - "epoch": 0.76, - "learning_rate": 4.146615485441438e-07, - "logits/chosen": -2.62028169631958, - "logits/rejected": -2.5463624000549316, - "logps/chosen": -329.8163146972656, - "logps/rejected": -278.805908203125, - "loss": 0.5247, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -2.1455769538879395, - "rewards/margins": 1.8340288400650024, - "rewards/rejected": -3.9796054363250732, + "epoch": 0.72, + "eval_logits/chosen": -2.6631782054901123, + "eval_logits/rejected": -2.6420865058898926, + "eval_logps/chosen": -221.8050537109375, + "eval_logps/rejected": -228.2149200439453, + "eval_loss": 0.5163535475730896, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -2.584404706954956, + "eval_rewards/margins": 1.7977792024612427, + "eval_rewards/rejected": -4.382184028625488, + "eval_runtime": 132.2961, + "eval_samples_per_second": 23.856, + "eval_steps_per_second": 0.378, + "step": 3000 + }, + { + "epoch": 0.72, + "learning_rate": 4.2142093064717416e-07, + "logits/chosen": -2.6681129932403564, + "logits/rejected": -2.7343590259552, + "logps/chosen": -180.8966827392578, + "logps/rejected": -225.6484375, + "loss": 0.5211, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.968076229095459, + "rewards/margins": 2.2136971950531006, + "rewards/rejected": -3.1817736625671387, "step": 3010 }, { - "epoch": 0.76, - "learning_rate": 4.141934275816871e-07, - "logits/chosen": -2.6459527015686035, - "logits/rejected": -2.6695361137390137, - "logps/chosen": -280.35699462890625, - "logps/rejected": -262.6058654785156, - "loss": 0.6609, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.1435942649841309, - "rewards/margins": 1.60695481300354, - "rewards/rejected": -2.750548839569092, + "epoch": 0.73, + "learning_rate": 4.2097521839900156e-07, + "logits/chosen": -2.6173248291015625, + "logits/rejected": -2.634675979614258, + "logps/chosen": -315.01031494140625, + "logps/rejected": -258.7602844238281, + "loss": 0.5231, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.515277624130249, + "rewards/margins": 1.4592519998550415, + "rewards/rejected": -2.974529504776001, "step": 3020 }, { - "epoch": 0.77, - "learning_rate": 4.137253066192304e-07, - "logits/chosen": -2.807156562805176, - "logits/rejected": -2.7334413528442383, - "logps/chosen": -327.09344482421875, - "logps/rejected": -289.5278625488281, - "loss": 0.537, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9722150564193726, - "rewards/margins": 1.2871454954147339, - "rewards/rejected": -2.2593607902526855, + "epoch": 0.73, + "learning_rate": 4.20529506150829e-07, + "logits/chosen": -2.850180149078369, + "logits/rejected": -2.6990010738372803, + "logps/chosen": -216.4237518310547, + "logps/rejected": -271.6128234863281, + "loss": 0.5315, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2702429294586182, + "rewards/margins": 2.8379485607147217, + "rewards/rejected": -4.10819149017334, "step": 3030 }, { - "epoch": 0.77, - "learning_rate": 4.132571856567737e-07, - "logits/chosen": -2.73054838180542, - "logits/rejected": -2.613467216491699, - "logps/chosen": -325.3888244628906, - "logps/rejected": -293.91192626953125, - "loss": 0.6763, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.6591287851333618, - "rewards/margins": 0.35142362117767334, - "rewards/rejected": -2.0105526447296143, + "epoch": 0.73, + "learning_rate": 4.200837939026564e-07, + "logits/chosen": -2.7470576763153076, + "logits/rejected": -2.741525411605835, + "logps/chosen": -225.9722442626953, + "logps/rejected": -229.1326446533203, + "loss": 0.5389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1506850719451904, + "rewards/margins": 2.079385995864868, + "rewards/rejected": -3.2300708293914795, "step": 3040 }, { - "epoch": 0.77, - "learning_rate": 4.12789064694317e-07, - "logits/chosen": -2.6410369873046875, - "logits/rejected": -2.5918993949890137, - "logps/chosen": -286.21051025390625, - "logps/rejected": -279.4210205078125, - "loss": 0.6697, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2338840961456299, - "rewards/margins": 1.201923131942749, - "rewards/rejected": -2.435807704925537, + "epoch": 0.73, + "learning_rate": 4.196380816544838e-07, + "logits/chosen": -2.6515331268310547, + "logits/rejected": -2.725721836090088, + "logps/chosen": -236.8667755126953, + "logps/rejected": -196.12484741210938, + "loss": 0.6235, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18961039185523987, + "rewards/margins": 1.1574599742889404, + "rewards/rejected": -1.3470706939697266, "step": 3050 }, { - "epoch": 0.77, - "learning_rate": 4.123209437318603e-07, - "logits/chosen": -2.6927359104156494, - "logits/rejected": -2.6920785903930664, - "logps/chosen": -403.5760498046875, - "logps/rejected": -293.9722595214844, - "loss": 0.5659, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.5857149362564087, - "rewards/margins": 0.5379332304000854, - "rewards/rejected": -2.123648166656494, - "step": 3060 + "epoch": 0.74, + "learning_rate": 4.191923694063113e-07, + "logits/chosen": -2.923741102218628, + "logits/rejected": -2.850276470184326, + "logps/chosen": -291.39129638671875, + "logps/rejected": -243.1573944091797, + "loss": 0.9608, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9407339096069336, + "rewards/margins": 1.1975892782211304, + "rewards/rejected": -2.1383233070373535, + "step": 3060 }, { - "epoch": 0.78, - "learning_rate": 4.118528227694036e-07, - "logits/chosen": -2.538649082183838, - "logits/rejected": -2.5093741416931152, - "logps/chosen": -229.52725219726562, - "logps/rejected": -292.178466796875, - "loss": 0.5763, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.477867841720581, - "rewards/margins": 1.1251497268676758, - "rewards/rejected": -2.603017568588257, + "epoch": 0.74, + "learning_rate": 4.187466571581387e-07, + "logits/chosen": -2.7704670429229736, + "logits/rejected": -2.743314027786255, + "logps/chosen": -272.50640869140625, + "logps/rejected": -193.0053253173828, + "loss": 0.62, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.1542600393295288, + "rewards/margins": 1.0787429809570312, + "rewards/rejected": -2.2330029010772705, "step": 3070 }, { - "epoch": 0.78, - "learning_rate": 4.113847018069469e-07, - "logits/chosen": -2.6631672382354736, - "logits/rejected": -2.600651741027832, - "logps/chosen": -307.30609130859375, - "logps/rejected": -316.18426513671875, - "loss": 0.5171, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.586888074874878, - "rewards/margins": 1.2167094945907593, - "rewards/rejected": -2.8035976886749268, + "epoch": 0.74, + "learning_rate": 4.183009449099661e-07, + "logits/chosen": -2.7297980785369873, + "logits/rejected": -2.681535243988037, + "logps/chosen": -342.48883056640625, + "logps/rejected": -292.08294677734375, + "loss": 0.6297, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.397939920425415, + "rewards/margins": 1.863065481185913, + "rewards/rejected": -3.261005401611328, "step": 3080 }, { - "epoch": 0.78, - "learning_rate": 4.109165808444902e-07, - "logits/chosen": -2.6414685249328613, - "logits/rejected": -2.5432944297790527, - "logps/chosen": -282.6369934082031, - "logps/rejected": -255.5457000732422, - "loss": 0.6611, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7821651697158813, - "rewards/margins": 1.5107762813568115, - "rewards/rejected": -3.2929415702819824, + "epoch": 0.74, + "learning_rate": 4.178552326617935e-07, + "logits/chosen": -2.6804895401000977, + "logits/rejected": -2.633514404296875, + "logps/chosen": -252.39779663085938, + "logps/rejected": -248.11868286132812, + "loss": 0.5121, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.125462293624878, + "rewards/margins": 2.908383369445801, + "rewards/rejected": -4.033844947814941, "step": 3090 }, { - "epoch": 0.78, - "learning_rate": 4.104484598820335e-07, - "logits/chosen": -2.742868661880493, - "logits/rejected": -2.6158576011657715, - "logps/chosen": -297.921875, - "logps/rejected": -295.4357604980469, - "loss": 0.5415, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3601291179656982, - "rewards/margins": 1.7456867694854736, - "rewards/rejected": -3.10581636428833, + "epoch": 0.75, + "learning_rate": 4.1740952041362094e-07, + "logits/chosen": -2.7150685787200928, + "logits/rejected": -2.6344127655029297, + "logps/chosen": -280.14202880859375, + "logps/rejected": -304.5433654785156, + "loss": 0.7441, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4521244764328003, + "rewards/margins": 0.3552326261997223, + "rewards/rejected": -1.8073571920394897, "step": 3100 }, { - "epoch": 0.79, - "learning_rate": 4.099803389195768e-07, - "logits/chosen": -2.6755316257476807, - "logits/rejected": -2.6158745288848877, - "logps/chosen": -228.6281280517578, - "logps/rejected": -174.53689575195312, - "loss": 0.5777, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.9730864763259888, - "rewards/margins": 0.5768686532974243, - "rewards/rejected": -1.549955129623413, + "epoch": 0.75, + "eval_logits/chosen": -2.6464803218841553, + "eval_logits/rejected": -2.6254334449768066, + "eval_logps/chosen": -220.8607940673828, + "eval_logps/rejected": -227.27622985839844, + "eval_loss": 0.5174666047096252, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -2.4899778366088867, + "eval_rewards/margins": 1.7983382940292358, + "eval_rewards/rejected": -4.288315773010254, + "eval_runtime": 132.0495, + "eval_samples_per_second": 23.9, + "eval_steps_per_second": 0.379, + "step": 3100 + }, + { + "epoch": 0.75, + "learning_rate": 4.1696380816544834e-07, + "logits/chosen": -2.8466691970825195, + "logits/rejected": -2.8690829277038574, + "logps/chosen": -305.29412841796875, + "logps/rejected": -354.5791320800781, + "loss": 0.6765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7334659099578857, + "rewards/margins": 1.8862203359603882, + "rewards/rejected": -3.6196866035461426, "step": 3110 }, { - "epoch": 0.79, - "learning_rate": 4.095122179571201e-07, - "logits/chosen": -2.855614185333252, - "logits/rejected": -2.75842022895813, - "logps/chosen": -370.797119140625, - "logps/rejected": -288.92547607421875, - "loss": 0.6124, - "rewards/accuracies": 0.5, - "rewards/chosen": -2.317105293273926, - "rewards/margins": 0.5193389654159546, - "rewards/rejected": -2.836444139480591, + "epoch": 0.75, + "learning_rate": 4.1651809591727575e-07, + "logits/chosen": -2.7417259216308594, + "logits/rejected": -2.6553258895874023, + "logps/chosen": -202.55335998535156, + "logps/rejected": -251.43392944335938, + "loss": 0.5804, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.216357946395874, + "rewards/margins": 1.9447059631347656, + "rewards/rejected": -4.161064147949219, "step": 3120 }, { - "epoch": 0.79, - "learning_rate": 4.0904409699466343e-07, - "logits/chosen": -2.551896095275879, - "logits/rejected": -2.552696704864502, - "logps/chosen": -204.35296630859375, - "logps/rejected": -230.80282592773438, - "loss": 0.5603, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.911942720413208, - "rewards/margins": 1.3499393463134766, - "rewards/rejected": -3.2618820667266846, + "epoch": 0.75, + "learning_rate": 4.160723836691032e-07, + "logits/chosen": -2.6539032459259033, + "logits/rejected": -2.6280667781829834, + "logps/chosen": -279.409912109375, + "logps/rejected": -259.25140380859375, + "loss": 0.6704, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.332376003265381, + "rewards/margins": 1.6656761169433594, + "rewards/rejected": -3.998051881790161, "step": 3130 }, { - "epoch": 0.79, - "learning_rate": 4.085759760322067e-07, - "logits/chosen": -2.778578519821167, - "logits/rejected": -2.698376417160034, - "logps/chosen": -189.66526794433594, - "logps/rejected": -198.605712890625, - "loss": 0.5461, + "epoch": 0.76, + "learning_rate": 4.156266714209306e-07, + "logits/chosen": -2.9397025108337402, + "logits/rejected": -2.9069294929504395, + "logps/chosen": -252.04751586914062, + "logps/rejected": -298.326416015625, + "loss": 0.5141, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.7923952341079712, - "rewards/margins": 1.0926042795181274, - "rewards/rejected": -2.8849995136260986, + "rewards/chosen": -2.712002754211426, + "rewards/margins": 1.291800856590271, + "rewards/rejected": -4.003803730010986, "step": 3140 }, { - "epoch": 0.8, - "learning_rate": 4.0810785506975e-07, - "logits/chosen": -2.641099691390991, - "logits/rejected": -2.5993666648864746, - "logps/chosen": -179.12158203125, - "logps/rejected": -188.12527465820312, - "loss": 0.4533, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4460694789886475, - "rewards/margins": 1.0726109743118286, - "rewards/rejected": -2.5186808109283447, + "epoch": 0.76, + "learning_rate": 4.15180959172758e-07, + "logits/chosen": -2.8749005794525146, + "logits/rejected": -2.862103223800659, + "logps/chosen": -242.1090850830078, + "logps/rejected": -205.1970977783203, + "loss": 0.6121, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.7022042274475098, + "rewards/margins": 0.7931037545204163, + "rewards/rejected": -3.4953079223632812, "step": 3150 }, { - "epoch": 0.8, - "learning_rate": 4.076397341072933e-07, - "logits/chosen": -2.62526798248291, - "logits/rejected": -2.607677936553955, - "logps/chosen": -210.54129028320312, - "logps/rejected": -272.2312927246094, - "loss": 0.581, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.301488995552063, - "rewards/margins": 0.9993396997451782, - "rewards/rejected": -2.300828695297241, + "epoch": 0.76, + "learning_rate": 4.1473524692458546e-07, + "logits/chosen": -2.7872793674468994, + "logits/rejected": -2.7768478393554688, + "logps/chosen": -261.559326171875, + "logps/rejected": -243.8427734375, + "loss": 0.4852, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.308211088180542, + "rewards/margins": 0.8918790817260742, + "rewards/rejected": -3.2000904083251953, "step": 3160 }, { - "epoch": 0.8, - "learning_rate": 4.071716131448366e-07, - "logits/chosen": -2.711860179901123, - "logits/rejected": -2.6672210693359375, - "logps/chosen": -294.1248779296875, - "logps/rejected": -276.2283935546875, - "loss": 0.571, + "epoch": 0.76, + "learning_rate": 4.1428953467641287e-07, + "logits/chosen": -2.762392997741699, + "logits/rejected": -2.770738124847412, + "logps/chosen": -214.235107421875, + "logps/rejected": -194.26441955566406, + "loss": 0.546, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.4649314284324646, - "rewards/margins": 1.3288383483886719, - "rewards/rejected": -1.7937695980072021, + "rewards/chosen": -1.9100580215454102, + "rewards/margins": 1.5261926651000977, + "rewards/rejected": -3.436250686645508, "step": 3170 }, { - "epoch": 0.8, - "learning_rate": 4.0670349218237993e-07, - "logits/chosen": -2.5365118980407715, - "logits/rejected": -2.5782217979431152, - "logps/chosen": -335.83917236328125, - "logps/rejected": -262.0423583984375, - "loss": 0.4983, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8767711520195007, - "rewards/margins": 1.6427284479141235, - "rewards/rejected": -2.5194995403289795, + "epoch": 0.77, + "learning_rate": 4.1384382242824027e-07, + "logits/chosen": -2.8301382064819336, + "logits/rejected": -2.664149284362793, + "logps/chosen": -218.0136260986328, + "logps/rejected": -160.65834045410156, + "loss": 0.6072, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.562560796737671, + "rewards/margins": 0.4816276431083679, + "rewards/rejected": -2.0441884994506836, "step": 3180 }, { - "epoch": 0.81, - "learning_rate": 4.0623537121992316e-07, - "logits/chosen": -2.634230852127075, - "logits/rejected": -2.648371934890747, - "logps/chosen": -328.01885986328125, - "logps/rejected": -252.741943359375, - "loss": 0.575, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.053867816925049, - "rewards/margins": 1.3023773431777954, - "rewards/rejected": -3.356245517730713, + "epoch": 0.77, + "learning_rate": 4.133981101800677e-07, + "logits/chosen": -2.840327739715576, + "logits/rejected": -2.8039889335632324, + "logps/chosen": -292.4952087402344, + "logps/rejected": -334.70831298828125, + "loss": 0.6327, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9457414746284485, + "rewards/margins": 0.8799258470535278, + "rewards/rejected": -1.8256676197052002, "step": 3190 }, { - "epoch": 0.81, - "learning_rate": 4.057672502574665e-07, - "logits/chosen": -2.7491402626037598, - "logits/rejected": -2.7113699913024902, - "logps/chosen": -366.0289001464844, - "logps/rejected": -295.0087585449219, - "loss": 0.5942, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1792293787002563, - "rewards/margins": 1.7449098825454712, - "rewards/rejected": -2.9241390228271484, + "epoch": 0.77, + "learning_rate": 4.1295239793189513e-07, + "logits/chosen": -2.902097702026367, + "logits/rejected": -2.7434048652648926, + "logps/chosen": -243.239990234375, + "logps/rejected": -212.5474395751953, + "loss": 0.6169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7089574337005615, + "rewards/margins": 0.9152014851570129, + "rewards/rejected": -2.6241588592529297, "step": 3200 }, { - "epoch": 0.81, - "learning_rate": 4.052991292950098e-07, - "logits/chosen": -2.7164413928985596, - "logits/rejected": -2.7283341884613037, - "logps/chosen": -244.5092315673828, - "logps/rejected": -243.07958984375, - "loss": 0.5719, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5343577861785889, - "rewards/margins": 0.30130812525749207, - "rewards/rejected": -1.8356659412384033, + "epoch": 0.77, + "eval_logits/chosen": -2.6774706840515137, + "eval_logits/rejected": -2.65167236328125, + "eval_logps/chosen": -218.45034790039062, + "eval_logps/rejected": -223.05889892578125, + "eval_loss": 0.516303300857544, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -2.248932123184204, + "eval_rewards/margins": 1.6176486015319824, + "eval_rewards/rejected": -3.8665812015533447, + "eval_runtime": 132.0418, + "eval_samples_per_second": 23.902, + "eval_steps_per_second": 0.379, + "step": 3200 + }, + { + "epoch": 0.77, + "learning_rate": 4.1250668568372253e-07, + "logits/chosen": -2.7115418910980225, + "logits/rejected": -2.6640706062316895, + "logps/chosen": -202.72142028808594, + "logps/rejected": -234.97793579101562, + "loss": 0.5523, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.112852692604065, + "rewards/margins": 1.6642510890960693, + "rewards/rejected": -2.777103900909424, "step": 3210 }, { - "epoch": 0.81, - "learning_rate": 4.0483100833255314e-07, - "logits/chosen": -2.690765619277954, - "logits/rejected": -2.544368028640747, - "logps/chosen": -251.3297576904297, - "logps/rejected": -224.3122100830078, - "loss": 0.5907, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7500336170196533, - "rewards/margins": 1.2233902215957642, - "rewards/rejected": -2.973423719406128, + "epoch": 0.77, + "learning_rate": 4.1206097343555e-07, + "logits/chosen": -2.684051036834717, + "logits/rejected": -2.725161552429199, + "logps/chosen": -207.09140014648438, + "logps/rejected": -212.0842742919922, + "loss": 0.5388, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9394035339355469, + "rewards/margins": 1.6471328735351562, + "rewards/rejected": -2.586536407470703, "step": 3220 }, { - "epoch": 0.82, - "learning_rate": 4.0436288737009643e-07, - "logits/chosen": -2.56693959236145, - "logits/rejected": -2.575343608856201, - "logps/chosen": -218.8804473876953, - "logps/rejected": -245.62911987304688, - "loss": 0.557, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.9216814041137695, - "rewards/margins": 1.8012382984161377, - "rewards/rejected": -3.7229199409484863, + "epoch": 0.78, + "learning_rate": 4.116152611873774e-07, + "logits/chosen": -2.757572650909424, + "logits/rejected": -2.719525098800659, + "logps/chosen": -261.9958190917969, + "logps/rejected": -280.16259765625, + "loss": 0.4942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.293212652206421, + "rewards/margins": 1.5744187831878662, + "rewards/rejected": -2.867631435394287, "step": 3230 }, { - "epoch": 0.82, - "learning_rate": 4.038947664076397e-07, - "logits/chosen": -2.5427839756011963, - "logits/rejected": -2.329836368560791, - "logps/chosen": -255.80868530273438, - "logps/rejected": -224.1576690673828, - "loss": 0.5536, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7845814228057861, - "rewards/margins": 1.403369665145874, - "rewards/rejected": -3.187950849533081, + "epoch": 0.78, + "learning_rate": 4.1116954893920485e-07, + "logits/chosen": -2.9839398860931396, + "logits/rejected": -2.7981276512145996, + "logps/chosen": -319.7491455078125, + "logps/rejected": -253.24435424804688, + "loss": 0.6379, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6714880466461182, + "rewards/margins": 2.326796531677246, + "rewards/rejected": -3.998284101486206, "step": 3240 }, { - "epoch": 0.82, - "learning_rate": 4.03426645445183e-07, - "logits/chosen": -2.501692533493042, - "logits/rejected": -2.4710707664489746, - "logps/chosen": -273.8977966308594, - "logps/rejected": -273.548095703125, - "loss": 0.5992, + "epoch": 0.78, + "learning_rate": 4.107238366910323e-07, + "logits/chosen": -2.785733461380005, + "logits/rejected": -2.787304639816284, + "logps/chosen": -276.7082824707031, + "logps/rejected": -289.84771728515625, + "loss": 0.6479, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.00805401802063, - "rewards/margins": 1.3584315776824951, - "rewards/rejected": -3.366485595703125, + "rewards/chosen": -1.8403196334838867, + "rewards/margins": 1.976406455039978, + "rewards/rejected": -3.816725969314575, "step": 3250 }, { - "epoch": 0.82, - "learning_rate": 4.0295852448272635e-07, - "logits/chosen": -2.6749606132507324, - "logits/rejected": -2.640605926513672, - "logps/chosen": -208.001708984375, - "logps/rejected": -244.11300659179688, - "loss": 0.564, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.029116153717041, - "rewards/margins": 1.3018925189971924, - "rewards/rejected": -3.3310089111328125, + "epoch": 0.78, + "learning_rate": 4.102781244428597e-07, + "logits/chosen": -2.888366937637329, + "logits/rejected": -2.8376317024230957, + "logps/chosen": -280.24224853515625, + "logps/rejected": -240.0019989013672, + "loss": 0.5354, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5033063888549805, + "rewards/margins": 1.2687366008758545, + "rewards/rejected": -2.772042989730835, "step": 3260 }, { - "epoch": 0.83, - "learning_rate": 4.0249040352026964e-07, - "logits/chosen": -2.686748504638672, - "logits/rejected": -2.5494863986968994, - "logps/chosen": -302.6435546875, - "logps/rejected": -251.1964874267578, - "loss": 0.6419, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -2.0372931957244873, - "rewards/margins": 0.9006452560424805, - "rewards/rejected": -2.9379382133483887, + "epoch": 0.79, + "learning_rate": 4.098324121946871e-07, + "logits/chosen": -2.877849578857422, + "logits/rejected": -2.759117841720581, + "logps/chosen": -254.46304321289062, + "logps/rejected": -257.31671142578125, + "loss": 0.5151, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5496079921722412, + "rewards/margins": 2.2586841583251953, + "rewards/rejected": -3.8082923889160156, "step": 3270 }, { - "epoch": 0.83, - "learning_rate": 4.020222825578129e-07, - "logits/chosen": -2.7011072635650635, - "logits/rejected": -2.6204285621643066, - "logps/chosen": -292.6033630371094, - "logps/rejected": -324.64434814453125, - "loss": 0.5648, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.684535264968872, - "rewards/margins": 1.2165940999984741, - "rewards/rejected": -2.9011292457580566, + "epoch": 0.79, + "learning_rate": 4.093866999465145e-07, + "logits/chosen": -2.970949649810791, + "logits/rejected": -2.828958034515381, + "logps/chosen": -303.1264343261719, + "logps/rejected": -304.41412353515625, + "loss": 0.723, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8294920921325684, + "rewards/margins": 1.3070285320281982, + "rewards/rejected": -2.1365208625793457, "step": 3280 }, { - "epoch": 0.83, - "learning_rate": 4.015541615953562e-07, - "logits/chosen": -2.535747528076172, - "logits/rejected": -2.582158088684082, - "logps/chosen": -195.5382843017578, - "logps/rejected": -261.56365966796875, - "loss": 0.6563, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -2.721742868423462, - "rewards/margins": 0.9087635278701782, - "rewards/rejected": -3.6305060386657715, + "epoch": 0.79, + "learning_rate": 4.0894098769834197e-07, + "logits/chosen": -2.791436195373535, + "logits/rejected": -2.679800033569336, + "logps/chosen": -309.3990173339844, + "logps/rejected": -291.8839111328125, + "loss": 0.5399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3520158529281616, + "rewards/margins": 1.8990980386734009, + "rewards/rejected": -3.2511138916015625, "step": 3290 }, { - "epoch": 0.83, - "learning_rate": 4.010860406328995e-07, - "logits/chosen": -2.808954954147339, - "logits/rejected": -2.696636915206909, - "logps/chosen": -361.0929260253906, - "logps/rejected": -290.98779296875, - "loss": 0.6535, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -2.4368767738342285, - "rewards/margins": 0.8197757601737976, - "rewards/rejected": -3.256652355194092, + "epoch": 0.79, + "learning_rate": 4.0849527545016937e-07, + "logits/chosen": -2.682048797607422, + "logits/rejected": -2.663224697113037, + "logps/chosen": -352.7359313964844, + "logps/rejected": -334.49859619140625, + "loss": 0.5347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11662639677524567, + "rewards/margins": 1.7480242252349854, + "rewards/rejected": -1.8646503686904907, "step": 3300 }, { - "epoch": 0.84, - "learning_rate": 4.0061791967044285e-07, - "logits/chosen": -2.429372787475586, - "logits/rejected": -2.4101641178131104, - "logps/chosen": -292.60931396484375, - "logps/rejected": -245.3379669189453, - "loss": 0.6289, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -2.1441898345947266, - "rewards/margins": 1.6892770528793335, - "rewards/rejected": -3.8334667682647705, + "epoch": 0.79, + "eval_logits/chosen": -2.6908528804779053, + "eval_logits/rejected": -2.6711535453796387, + "eval_logps/chosen": -222.66001892089844, + "eval_logps/rejected": -228.23675537109375, + "eval_loss": 0.5222463607788086, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": -2.6698992252349854, + "eval_rewards/margins": 1.714467167854309, + "eval_rewards/rejected": -4.384366512298584, + "eval_runtime": 132.191, + "eval_samples_per_second": 23.875, + "eval_steps_per_second": 0.378, + "step": 3300 + }, + { + "epoch": 0.8, + "learning_rate": 4.0804956320199677e-07, + "logits/chosen": -2.665228843688965, + "logits/rejected": -2.6133179664611816, + "logps/chosen": -208.2563018798828, + "logps/rejected": -214.9170379638672, + "loss": 0.4921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2633521556854248, + "rewards/margins": 0.980221152305603, + "rewards/rejected": -2.2435734272003174, "step": 3310 }, { - "epoch": 0.84, - "learning_rate": 4.0014979870798614e-07, - "logits/chosen": -2.67567777633667, - "logits/rejected": -2.740675687789917, - "logps/chosen": -295.6293029785156, - "logps/rejected": -416.08221435546875, - "loss": 0.6084, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.9729360342025757, - "rewards/margins": 1.8700878620147705, - "rewards/rejected": -3.8430240154266357, + "epoch": 0.8, + "learning_rate": 4.0760385095382423e-07, + "logits/chosen": -2.7662155628204346, + "logits/rejected": -2.759779453277588, + "logps/chosen": -341.47320556640625, + "logps/rejected": -256.7302551269531, + "loss": 0.8424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6086572408676147, + "rewards/margins": 1.3617091178894043, + "rewards/rejected": -1.9703662395477295, "step": 3320 }, { - "epoch": 0.84, - "learning_rate": 3.9968167774552943e-07, - "logits/chosen": -2.6133923530578613, - "logits/rejected": -2.5547168254852295, - "logps/chosen": -234.68698120117188, - "logps/rejected": -259.75048828125, - "loss": 0.4013, + "epoch": 0.8, + "learning_rate": 4.0715813870565163e-07, + "logits/chosen": -2.7792305946350098, + "logits/rejected": -2.678015947341919, + "logps/chosen": -256.3700866699219, + "logps/rejected": -361.92999267578125, + "loss": 0.4706, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.157472610473633, - "rewards/margins": 1.3976643085479736, - "rewards/rejected": -3.5551364421844482, + "rewards/chosen": -1.9827537536621094, + "rewards/margins": 2.301004409790039, + "rewards/rejected": -4.283758163452148, "step": 3330 }, { - "epoch": 0.84, - "learning_rate": 3.992135567830727e-07, - "logits/chosen": -2.5629830360412598, - "logits/rejected": -2.607271671295166, - "logps/chosen": -232.90493774414062, - "logps/rejected": -259.7083435058594, - "loss": 0.5546, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1839420795440674, - "rewards/margins": 2.026728391647339, - "rewards/rejected": -3.2106704711914062, + "epoch": 0.8, + "learning_rate": 4.0671242645747903e-07, + "logits/chosen": -2.7010245323181152, + "logits/rejected": -2.669950485229492, + "logps/chosen": -324.13037109375, + "logps/rejected": -348.9970703125, + "loss": 0.5615, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0223013162612915, + "rewards/margins": 0.8672122955322266, + "rewards/rejected": -1.889513611793518, "step": 3340 }, { - "epoch": 0.85, - "learning_rate": 3.9874543582061606e-07, - "logits/chosen": -2.6749119758605957, - "logits/rejected": -2.4684460163116455, - "logps/chosen": -257.3249816894531, - "logps/rejected": -186.1034698486328, - "loss": 0.5217, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.893366813659668, - "rewards/margins": 1.852936029434204, - "rewards/rejected": -3.746302843093872, + "epoch": 0.81, + "learning_rate": 4.062667142093065e-07, + "logits/chosen": -2.755650043487549, + "logits/rejected": -2.7682108879089355, + "logps/chosen": -333.831787109375, + "logps/rejected": -289.2721252441406, + "loss": 0.4737, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3894363641738892, + "rewards/margins": 0.9578593373298645, + "rewards/rejected": -2.3472955226898193, "step": 3350 }, { - "epoch": 0.85, - "learning_rate": 3.9827731485815935e-07, - "logits/chosen": -2.6366989612579346, - "logits/rejected": -2.5729355812072754, - "logps/chosen": -281.87969970703125, - "logps/rejected": -295.0253601074219, - "loss": 0.6457, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -3.02162504196167, - "rewards/margins": 0.9705334901809692, - "rewards/rejected": -3.9921584129333496, + "epoch": 0.81, + "learning_rate": 4.058210019611339e-07, + "logits/chosen": -2.706803560256958, + "logits/rejected": -2.6700148582458496, + "logps/chosen": -248.3615264892578, + "logps/rejected": -259.49176025390625, + "loss": 0.5909, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7057228088378906, + "rewards/margins": 1.5244674682617188, + "rewards/rejected": -3.2301902770996094, "step": 3360 }, { - "epoch": 0.85, - "learning_rate": 3.978091938957026e-07, - "logits/chosen": -2.4280495643615723, - "logits/rejected": -2.409846782684326, - "logps/chosen": -249.6262969970703, - "logps/rejected": -197.26663208007812, - "loss": 0.5377, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.532490849494934, - "rewards/margins": 1.3351542949676514, - "rewards/rejected": -2.867644786834717, + "epoch": 0.81, + "learning_rate": 4.053752897129613e-07, + "logits/chosen": -2.7024283409118652, + "logits/rejected": -2.601698160171509, + "logps/chosen": -278.46124267578125, + "logps/rejected": -225.0982208251953, + "loss": 0.5999, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7203390002250671, + "rewards/margins": 3.0689213275909424, + "rewards/rejected": -3.7892603874206543, "step": 3370 }, { - "epoch": 0.85, - "learning_rate": 3.9734107293324593e-07, - "logits/chosen": -2.3564815521240234, - "logits/rejected": -2.418379545211792, - "logps/chosen": -230.7577667236328, - "logps/rejected": -210.77334594726562, - "loss": 0.5286, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.699113368988037, - "rewards/margins": 0.9843389391899109, - "rewards/rejected": -3.6834521293640137, + "epoch": 0.81, + "learning_rate": 4.0492957746478875e-07, + "logits/chosen": -2.7170844078063965, + "logits/rejected": -2.689716100692749, + "logps/chosen": -338.5250549316406, + "logps/rejected": -331.200439453125, + "loss": 0.5322, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0604665279388428, + "rewards/margins": 1.8095000982284546, + "rewards/rejected": -2.869966983795166, "step": 3380 }, { - "epoch": 0.86, - "learning_rate": 3.968729519707892e-07, - "logits/chosen": -2.6940178871154785, - "logits/rejected": -2.5732855796813965, - "logps/chosen": -276.3876953125, - "logps/rejected": -253.4854278564453, - "loss": 0.5725, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.361846923828125, - "rewards/margins": 0.9445775747299194, - "rewards/rejected": -2.306424617767334, + "epoch": 0.82, + "learning_rate": 4.0448386521661615e-07, + "logits/chosen": -2.826956033706665, + "logits/rejected": -2.738678455352783, + "logps/chosen": -229.26992797851562, + "logps/rejected": -220.6785888671875, + "loss": 0.5798, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2285170555114746, + "rewards/margins": 1.1528253555297852, + "rewards/rejected": -3.3813424110412598, "step": 3390 }, { - "epoch": 0.86, - "learning_rate": 3.9640483100833256e-07, - "logits/chosen": -2.7425854206085205, - "logits/rejected": -2.730180501937866, - "logps/chosen": -398.3658142089844, - "logps/rejected": -334.22869873046875, - "loss": 0.5644, + "epoch": 0.82, + "learning_rate": 4.0403815296844356e-07, + "logits/chosen": -2.8549716472625732, + "logits/rejected": -2.758796215057373, + "logps/chosen": -275.7738037109375, + "logps/rejected": -342.7076110839844, + "loss": 0.5369, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1900866031646729, - "rewards/margins": 1.9019054174423218, - "rewards/rejected": -3.091992139816284, + "rewards/chosen": -1.2324062585830688, + "rewards/margins": 1.932786226272583, + "rewards/rejected": -3.1651923656463623, "step": 3400 }, { - "epoch": 0.86, - "learning_rate": 3.959367100458758e-07, - "logits/chosen": -2.6171083450317383, - "logits/rejected": -2.6043202877044678, - "logps/chosen": -289.6757507324219, - "logps/rejected": -265.671630859375, - "loss": 0.5571, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2667036056518555, - "rewards/margins": 1.299369215965271, - "rewards/rejected": -2.566072702407837, + "epoch": 0.82, + "eval_logits/chosen": -2.559478521347046, + "eval_logits/rejected": -2.530360221862793, + "eval_logps/chosen": -223.6710968017578, + "eval_logps/rejected": -230.74485778808594, + "eval_loss": 0.5243595838546753, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -2.7710094451904297, + "eval_rewards/margins": 1.8641690015792847, + "eval_rewards/rejected": -4.635178089141846, + "eval_runtime": 132.0589, + "eval_samples_per_second": 23.898, + "eval_steps_per_second": 0.379, + "step": 3400 + }, + { + "epoch": 0.82, + "learning_rate": 4.03592440720271e-07, + "logits/chosen": -2.7602005004882812, + "logits/rejected": -2.6876800060272217, + "logps/chosen": -279.74737548828125, + "logps/rejected": -275.16094970703125, + "loss": 0.4916, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.1874918937683105, + "rewards/margins": 2.0317811965942383, + "rewards/rejected": -4.219273090362549, "step": 3410 }, { - "epoch": 0.86, - "learning_rate": 3.9546858908341914e-07, - "logits/chosen": -2.6402785778045654, - "logits/rejected": -2.543848752975464, - "logps/chosen": -292.3985290527344, - "logps/rejected": -254.4654083251953, - "loss": 0.5835, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4257738590240479, - "rewards/margins": 1.4361470937728882, - "rewards/rejected": -2.8619208335876465, + "epoch": 0.82, + "learning_rate": 4.031467284720984e-07, + "logits/chosen": -2.7744486331939697, + "logits/rejected": -2.694875478744507, + "logps/chosen": -227.4954071044922, + "logps/rejected": -202.5278778076172, + "loss": 0.4842, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5334289073944092, + "rewards/margins": 1.9471044540405273, + "rewards/rejected": -3.4805335998535156, "step": 3420 }, { - "epoch": 0.87, - "learning_rate": 3.950004681209624e-07, - "logits/chosen": -2.537668228149414, - "logits/rejected": -2.5646309852600098, - "logps/chosen": -211.93148803710938, - "logps/rejected": -216.738037109375, - "loss": 0.5892, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6508975028991699, - "rewards/margins": 1.7442976236343384, - "rewards/rejected": -2.3951950073242188, + "epoch": 0.83, + "learning_rate": 4.027010162239258e-07, + "logits/chosen": -2.76536226272583, + "logits/rejected": -2.7031362056732178, + "logps/chosen": -248.09506225585938, + "logps/rejected": -193.00314331054688, + "loss": 0.6467, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6645818948745728, + "rewards/margins": 2.320861577987671, + "rewards/rejected": -2.985443115234375, "step": 3430 }, { - "epoch": 0.87, - "learning_rate": 3.9453234715850577e-07, - "logits/chosen": -2.6993932723999023, - "logits/rejected": -2.5626072883605957, - "logps/chosen": -358.39227294921875, - "logps/rejected": -276.04742431640625, - "loss": 0.5386, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.695052981376648, - "rewards/margins": 2.2782647609710693, - "rewards/rejected": -2.9733176231384277, + "epoch": 0.83, + "learning_rate": 4.022553039757532e-07, + "logits/chosen": -2.888481378555298, + "logits/rejected": -2.8760275840759277, + "logps/chosen": -283.21197509765625, + "logps/rejected": -312.6656799316406, + "loss": 0.6338, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2900383472442627, + "rewards/margins": 0.8005240559577942, + "rewards/rejected": -3.090562343597412, "step": 3440 }, { - "epoch": 0.87, - "learning_rate": 3.9406422619604906e-07, - "logits/chosen": -2.7179439067840576, - "logits/rejected": -2.6138925552368164, - "logps/chosen": -378.1282958984375, - "logps/rejected": -288.8641357421875, - "loss": 0.5758, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.9554533958435059, - "rewards/margins": 1.3336598873138428, - "rewards/rejected": -2.2891132831573486, + "epoch": 0.83, + "learning_rate": 4.018095917275807e-07, + "logits/chosen": -2.843820095062256, + "logits/rejected": -2.7807846069335938, + "logps/chosen": -333.4847717285156, + "logps/rejected": -260.08221435546875, + "loss": 0.5944, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.6706252098083496, + "rewards/margins": 0.11674753576517105, + "rewards/rejected": -2.7873733043670654, "step": 3450 }, { - "epoch": 0.87, - "learning_rate": 3.9359610523359235e-07, - "logits/chosen": -2.6427860260009766, - "logits/rejected": -2.5691373348236084, - "logps/chosen": -247.28494262695312, - "logps/rejected": -264.9586181640625, - "loss": 0.5398, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.0383670330047607, - "rewards/margins": 1.094294786453247, - "rewards/rejected": -2.1326615810394287, + "epoch": 0.83, + "learning_rate": 4.013638794794081e-07, + "logits/chosen": -2.905905246734619, + "logits/rejected": -2.7520716190338135, + "logps/chosen": -296.47027587890625, + "logps/rejected": -282.2842712402344, + "loss": 0.4748, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2645063400268555, + "rewards/margins": 1.9935411214828491, + "rewards/rejected": -3.258047103881836, "step": 3460 }, { - "epoch": 0.88, - "learning_rate": 3.9312798427113564e-07, - "logits/chosen": -2.5711920261383057, - "logits/rejected": -2.5556704998016357, - "logps/chosen": -189.369384765625, - "logps/rejected": -230.4828338623047, - "loss": 0.5015, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8589425086975098, - "rewards/margins": 1.2133837938308716, - "rewards/rejected": -2.072326183319092, + "epoch": 0.84, + "learning_rate": 4.009181672312355e-07, + "logits/chosen": -2.764261484146118, + "logits/rejected": -2.811213254928589, + "logps/chosen": -244.48934936523438, + "logps/rejected": -235.26821899414062, + "loss": 0.5649, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5248193740844727, + "rewards/margins": 1.6903337240219116, + "rewards/rejected": -3.215153217315674, "step": 3470 }, { - "epoch": 0.88, - "learning_rate": 3.926598633086789e-07, - "logits/chosen": -2.505418062210083, - "logits/rejected": -2.6115143299102783, - "logps/chosen": -305.3795471191406, - "logps/rejected": -264.7258605957031, - "loss": 0.6931, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -2.0123074054718018, - "rewards/margins": -0.07890516519546509, - "rewards/rejected": -1.933402419090271, + "epoch": 0.84, + "learning_rate": 4.0047245498306294e-07, + "logits/chosen": -2.8663015365600586, + "logits/rejected": -2.751207113265991, + "logps/chosen": -254.2701416015625, + "logps/rejected": -270.5858459472656, + "loss": 0.6003, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0607060194015503, + "rewards/margins": 1.6508439779281616, + "rewards/rejected": -2.711550235748291, "step": 3480 }, { - "epoch": 0.88, - "learning_rate": 3.9219174234622227e-07, - "logits/chosen": -2.527928113937378, - "logits/rejected": -2.6276683807373047, - "logps/chosen": -282.0664367675781, - "logps/rejected": -267.37109375, - "loss": 0.5965, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.4060749113559723, - "rewards/margins": 1.2708396911621094, - "rewards/rejected": -1.6769145727157593, + "epoch": 0.84, + "learning_rate": 4.0002674273489034e-07, + "logits/chosen": -2.5828440189361572, + "logits/rejected": -2.622328281402588, + "logps/chosen": -287.8699645996094, + "logps/rejected": -262.64117431640625, + "loss": 0.4798, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7128021717071533, + "rewards/margins": 1.4308149814605713, + "rewards/rejected": -3.1436171531677246, "step": 3490 }, { - "epoch": 0.88, - "learning_rate": 3.917236213837655e-07, - "logits/chosen": -2.7274959087371826, - "logits/rejected": -2.577826976776123, - "logps/chosen": -331.56549072265625, - "logps/rejected": -279.29083251953125, - "loss": 0.4947, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7532097697257996, - "rewards/margins": 1.5866296291351318, - "rewards/rejected": -2.339839220046997, + "epoch": 0.84, + "learning_rate": 3.9958103048671774e-07, + "logits/chosen": -2.809256076812744, + "logits/rejected": -2.7442965507507324, + "logps/chosen": -206.04623413085938, + "logps/rejected": -180.84036254882812, + "loss": 0.5613, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4523483514785767, + "rewards/margins": 1.1587363481521606, + "rewards/rejected": -2.6110849380493164, "step": 3500 }, { - "epoch": 0.89, - "learning_rate": 3.9125550042130885e-07, - "logits/chosen": -2.564699649810791, - "logits/rejected": -2.598123550415039, - "logps/chosen": -278.11785888671875, - "logps/rejected": -316.83502197265625, - "loss": 0.5313, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2439638376235962, - "rewards/margins": 0.5872949361801147, - "rewards/rejected": -1.831258773803711, + "epoch": 0.84, + "eval_logits/chosen": -2.560426712036133, + "eval_logits/rejected": -2.5348458290100098, + "eval_logps/chosen": -233.6063232421875, + "eval_logps/rejected": -241.16635131835938, + "eval_loss": 0.5431033372879028, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -3.764530897140503, + "eval_rewards/margins": 1.9127956628799438, + "eval_rewards/rejected": -5.677326679229736, + "eval_runtime": 132.1301, + "eval_samples_per_second": 23.886, + "eval_steps_per_second": 0.378, + "step": 3500 + }, + { + "epoch": 0.84, + "learning_rate": 3.991353182385452e-07, + "logits/chosen": -2.799347162246704, + "logits/rejected": -2.7776851654052734, + "logps/chosen": -357.83978271484375, + "logps/rejected": -288.75201416015625, + "loss": 0.5959, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6087825298309326, + "rewards/margins": 2.169447660446167, + "rewards/rejected": -3.7782301902770996, "step": 3510 }, { - "epoch": 0.89, - "learning_rate": 3.9078737945885214e-07, - "logits/chosen": -2.628638744354248, - "logits/rejected": -2.470625400543213, - "logps/chosen": -342.5315856933594, - "logps/rejected": -275.0693054199219, - "loss": 0.7085, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.970574140548706, - "rewards/margins": 1.1266072988510132, - "rewards/rejected": -3.097181797027588, + "epoch": 0.85, + "learning_rate": 3.986896059903726e-07, + "logits/chosen": -2.4091360569000244, + "logits/rejected": -2.4166347980499268, + "logps/chosen": -216.064697265625, + "logps/rejected": -171.97769165039062, + "loss": 0.582, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.16249215602874756, + "rewards/margins": 2.555345296859741, + "rewards/rejected": -2.717837333679199, "step": 3520 }, { - "epoch": 0.89, - "learning_rate": 3.903192584963955e-07, - "logits/chosen": -2.676095485687256, - "logits/rejected": -2.6351985931396484, - "logps/chosen": -349.648193359375, - "logps/rejected": -269.4110412597656, - "loss": 0.6365, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7081565856933594, - "rewards/margins": 1.273766279220581, - "rewards/rejected": -2.9819226264953613, + "epoch": 0.85, + "learning_rate": 3.982438937422e-07, + "logits/chosen": -2.589353084564209, + "logits/rejected": -2.5620620250701904, + "logps/chosen": -223.41940307617188, + "logps/rejected": -228.9301300048828, + "loss": 0.541, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4173853397369385, + "rewards/margins": 2.2482168674468994, + "rewards/rejected": -3.665602445602417, "step": 3530 }, { - "epoch": 0.89, - "learning_rate": 3.8985113753393877e-07, - "logits/chosen": -2.4113409519195557, - "logits/rejected": -2.4867024421691895, - "logps/chosen": -209.7042999267578, - "logps/rejected": -235.5690155029297, - "loss": 0.5081, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2396607398986816, - "rewards/margins": 1.8427131175994873, - "rewards/rejected": -3.082373857498169, + "epoch": 0.85, + "learning_rate": 3.9779818149402746e-07, + "logits/chosen": -2.613518476486206, + "logits/rejected": -2.5435214042663574, + "logps/chosen": -168.79859924316406, + "logps/rejected": -133.8692169189453, + "loss": 0.5653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.078601121902466, + "rewards/margins": 0.9717855453491211, + "rewards/rejected": -3.050386667251587, "step": 3540 }, { - "epoch": 0.9, - "learning_rate": 3.8938301657148206e-07, - "logits/chosen": -2.6304538249969482, - "logits/rejected": -2.5243570804595947, - "logps/chosen": -293.2393798828125, - "logps/rejected": -311.14263916015625, - "loss": 0.5648, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.9866330027580261, - "rewards/margins": 2.1932902336120605, - "rewards/rejected": -3.1799235343933105, + "epoch": 0.85, + "learning_rate": 3.9735246924585486e-07, + "logits/chosen": -2.7209832668304443, + "logits/rejected": -2.7087225914001465, + "logps/chosen": -198.72506713867188, + "logps/rejected": -165.67263793945312, + "loss": 0.5501, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4326510429382324, + "rewards/margins": 1.8340715169906616, + "rewards/rejected": -3.2667224407196045, "step": 3550 }, { - "epoch": 0.9, - "learning_rate": 3.8891489560902535e-07, - "logits/chosen": -2.3965888023376465, - "logits/rejected": -2.5309622287750244, - "logps/chosen": -242.2054443359375, - "logps/rejected": -270.7049255371094, - "loss": 0.577, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3094321489334106, - "rewards/margins": 1.3607757091522217, - "rewards/rejected": -2.6702075004577637, + "epoch": 0.86, + "learning_rate": 3.9690675699768227e-07, + "logits/chosen": -2.7536797523498535, + "logits/rejected": -2.7060484886169434, + "logps/chosen": -222.2887420654297, + "logps/rejected": -225.86422729492188, + "loss": 0.5066, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9209111928939819, + "rewards/margins": 1.7878525257110596, + "rewards/rejected": -2.708763599395752, "step": 3560 }, { - "epoch": 0.9, - "learning_rate": 3.884467746465687e-07, - "logits/chosen": -2.592451333999634, - "logits/rejected": -2.6167538166046143, - "logps/chosen": -231.6024627685547, - "logps/rejected": -326.56378173828125, - "loss": 0.6839, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8817847967147827, - "rewards/margins": 2.196343183517456, - "rewards/rejected": -3.0781280994415283, + "epoch": 0.86, + "learning_rate": 3.964610447495097e-07, + "logits/chosen": -2.634460926055908, + "logits/rejected": -2.5598464012145996, + "logps/chosen": -267.879638671875, + "logps/rejected": -250.5550994873047, + "loss": 0.614, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.9559977054595947, + "rewards/margins": 1.4461504220962524, + "rewards/rejected": -4.402148246765137, "step": 3570 }, { - "epoch": 0.9, - "learning_rate": 3.87978653684112e-07, - "logits/chosen": -2.794201612472534, - "logits/rejected": -2.5453829765319824, - "logps/chosen": -418.9364318847656, - "logps/rejected": -227.43875122070312, - "loss": 0.4803, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.071514368057251, - "rewards/margins": 1.1174607276916504, - "rewards/rejected": -3.1889748573303223, + "epoch": 0.86, + "learning_rate": 3.960153325013371e-07, + "logits/chosen": -2.7306995391845703, + "logits/rejected": -2.848644495010376, + "logps/chosen": -331.008056640625, + "logps/rejected": -389.3710021972656, + "loss": 0.6225, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8164174556732178, + "rewards/margins": 0.8572849035263062, + "rewards/rejected": -2.6737027168273926, "step": 3580 }, { - "epoch": 0.91, - "learning_rate": 3.875105327216552e-07, - "logits/chosen": -2.723081350326538, - "logits/rejected": -2.688534736633301, - "logps/chosen": -254.62332153320312, - "logps/rejected": -248.84072875976562, - "loss": 0.5947, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0840532779693604, - "rewards/margins": 1.7423394918441772, - "rewards/rejected": -2.826392650604248, + "epoch": 0.86, + "learning_rate": 3.9556962025316453e-07, + "logits/chosen": -2.6308114528656006, + "logits/rejected": -2.5287985801696777, + "logps/chosen": -333.4461669921875, + "logps/rejected": -303.1726379394531, + "loss": 0.6664, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6369529962539673, + "rewards/margins": 1.260517954826355, + "rewards/rejected": -2.8974709510803223, "step": 3590 }, { - "epoch": 0.91, - "learning_rate": 3.8704241175919856e-07, - "logits/chosen": -2.6092543601989746, - "logits/rejected": -2.518146514892578, - "logps/chosen": -291.47137451171875, - "logps/rejected": -250.5021209716797, - "loss": 0.5874, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.7550883889198303, - "rewards/margins": 1.8083865642547607, - "rewards/rejected": -2.5634751319885254, + "epoch": 0.87, + "learning_rate": 3.9512390800499193e-07, + "logits/chosen": -2.820922613143921, + "logits/rejected": -2.7313172817230225, + "logps/chosen": -254.3642120361328, + "logps/rejected": -252.00033569335938, + "loss": 0.6395, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.5080201625823975, + "rewards/margins": 1.8081779479980469, + "rewards/rejected": -4.316198348999023, "step": 3600 }, { - "epoch": 0.91, - "learning_rate": 3.8657429079674185e-07, - "logits/chosen": -2.7659926414489746, - "logits/rejected": -2.703761577606201, - "logps/chosen": -304.2853088378906, - "logps/rejected": -325.69012451171875, - "loss": 0.6338, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4947483539581299, - "rewards/margins": 0.9108313322067261, - "rewards/rejected": -2.4055798053741455, - "step": 3610 - }, + "epoch": 0.87, + "eval_logits/chosen": -2.577817440032959, + "eval_logits/rejected": -2.5479116439819336, + "eval_logps/chosen": -234.62742614746094, + "eval_logps/rejected": -241.28671264648438, + "eval_loss": 0.5332222580909729, + "eval_rewards/accuracies": 0.6524999737739563, + "eval_rewards/chosen": -3.8666422367095947, + "eval_rewards/margins": 1.8227207660675049, + "eval_rewards/rejected": -5.6893630027771, + "eval_runtime": 132.0472, + "eval_samples_per_second": 23.901, + "eval_steps_per_second": 0.379, + "step": 3600 + }, { - "epoch": 0.92, - "learning_rate": 3.861061698342852e-07, - "logits/chosen": -2.6591992378234863, - "logits/rejected": -2.55127215385437, - "logps/chosen": -316.5832214355469, - "logps/rejected": -230.5525360107422, - "loss": 0.5693, + "epoch": 0.87, + "learning_rate": 3.946781957568194e-07, + "logits/chosen": -2.7825145721435547, + "logits/rejected": -2.759437084197998, + "logps/chosen": -267.33795166015625, + "logps/rejected": -302.57061767578125, + "loss": 0.5882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.738835096359253, + "rewards/margins": 0.822882354259491, + "rewards/rejected": -2.5617175102233887, + "step": 3610 + }, + { + "epoch": 0.87, + "learning_rate": 3.942324835086468e-07, + "logits/chosen": -2.7459561824798584, + "logits/rejected": -2.8053696155548096, + "logps/chosen": -301.9945373535156, + "logps/rejected": -255.42529296875, + "loss": 0.5364, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.1001923084259033, - "rewards/margins": 1.9854466915130615, - "rewards/rejected": -3.085639238357544, + "rewards/chosen": -1.5883409976959229, + "rewards/margins": 1.9795383214950562, + "rewards/rejected": -3.5678791999816895, "step": 3620 }, { - "epoch": 0.92, - "learning_rate": 3.856380488718284e-07, - "logits/chosen": -2.67317795753479, - "logits/rejected": -2.627720594406128, - "logps/chosen": -236.6415557861328, - "logps/rejected": -306.64239501953125, - "loss": 0.6112, + "epoch": 0.87, + "learning_rate": 3.937867712604742e-07, + "logits/chosen": -2.849888324737549, + "logits/rejected": -2.7563107013702393, + "logps/chosen": -403.3260498046875, + "logps/rejected": -311.11871337890625, + "loss": 0.5544, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.4338676929473877, - "rewards/margins": 2.2288482189178467, - "rewards/rejected": -3.6627159118652344, + "rewards/chosen": -1.209285855293274, + "rewards/margins": 1.720338225364685, + "rewards/rejected": -2.929624080657959, "step": 3630 }, { - "epoch": 0.92, - "learning_rate": 3.8516992790937177e-07, - "logits/chosen": -2.586552619934082, - "logits/rejected": -2.6333885192871094, - "logps/chosen": -260.02496337890625, - "logps/rejected": -262.3716125488281, - "loss": 0.6356, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.574707269668579, - "rewards/margins": 1.0051276683807373, - "rewards/rejected": -2.5798351764678955, + "epoch": 0.88, + "learning_rate": 3.9334105901230165e-07, + "logits/chosen": -2.521254062652588, + "logits/rejected": -2.4202935695648193, + "logps/chosen": -335.81317138671875, + "logps/rejected": -342.54986572265625, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5694057941436768, + "rewards/margins": 2.845444917678833, + "rewards/rejected": -4.414850234985352, "step": 3640 }, { - "epoch": 0.92, - "learning_rate": 3.8470180694691506e-07, - "logits/chosen": -2.8641533851623535, - "logits/rejected": -2.7077131271362305, - "logps/chosen": -271.4937438964844, - "logps/rejected": -222.1265411376953, - "loss": 0.615, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.424896001815796, - "rewards/margins": 1.3194665908813477, - "rewards/rejected": -2.7443625926971436, + "epoch": 0.88, + "learning_rate": 3.9289534676412905e-07, + "logits/chosen": -2.636383533477783, + "logits/rejected": -2.6689443588256836, + "logps/chosen": -226.60971069335938, + "logps/rejected": -240.5894012451172, + "loss": 0.6526, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.8428890705108643, + "rewards/margins": 0.903119683265686, + "rewards/rejected": -2.74600887298584, "step": 3650 }, { - "epoch": 0.93, - "learning_rate": 3.842336859844584e-07, - "logits/chosen": -2.6335389614105225, - "logits/rejected": -2.5206198692321777, - "logps/chosen": -303.037353515625, - "logps/rejected": -293.91668701171875, - "loss": 0.5565, + "epoch": 0.88, + "learning_rate": 3.9244963451595645e-07, + "logits/chosen": -2.707064390182495, + "logits/rejected": -2.6527113914489746, + "logps/chosen": -204.14195251464844, + "logps/rejected": -202.14462280273438, + "loss": 0.5952, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4828656911849976, - "rewards/margins": 1.1465017795562744, - "rewards/rejected": -2.6293673515319824, + "rewards/chosen": -1.8457982540130615, + "rewards/margins": 1.7863857746124268, + "rewards/rejected": -3.632183790206909, "step": 3660 }, { - "epoch": 0.93, - "learning_rate": 3.837655650220017e-07, - "logits/chosen": -2.773890256881714, - "logits/rejected": -2.586940050125122, - "logps/chosen": -332.7080993652344, - "logps/rejected": -269.8664855957031, - "loss": 0.6619, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.16353440284729, - "rewards/margins": 1.4180161952972412, - "rewards/rejected": -3.581550121307373, + "epoch": 0.88, + "learning_rate": 3.920039222677839e-07, + "logits/chosen": -2.662675380706787, + "logits/rejected": -2.561508893966675, + "logps/chosen": -218.0263671875, + "logps/rejected": -265.77490234375, + "loss": 0.5407, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3727507591247559, + "rewards/margins": 2.5847787857055664, + "rewards/rejected": -3.957529067993164, "step": 3670 }, { - "epoch": 0.93, - "learning_rate": 3.832974440595449e-07, - "logits/chosen": -2.72123122215271, - "logits/rejected": -2.6845922470092773, - "logps/chosen": -299.73162841796875, - "logps/rejected": -271.2549743652344, - "loss": 0.7304, + "epoch": 0.89, + "learning_rate": 3.915582100196113e-07, + "logits/chosen": -2.7710025310516357, + "logits/rejected": -2.7594571113586426, + "logps/chosen": -186.926513671875, + "logps/rejected": -266.11163330078125, + "loss": 0.6079, "rewards/accuracies": 0.75, - "rewards/chosen": -2.399379253387451, - "rewards/margins": 0.6283119916915894, - "rewards/rejected": -3.02769136428833, + "rewards/chosen": -1.6701900959014893, + "rewards/margins": 1.5521122217178345, + "rewards/rejected": -3.2223026752471924, "step": 3680 }, { - "epoch": 0.93, - "learning_rate": 3.8282932309708827e-07, - "logits/chosen": -2.725205183029175, - "logits/rejected": -2.6237640380859375, - "logps/chosen": -241.66439819335938, - "logps/rejected": -225.9160614013672, - "loss": 0.5719, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.980405569076538, - "rewards/margins": 0.9876116514205933, - "rewards/rejected": -2.968017101287842, + "epoch": 0.89, + "learning_rate": 3.911124977714387e-07, + "logits/chosen": -2.6601173877716064, + "logits/rejected": -2.7156665325164795, + "logps/chosen": -137.65895080566406, + "logps/rejected": -212.44247436523438, + "loss": 0.5322, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7452404499053955, + "rewards/margins": 1.1229227781295776, + "rewards/rejected": -3.8681633472442627, "step": 3690 }, { - "epoch": 0.94, - "learning_rate": 3.8236120213463156e-07, - "logits/chosen": -2.708747386932373, - "logits/rejected": -2.711845874786377, - "logps/chosen": -246.9578857421875, - "logps/rejected": -265.8954772949219, - "loss": 0.5576, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.7569336891174316, - "rewards/margins": 0.7664319276809692, - "rewards/rejected": -3.5233657360076904, + "epoch": 0.89, + "learning_rate": 3.9066678552326617e-07, + "logits/chosen": -2.8254523277282715, + "logits/rejected": -2.670154571533203, + "logps/chosen": -365.99114990234375, + "logps/rejected": -337.25128173828125, + "loss": 0.6552, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9288088083267212, + "rewards/margins": 1.3776509761810303, + "rewards/rejected": -3.306459903717041, "step": 3700 }, { - "epoch": 0.94, - "learning_rate": 3.818930811721749e-07, - "logits/chosen": -2.563875198364258, - "logits/rejected": -2.60050892829895, - "logps/chosen": -277.04351806640625, - "logps/rejected": -283.3923034667969, - "loss": 0.6133, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.120920419692993, - "rewards/margins": 0.8064680099487305, - "rewards/rejected": -2.9273884296417236, + "epoch": 0.89, + "eval_logits/chosen": -2.4901480674743652, + "eval_logits/rejected": -2.4579715728759766, + "eval_logps/chosen": -225.12937927246094, + "eval_logps/rejected": -231.69898986816406, + "eval_loss": 0.5148530602455139, + "eval_rewards/accuracies": 0.6524999737739563, + "eval_rewards/chosen": -2.916835069656372, + "eval_rewards/margins": 1.8137555122375488, + "eval_rewards/rejected": -4.730589866638184, + "eval_runtime": 132.2567, + "eval_samples_per_second": 23.863, + "eval_steps_per_second": 0.378, + "step": 3700 + }, + { + "epoch": 0.89, + "learning_rate": 3.902210732750936e-07, + "logits/chosen": -2.679245948791504, + "logits/rejected": -2.572080373764038, + "logps/chosen": -252.30636596679688, + "logps/rejected": -331.4736022949219, + "loss": 0.6061, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.073190450668335, + "rewards/margins": 0.6728688478469849, + "rewards/rejected": -2.7460594177246094, "step": 3710 }, { - "epoch": 0.94, - "learning_rate": 3.8142496020971813e-07, - "logits/chosen": -2.6808571815490723, - "logits/rejected": -2.6824426651000977, - "logps/chosen": -266.8973388671875, - "logps/rejected": -216.49423217773438, - "loss": 0.5389, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3810316324234009, - "rewards/margins": 1.0641003847122192, - "rewards/rejected": -2.44513201713562, + "epoch": 0.9, + "learning_rate": 3.89775361026921e-07, + "logits/chosen": -2.632016181945801, + "logits/rejected": -2.720738410949707, + "logps/chosen": -258.62640380859375, + "logps/rejected": -247.09042358398438, + "loss": 0.5392, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8232755661010742, + "rewards/margins": 1.5221580266952515, + "rewards/rejected": -3.3454337120056152, "step": 3720 }, { - "epoch": 0.94, - "learning_rate": 3.809568392472615e-07, - "logits/chosen": -2.7239327430725098, - "logits/rejected": -2.701735496520996, - "logps/chosen": -301.28521728515625, - "logps/rejected": -270.1116943359375, - "loss": 0.5956, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.071234941482544, - "rewards/margins": 1.517927885055542, - "rewards/rejected": -2.589162826538086, + "epoch": 0.9, + "learning_rate": 3.8932964877874843e-07, + "logits/chosen": -2.6614232063293457, + "logits/rejected": -2.646655559539795, + "logps/chosen": -226.7962188720703, + "logps/rejected": -265.7986145019531, + "loss": 0.6564, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.015622615814209, + "rewards/margins": 1.1390827894210815, + "rewards/rejected": -2.15470552444458, "step": 3730 }, { - "epoch": 0.95, - "learning_rate": 3.8048871828480477e-07, - "logits/chosen": -2.5309438705444336, - "logits/rejected": -2.4692294597625732, - "logps/chosen": -316.5550842285156, - "logps/rejected": -302.6878662109375, - "loss": 0.4316, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.82965886592865, - "rewards/margins": 1.7301852703094482, - "rewards/rejected": -3.559844493865967, + "epoch": 0.9, + "learning_rate": 3.8888393653057584e-07, + "logits/chosen": -2.476155996322632, + "logits/rejected": -2.5315704345703125, + "logps/chosen": -248.3118896484375, + "logps/rejected": -168.81814575195312, + "loss": 0.6548, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0070793628692627, + "rewards/margins": 1.059119701385498, + "rewards/rejected": -3.0661988258361816, "step": 3740 }, { - "epoch": 0.95, - "learning_rate": 3.800205973223481e-07, - "logits/chosen": -2.6489920616149902, - "logits/rejected": -2.551776647567749, - "logps/chosen": -227.8351593017578, - "logps/rejected": -247.8078155517578, - "loss": 0.7572, + "epoch": 0.9, + "learning_rate": 3.8843822428240324e-07, + "logits/chosen": -2.5508644580841064, + "logits/rejected": -2.652991771697998, + "logps/chosen": -189.95370483398438, + "logps/rejected": -226.12472534179688, + "loss": 0.4454, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.2301652431488037, - "rewards/margins": 1.3657900094985962, - "rewards/rejected": -3.5959556102752686, + "rewards/chosen": -2.0008137226104736, + "rewards/margins": 0.8059350252151489, + "rewards/rejected": -2.806748867034912, "step": 3750 }, { - "epoch": 0.95, - "learning_rate": 3.795524763598914e-07, - "logits/chosen": -2.2900452613830566, - "logits/rejected": -2.341315746307373, - "logps/chosen": -368.0122375488281, - "logps/rejected": -338.74951171875, - "loss": 0.5997, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7872066497802734, - "rewards/margins": 3.213815689086914, - "rewards/rejected": -5.001022815704346, + "epoch": 0.9, + "learning_rate": 3.8799251203423064e-07, + "logits/chosen": -2.8311209678649902, + "logits/rejected": -2.787224292755127, + "logps/chosen": -334.48736572265625, + "logps/rejected": -348.693359375, + "loss": 0.5171, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3315805196762085, + "rewards/margins": 1.2268116474151611, + "rewards/rejected": -2.558392286300659, "step": 3760 }, { - "epoch": 0.95, - "learning_rate": 3.790843553974347e-07, - "logits/chosen": -2.4849908351898193, - "logits/rejected": -2.551020860671997, - "logps/chosen": -263.210693359375, - "logps/rejected": -397.0281066894531, - "loss": 0.4803, + "epoch": 0.91, + "learning_rate": 3.875467997860581e-07, + "logits/chosen": -2.7187821865081787, + "logits/rejected": -2.619144916534424, + "logps/chosen": -212.1243896484375, + "logps/rejected": -176.82835388183594, + "loss": 0.6273, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.4165332317352295, - "rewards/margins": 1.1031792163848877, - "rewards/rejected": -3.519712448120117, + "rewards/chosen": -1.676042914390564, + "rewards/margins": 0.6634700894355774, + "rewards/rejected": -2.339512825012207, "step": 3770 }, { - "epoch": 0.96, - "learning_rate": 3.78616234434978e-07, - "logits/chosen": -2.51816987991333, - "logits/rejected": -2.5692131519317627, - "logps/chosen": -258.7332458496094, - "logps/rejected": -332.8305969238281, - "loss": 0.5861, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.619706392288208, - "rewards/margins": 1.3120684623718262, - "rewards/rejected": -3.9317753314971924, + "epoch": 0.91, + "learning_rate": 3.871010875378855e-07, + "logits/chosen": -2.4955334663391113, + "logits/rejected": -2.554507255554199, + "logps/chosen": -274.38531494140625, + "logps/rejected": -285.43536376953125, + "loss": 0.5729, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.40419697761535645, + "rewards/margins": 2.684859275817871, + "rewards/rejected": -3.0890562534332275, "step": 3780 }, { - "epoch": 0.96, - "learning_rate": 3.7814811347252127e-07, - "logits/chosen": -2.289250612258911, - "logits/rejected": -2.404963970184326, - "logps/chosen": -224.2694549560547, - "logps/rejected": -220.3228759765625, - "loss": 0.6365, - "rewards/accuracies": 0.5, - "rewards/chosen": -2.5331289768218994, - "rewards/margins": 0.5771879553794861, - "rewards/rejected": -3.110316753387451, + "epoch": 0.91, + "learning_rate": 3.866553752897129e-07, + "logits/chosen": -2.6523427963256836, + "logits/rejected": -2.7260308265686035, + "logps/chosen": -253.6539764404297, + "logps/rejected": -322.428955078125, + "loss": 0.557, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3323105573654175, + "rewards/margins": 2.0486483573913574, + "rewards/rejected": -3.3809590339660645, "step": 3790 }, { - "epoch": 0.96, - "learning_rate": 3.776799925100646e-07, - "logits/chosen": -2.3783960342407227, - "logits/rejected": -2.2490427494049072, - "logps/chosen": -397.93804931640625, - "logps/rejected": -365.0287780761719, - "loss": 0.7984, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.592641592025757, - "rewards/margins": -0.39536601305007935, - "rewards/rejected": -2.1972756385803223, + "epoch": 0.91, + "learning_rate": 3.8620966304154036e-07, + "logits/chosen": -2.6505486965179443, + "logits/rejected": -2.6866531372070312, + "logps/chosen": -251.415771484375, + "logps/rejected": -230.0135040283203, + "loss": 0.6381, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1353700160980225, + "rewards/margins": 0.7971321940422058, + "rewards/rejected": -2.932502269744873, "step": 3800 }, { - "epoch": 0.96, - "learning_rate": 3.7721187154760784e-07, - "logits/chosen": -2.3933181762695312, - "logits/rejected": -2.3748421669006348, - "logps/chosen": -261.06378173828125, - "logps/rejected": -215.88504028320312, - "loss": 0.6671, + "epoch": 0.91, + "eval_logits/chosen": -2.499129056930542, + "eval_logits/rejected": -2.4730007648468018, + "eval_logps/chosen": -222.14324951171875, + "eval_logps/rejected": -227.39642333984375, + "eval_loss": 0.5081271529197693, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -2.618225336074829, + "eval_rewards/margins": 1.6821056604385376, + "eval_rewards/rejected": -4.300331115722656, + "eval_runtime": 132.1041, + "eval_samples_per_second": 23.89, + "eval_steps_per_second": 0.378, + "step": 3800 + }, + { + "epoch": 0.92, + "learning_rate": 3.8576395079336776e-07, + "logits/chosen": -2.773014545440674, + "logits/rejected": -2.750072956085205, + "logps/chosen": -290.4387512207031, + "logps/rejected": -378.2158203125, + "loss": 0.5415, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.1627757549285889, - "rewards/margins": 0.8094266653060913, - "rewards/rejected": -1.9722025394439697, + "rewards/chosen": -1.5297836065292358, + "rewards/margins": 0.9236348867416382, + "rewards/rejected": -2.453418254852295, "step": 3810 }, { - "epoch": 0.97, - "learning_rate": 3.767437505851512e-07, - "logits/chosen": -2.6702141761779785, - "logits/rejected": -2.6176254749298096, - "logps/chosen": -281.86773681640625, - "logps/rejected": -348.5569763183594, - "loss": 0.5429, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.463614821434021, - "rewards/margins": 1.5381866693496704, - "rewards/rejected": -3.0018012523651123, + "epoch": 0.92, + "learning_rate": 3.8531823854519516e-07, + "logits/chosen": -2.7695765495300293, + "logits/rejected": -2.735790729522705, + "logps/chosen": -253.1417999267578, + "logps/rejected": -280.3123779296875, + "loss": 0.5337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.658485770225525, + "rewards/margins": 1.5891600847244263, + "rewards/rejected": -3.247645616531372, "step": 3820 }, { - "epoch": 0.97, - "learning_rate": 3.762756296226945e-07, - "logits/chosen": -2.3716139793395996, - "logits/rejected": -2.35888671875, - "logps/chosen": -261.81512451171875, - "logps/rejected": -226.60299682617188, - "loss": 0.5125, + "epoch": 0.92, + "learning_rate": 3.848725262970226e-07, + "logits/chosen": -2.740900754928589, + "logits/rejected": -2.74745512008667, + "logps/chosen": -270.62310791015625, + "logps/rejected": -314.9116516113281, + "loss": 0.64, "rewards/accuracies": 0.75, - "rewards/chosen": -1.3739840984344482, - "rewards/margins": 1.4622911214828491, - "rewards/rejected": -2.836275339126587, + "rewards/chosen": -2.0330982208251953, + "rewards/margins": 1.5113351345062256, + "rewards/rejected": -3.54443359375, "step": 3830 }, { - "epoch": 0.97, - "learning_rate": 3.758075086602378e-07, - "logits/chosen": -2.617845058441162, - "logits/rejected": -2.6614153385162354, - "logps/chosen": -240.1200714111328, - "logps/rejected": -294.9786071777344, - "loss": 0.4858, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.050519347190857, - "rewards/margins": 2.296268939971924, - "rewards/rejected": -3.346787929534912, + "epoch": 0.92, + "learning_rate": 3.8442681404885e-07, + "logits/chosen": -2.7074077129364014, + "logits/rejected": -2.752732753753662, + "logps/chosen": -231.3099365234375, + "logps/rejected": -244.00411987304688, + "loss": 0.4956, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7288591861724854, + "rewards/margins": 1.7826036214828491, + "rewards/rejected": -3.511462688446045, "step": 3840 }, { - "epoch": 0.97, - "learning_rate": 3.7533938769778105e-07, - "logits/chosen": -2.5451509952545166, - "logits/rejected": -2.451148271560669, - "logps/chosen": -258.9147033691406, - "logps/rejected": -250.61575317382812, - "loss": 0.6505, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.864426851272583, - "rewards/margins": 0.6109026074409485, - "rewards/rejected": -2.4753293991088867, + "epoch": 0.93, + "learning_rate": 3.839811018006774e-07, + "logits/chosen": -2.73136043548584, + "logits/rejected": -2.7185912132263184, + "logps/chosen": -302.53521728515625, + "logps/rejected": -318.34893798828125, + "loss": 0.7121, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4784200191497803, + "rewards/margins": 1.503691554069519, + "rewards/rejected": -3.9821114540100098, "step": 3850 }, { - "epoch": 0.98, - "learning_rate": 3.748712667353244e-07, - "logits/chosen": -2.7763352394104004, - "logits/rejected": -2.7048943042755127, - "logps/chosen": -369.334716796875, - "logps/rejected": -304.45758056640625, - "loss": 0.6338, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.1332387924194336, - "rewards/margins": 0.9427889585494995, - "rewards/rejected": -3.0760276317596436, + "epoch": 0.93, + "learning_rate": 3.835353895525049e-07, + "logits/chosen": -2.6867222785949707, + "logits/rejected": -2.6257197856903076, + "logps/chosen": -235.4755401611328, + "logps/rejected": -251.58090209960938, + "loss": 0.4903, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.582584023475647, + "rewards/margins": 2.0604348182678223, + "rewards/rejected": -3.6430187225341797, "step": 3860 }, { - "epoch": 0.98, - "learning_rate": 3.744031457728677e-07, - "logits/chosen": -2.4644370079040527, - "logits/rejected": -2.4173831939697266, - "logps/chosen": -319.7100524902344, - "logps/rejected": -272.42559814453125, - "loss": 0.5365, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.5952672958374023, - "rewards/margins": 2.1468255519866943, - "rewards/rejected": -3.742093324661255, + "epoch": 0.93, + "learning_rate": 3.830896773043323e-07, + "logits/chosen": -2.484771251678467, + "logits/rejected": -2.438969850540161, + "logps/chosen": -260.0497741699219, + "logps/rejected": -297.59033203125, + "loss": 0.5586, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.405928134918213, + "rewards/margins": 2.229367733001709, + "rewards/rejected": -3.635295867919922, "step": 3870 }, { - "epoch": 0.98, - "learning_rate": 3.7393502481041103e-07, - "logits/chosen": -2.7287070751190186, - "logits/rejected": -2.740267276763916, - "logps/chosen": -383.4566345214844, - "logps/rejected": -363.9472961425781, - "loss": 0.5002, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.6474063396453857, - "rewards/margins": 1.0410480499267578, - "rewards/rejected": -3.6884543895721436, + "epoch": 0.93, + "learning_rate": 3.826439650561597e-07, + "logits/chosen": -2.8689968585968018, + "logits/rejected": -2.7582883834838867, + "logps/chosen": -323.91632080078125, + "logps/rejected": -322.7571105957031, + "loss": 0.6079, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.454990863800049, + "rewards/margins": 0.8642986416816711, + "rewards/rejected": -3.3192896842956543, "step": 3880 }, { - "epoch": 0.98, - "learning_rate": 3.734669038479543e-07, - "logits/chosen": -2.6365084648132324, - "logits/rejected": -2.4641079902648926, - "logps/chosen": -363.31915283203125, - "logps/rejected": -291.7921142578125, - "loss": 0.6346, + "epoch": 0.94, + "learning_rate": 3.8219825280798714e-07, + "logits/chosen": -2.6461687088012695, + "logits/rejected": -2.6319615840911865, + "logps/chosen": -255.50912475585938, + "logps/rejected": -243.58554077148438, + "loss": 0.7378, "rewards/accuracies": 0.75, - "rewards/chosen": -1.3033270835876465, - "rewards/margins": 2.174389600753784, - "rewards/rejected": -3.4777164459228516, + "rewards/chosen": -1.4754579067230225, + "rewards/margins": 1.322139859199524, + "rewards/rejected": -2.797597885131836, "step": 3890 }, { - "epoch": 0.99, - "learning_rate": 3.7299878288549755e-07, - "logits/chosen": -2.745086431503296, - "logits/rejected": -2.551102876663208, - "logps/chosen": -321.5898132324219, - "logps/rejected": -275.5663146972656, - "loss": 0.5264, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.8006789684295654, - "rewards/margins": 1.495180368423462, - "rewards/rejected": -4.295859336853027, + "epoch": 0.94, + "learning_rate": 3.8175254055981455e-07, + "logits/chosen": -2.670517683029175, + "logits/rejected": -2.6806640625, + "logps/chosen": -293.5143127441406, + "logps/rejected": -250.64920043945312, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4953172206878662, + "rewards/margins": 1.9487268924713135, + "rewards/rejected": -3.4440436363220215, "step": 3900 }, { - "epoch": 0.99, - "learning_rate": 3.725306619230409e-07, - "logits/chosen": -2.579899549484253, - "logits/rejected": -2.564140796661377, - "logps/chosen": -250.631103515625, - "logps/rejected": -247.8636016845703, - "loss": 0.5685, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.100395679473877, - "rewards/margins": 2.7038025856018066, - "rewards/rejected": -3.8041980266571045, + "epoch": 0.94, + "eval_logits/chosen": -2.6064794063568115, + "eval_logits/rejected": -2.5875136852264404, + "eval_logps/chosen": -221.26336669921875, + "eval_logps/rejected": -226.86886596679688, + "eval_loss": 0.509952962398529, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -2.5302348136901855, + "eval_rewards/margins": 1.717340350151062, + "eval_rewards/rejected": -4.247575283050537, + "eval_runtime": 132.2303, + "eval_samples_per_second": 23.867, + "eval_steps_per_second": 0.378, + "step": 3900 + }, + { + "epoch": 0.94, + "learning_rate": 3.8130682831164195e-07, + "logits/chosen": -2.738208055496216, + "logits/rejected": -2.777827024459839, + "logps/chosen": -296.6704406738281, + "logps/rejected": -287.9441223144531, + "loss": 0.4663, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0642380714416504, + "rewards/margins": 2.0133893489837646, + "rewards/rejected": -3.077627658843994, "step": 3910 }, { - "epoch": 0.99, - "learning_rate": 3.720625409605842e-07, - "logits/chosen": -2.5308988094329834, - "logits/rejected": -2.525057315826416, - "logps/chosen": -297.98529052734375, - "logps/rejected": -257.6971130371094, - "loss": 0.5155, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.13560429215431213, - "rewards/margins": 2.4298863410949707, - "rewards/rejected": -2.56549072265625, + "epoch": 0.94, + "learning_rate": 3.8086111606346946e-07, + "logits/chosen": -2.7026431560516357, + "logits/rejected": -2.6036548614501953, + "logps/chosen": -282.5904541015625, + "logps/rejected": -304.15631103515625, + "loss": 0.5412, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8530317544937134, + "rewards/margins": 2.578052043914795, + "rewards/rejected": -3.4310836791992188, "step": 3920 }, { - "epoch": 0.99, - "learning_rate": 3.7159441999812753e-07, - "logits/chosen": -2.6410772800445557, - "logits/rejected": -2.5101382732391357, - "logps/chosen": -234.18124389648438, - "logps/rejected": -279.98370361328125, - "loss": 0.7214, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -2.2928466796875, - "rewards/margins": 0.7621484994888306, - "rewards/rejected": -3.054995059967041, + "epoch": 0.95, + "learning_rate": 3.8041540381529686e-07, + "logits/chosen": -2.784578800201416, + "logits/rejected": -2.7550692558288574, + "logps/chosen": -288.7332763671875, + "logps/rejected": -275.85662841796875, + "loss": 0.5836, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4746872186660767, + "rewards/margins": 1.975242018699646, + "rewards/rejected": -3.4499289989471436, "step": 3930 }, { - "epoch": 1.0, - "learning_rate": 3.7112629903567076e-07, - "logits/chosen": -2.872884750366211, - "logits/rejected": -2.4980196952819824, - "logps/chosen": -281.6012878417969, - "logps/rejected": -180.0428009033203, - "loss": 0.4839, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.9578030109405518, - "rewards/margins": 1.8885114192962646, - "rewards/rejected": -3.8463146686553955, + "epoch": 0.95, + "learning_rate": 3.7996969156712426e-07, + "logits/chosen": -2.939732789993286, + "logits/rejected": -2.8839194774627686, + "logps/chosen": -418.07000732421875, + "logps/rejected": -319.2259521484375, + "loss": 0.5463, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0764384269714355, + "rewards/margins": 1.9666048288345337, + "rewards/rejected": -4.043043613433838, "step": 3940 }, { - "epoch": 1.0, - "learning_rate": 3.706581780732141e-07, - "logits/chosen": -2.7477738857269287, - "logits/rejected": -2.658379316329956, - "logps/chosen": -236.2034912109375, - "logps/rejected": -243.9905242919922, - "loss": 0.6138, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.6140636205673218, - "rewards/margins": 1.226651906967163, - "rewards/rejected": -2.8407154083251953, + "epoch": 0.95, + "learning_rate": 3.7952397931895167e-07, + "logits/chosen": -2.861053705215454, + "logits/rejected": -2.8309762477874756, + "logps/chosen": -303.13482666015625, + "logps/rejected": -319.7237243652344, + "loss": 0.6092, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3278403282165527, + "rewards/margins": 2.2359471321105957, + "rewards/rejected": -4.563787937164307, "step": 3950 }, { - "epoch": 1.0, - "learning_rate": 3.701900571107574e-07, - "logits/chosen": -2.5700225830078125, - "logits/rejected": -2.509644031524658, - "logps/chosen": -277.822265625, - "logps/rejected": -263.8984680175781, - "loss": 0.3053, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.8047565221786499, - "rewards/margins": 4.11063289642334, - "rewards/rejected": -4.915389537811279, + "epoch": 0.95, + "learning_rate": 3.790782670707791e-07, + "logits/chosen": -2.8640716075897217, + "logits/rejected": -2.9300031661987305, + "logps/chosen": -215.1240234375, + "logps/rejected": -266.8131103515625, + "loss": 0.567, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6753451824188232, + "rewards/margins": 0.917506992816925, + "rewards/rejected": -2.5928521156311035, "step": 3960 }, { - "epoch": 1.0, - "learning_rate": 3.6972193614830074e-07, - "logits/chosen": -2.705073118209839, - "logits/rejected": -2.610149383544922, - "logps/chosen": -239.9237060546875, - "logps/rejected": -314.971923828125, - "loss": 0.139, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.38272005319595337, - "rewards/margins": 6.647641658782959, - "rewards/rejected": -6.26492166519165, + "epoch": 0.96, + "learning_rate": 3.786325548226065e-07, + "logits/chosen": -2.864936351776123, + "logits/rejected": -2.8951430320739746, + "logps/chosen": -236.265380859375, + "logps/rejected": -291.41180419921875, + "loss": 0.5954, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.0504260063171387, + "rewards/margins": 1.0682042837142944, + "rewards/rejected": -3.1186306476593018, "step": 3970 }, { - "epoch": 1.01, - "learning_rate": 3.6925381518584403e-07, - "logits/chosen": -2.7207839488983154, - "logits/rejected": -2.585111141204834, - "logps/chosen": -426.0726623535156, - "logps/rejected": -304.66510009765625, - "loss": 0.0802, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1401253938674927, - "rewards/margins": 7.029414176940918, - "rewards/rejected": -5.889288425445557, + "epoch": 0.96, + "learning_rate": 3.7818684257443393e-07, + "logits/chosen": -2.73911452293396, + "logits/rejected": -2.713984727859497, + "logps/chosen": -265.76263427734375, + "logps/rejected": -284.48126220703125, + "loss": 0.5748, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8313690423965454, + "rewards/margins": 1.8258765935897827, + "rewards/rejected": -2.657245635986328, "step": 3980 }, { - "epoch": 1.01, - "learning_rate": 3.6878569422338726e-07, - "logits/chosen": -2.681621551513672, - "logits/rejected": -2.5620551109313965, - "logps/chosen": -309.2017822265625, - "logps/rejected": -323.23931884765625, - "loss": 0.1826, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.27074792981147766, - "rewards/margins": 5.536995887756348, - "rewards/rejected": -5.807744026184082, + "epoch": 0.96, + "learning_rate": 3.777411303262614e-07, + "logits/chosen": -2.740730047225952, + "logits/rejected": -2.8384454250335693, + "logps/chosen": -395.585205078125, + "logps/rejected": -369.36834716796875, + "loss": 0.5122, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.454293966293335, + "rewards/margins": 2.1194934844970703, + "rewards/rejected": -3.573786973953247, "step": 3990 }, { - "epoch": 1.01, - "learning_rate": 3.683175732609306e-07, - "logits/chosen": -2.7098963260650635, - "logits/rejected": -2.4533934593200684, - "logps/chosen": -325.30316162109375, - "logps/rejected": -380.95135498046875, - "loss": 0.0694, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.07627948373556137, - "rewards/margins": 7.224770545959473, - "rewards/rejected": -7.148490905761719, + "epoch": 0.96, + "learning_rate": 3.772954180780888e-07, + "logits/chosen": -2.760805368423462, + "logits/rejected": -2.7401845455169678, + "logps/chosen": -210.9258270263672, + "logps/rejected": -188.4416046142578, + "loss": 0.5488, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0753872394561768, + "rewards/margins": 2.0571601390838623, + "rewards/rejected": -3.132547378540039, "step": 4000 }, { - "epoch": 1.01, - "learning_rate": 3.678494522984739e-07, - "logits/chosen": -2.7862906455993652, - "logits/rejected": -2.6643624305725098, - "logps/chosen": -218.59738159179688, - "logps/rejected": -219.8732147216797, - "loss": 0.1955, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.7133752703666687, - "rewards/margins": 5.267045021057129, - "rewards/rejected": -4.553668975830078, + "epoch": 0.96, + "eval_logits/chosen": -2.7215051651000977, + "eval_logits/rejected": -2.701664686203003, + "eval_logps/chosen": -227.50132751464844, + "eval_logps/rejected": -232.7318115234375, + "eval_loss": 0.5164242386817932, + "eval_rewards/accuracies": 0.6549999713897705, + "eval_rewards/chosen": -3.1540322303771973, + "eval_rewards/margins": 1.6798410415649414, + "eval_rewards/rejected": -4.8338727951049805, + "eval_runtime": 132.2463, + "eval_samples_per_second": 23.865, + "eval_steps_per_second": 0.378, + "step": 4000 + }, + { + "epoch": 0.97, + "learning_rate": 3.768497058299162e-07, + "logits/chosen": -2.841407299041748, + "logits/rejected": -2.8201231956481934, + "logps/chosen": -245.4801788330078, + "logps/rejected": -283.57049560546875, + "loss": 0.4582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.151103973388672, + "rewards/margins": 1.493875503540039, + "rewards/rejected": -3.644979476928711, "step": 4010 }, { - "epoch": 1.02, - "learning_rate": 3.6738133133601724e-07, - "logits/chosen": -2.7759945392608643, - "logits/rejected": -2.7921369075775146, - "logps/chosen": -261.9883117675781, - "logps/rejected": -340.5843200683594, - "loss": 0.0495, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1419750452041626, - "rewards/margins": 7.45736837387085, - "rewards/rejected": -6.31539249420166, + "epoch": 0.97, + "learning_rate": 3.7640399358174365e-07, + "logits/chosen": -2.8127903938293457, + "logits/rejected": -2.6954169273376465, + "logps/chosen": -344.28778076171875, + "logps/rejected": -251.55416870117188, + "loss": 0.4683, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8597686290740967, + "rewards/margins": 1.2121803760528564, + "rewards/rejected": -4.071949005126953, "step": 4020 }, { - "epoch": 1.02, - "learning_rate": 3.669132103735605e-07, - "logits/chosen": -2.646761417388916, - "logits/rejected": -2.5379443168640137, - "logps/chosen": -260.28765869140625, - "logps/rejected": -246.83193969726562, - "loss": 0.1136, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.24251198768615723, - "rewards/margins": 5.654573917388916, - "rewards/rejected": -5.897086143493652, + "epoch": 0.97, + "learning_rate": 3.7595828133357105e-07, + "logits/chosen": -2.9195544719696045, + "logits/rejected": -2.8331875801086426, + "logps/chosen": -303.34661865234375, + "logps/rejected": -246.0027618408203, + "loss": 0.4881, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6333646774291992, + "rewards/margins": 1.5879805088043213, + "rewards/rejected": -3.2213454246520996, "step": 4030 }, { - "epoch": 1.02, - "learning_rate": 3.664450894111038e-07, - "logits/chosen": -2.5841357707977295, - "logits/rejected": -2.53714919090271, - "logps/chosen": -287.8470153808594, - "logps/rejected": -314.3433837890625, - "loss": 0.1137, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.2378253936767578, - "rewards/margins": 7.138622283935547, - "rewards/rejected": -7.376448154449463, + "epoch": 0.97, + "learning_rate": 3.7551256908539845e-07, + "logits/chosen": -2.8977580070495605, + "logits/rejected": -2.824528455734253, + "logps/chosen": -208.7256317138672, + "logps/rejected": -194.3167724609375, + "loss": 0.9101, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1041762828826904, + "rewards/margins": 1.9525047540664673, + "rewards/rejected": -4.0566816329956055, "step": 4040 }, { - "epoch": 1.02, - "learning_rate": 3.659769684486471e-07, - "logits/chosen": -2.703209161758423, - "logits/rejected": -2.571162700653076, - "logps/chosen": -247.2056427001953, - "logps/rejected": -289.3986511230469, - "loss": 0.0958, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.07882566750049591, - "rewards/margins": 6.572531700134277, - "rewards/rejected": -6.493705749511719, + "epoch": 0.97, + "learning_rate": 3.750668568372259e-07, + "logits/chosen": -2.6830484867095947, + "logits/rejected": -2.60634446144104, + "logps/chosen": -289.63812255859375, + "logps/rejected": -358.77410888671875, + "loss": 0.701, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.819648265838623, + "rewards/margins": 5.366222858428955, + "rewards/rejected": -8.185871124267578, "step": 4050 }, { - "epoch": 1.03, - "learning_rate": 3.6550884748619045e-07, - "logits/chosen": -2.5277256965637207, - "logits/rejected": -2.565796375274658, - "logps/chosen": -261.80841064453125, - "logps/rejected": -318.0531311035156, - "loss": 0.0731, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.227341890335083, - "rewards/margins": 9.081512451171875, - "rewards/rejected": -7.854170322418213, + "epoch": 0.98, + "learning_rate": 3.746211445890533e-07, + "logits/chosen": -2.8870174884796143, + "logits/rejected": -2.8342325687408447, + "logps/chosen": -241.5248260498047, + "logps/rejected": -235.63180541992188, + "loss": 0.5309, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8523916006088257, + "rewards/margins": 1.8193457126617432, + "rewards/rejected": -3.6717376708984375, "step": 4060 }, { - "epoch": 1.03, - "learning_rate": 3.650407265237337e-07, - "logits/chosen": -2.5151851177215576, - "logits/rejected": -2.4258155822753906, - "logps/chosen": -224.4687042236328, - "logps/rejected": -311.28936767578125, - "loss": 0.1217, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.26925137639045715, - "rewards/margins": 4.458642959594727, - "rewards/rejected": -4.7278947830200195, + "epoch": 0.98, + "learning_rate": 3.741754323408807e-07, + "logits/chosen": -2.851644277572632, + "logits/rejected": -2.7207632064819336, + "logps/chosen": -347.4605712890625, + "logps/rejected": -293.8634948730469, + "loss": 0.7056, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.732881546020508, + "rewards/margins": 0.8935607075691223, + "rewards/rejected": -3.6264424324035645, "step": 4070 }, { - "epoch": 1.03, - "learning_rate": 3.6457260556127703e-07, - "logits/chosen": -2.613365650177002, - "logits/rejected": -2.623168468475342, - "logps/chosen": -212.8934783935547, - "logps/rejected": -294.2333984375, - "loss": 0.1228, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.08440116047859192, - "rewards/margins": 8.041141510009766, - "rewards/rejected": -8.125543594360352, + "epoch": 0.98, + "learning_rate": 3.7372972009270817e-07, + "logits/chosen": -2.725109815597534, + "logits/rejected": -2.646479845046997, + "logps/chosen": -223.190673828125, + "logps/rejected": -280.4437255859375, + "loss": 0.6356, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.13977313041687, + "rewards/margins": 1.8631305694580078, + "rewards/rejected": -4.002903938293457, "step": 4080 }, { - "epoch": 1.03, - "learning_rate": 3.641044845988203e-07, - "logits/chosen": -2.67268443107605, - "logits/rejected": -2.607306480407715, - "logps/chosen": -221.0255889892578, - "logps/rejected": -288.19696044921875, - "loss": 0.074, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5305021405220032, - "rewards/margins": 7.2046942710876465, - "rewards/rejected": -6.674191474914551, + "epoch": 0.98, + "learning_rate": 3.7328400784453557e-07, + "logits/chosen": -2.9094722270965576, + "logits/rejected": -2.908536195755005, + "logps/chosen": -293.22491455078125, + "logps/rejected": -266.91644287109375, + "loss": 0.6004, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1433205604553223, + "rewards/margins": 1.847665786743164, + "rewards/rejected": -3.9909870624542236, "step": 4090 }, { - "epoch": 1.04, - "learning_rate": 3.636363636363636e-07, - "logits/chosen": -2.359762668609619, - "logits/rejected": -2.412950038909912, - "logps/chosen": -240.56201171875, - "logps/rejected": -267.10150146484375, - "loss": 0.0971, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.5591071844100952, - "rewards/margins": 5.897465705871582, - "rewards/rejected": -6.456572532653809, + "epoch": 0.99, + "learning_rate": 3.72838295596363e-07, + "logits/chosen": -2.8822057247161865, + "logits/rejected": -2.8010752201080322, + "logps/chosen": -330.4580383300781, + "logps/rejected": -267.7075500488281, + "loss": 0.6802, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.019509792327881, + "rewards/margins": 0.9538524746894836, + "rewards/rejected": -2.9733619689941406, "step": 4100 }, { - "epoch": 1.04, - "learning_rate": 3.6316824267390695e-07, - "logits/chosen": -2.675454616546631, - "logits/rejected": -2.515897274017334, - "logps/chosen": -310.7024841308594, - "logps/rejected": -344.6875305175781, - "loss": 0.0756, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.8981047868728638, - "rewards/margins": 9.891563415527344, - "rewards/rejected": -8.99345874786377, + "epoch": 0.99, + "eval_logits/chosen": -2.6250314712524414, + "eval_logits/rejected": -2.6009910106658936, + "eval_logps/chosen": -222.02069091796875, + "eval_logps/rejected": -227.3087158203125, + "eval_loss": 0.5134173631668091, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -2.6059675216674805, + "eval_rewards/margins": 1.685595989227295, + "eval_rewards/rejected": -4.291563510894775, + "eval_runtime": 132.2851, + "eval_samples_per_second": 23.858, + "eval_steps_per_second": 0.378, + "step": 4100 + }, + { + "epoch": 0.99, + "learning_rate": 3.723925833481904e-07, + "logits/chosen": -2.866441249847412, + "logits/rejected": -2.6965231895446777, + "logps/chosen": -416.38494873046875, + "logps/rejected": -239.02639770507812, + "loss": 0.5024, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0345304012298584, + "rewards/margins": 0.8612769246101379, + "rewards/rejected": -2.8958075046539307, "step": 4110 }, { - "epoch": 1.04, - "learning_rate": 3.627001217114502e-07, - "logits/chosen": -2.698408603668213, - "logits/rejected": -2.574500322341919, - "logps/chosen": -219.25167846679688, - "logps/rejected": -214.8889923095703, - "loss": 0.1992, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.4875507354736328, - "rewards/margins": 3.8590054512023926, - "rewards/rejected": -4.346555709838867, + "epoch": 0.99, + "learning_rate": 3.7194687110001783e-07, + "logits/chosen": -2.335099935531616, + "logits/rejected": -2.4469590187072754, + "logps/chosen": -237.5718994140625, + "logps/rejected": -233.95883178710938, + "loss": 0.5539, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.372395396232605, + "rewards/margins": 1.9789094924926758, + "rewards/rejected": -3.3513050079345703, "step": 4120 }, { - "epoch": 1.04, - "learning_rate": 3.622320007489935e-07, - "logits/chosen": -2.582207202911377, - "logits/rejected": -2.5340628623962402, - "logps/chosen": -264.3891296386719, - "logps/rejected": -235.1367645263672, - "loss": 0.1439, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6553934812545776, - "rewards/margins": 6.19410514831543, - "rewards/rejected": -5.538712024688721, + "epoch": 0.99, + "learning_rate": 3.7150115885184524e-07, + "logits/chosen": -2.873213768005371, + "logits/rejected": -2.6832902431488037, + "logps/chosen": -219.3606414794922, + "logps/rejected": -214.99319458007812, + "loss": 0.4685, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9941431283950806, + "rewards/margins": 1.4484153985977173, + "rewards/rejected": -3.442558765411377, "step": 4130 }, { - "epoch": 1.05, - "learning_rate": 3.617638797865368e-07, - "logits/chosen": -2.4984068870544434, - "logits/rejected": -2.4741299152374268, - "logps/chosen": -209.7914581298828, - "logps/rejected": -269.10748291015625, - "loss": 0.0869, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.32389122247695923, - "rewards/margins": 6.044379234313965, - "rewards/rejected": -6.368269920349121, + "epoch": 1.0, + "learning_rate": 3.7105544660367264e-07, + "logits/chosen": -2.7354140281677246, + "logits/rejected": -2.695600986480713, + "logps/chosen": -261.8937683105469, + "logps/rejected": -201.8359375, + "loss": 0.5759, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0913310050964355, + "rewards/margins": 1.272998571395874, + "rewards/rejected": -3.3643295764923096, "step": 4140 }, { - "epoch": 1.05, - "learning_rate": 3.6129575882408016e-07, - "logits/chosen": -2.6042675971984863, - "logits/rejected": -2.5595932006835938, - "logps/chosen": -239.3861846923828, - "logps/rejected": -258.2563171386719, - "loss": 0.0979, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.48330157995224, - "rewards/margins": 5.9926300048828125, - "rewards/rejected": -5.5093278884887695, + "epoch": 1.0, + "learning_rate": 3.706097343555001e-07, + "logits/chosen": -2.8658010959625244, + "logits/rejected": -2.8177363872528076, + "logps/chosen": -286.46673583984375, + "logps/rejected": -293.17718505859375, + "loss": 0.6371, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.4398696422576904, + "rewards/margins": 0.5984171628952026, + "rewards/rejected": -3.0382871627807617, "step": 4150 }, { - "epoch": 1.05, - "learning_rate": 3.608276378616234e-07, - "logits/chosen": -2.443714141845703, - "logits/rejected": -2.2847142219543457, - "logps/chosen": -271.3092346191406, - "logps/rejected": -215.4811248779297, - "loss": 0.0999, + "epoch": 1.0, + "learning_rate": 3.701640221073275e-07, + "logits/chosen": -2.712001323699951, + "logits/rejected": -2.647970199584961, + "logps/chosen": -355.46502685546875, + "logps/rejected": -275.7532653808594, + "loss": 0.4768, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.6948297619819641, - "rewards/margins": 5.3410773277282715, - "rewards/rejected": -6.035906791687012, + "rewards/chosen": -0.8509339094161987, + "rewards/margins": 4.578801155090332, + "rewards/rejected": -5.42973518371582, "step": 4160 }, { - "epoch": 1.05, - "learning_rate": 3.6035951689916674e-07, - "logits/chosen": -2.599687099456787, - "logits/rejected": -2.506901979446411, - "logps/chosen": -190.1280059814453, - "logps/rejected": -262.118896484375, - "loss": 0.1492, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.496035099029541, - "rewards/margins": 5.447064399719238, - "rewards/rejected": -4.9510297775268555, + "epoch": 1.0, + "learning_rate": 3.697183098591549e-07, + "logits/chosen": -2.848571300506592, + "logits/rejected": -2.823004722595215, + "logps/chosen": -285.56182861328125, + "logps/rejected": -354.7408142089844, + "loss": 0.2267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.257232129573822, + "rewards/margins": 6.21987771987915, + "rewards/rejected": -5.962646484375, "step": 4170 }, { - "epoch": 1.06, - "learning_rate": 3.5989139593671e-07, - "logits/chosen": -2.5147697925567627, - "logits/rejected": -2.5242176055908203, - "logps/chosen": -240.4428253173828, - "logps/rejected": -321.82061767578125, - "loss": 0.1165, + "epoch": 1.01, + "learning_rate": 3.6927259761098236e-07, + "logits/chosen": -2.6139652729034424, + "logits/rejected": -2.682786226272583, + "logps/chosen": -244.648681640625, + "logps/rejected": -358.69940185546875, + "loss": 0.089, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9775978922843933, - "rewards/margins": 6.624411106109619, - "rewards/rejected": -5.646812915802002, + "rewards/chosen": 0.4484066069126129, + "rewards/margins": 7.467729091644287, + "rewards/rejected": -7.019321441650391, "step": 4180 }, { - "epoch": 1.06, - "learning_rate": 3.5942327497425337e-07, - "logits/chosen": -2.503431797027588, - "logits/rejected": -2.459929943084717, - "logps/chosen": -207.2917022705078, - "logps/rejected": -317.4582824707031, - "loss": 0.0999, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.0375446118414402, - "rewards/margins": 6.4059157371521, - "rewards/rejected": -6.4434614181518555, + "epoch": 1.01, + "learning_rate": 3.6882688536280976e-07, + "logits/chosen": -2.6833653450012207, + "logits/rejected": -2.7394802570343018, + "logps/chosen": -219.50576782226562, + "logps/rejected": -286.9923400878906, + "loss": 0.1015, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.399767279624939, + "rewards/margins": 4.965181827545166, + "rewards/rejected": -6.3649492263793945, "step": 4190 }, { - "epoch": 1.06, - "learning_rate": 3.5895515401179666e-07, - "logits/chosen": -2.4857592582702637, - "logits/rejected": -2.416965961456299, - "logps/chosen": -263.7533264160156, - "logps/rejected": -320.15362548828125, - "loss": 0.1649, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4429326057434082, - "rewards/margins": 7.078021049499512, - "rewards/rejected": -5.6350884437561035, + "epoch": 1.01, + "learning_rate": 3.6838117311463716e-07, + "logits/chosen": -2.60537052154541, + "logits/rejected": -2.5601916313171387, + "logps/chosen": -249.0813751220703, + "logps/rejected": -311.14178466796875, + "loss": 0.0976, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.0010917186737060547, + "rewards/margins": 5.988485813140869, + "rewards/rejected": -5.9895782470703125, "step": 4200 }, { - "epoch": 1.06, - "learning_rate": 3.584870330493399e-07, - "logits/chosen": -2.7715916633605957, - "logits/rejected": -2.681565999984741, - "logps/chosen": -331.1893615722656, - "logps/rejected": -335.10223388671875, - "loss": 0.0935, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.20631642639636993, - "rewards/margins": 6.091833591461182, - "rewards/rejected": -5.885517120361328, - "step": 4210 + "epoch": 1.01, + "eval_logits/chosen": -2.5027899742126465, + "eval_logits/rejected": -2.4720711708068848, + "eval_logps/chosen": -226.84625244140625, + "eval_logps/rejected": -234.8874053955078, + "eval_loss": 0.50312340259552, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -3.0885236263275146, + "eval_rewards/margins": 1.9609071016311646, + "eval_rewards/rejected": -5.049430847167969, + "eval_runtime": 132.3445, + "eval_samples_per_second": 23.847, + "eval_steps_per_second": 0.378, + "step": 4200 }, { - "epoch": 1.07, - "learning_rate": 3.5801891208688324e-07, - "logits/chosen": -2.6815414428710938, - "logits/rejected": -2.6054797172546387, - "logps/chosen": -200.813720703125, - "logps/rejected": -217.6241912841797, - "loss": 0.1042, + "epoch": 1.01, + "learning_rate": 3.679354608664646e-07, + "logits/chosen": -2.82269549369812, + "logits/rejected": -2.7591030597686768, + "logps/chosen": -249.41879272460938, + "logps/rejected": -317.8497314453125, + "loss": 0.0883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02186262607574463, + "rewards/margins": 6.281726360321045, + "rewards/rejected": -6.303589344024658, + "step": 4210 + }, + { + "epoch": 1.02, + "learning_rate": 3.67489748618292e-07, + "logits/chosen": -2.7081894874572754, + "logits/rejected": -2.5566070079803467, + "logps/chosen": -197.99549865722656, + "logps/rejected": -246.48654174804688, + "loss": 0.0971, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.9837878346443176, - "rewards/margins": 4.837677001953125, - "rewards/rejected": -5.821465015411377, + "rewards/chosen": -1.6126645803451538, + "rewards/margins": 3.921424388885498, + "rewards/rejected": -5.534089088439941, "step": 4220 }, { - "epoch": 1.07, - "learning_rate": 3.575507911244265e-07, - "logits/chosen": -2.59967303276062, - "logits/rejected": -2.607329845428467, - "logps/chosen": -241.1183624267578, - "logps/rejected": -331.64117431640625, - "loss": 0.1341, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5841931104660034, - "rewards/margins": 7.46688985824585, - "rewards/rejected": -6.882697105407715, + "epoch": 1.02, + "learning_rate": 3.670440363701194e-07, + "logits/chosen": -2.75780987739563, + "logits/rejected": -2.6960082054138184, + "logps/chosen": -276.0903625488281, + "logps/rejected": -318.72039794921875, + "loss": 0.1228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9815553426742554, + "rewards/margins": 7.108693599700928, + "rewards/rejected": -6.127139091491699, "step": 4230 }, { - "epoch": 1.07, - "learning_rate": 3.5708267016196987e-07, - "logits/chosen": -2.7304348945617676, - "logits/rejected": -2.553729772567749, - "logps/chosen": -252.4587860107422, - "logps/rejected": -267.918212890625, - "loss": 0.0772, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.876346230506897, - "rewards/margins": 5.990228652954102, - "rewards/rejected": -5.113882064819336, + "epoch": 1.02, + "learning_rate": 3.665983241219469e-07, + "logits/chosen": -2.7243752479553223, + "logits/rejected": -2.605213165283203, + "logps/chosen": -238.5880889892578, + "logps/rejected": -259.71148681640625, + "loss": 0.1017, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.3822143077850342, + "rewards/margins": 6.5152082443237305, + "rewards/rejected": -6.897422790527344, "step": 4240 }, { - "epoch": 1.07, - "learning_rate": 3.566145491995131e-07, - "logits/chosen": -2.666968584060669, - "logits/rejected": -2.579244613647461, - "logps/chosen": -280.7061462402344, - "logps/rejected": -300.51776123046875, - "loss": 0.1279, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.13451166450977325, - "rewards/margins": 7.642842769622803, - "rewards/rejected": -7.7773542404174805, + "epoch": 1.02, + "learning_rate": 3.661526118737743e-07, + "logits/chosen": -2.5577034950256348, + "logits/rejected": -2.5197629928588867, + "logps/chosen": -367.8518371582031, + "logps/rejected": -372.84381103515625, + "loss": 0.0864, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5348130464553833, + "rewards/margins": 5.824860572814941, + "rewards/rejected": -6.359673500061035, "step": 4250 }, { - "epoch": 1.08, - "learning_rate": 3.5614642823705645e-07, - "logits/chosen": -2.742469310760498, - "logits/rejected": -2.7256650924682617, - "logps/chosen": -232.1375732421875, - "logps/rejected": -248.3732147216797, - "loss": 0.1574, + "epoch": 1.03, + "learning_rate": 3.657068996256017e-07, + "logits/chosen": -2.458911657333374, + "logits/rejected": -2.414825916290283, + "logps/chosen": -251.5448760986328, + "logps/rejected": -328.6988830566406, + "loss": 0.109, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0827386379241943, - "rewards/margins": 4.552823543548584, - "rewards/rejected": -3.4700851440429688, + "rewards/chosen": 0.5778032541275024, + "rewards/margins": 6.741812229156494, + "rewards/rejected": -6.164009094238281, "step": 4260 }, { - "epoch": 1.08, - "learning_rate": 3.5567830727459974e-07, - "logits/chosen": -2.4802334308624268, - "logits/rejected": -2.3722081184387207, - "logps/chosen": -245.95205688476562, - "logps/rejected": -424.5274353027344, - "loss": 0.0612, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.5879344940185547, - "rewards/margins": 8.373659133911133, - "rewards/rejected": -6.7857255935668945, + "epoch": 1.03, + "learning_rate": 3.6526118737742914e-07, + "logits/chosen": -2.7192983627319336, + "logits/rejected": -2.6738905906677246, + "logps/chosen": -226.1861114501953, + "logps/rejected": -295.11871337890625, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4454830884933472, + "rewards/margins": 6.335725784301758, + "rewards/rejected": -4.890242576599121, "step": 4270 }, { - "epoch": 1.08, - "learning_rate": 3.552101863121431e-07, - "logits/chosen": -2.5735154151916504, - "logits/rejected": -2.550457715988159, - "logps/chosen": -264.8874206542969, - "logps/rejected": -269.48980712890625, - "loss": 0.1089, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.5131327509880066, - "rewards/margins": 7.48690128326416, - "rewards/rejected": -8.000033378601074, + "epoch": 1.03, + "learning_rate": 3.6481547512925654e-07, + "logits/chosen": -2.3934969902038574, + "logits/rejected": -2.478426218032837, + "logps/chosen": -177.85971069335938, + "logps/rejected": -256.49920654296875, + "loss": 0.104, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3021724820137024, + "rewards/margins": 7.044008731842041, + "rewards/rejected": -6.741835594177246, "step": 4280 }, { - "epoch": 1.08, - "learning_rate": 3.5474206534968637e-07, - "logits/chosen": -2.6528215408325195, - "logits/rejected": -2.4966320991516113, - "logps/chosen": -270.1402587890625, - "logps/rejected": -250.8711395263672, - "loss": 0.0816, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.6331374645233154, - "rewards/margins": 7.091406345367432, - "rewards/rejected": -6.458268642425537, + "epoch": 1.03, + "learning_rate": 3.6436976288108395e-07, + "logits/chosen": -2.4851231575012207, + "logits/rejected": -2.5054361820220947, + "logps/chosen": -285.0626525878906, + "logps/rejected": -376.2096252441406, + "loss": 0.1249, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.3241591155529022, + "rewards/margins": 7.1120429039001465, + "rewards/rejected": -6.787884712219238, "step": 4290 }, { - "epoch": 1.09, - "learning_rate": 3.542739443872296e-07, - "logits/chosen": -2.5830702781677246, - "logits/rejected": -2.5414295196533203, - "logps/chosen": -259.9555358886719, - "logps/rejected": -311.7438049316406, - "loss": 0.0647, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.13916687667369843, - "rewards/margins": 5.850489616394043, - "rewards/rejected": -5.9896559715271, + "epoch": 1.03, + "learning_rate": 3.6392405063291135e-07, + "logits/chosen": -2.8106791973114014, + "logits/rejected": -2.7396514415740967, + "logps/chosen": -294.4912109375, + "logps/rejected": -335.8979187011719, + "loss": 0.0839, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8070405125617981, + "rewards/margins": 6.383404731750488, + "rewards/rejected": -5.576363563537598, "step": 4300 }, { - "epoch": 1.09, - "learning_rate": 3.5380582342477295e-07, - "logits/chosen": -2.585474729537964, - "logits/rejected": -2.639191150665283, - "logps/chosen": -239.0613555908203, - "logps/rejected": -289.3290100097656, - "loss": 0.1227, + "epoch": 1.03, + "eval_logits/chosen": -2.4238035678863525, + "eval_logits/rejected": -2.3885602951049805, + "eval_logps/chosen": -229.43016052246094, + "eval_logps/rejected": -238.7591552734375, + "eval_loss": 0.5027004480361938, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -3.346914052963257, + "eval_rewards/margins": 2.089693546295166, + "eval_rewards/rejected": -5.436607837677002, + "eval_runtime": 132.236, + "eval_samples_per_second": 23.866, + "eval_steps_per_second": 0.378, + "step": 4300 + }, + { + "epoch": 1.04, + "learning_rate": 3.634783383847388e-07, + "logits/chosen": -2.725675582885742, + "logits/rejected": -2.5204176902770996, + "logps/chosen": -350.4329833984375, + "logps/rejected": -281.46435546875, + "loss": 0.0871, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.19983701407909393, - "rewards/margins": 6.089356422424316, - "rewards/rejected": -6.289193153381348, + "rewards/chosen": 0.29073232412338257, + "rewards/margins": 6.241979122161865, + "rewards/rejected": -5.951246738433838, "step": 4310 }, { - "epoch": 1.09, - "learning_rate": 3.5333770246231624e-07, - "logits/chosen": -2.4982070922851562, - "logits/rejected": -2.427825927734375, - "logps/chosen": -320.6855163574219, - "logps/rejected": -309.69256591796875, - "loss": 0.1027, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9687162637710571, - "rewards/margins": 8.043795585632324, - "rewards/rejected": -9.012511253356934, + "epoch": 1.04, + "learning_rate": 3.630326261365662e-07, + "logits/chosen": -2.788583278656006, + "logits/rejected": -2.531651496887207, + "logps/chosen": -220.9932861328125, + "logps/rejected": -214.7703857421875, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7996373176574707, + "rewards/margins": 4.645855903625488, + "rewards/rejected": -5.445493221282959, "step": 4320 }, { - "epoch": 1.09, - "learning_rate": 3.528695814998596e-07, - "logits/chosen": -2.4943461418151855, - "logits/rejected": -2.4896931648254395, - "logps/chosen": -238.29458618164062, - "logps/rejected": -280.5413513183594, - "loss": 0.0592, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0275102853775024, - "rewards/margins": 6.780093193054199, - "rewards/rejected": -7.807603359222412, + "epoch": 1.04, + "learning_rate": 3.625869138883936e-07, + "logits/chosen": -2.696899890899658, + "logits/rejected": -2.736288070678711, + "logps/chosen": -220.5087432861328, + "logps/rejected": -265.0604248046875, + "loss": 0.1062, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.37364688515663147, + "rewards/margins": 7.606331825256348, + "rewards/rejected": -7.232684135437012, "step": 4330 }, { - "epoch": 1.1, - "learning_rate": 3.524014605374028e-07, - "logits/chosen": -2.7054009437561035, - "logits/rejected": -2.6643614768981934, - "logps/chosen": -212.01318359375, - "logps/rejected": -324.78717041015625, - "loss": 0.0652, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3500351309776306, - "rewards/margins": 9.12161922454834, - "rewards/rejected": -9.471654891967773, + "epoch": 1.04, + "learning_rate": 3.6214120164022107e-07, + "logits/chosen": -2.6355111598968506, + "logits/rejected": -2.637110471725464, + "logps/chosen": -175.37379455566406, + "logps/rejected": -273.8680419921875, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4862394332885742, + "rewards/margins": 5.740817070007324, + "rewards/rejected": -6.22705602645874, "step": 4340 }, { - "epoch": 1.1, - "learning_rate": 3.5193333957494616e-07, - "logits/chosen": -2.4785401821136475, - "logits/rejected": -2.505174160003662, - "logps/chosen": -207.61343383789062, - "logps/rejected": -348.1514587402344, - "loss": 0.0704, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.1490641087293625, - "rewards/margins": 8.631427764892578, - "rewards/rejected": -8.482362747192383, + "epoch": 1.05, + "learning_rate": 3.6169548939204847e-07, + "logits/chosen": -2.371464252471924, + "logits/rejected": -2.4779508113861084, + "logps/chosen": -193.6864013671875, + "logps/rejected": -251.49853515625, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8208333253860474, + "rewards/margins": 4.712620258331299, + "rewards/rejected": -5.533453941345215, "step": 4350 }, { - "epoch": 1.1, - "learning_rate": 3.5146521861248945e-07, - "logits/chosen": -2.54978609085083, - "logits/rejected": -2.507051944732666, - "logps/chosen": -285.1393127441406, - "logps/rejected": -300.57354736328125, - "loss": 0.0967, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.9839023351669312, - "rewards/margins": 8.747843742370605, - "rewards/rejected": -6.763941764831543, + "epoch": 1.05, + "learning_rate": 3.6124977714387587e-07, + "logits/chosen": -2.450424909591675, + "logits/rejected": -2.4948172569274902, + "logps/chosen": -225.9076385498047, + "logps/rejected": -272.49700927734375, + "loss": 0.947, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.38960337638855, + "rewards/margins": 5.2783355712890625, + "rewards/rejected": -7.667939186096191, "step": 4360 }, { - "epoch": 1.1, - "learning_rate": 3.509970976500328e-07, - "logits/chosen": -2.670477867126465, - "logits/rejected": -2.6304116249084473, - "logps/chosen": -283.92144775390625, - "logps/rejected": -384.2460632324219, - "loss": 0.0937, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.8794275522232056, - "rewards/margins": 11.181547164916992, - "rewards/rejected": -9.302119255065918, + "epoch": 1.05, + "learning_rate": 3.6080406489570333e-07, + "logits/chosen": -2.490787982940674, + "logits/rejected": -2.535414934158325, + "logps/chosen": -242.22421264648438, + "logps/rejected": -310.5423278808594, + "loss": 0.0636, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5611072778701782, + "rewards/margins": 6.9396257400512695, + "rewards/rejected": -8.500733375549316, "step": 4370 }, { - "epoch": 1.11, - "learning_rate": 3.50528976687576e-07, - "logits/chosen": -2.586055040359497, - "logits/rejected": -2.5961456298828125, - "logps/chosen": -241.93124389648438, - "logps/rejected": -340.2915954589844, - "loss": 0.113, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.8442662954330444, - "rewards/margins": 8.11192512512207, - "rewards/rejected": -6.26765775680542, + "epoch": 1.05, + "learning_rate": 3.6035835264753073e-07, + "logits/chosen": -2.5034115314483643, + "logits/rejected": -2.4764208793640137, + "logps/chosen": -201.92776489257812, + "logps/rejected": -281.9005432128906, + "loss": 0.1695, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8855290412902832, + "rewards/margins": 8.374197006225586, + "rewards/rejected": -9.259726524353027, "step": 4380 }, { - "epoch": 1.11, - "learning_rate": 3.5006085572511937e-07, - "logits/chosen": -2.651855945587158, - "logits/rejected": -2.473231792449951, - "logps/chosen": -312.76507568359375, - "logps/rejected": -317.0950927734375, - "loss": 0.0473, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.09365153312683105, - "rewards/margins": 7.239400386810303, - "rewards/rejected": -7.145749092102051, + "epoch": 1.06, + "learning_rate": 3.5991264039935813e-07, + "logits/chosen": -2.5301244258880615, + "logits/rejected": -2.4678752422332764, + "logps/chosen": -194.5750732421875, + "logps/rejected": -197.36917114257812, + "loss": 0.0873, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5885003805160522, + "rewards/margins": 6.027111053466797, + "rewards/rejected": -6.6156110763549805, "step": 4390 }, { - "epoch": 1.11, - "learning_rate": 3.4959273476266266e-07, - "logits/chosen": -2.7039191722869873, - "logits/rejected": -2.6823935508728027, - "logps/chosen": -242.7147979736328, - "logps/rejected": -268.5030822753906, - "loss": 0.0781, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9241912961006165, - "rewards/margins": 4.920010089874268, - "rewards/rejected": -3.995819091796875, + "epoch": 1.06, + "learning_rate": 3.594669281511856e-07, + "logits/chosen": -2.5054192543029785, + "logits/rejected": -2.404689311981201, + "logps/chosen": -265.75518798828125, + "logps/rejected": -212.1231689453125, + "loss": 0.0788, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0553008317947388, + "rewards/margins": 6.590175628662109, + "rewards/rejected": -7.645476341247559, "step": 4400 }, { - "epoch": 1.11, - "learning_rate": 3.4912461380020595e-07, - "logits/chosen": -2.694061517715454, - "logits/rejected": -2.5867414474487305, - "logps/chosen": -329.20806884765625, - "logps/rejected": -339.77227783203125, - "loss": 0.0937, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.02051043510437, - "rewards/margins": 10.045517921447754, - "rewards/rejected": -8.025007247924805, + "epoch": 1.06, + "eval_logits/chosen": -2.227496862411499, + "eval_logits/rejected": -2.180467128753662, + "eval_logps/chosen": -240.2678680419922, + "eval_logps/rejected": -252.96142578125, + "eval_loss": 0.5398357510566711, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -4.430687427520752, + "eval_rewards/margins": 2.4261465072631836, + "eval_rewards/rejected": -6.8568339347839355, + "eval_runtime": 133.4901, + "eval_samples_per_second": 23.642, + "eval_steps_per_second": 0.375, + "step": 4400 + }, + { + "epoch": 1.06, + "learning_rate": 3.59021215903013e-07, + "logits/chosen": -2.3675990104675293, + "logits/rejected": -2.423901319503784, + "logps/chosen": -203.9844970703125, + "logps/rejected": -278.0522155761719, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0473463535308838, + "rewards/margins": 6.782293796539307, + "rewards/rejected": -7.829640865325928, "step": 4410 }, { - "epoch": 1.12, - "learning_rate": 3.486564928377493e-07, - "logits/chosen": -2.6415634155273438, - "logits/rejected": -2.554490566253662, - "logps/chosen": -249.7362060546875, - "logps/rejected": -265.8996887207031, - "loss": 0.0812, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.46163004636764526, - "rewards/margins": 5.617148399353027, - "rewards/rejected": -6.0787787437438965, + "epoch": 1.06, + "learning_rate": 3.585755036548404e-07, + "logits/chosen": -2.5545144081115723, + "logits/rejected": -2.4003384113311768, + "logps/chosen": -296.57135009765625, + "logps/rejected": -309.2122497558594, + "loss": 0.1516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2418675422668457, + "rewards/margins": 7.7992095947265625, + "rewards/rejected": -8.041077613830566, "step": 4420 }, { - "epoch": 1.12, - "learning_rate": 3.481883718752925e-07, - "logits/chosen": -2.595177173614502, - "logits/rejected": -2.6248831748962402, - "logps/chosen": -311.8207092285156, - "logps/rejected": -321.56243896484375, - "loss": 0.1348, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5490500330924988, - "rewards/margins": 6.183700084686279, - "rewards/rejected": -6.732749938964844, + "epoch": 1.07, + "learning_rate": 3.5812979140666785e-07, + "logits/chosen": -2.5682432651519775, + "logits/rejected": -2.451465368270874, + "logps/chosen": -261.8687744140625, + "logps/rejected": -332.8811340332031, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09469322860240936, + "rewards/margins": 8.882295608520508, + "rewards/rejected": -8.787601470947266, "step": 4430 }, { - "epoch": 1.12, - "learning_rate": 3.4772025091283587e-07, - "logits/chosen": -2.6946587562561035, - "logits/rejected": -2.681180477142334, - "logps/chosen": -285.69488525390625, - "logps/rejected": -354.4988708496094, - "loss": 0.1563, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.4733982980251312, - "rewards/margins": 7.104119777679443, - "rewards/rejected": -7.577518463134766, + "epoch": 1.07, + "learning_rate": 3.5768407915849525e-07, + "logits/chosen": -2.4848480224609375, + "logits/rejected": -2.5554747581481934, + "logps/chosen": -217.7174835205078, + "logps/rejected": -301.87945556640625, + "loss": 0.2685, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3529406785964966, + "rewards/margins": 7.304051876068115, + "rewards/rejected": -8.656991958618164, "step": 4440 }, { - "epoch": 1.12, - "learning_rate": 3.4725212995037916e-07, - "logits/chosen": -2.6871225833892822, - "logits/rejected": -2.4688057899475098, - "logps/chosen": -252.8527069091797, - "logps/rejected": -235.73483276367188, - "loss": 0.0967, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8882217407226562, - "rewards/margins": 5.205448627471924, - "rewards/rejected": -6.093670845031738, + "epoch": 1.07, + "learning_rate": 3.5723836691032266e-07, + "logits/chosen": -2.6528122425079346, + "logits/rejected": -2.47171950340271, + "logps/chosen": -277.3251037597656, + "logps/rejected": -296.85015869140625, + "loss": 0.0985, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4608760476112366, + "rewards/margins": 7.703942775726318, + "rewards/rejected": -7.243067264556885, "step": 4450 }, { - "epoch": 1.13, - "learning_rate": 3.467840089879225e-07, - "logits/chosen": -2.4395699501037598, - "logits/rejected": -2.5508391857147217, - "logps/chosen": -266.2554931640625, - "logps/rejected": -329.2310485839844, - "loss": 0.1904, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0424487590789795, - "rewards/margins": 9.269670486450195, - "rewards/rejected": -8.227221488952637, + "epoch": 1.07, + "learning_rate": 3.5679265466215006e-07, + "logits/chosen": -2.566707134246826, + "logits/rejected": -2.545395612716675, + "logps/chosen": -282.0556640625, + "logps/rejected": -323.8487548828125, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25296300649642944, + "rewards/margins": 7.253639221191406, + "rewards/rejected": -7.5066022872924805, "step": 4460 }, { - "epoch": 1.13, - "learning_rate": 3.4631588802546573e-07, - "logits/chosen": -2.6959567070007324, - "logits/rejected": -2.6026034355163574, - "logps/chosen": -318.25616455078125, - "logps/rejected": -392.3797302246094, - "loss": 0.1572, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7935773134231567, - "rewards/margins": 7.944984436035156, - "rewards/rejected": -7.151407718658447, + "epoch": 1.08, + "learning_rate": 3.563469424139775e-07, + "logits/chosen": -2.6020054817199707, + "logits/rejected": -2.408296585083008, + "logps/chosen": -274.94879150390625, + "logps/rejected": -287.31719970703125, + "loss": 0.1576, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1979066133499146, + "rewards/margins": 7.276200294494629, + "rewards/rejected": -8.47410774230957, "step": 4470 }, { - "epoch": 1.13, - "learning_rate": 3.458477670630091e-07, - "logits/chosen": -2.707530975341797, - "logits/rejected": -2.6383745670318604, - "logps/chosen": -271.60198974609375, - "logps/rejected": -291.4852600097656, - "loss": 0.078, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.16344894468784332, - "rewards/margins": 5.676591873168945, - "rewards/rejected": -5.84004020690918, + "epoch": 1.08, + "learning_rate": 3.559012301658049e-07, + "logits/chosen": -2.335019826889038, + "logits/rejected": -2.2550578117370605, + "logps/chosen": -303.6737365722656, + "logps/rejected": -381.5570983886719, + "loss": 0.2015, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8513988256454468, + "rewards/margins": 7.191327095031738, + "rewards/rejected": -9.042726516723633, "step": 4480 }, { - "epoch": 1.13, - "learning_rate": 3.4537964610055237e-07, - "logits/chosen": -2.642083168029785, - "logits/rejected": -2.554753541946411, - "logps/chosen": -357.892578125, - "logps/rejected": -368.09686279296875, - "loss": 0.1444, + "epoch": 1.08, + "learning_rate": 3.554555179176323e-07, + "logits/chosen": -2.4753079414367676, + "logits/rejected": -2.3464515209198, + "logps/chosen": -256.3263244628906, + "logps/rejected": -271.3084716796875, + "loss": 0.1384, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 1.3861753940582275, - "rewards/margins": 9.830830574035645, - "rewards/rejected": -8.44465446472168, + "rewards/chosen": 0.5284283757209778, + "rewards/margins": 8.214263916015625, + "rewards/rejected": -7.685835361480713, "step": 4490 }, { - "epoch": 1.14, - "learning_rate": 3.449115251380957e-07, - "logits/chosen": -2.560091972351074, - "logits/rejected": -2.6622557640075684, - "logps/chosen": -228.8826141357422, - "logps/rejected": -300.7447204589844, - "loss": 0.0706, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.33011412620544434, - "rewards/margins": 8.04318904876709, - "rewards/rejected": -8.37330436706543, + "epoch": 1.08, + "learning_rate": 3.550098056694598e-07, + "logits/chosen": -2.5263915061950684, + "logits/rejected": -2.612776279449463, + "logps/chosen": -216.1134033203125, + "logps/rejected": -360.3942565917969, + "loss": 0.0701, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3528536260128021, + "rewards/margins": 8.515606880187988, + "rewards/rejected": -8.868459701538086, "step": 4500 }, { - "epoch": 1.14, - "learning_rate": 3.44443404175639e-07, - "logits/chosen": -2.764373302459717, - "logits/rejected": -2.7491188049316406, - "logps/chosen": -359.4165954589844, - "logps/rejected": -319.29095458984375, - "loss": 0.1212, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6747967600822449, - "rewards/margins": 6.897503852844238, - "rewards/rejected": -7.572301387786865, + "epoch": 1.08, + "eval_logits/chosen": -2.2436559200286865, + "eval_logits/rejected": -2.1935153007507324, + "eval_logps/chosen": -239.7000732421875, + "eval_logps/rejected": -255.3717041015625, + "eval_loss": 0.5431502461433411, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -4.373907089233398, + "eval_rewards/margins": 2.723952531814575, + "eval_rewards/rejected": -7.097860336303711, + "eval_runtime": 132.7336, + "eval_samples_per_second": 23.777, + "eval_steps_per_second": 0.377, + "step": 4500 + }, + { + "epoch": 1.09, + "learning_rate": 3.545640934212872e-07, + "logits/chosen": -2.565476417541504, + "logits/rejected": -2.3339710235595703, + "logps/chosen": -237.61129760742188, + "logps/rejected": -227.32192993164062, + "loss": 0.0612, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6042459607124329, + "rewards/margins": 8.681166648864746, + "rewards/rejected": -8.076921463012695, "step": 4510 }, { - "epoch": 1.14, - "learning_rate": 3.4397528321318223e-07, - "logits/chosen": -2.7152912616729736, - "logits/rejected": -2.70686674118042, - "logps/chosen": -314.1294250488281, - "logps/rejected": -342.2290954589844, - "loss": 0.0982, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.29457229375839233, - "rewards/margins": 6.949118614196777, - "rewards/rejected": -7.243691444396973, + "epoch": 1.09, + "learning_rate": 3.541183811731146e-07, + "logits/chosen": -2.5199179649353027, + "logits/rejected": -2.5182766914367676, + "logps/chosen": -271.98052978515625, + "logps/rejected": -282.78900146484375, + "loss": 0.0578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06733126938343048, + "rewards/margins": 8.899940490722656, + "rewards/rejected": -8.96727180480957, "step": 4520 }, { - "epoch": 1.15, - "learning_rate": 3.435071622507256e-07, - "logits/chosen": -2.5364561080932617, - "logits/rejected": -2.527350902557373, - "logps/chosen": -284.78472900390625, - "logps/rejected": -373.36517333984375, - "loss": 0.1348, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.7594577074050903, - "rewards/margins": 8.074015617370605, - "rewards/rejected": -8.833474159240723, + "epoch": 1.09, + "learning_rate": 3.5367266892494204e-07, + "logits/chosen": -2.6337881088256836, + "logits/rejected": -2.545973300933838, + "logps/chosen": -298.5050964355469, + "logps/rejected": -298.3019714355469, + "loss": 0.066, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3560872077941895, + "rewards/margins": 5.780177116394043, + "rewards/rejected": -7.136263847351074, "step": 4530 }, { - "epoch": 1.15, - "learning_rate": 3.4303904128826887e-07, - "logits/chosen": -2.800316333770752, - "logits/rejected": -2.845433235168457, - "logps/chosen": -252.117431640625, - "logps/rejected": -345.842529296875, - "loss": 0.1883, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.2000238597393036, - "rewards/margins": 9.164016723632812, - "rewards/rejected": -8.963993072509766, + "epoch": 1.09, + "learning_rate": 3.5322695667676944e-07, + "logits/chosen": -2.511812210083008, + "logits/rejected": -2.4725537300109863, + "logps/chosen": -203.6138458251953, + "logps/rejected": -303.0009460449219, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6344941854476929, + "rewards/margins": 7.031195640563965, + "rewards/rejected": -7.665688991546631, "step": 4540 }, { - "epoch": 1.15, - "learning_rate": 3.425709203258122e-07, - "logits/chosen": -2.722050905227661, - "logits/rejected": -2.7154502868652344, - "logps/chosen": -240.56130981445312, - "logps/rejected": -348.01336669921875, - "loss": 0.0954, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.9766684770584106, - "rewards/margins": 6.593020439147949, - "rewards/rejected": -7.5696892738342285, + "epoch": 1.1, + "learning_rate": 3.5278124442859684e-07, + "logits/chosen": -2.5235538482666016, + "logits/rejected": -2.491658926010132, + "logps/chosen": -240.34835815429688, + "logps/rejected": -352.4381103515625, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8006995916366577, + "rewards/margins": 11.205166816711426, + "rewards/rejected": -10.40446662902832, "step": 4550 }, { - "epoch": 1.15, - "learning_rate": 3.4210279936335544e-07, - "logits/chosen": -2.747832775115967, - "logits/rejected": -2.6978516578674316, - "logps/chosen": -357.05572509765625, - "logps/rejected": -413.34906005859375, - "loss": 0.0741, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.316377639770508, - "rewards/margins": 11.051581382751465, - "rewards/rejected": -8.735204696655273, + "epoch": 1.1, + "learning_rate": 3.523355321804243e-07, + "logits/chosen": -2.6103668212890625, + "logits/rejected": -2.637732744216919, + "logps/chosen": -304.49053955078125, + "logps/rejected": -387.01751708984375, + "loss": 0.1344, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.8124135732650757, + "rewards/margins": 7.884924411773682, + "rewards/rejected": -7.072511196136475, "step": 4560 }, { - "epoch": 1.16, - "learning_rate": 3.416346784008988e-07, - "logits/chosen": -2.6359996795654297, - "logits/rejected": -2.667074203491211, - "logps/chosen": -311.69647216796875, - "logps/rejected": -352.11676025390625, - "loss": 0.1039, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.8526498079299927, - "rewards/margins": 8.114217758178711, - "rewards/rejected": -8.966867446899414, + "epoch": 1.1, + "learning_rate": 3.518898199322517e-07, + "logits/chosen": -2.6975045204162598, + "logits/rejected": -2.6109113693237305, + "logps/chosen": -341.67010498046875, + "logps/rejected": -328.21160888671875, + "loss": 0.1154, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.39225417375564575, + "rewards/margins": 6.560842037200928, + "rewards/rejected": -6.168587684631348, "step": 4570 }, { - "epoch": 1.16, - "learning_rate": 3.411665574384421e-07, - "logits/chosen": -2.653594970703125, - "logits/rejected": -2.585784435272217, - "logps/chosen": -350.00238037109375, - "logps/rejected": -349.8188781738281, - "loss": 0.1114, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.14014729857444763, - "rewards/margins": 7.541351318359375, - "rewards/rejected": -7.4012041091918945, + "epoch": 1.1, + "learning_rate": 3.514441076840791e-07, + "logits/chosen": -2.5345098972320557, + "logits/rejected": -2.476235866546631, + "logps/chosen": -213.5447235107422, + "logps/rejected": -306.55084228515625, + "loss": 0.1685, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8820182681083679, + "rewards/margins": 6.847736358642578, + "rewards/rejected": -7.729754447937012, "step": 4580 }, { - "epoch": 1.16, - "learning_rate": 3.406984364759854e-07, - "logits/chosen": -2.7345433235168457, - "logits/rejected": -2.7317304611206055, - "logps/chosen": -285.2073974609375, - "logps/rejected": -375.93621826171875, - "loss": 0.085, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3569324314594269, - "rewards/margins": 7.82189416885376, - "rewards/rejected": -8.178826332092285, + "epoch": 1.1, + "learning_rate": 3.5099839543590656e-07, + "logits/chosen": -2.402772903442383, + "logits/rejected": -2.4007513523101807, + "logps/chosen": -304.1368713378906, + "logps/rejected": -528.6956787109375, + "loss": 0.1436, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7980182766914368, + "rewards/margins": 12.790114402770996, + "rewards/rejected": -11.992096900939941, "step": 4590 }, { - "epoch": 1.16, - "learning_rate": 3.4023031551352865e-07, - "logits/chosen": -2.5770225524902344, - "logits/rejected": -2.5470917224884033, - "logps/chosen": -216.72360229492188, - "logps/rejected": -259.01409912109375, - "loss": 0.0678, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.25177958607673645, - "rewards/margins": 5.54573917388916, - "rewards/rejected": -5.797519683837891, + "epoch": 1.11, + "learning_rate": 3.50552683187734e-07, + "logits/chosen": -2.635467052459717, + "logits/rejected": -2.5282578468322754, + "logps/chosen": -339.1761169433594, + "logps/rejected": -308.29852294921875, + "loss": 0.0959, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.43784698843955994, + "rewards/margins": 6.931831359863281, + "rewards/rejected": -6.493984222412109, "step": 4600 }, { - "epoch": 1.17, - "learning_rate": 3.3976219455107194e-07, - "logits/chosen": -2.4865548610687256, - "logits/rejected": -2.4736034870147705, - "logps/chosen": -241.6890106201172, - "logps/rejected": -293.00665283203125, - "loss": 0.0821, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0089930295944214, - "rewards/margins": 5.638051509857178, - "rewards/rejected": -6.6470441818237305, + "epoch": 1.11, + "eval_logits/chosen": -2.3271560668945312, + "eval_logits/rejected": -2.2859771251678467, + "eval_logps/chosen": -235.74502563476562, + "eval_logps/rejected": -247.6283721923828, + "eval_loss": 0.5362380743026733, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -3.978400468826294, + "eval_rewards/margins": 2.3451268672943115, + "eval_rewards/rejected": -6.323526382446289, + "eval_runtime": 132.7857, + "eval_samples_per_second": 23.768, + "eval_steps_per_second": 0.377, + "step": 4600 + }, + { + "epoch": 1.11, + "learning_rate": 3.501069709395614e-07, + "logits/chosen": -2.7260050773620605, + "logits/rejected": -2.667550802230835, + "logps/chosen": -288.2752380371094, + "logps/rejected": -295.8538513183594, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07223912328481674, + "rewards/margins": 7.642079830169678, + "rewards/rejected": -7.714318752288818, "step": 4610 }, { - "epoch": 1.17, - "learning_rate": 3.392940735886153e-07, - "logits/chosen": -2.670954704284668, - "logits/rejected": -2.6923716068267822, - "logps/chosen": -236.7271728515625, - "logps/rejected": -395.5998229980469, - "loss": 0.0744, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.6401809453964233, - "rewards/margins": 11.688632011413574, - "rewards/rejected": -10.048450469970703, + "epoch": 1.11, + "learning_rate": 3.496612586913889e-07, + "logits/chosen": -2.592433452606201, + "logits/rejected": -2.5326685905456543, + "logps/chosen": -355.7337646484375, + "logps/rejected": -375.61810302734375, + "loss": 0.1024, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5456161499023438, + "rewards/margins": 9.507122993469238, + "rewards/rejected": -8.961506843566895, "step": 4620 }, { - "epoch": 1.17, - "learning_rate": 3.388259526261586e-07, - "logits/chosen": -2.740015983581543, - "logits/rejected": -2.6743392944335938, - "logps/chosen": -323.6190185546875, - "logps/rejected": -393.6664733886719, - "loss": 0.1134, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.9889957904815674, - "rewards/margins": 11.044259071350098, - "rewards/rejected": -9.05526351928711, + "epoch": 1.11, + "learning_rate": 3.492155464432163e-07, + "logits/chosen": -2.772498607635498, + "logits/rejected": -2.6470677852630615, + "logps/chosen": -391.77978515625, + "logps/rejected": -311.52947998046875, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7689968943595886, + "rewards/margins": 6.9812493324279785, + "rewards/rejected": -6.212252140045166, "step": 4630 }, { - "epoch": 1.17, - "learning_rate": 3.383578316637019e-07, - "logits/chosen": -2.745980978012085, - "logits/rejected": -2.6363847255706787, - "logps/chosen": -222.12136840820312, - "logps/rejected": -237.53622436523438, - "loss": 0.0668, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.422921746969223, - "rewards/margins": 5.303702354431152, - "rewards/rejected": -5.726624011993408, + "epoch": 1.12, + "learning_rate": 3.487698341950437e-07, + "logits/chosen": -2.6463332176208496, + "logits/rejected": -2.582622528076172, + "logps/chosen": -219.696533203125, + "logps/rejected": -249.7501983642578, + "loss": 0.0924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6100350618362427, + "rewards/margins": 8.142468452453613, + "rewards/rejected": -8.752503395080566, "step": 4640 }, { - "epoch": 1.18, - "learning_rate": 3.3788971070124515e-07, - "logits/chosen": -2.6138062477111816, - "logits/rejected": -2.5201363563537598, - "logps/chosen": -296.3515625, - "logps/rejected": -307.6025085449219, - "loss": 0.0929, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3155912458896637, - "rewards/margins": 6.940545558929443, - "rewards/rejected": -7.256136417388916, + "epoch": 1.12, + "learning_rate": 3.483241219468711e-07, + "logits/chosen": -2.5509085655212402, + "logits/rejected": -2.643665313720703, + "logps/chosen": -245.15481567382812, + "logps/rejected": -352.7142333984375, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.651538610458374, + "rewards/margins": 7.158326148986816, + "rewards/rejected": -7.8098649978637695, "step": 4650 }, { - "epoch": 1.18, - "learning_rate": 3.374215897387885e-07, - "logits/chosen": -2.8091039657592773, - "logits/rejected": -2.78580904006958, - "logps/chosen": -320.61651611328125, - "logps/rejected": -311.21051025390625, - "loss": 0.0875, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5112084746360779, - "rewards/margins": 6.893043518066406, - "rewards/rejected": -6.381835460662842, + "epoch": 1.12, + "learning_rate": 3.4787840969869854e-07, + "logits/chosen": -2.544185161590576, + "logits/rejected": -2.5211024284362793, + "logps/chosen": -282.38079833984375, + "logps/rejected": -336.9720764160156, + "loss": 0.0738, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7497472763061523, + "rewards/margins": 8.456887245178223, + "rewards/rejected": -9.206633567810059, "step": 4660 }, { - "epoch": 1.18, - "learning_rate": 3.369534687763318e-07, - "logits/chosen": -2.6944401264190674, - "logits/rejected": -2.6727585792541504, - "logps/chosen": -221.2980194091797, - "logps/rejected": -306.5845947265625, - "loss": 0.0994, + "epoch": 1.12, + "learning_rate": 3.4743269745052594e-07, + "logits/chosen": -2.669247627258301, + "logits/rejected": -2.607506275177002, + "logps/chosen": -191.70323181152344, + "logps/rejected": -238.08578491210938, + "loss": 0.0699, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.10195112228393555, - "rewards/margins": 8.119444847106934, - "rewards/rejected": -8.017494201660156, + "rewards/chosen": -1.4127624034881592, + "rewards/margins": 6.265049934387207, + "rewards/rejected": -7.677813529968262, "step": 4670 }, { - "epoch": 1.18, - "learning_rate": 3.3648534781387513e-07, - "logits/chosen": -2.3914220333099365, - "logits/rejected": -2.2504477500915527, - "logps/chosen": -424.81341552734375, - "logps/rejected": -369.765869140625, - "loss": 0.086, + "epoch": 1.13, + "learning_rate": 3.4698698520235335e-07, + "logits/chosen": -2.587364673614502, + "logits/rejected": -2.480146646499634, + "logps/chosen": -208.7715606689453, + "logps/rejected": -297.1170959472656, + "loss": 0.095, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.4902187585830688, - "rewards/margins": 6.820440769195557, - "rewards/rejected": -8.310659408569336, + "rewards/chosen": -0.8396922945976257, + "rewards/margins": 7.457869052886963, + "rewards/rejected": -8.297561645507812, "step": 4680 }, { - "epoch": 1.19, - "learning_rate": 3.3601722685141836e-07, - "logits/chosen": -2.843759059906006, - "logits/rejected": -2.71614670753479, - "logps/chosen": -283.5601501464844, - "logps/rejected": -440.706298828125, - "loss": 0.0882, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2339136600494385, - "rewards/margins": 10.507527351379395, - "rewards/rejected": -9.273611068725586, + "epoch": 1.13, + "learning_rate": 3.465412729541808e-07, + "logits/chosen": -2.4993512630462646, + "logits/rejected": -2.4447569847106934, + "logps/chosen": -306.6724853515625, + "logps/rejected": -343.8681945800781, + "loss": 0.0658, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.442502498626709, + "rewards/margins": 6.827431678771973, + "rewards/rejected": -9.269933700561523, "step": 4690 }, { - "epoch": 1.19, - "learning_rate": 3.355491058889617e-07, - "logits/chosen": -2.504859447479248, - "logits/rejected": -2.501770496368408, - "logps/chosen": -257.53802490234375, - "logps/rejected": -385.2470703125, - "loss": 0.0999, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.575905442237854, - "rewards/margins": 7.368882656097412, - "rewards/rejected": -8.944788932800293, + "epoch": 1.13, + "learning_rate": 3.460955607060082e-07, + "logits/chosen": -2.7185981273651123, + "logits/rejected": -2.6675920486450195, + "logps/chosen": -329.14398193359375, + "logps/rejected": -330.47552490234375, + "loss": 0.1177, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.15235868096351624, + "rewards/margins": 6.318484306335449, + "rewards/rejected": -6.4708428382873535, "step": 4700 }, { - "epoch": 1.19, - "learning_rate": 3.35080984926505e-07, - "logits/chosen": -2.812898635864258, - "logits/rejected": -2.7729899883270264, - "logps/chosen": -301.29193115234375, - "logps/rejected": -374.1706848144531, - "loss": 0.0783, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.030539250001311302, - "rewards/margins": 7.485116481781006, - "rewards/rejected": -7.4545769691467285, + "epoch": 1.13, + "eval_logits/chosen": -2.3682165145874023, + "eval_logits/rejected": -2.32588791847229, + "eval_logps/chosen": -237.89369201660156, + "eval_logps/rejected": -252.82949829101562, + "eval_loss": 0.5411165952682495, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -4.193268299102783, + "eval_rewards/margins": 2.650369644165039, + "eval_rewards/rejected": -6.8436384201049805, + "eval_runtime": 132.8212, + "eval_samples_per_second": 23.761, + "eval_steps_per_second": 0.376, + "step": 4700 + }, + { + "epoch": 1.13, + "learning_rate": 3.456498484578356e-07, + "logits/chosen": -2.7592391967773438, + "logits/rejected": -2.6942925453186035, + "logps/chosen": -222.16946411132812, + "logps/rejected": -292.944091796875, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08918152004480362, + "rewards/margins": 6.620908260345459, + "rewards/rejected": -6.710089683532715, "step": 4710 }, { - "epoch": 1.19, - "learning_rate": 3.346128639640483e-07, - "logits/chosen": -2.7590649127960205, - "logits/rejected": -2.6491782665252686, - "logps/chosen": -356.0698547363281, - "logps/rejected": -367.9351501464844, - "loss": 0.2066, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5648988485336304, - "rewards/margins": 6.791413307189941, - "rewards/rejected": -7.356311798095703, + "epoch": 1.14, + "learning_rate": 3.4520413620966306e-07, + "logits/chosen": -2.539498805999756, + "logits/rejected": -2.586599588394165, + "logps/chosen": -194.7329559326172, + "logps/rejected": -309.6669921875, + "loss": 0.1663, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.045355200767517, + "rewards/margins": 8.331487655639648, + "rewards/rejected": -7.286134243011475, "step": 4720 }, { - "epoch": 1.2, - "learning_rate": 3.3414474300159163e-07, - "logits/chosen": -2.4159865379333496, - "logits/rejected": -2.5030646324157715, - "logps/chosen": -272.33172607421875, - "logps/rejected": -328.46258544921875, - "loss": 0.1267, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2051588296890259, - "rewards/margins": 7.337274074554443, - "rewards/rejected": -8.54243278503418, + "epoch": 1.14, + "learning_rate": 3.4475842396149047e-07, + "logits/chosen": -2.544127941131592, + "logits/rejected": -2.5349574089050293, + "logps/chosen": -229.8127899169922, + "logps/rejected": -345.43902587890625, + "loss": 0.0747, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.279430091381073, + "rewards/margins": 7.787337303161621, + "rewards/rejected": -8.066767692565918, "step": 4730 }, { - "epoch": 1.2, - "learning_rate": 3.3367662203913486e-07, - "logits/chosen": -2.6770882606506348, - "logits/rejected": -2.4457528591156006, - "logps/chosen": -304.1426696777344, - "logps/rejected": -284.6874084472656, - "loss": 0.0785, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.562188982963562, - "rewards/margins": 8.102972030639648, - "rewards/rejected": -8.665160179138184, + "epoch": 1.14, + "learning_rate": 3.4431271171331787e-07, + "logits/chosen": -2.6528468132019043, + "logits/rejected": -2.482433319091797, + "logps/chosen": -293.0240173339844, + "logps/rejected": -320.66949462890625, + "loss": 0.1169, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.465946763753891, + "rewards/margins": 6.958950042724609, + "rewards/rejected": -7.424896240234375, "step": 4740 }, { - "epoch": 1.2, - "learning_rate": 3.332085010766782e-07, - "logits/chosen": -2.6974499225616455, - "logits/rejected": -2.589693784713745, - "logps/chosen": -322.5655212402344, - "logps/rejected": -326.5779113769531, - "loss": 0.1351, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5035828948020935, - "rewards/margins": 7.923377990722656, - "rewards/rejected": -7.41979455947876, + "epoch": 1.14, + "learning_rate": 3.438669994651453e-07, + "logits/chosen": -2.5880727767944336, + "logits/rejected": -2.513706922531128, + "logps/chosen": -201.38583374023438, + "logps/rejected": -324.92034912109375, + "loss": 0.0796, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5520162582397461, + "rewards/margins": 9.13754653930664, + "rewards/rejected": -8.585530281066895, "step": 4750 }, { - "epoch": 1.2, - "learning_rate": 3.327403801142215e-07, - "logits/chosen": -2.536750555038452, - "logits/rejected": -2.445263624191284, - "logps/chosen": -219.2605743408203, - "logps/rejected": -300.7672119140625, - "loss": 0.0623, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.11792264133691788, - "rewards/margins": 8.396120071411133, - "rewards/rejected": -8.514042854309082, + "epoch": 1.15, + "learning_rate": 3.4342128721697273e-07, + "logits/chosen": -2.5701522827148438, + "logits/rejected": -2.4036645889282227, + "logps/chosen": -254.9263153076172, + "logps/rejected": -267.2220153808594, + "loss": 0.1423, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.150063157081604, + "rewards/margins": 5.408053398132324, + "rewards/rejected": -6.558116912841797, "step": 4760 }, { - "epoch": 1.21, - "learning_rate": 3.3227225915176484e-07, - "logits/chosen": -2.708127498626709, - "logits/rejected": -2.5230722427368164, - "logps/chosen": -322.44451904296875, - "logps/rejected": -329.8259582519531, - "loss": 0.061, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.0522180795669556, - "rewards/margins": 9.174871444702148, - "rewards/rejected": -8.12265396118164, + "epoch": 1.15, + "learning_rate": 3.4297557496880013e-07, + "logits/chosen": -2.7101962566375732, + "logits/rejected": -2.5529160499572754, + "logps/chosen": -307.70550537109375, + "logps/rejected": -323.7502136230469, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13209082186222076, + "rewards/margins": 8.014756202697754, + "rewards/rejected": -8.146845817565918, "step": 4770 }, { - "epoch": 1.21, - "learning_rate": 3.318041381893081e-07, - "logits/chosen": -2.3173861503601074, - "logits/rejected": -2.3874242305755615, - "logps/chosen": -357.01116943359375, - "logps/rejected": -348.6958312988281, - "loss": 0.0629, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6036861538887024, - "rewards/margins": 6.142181873321533, - "rewards/rejected": -6.7458672523498535, + "epoch": 1.15, + "learning_rate": 3.425298627206276e-07, + "logits/chosen": -2.438110828399658, + "logits/rejected": -2.5066728591918945, + "logps/chosen": -224.5572052001953, + "logps/rejected": -314.0223083496094, + "loss": 0.1131, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0630565881729126, + "rewards/margins": 7.094944953918457, + "rewards/rejected": -7.031888484954834, "step": 4780 }, { - "epoch": 1.21, - "learning_rate": 3.313360172268514e-07, - "logits/chosen": -2.5158562660217285, - "logits/rejected": -2.5699989795684814, - "logps/chosen": -304.5142822265625, - "logps/rejected": -355.92901611328125, - "loss": 0.1261, + "epoch": 1.15, + "learning_rate": 3.42084150472455e-07, + "logits/chosen": -2.677676200866699, + "logits/rejected": -2.4428439140319824, + "logps/chosen": -260.1240539550781, + "logps/rejected": -322.24114990234375, + "loss": 0.2174, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.19178393483161926, - "rewards/margins": 7.122007846832275, - "rewards/rejected": -7.313791751861572, + "rewards/chosen": -1.865907907485962, + "rewards/margins": 7.181874752044678, + "rewards/rejected": -9.047781944274902, "step": 4790 }, { - "epoch": 1.21, - "learning_rate": 3.308678962643947e-07, - "logits/chosen": -2.7196245193481445, - "logits/rejected": -2.601762294769287, - "logps/chosen": -333.9893798828125, - "logps/rejected": -354.0044250488281, - "loss": 0.0989, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.6505855321884155, - "rewards/margins": 9.573948860168457, - "rewards/rejected": -7.923361778259277, + "epoch": 1.16, + "learning_rate": 3.416384382242824e-07, + "logits/chosen": -2.5137696266174316, + "logits/rejected": -2.5519754886627197, + "logps/chosen": -209.14480590820312, + "logps/rejected": -273.2664489746094, + "loss": 0.1651, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.874838650226593, + "rewards/margins": 6.106249809265137, + "rewards/rejected": -6.981088161468506, "step": 4800 }, { - "epoch": 1.22, - "learning_rate": 3.3039977530193805e-07, - "logits/chosen": -2.6281702518463135, - "logits/rejected": -2.604680299758911, - "logps/chosen": -213.1340789794922, - "logps/rejected": -235.78866577148438, - "loss": 0.0867, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.426002562046051, - "rewards/margins": 6.862097263336182, - "rewards/rejected": -7.288099765777588, + "epoch": 1.16, + "eval_logits/chosen": -2.3139142990112305, + "eval_logits/rejected": -2.2752561569213867, + "eval_logps/chosen": -244.1190185546875, + "eval_logps/rejected": -251.6221466064453, + "eval_loss": 0.5736638307571411, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -4.815803050994873, + "eval_rewards/margins": 1.9071028232574463, + "eval_rewards/rejected": -6.722906112670898, + "eval_runtime": 132.5951, + "eval_samples_per_second": 23.802, + "eval_steps_per_second": 0.377, + "step": 4800 + }, + { + "epoch": 1.16, + "learning_rate": 3.411927259761098e-07, + "logits/chosen": -2.599360704421997, + "logits/rejected": -2.530350685119629, + "logps/chosen": -227.35000610351562, + "logps/rejected": -254.7102508544922, + "loss": 0.1268, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8383834958076477, + "rewards/margins": 6.42560338973999, + "rewards/rejected": -7.2639875411987305, "step": 4810 }, { - "epoch": 1.22, - "learning_rate": 3.299316543394813e-07, - "logits/chosen": -2.5487630367279053, - "logits/rejected": -2.3977303504943848, - "logps/chosen": -308.79803466796875, - "logps/rejected": -307.55865478515625, - "loss": 0.1295, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.6266343593597412, - "rewards/margins": 7.827906608581543, - "rewards/rejected": -8.454541206359863, + "epoch": 1.16, + "learning_rate": 3.4074701372793725e-07, + "logits/chosen": -2.609750509262085, + "logits/rejected": -2.3782925605773926, + "logps/chosen": -238.8041229248047, + "logps/rejected": -273.5411376953125, + "loss": 0.1, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1289758682250977, + "rewards/margins": 6.446280479431152, + "rewards/rejected": -8.575257301330566, "step": 4820 }, { - "epoch": 1.22, - "learning_rate": 3.294635333770246e-07, - "logits/chosen": -2.5832231044769287, - "logits/rejected": -2.551140546798706, - "logps/chosen": -260.5826416015625, - "logps/rejected": -373.8498840332031, - "loss": 0.0825, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.6590865254402161, - "rewards/margins": 10.208707809448242, - "rewards/rejected": -9.54962158203125, + "epoch": 1.16, + "learning_rate": 3.4030130147976465e-07, + "logits/chosen": -2.5147125720977783, + "logits/rejected": -2.529360055923462, + "logps/chosen": -322.40313720703125, + "logps/rejected": -392.3272399902344, + "loss": 0.0876, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.04858759790658951, + "rewards/margins": 9.451199531555176, + "rewards/rejected": -9.499788284301758, "step": 4830 }, { - "epoch": 1.22, - "learning_rate": 3.289954124145679e-07, - "logits/chosen": -2.5220448970794678, - "logits/rejected": -2.5066912174224854, - "logps/chosen": -262.2845764160156, - "logps/rejected": -257.2518005371094, - "loss": 0.2061, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.8599599599838257, - "rewards/margins": 5.666739463806152, - "rewards/rejected": -7.526698112487793, + "epoch": 1.16, + "learning_rate": 3.3985558923159206e-07, + "logits/chosen": -2.6147255897521973, + "logits/rejected": -2.5742712020874023, + "logps/chosen": -286.0497741699219, + "logps/rejected": -307.2676696777344, + "loss": 0.0789, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1921635866165161, + "rewards/margins": 8.052736282348633, + "rewards/rejected": -9.244898796081543, "step": 4840 }, { - "epoch": 1.23, - "learning_rate": 3.285272914521112e-07, - "logits/chosen": -2.4081637859344482, - "logits/rejected": -2.360440254211426, - "logps/chosen": -210.6362762451172, - "logps/rejected": -366.09283447265625, - "loss": 0.1449, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.3536659479141235, - "rewards/margins": 6.394976615905762, - "rewards/rejected": -7.748641014099121, + "epoch": 1.17, + "learning_rate": 3.394098769834195e-07, + "logits/chosen": -2.7272768020629883, + "logits/rejected": -2.753826856613159, + "logps/chosen": -315.0127258300781, + "logps/rejected": -388.98699951171875, + "loss": 0.1145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4324567317962646, + "rewards/margins": 7.052580833435059, + "rewards/rejected": -8.485038757324219, "step": 4850 }, { - "epoch": 1.23, - "learning_rate": 3.2805917048965455e-07, - "logits/chosen": -2.679720640182495, - "logits/rejected": -2.585204839706421, - "logps/chosen": -300.98736572265625, - "logps/rejected": -262.6742858886719, - "loss": 0.0843, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.46402186155319214, - "rewards/margins": 5.079150199890137, - "rewards/rejected": -5.5431718826293945, + "epoch": 1.17, + "learning_rate": 3.389641647352469e-07, + "logits/chosen": -2.710136890411377, + "logits/rejected": -2.659062623977661, + "logps/chosen": -191.16256713867188, + "logps/rejected": -300.2677307128906, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.317988872528076, + "rewards/margins": 6.529849052429199, + "rewards/rejected": -8.847837448120117, "step": 4860 }, { - "epoch": 1.23, - "learning_rate": 3.275910495271978e-07, - "logits/chosen": -2.487708330154419, - "logits/rejected": -2.5284218788146973, - "logps/chosen": -231.89138793945312, - "logps/rejected": -356.9471740722656, - "loss": 0.0977, + "epoch": 1.17, + "learning_rate": 3.385184524870743e-07, + "logits/chosen": -2.782958745956421, + "logits/rejected": -2.6507785320281982, + "logps/chosen": -358.86138916015625, + "logps/rejected": -265.9754333496094, + "loss": 0.0866, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.6061888933181763, - "rewards/margins": 8.890503883361816, - "rewards/rejected": -8.284314155578613, + "rewards/chosen": -1.2561192512512207, + "rewards/margins": 5.585455417633057, + "rewards/rejected": -6.841574668884277, "step": 4870 }, { - "epoch": 1.23, - "learning_rate": 3.271229285647411e-07, - "logits/chosen": -2.5985474586486816, - "logits/rejected": -2.5037901401519775, - "logps/chosen": -319.89703369140625, - "logps/rejected": -444.4794921875, - "loss": 0.1161, + "epoch": 1.17, + "learning_rate": 3.380727402389018e-07, + "logits/chosen": -2.715045213699341, + "logits/rejected": -2.64947772026062, + "logps/chosen": -216.85092163085938, + "logps/rejected": -374.9673767089844, + "loss": 0.1258, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.2769446074962616, - "rewards/margins": 8.904375076293945, - "rewards/rejected": -8.627429008483887, + "rewards/chosen": -1.7327502965927124, + "rewards/margins": 8.963811874389648, + "rewards/rejected": -10.696561813354492, "step": 4880 }, { - "epoch": 1.24, - "learning_rate": 3.266548076022844e-07, - "logits/chosen": -2.5160458087921143, - "logits/rejected": -2.3146345615386963, - "logps/chosen": -263.0496826171875, - "logps/rejected": -347.59539794921875, - "loss": 0.0951, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.4355698227882385, - "rewards/margins": 8.083341598510742, - "rewards/rejected": -8.518911361694336, + "epoch": 1.18, + "learning_rate": 3.376270279907292e-07, + "logits/chosen": -2.8498542308807373, + "logits/rejected": -2.718228340148926, + "logps/chosen": -300.4696350097656, + "logps/rejected": -258.2084045410156, + "loss": 0.1316, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.26651516556739807, + "rewards/margins": 6.5299859046936035, + "rewards/rejected": -6.796500205993652, "step": 4890 }, { - "epoch": 1.24, - "learning_rate": 3.2618668663982776e-07, - "logits/chosen": -2.3110601902008057, - "logits/rejected": -2.304619073867798, - "logps/chosen": -289.2687072753906, - "logps/rejected": -312.71832275390625, - "loss": 0.0807, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.874498724937439, - "rewards/margins": 5.567784309387207, - "rewards/rejected": -6.442282676696777, + "epoch": 1.18, + "learning_rate": 3.371813157425566e-07, + "logits/chosen": -2.6546387672424316, + "logits/rejected": -2.5926167964935303, + "logps/chosen": -350.89385986328125, + "logps/rejected": -285.74664306640625, + "loss": 0.1298, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0249881744384766, + "rewards/margins": 6.851668357849121, + "rewards/rejected": -7.876657962799072, "step": 4900 }, { - "epoch": 1.24, - "learning_rate": 3.25718565677371e-07, - "logits/chosen": -2.3375391960144043, - "logits/rejected": -2.280052661895752, - "logps/chosen": -351.435791015625, - "logps/rejected": -335.129150390625, - "loss": 0.1079, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.4033665060997009, - "rewards/margins": 8.237372398376465, - "rewards/rejected": -7.834007263183594, + "epoch": 1.18, + "eval_logits/chosen": -2.5187530517578125, + "eval_logits/rejected": -2.4855663776397705, + "eval_logps/chosen": -242.4874267578125, + "eval_logps/rejected": -252.8262481689453, + "eval_loss": 0.5527775287628174, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -4.6526408195495605, + "eval_rewards/margins": 2.1906745433807373, + "eval_rewards/rejected": -6.843315601348877, + "eval_runtime": 132.4858, + "eval_samples_per_second": 23.821, + "eval_steps_per_second": 0.377, + "step": 4900 + }, + { + "epoch": 1.18, + "learning_rate": 3.3673560349438404e-07, + "logits/chosen": -2.679253101348877, + "logits/rejected": -2.8103015422821045, + "logps/chosen": -229.9272918701172, + "logps/rejected": -364.17401123046875, + "loss": 0.1334, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9747058749198914, + "rewards/margins": 9.557844161987305, + "rewards/rejected": -10.532548904418945, "step": 4910 }, { - "epoch": 1.24, - "learning_rate": 3.252504447149143e-07, - "logits/chosen": -2.599074363708496, - "logits/rejected": -2.519040107727051, - "logps/chosen": -295.12054443359375, - "logps/rejected": -278.61279296875, - "loss": 0.0756, + "epoch": 1.18, + "learning_rate": 3.3628989124621144e-07, + "logits/chosen": -2.8708176612854004, + "logits/rejected": -2.730346202850342, + "logps/chosen": -218.0589141845703, + "logps/rejected": -248.0902862548828, + "loss": 0.1046, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.35003671050071716, - "rewards/margins": 5.749134063720703, - "rewards/rejected": -6.099170207977295, + "rewards/chosen": -0.3006719946861267, + "rewards/margins": 7.540896415710449, + "rewards/rejected": -7.841568946838379, "step": 4920 }, { - "epoch": 1.25, - "learning_rate": 3.247823237524576e-07, - "logits/chosen": -2.363374948501587, - "logits/rejected": -2.219956159591675, - "logps/chosen": -272.5213317871094, - "logps/rejected": -356.702880859375, - "loss": 0.0551, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.160892128944397, - "rewards/margins": 6.272316932678223, - "rewards/rejected": -7.433209419250488, + "epoch": 1.19, + "learning_rate": 3.3584417899803884e-07, + "logits/chosen": -2.8268866539001465, + "logits/rejected": -2.6961302757263184, + "logps/chosen": -283.0794677734375, + "logps/rejected": -303.4607849121094, + "loss": 0.0849, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5420979261398315, + "rewards/margins": 6.114461898803711, + "rewards/rejected": -7.65656042098999, "step": 4930 }, { - "epoch": 1.25, - "learning_rate": 3.243142027900009e-07, - "logits/chosen": -2.46087384223938, - "logits/rejected": -2.4047253131866455, - "logps/chosen": -330.67987060546875, - "logps/rejected": -315.9459533691406, - "loss": 0.0981, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.39595699310302734, - "rewards/margins": 8.342355728149414, - "rewards/rejected": -7.9463982582092285, + "epoch": 1.19, + "learning_rate": 3.353984667498663e-07, + "logits/chosen": -2.526602268218994, + "logits/rejected": -2.678028106689453, + "logps/chosen": -256.60125732421875, + "logps/rejected": -292.89447021484375, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9405930638313293, + "rewards/margins": 6.148417949676514, + "rewards/rejected": -7.089011192321777, "step": 4940 }, { - "epoch": 1.25, - "learning_rate": 3.2384608182754426e-07, - "logits/chosen": -2.372825860977173, - "logits/rejected": -2.363440752029419, - "logps/chosen": -313.22259521484375, - "logps/rejected": -392.95330810546875, - "loss": 0.077, + "epoch": 1.19, + "learning_rate": 3.349527545016937e-07, + "logits/chosen": -2.708590269088745, + "logits/rejected": -2.596550226211548, + "logps/chosen": -374.00347900390625, + "logps/rejected": -392.03533935546875, + "loss": 0.0924, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.1961669921875, - "rewards/margins": 8.941523551940918, - "rewards/rejected": -10.137690544128418, + "rewards/chosen": -0.5931267738342285, + "rewards/margins": 6.627572536468506, + "rewards/rejected": -7.220698356628418, "step": 4950 }, { - "epoch": 1.25, - "learning_rate": 3.233779608650875e-07, - "logits/chosen": -2.536463499069214, - "logits/rejected": -2.463451623916626, - "logps/chosen": -299.01849365234375, - "logps/rejected": -307.9565734863281, - "loss": 0.1718, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.06701017916202545, - "rewards/margins": 6.308807373046875, - "rewards/rejected": -6.37581729888916, + "epoch": 1.19, + "learning_rate": 3.345070422535211e-07, + "logits/chosen": -2.5236542224884033, + "logits/rejected": -2.6090996265411377, + "logps/chosen": -154.9342041015625, + "logps/rejected": -260.1464538574219, + "loss": 0.0815, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.21829000115394592, + "rewards/margins": 7.69723653793335, + "rewards/rejected": -7.91552734375, "step": 4960 }, { - "epoch": 1.26, - "learning_rate": 3.2290983990263084e-07, - "logits/chosen": -2.4642741680145264, - "logits/rejected": -2.242976665496826, - "logps/chosen": -297.47772216796875, - "logps/rejected": -272.5084228515625, - "loss": 0.0671, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.45554548501968384, - "rewards/margins": 7.599942207336426, - "rewards/rejected": -7.144396781921387, + "epoch": 1.2, + "learning_rate": 3.340613300053485e-07, + "logits/chosen": -2.9121837615966797, + "logits/rejected": -2.765437126159668, + "logps/chosen": -295.09576416015625, + "logps/rejected": -415.7850036621094, + "loss": 0.1274, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.07239139825105667, + "rewards/margins": 8.920417785644531, + "rewards/rejected": -8.992809295654297, "step": 4970 }, { - "epoch": 1.26, - "learning_rate": 3.224417189401741e-07, - "logits/chosen": -2.6214382648468018, - "logits/rejected": -2.4945359230041504, - "logps/chosen": -313.21820068359375, - "logps/rejected": -415.0000915527344, - "loss": 0.0588, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.43819770216941833, - "rewards/margins": 8.810786247253418, - "rewards/rejected": -9.248983383178711, + "epoch": 1.2, + "learning_rate": 3.3361561775717596e-07, + "logits/chosen": -2.479745388031006, + "logits/rejected": -2.4983603954315186, + "logps/chosen": -201.68295288085938, + "logps/rejected": -242.7544708251953, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40303295850753784, + "rewards/margins": 6.061512470245361, + "rewards/rejected": -6.464545249938965, "step": 4980 }, { - "epoch": 1.26, - "learning_rate": 3.2197359797771747e-07, - "logits/chosen": -2.313692569732666, - "logits/rejected": -2.2011497020721436, - "logps/chosen": -218.2009735107422, - "logps/rejected": -247.17532348632812, - "loss": 0.111, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.7616634368896484, - "rewards/margins": 4.7421064376831055, - "rewards/rejected": -6.503769874572754, + "epoch": 1.2, + "learning_rate": 3.3316990550900336e-07, + "logits/chosen": -2.7950642108917236, + "logits/rejected": -2.7174344062805176, + "logps/chosen": -265.5577392578125, + "logps/rejected": -295.66949462890625, + "loss": 0.2313, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5985932350158691, + "rewards/margins": 8.149724960327148, + "rewards/rejected": -8.748318672180176, "step": 4990 }, { - "epoch": 1.26, - "learning_rate": 3.215054770152607e-07, - "logits/chosen": -2.337268352508545, - "logits/rejected": -2.1621477603912354, - "logps/chosen": -224.9978790283203, - "logps/rejected": -239.8637237548828, - "loss": 0.0488, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3969880938529968, - "rewards/margins": 7.433840751647949, - "rewards/rejected": -7.8308281898498535, + "epoch": 1.2, + "learning_rate": 3.3272419326083077e-07, + "logits/chosen": -2.5686748027801514, + "logits/rejected": -2.6225666999816895, + "logps/chosen": -260.56121826171875, + "logps/rejected": -240.3380126953125, + "loss": 0.1143, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.135986804962158, + "rewards/margins": 5.733573913574219, + "rewards/rejected": -7.869560241699219, "step": 5000 }, { - "epoch": 1.27, - "learning_rate": 3.2103735605280405e-07, - "logits/chosen": -2.4363958835601807, - "logits/rejected": -2.274914026260376, - "logps/chosen": -250.6156463623047, - "logps/rejected": -246.60397338867188, - "loss": 0.1126, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.40992647409439087, - "rewards/margins": 6.342892169952393, - "rewards/rejected": -6.7528181076049805, + "epoch": 1.2, + "eval_logits/chosen": -2.5541951656341553, + "eval_logits/rejected": -2.5190439224243164, + "eval_logps/chosen": -242.17335510253906, + "eval_logps/rejected": -255.1999969482422, + "eval_loss": 0.5511711835861206, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -4.621235370635986, + "eval_rewards/margins": 2.459453821182251, + "eval_rewards/rejected": -7.080688953399658, + "eval_runtime": 132.4706, + "eval_samples_per_second": 23.824, + "eval_steps_per_second": 0.377, + "step": 5000 + }, + { + "epoch": 1.21, + "learning_rate": 3.322784810126582e-07, + "logits/chosen": -2.740840435028076, + "logits/rejected": -2.7909092903137207, + "logps/chosen": -235.01016235351562, + "logps/rejected": -258.4129333496094, + "loss": 0.1798, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9214189052581787, + "rewards/margins": 4.721066474914551, + "rewards/rejected": -6.642485618591309, "step": 5010 }, { - "epoch": 1.27, - "learning_rate": 3.2056923509034734e-07, - "logits/chosen": -2.478621482849121, - "logits/rejected": -2.4945502281188965, - "logps/chosen": -282.9931335449219, - "logps/rejected": -350.892578125, - "loss": 0.083, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5662134289741516, - "rewards/margins": 9.96242904663086, - "rewards/rejected": -9.396215438842773, + "epoch": 1.21, + "learning_rate": 3.318327687644856e-07, + "logits/chosen": -2.8321995735168457, + "logits/rejected": -2.831120252609253, + "logps/chosen": -335.288818359375, + "logps/rejected": -344.8019714355469, + "loss": 0.0988, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.948270320892334, + "rewards/margins": 7.080456733703613, + "rewards/rejected": -8.028726577758789, "step": 5020 }, { - "epoch": 1.27, - "learning_rate": 3.201011141278906e-07, - "logits/chosen": -2.539477586746216, - "logits/rejected": -2.6019339561462402, - "logps/chosen": -190.00582885742188, - "logps/rejected": -270.9634704589844, - "loss": 0.0953, + "epoch": 1.21, + "learning_rate": 3.3138705651631303e-07, + "logits/chosen": -2.750998020172119, + "logits/rejected": -2.779369354248047, + "logps/chosen": -295.0296630859375, + "logps/rejected": -279.68878173828125, + "loss": 0.1164, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.5821501016616821, - "rewards/margins": 8.267257690429688, - "rewards/rejected": -8.849408149719238, + "rewards/chosen": -1.8295824527740479, + "rewards/margins": 5.146444797515869, + "rewards/rejected": -6.976027011871338, "step": 5030 }, { - "epoch": 1.27, - "learning_rate": 3.196329931654339e-07, - "logits/chosen": -2.580522060394287, - "logits/rejected": -2.344433069229126, - "logps/chosen": -213.58950805664062, - "logps/rejected": -231.41171264648438, - "loss": 0.043, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.2818623185157776, - "rewards/margins": 6.4354963302612305, - "rewards/rejected": -6.153634071350098, + "epoch": 1.21, + "learning_rate": 3.309413442681405e-07, + "logits/chosen": -2.8488826751708984, + "logits/rejected": -2.7452878952026367, + "logps/chosen": -227.227294921875, + "logps/rejected": -320.05657958984375, + "loss": 0.0729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09394042938947678, + "rewards/margins": 6.831442832946777, + "rewards/rejected": -6.925383567810059, "step": 5040 }, { - "epoch": 1.28, - "learning_rate": 3.191648722029772e-07, - "logits/chosen": -2.538010597229004, - "logits/rejected": -2.5986104011535645, - "logps/chosen": -334.59130859375, - "logps/rejected": -352.0889892578125, - "loss": 0.075, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7573752403259277, - "rewards/margins": 7.762686252593994, - "rewards/rejected": -7.005311489105225, + "epoch": 1.22, + "learning_rate": 3.304956320199679e-07, + "logits/chosen": -2.506798267364502, + "logits/rejected": -2.5822832584381104, + "logps/chosen": -216.282470703125, + "logps/rejected": -281.2460021972656, + "loss": 0.0707, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2941832542419434, + "rewards/margins": 6.209543704986572, + "rewards/rejected": -7.503726959228516, "step": 5050 }, { - "epoch": 1.28, - "learning_rate": 3.1869675124052055e-07, - "logits/chosen": -2.246711254119873, - "logits/rejected": -2.192532539367676, - "logps/chosen": -273.1240234375, - "logps/rejected": -297.4888610839844, - "loss": 0.0721, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.6416655778884888, - "rewards/margins": 7.805489540100098, - "rewards/rejected": -8.447155952453613, + "epoch": 1.22, + "learning_rate": 3.300499197717953e-07, + "logits/chosen": -2.795393466949463, + "logits/rejected": -2.6193509101867676, + "logps/chosen": -319.36798095703125, + "logps/rejected": -335.1795959472656, + "loss": 0.0753, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.13285976648330688, + "rewards/margins": 9.38614273071289, + "rewards/rejected": -9.519001960754395, "step": 5060 }, { - "epoch": 1.28, - "learning_rate": 3.1822863027806384e-07, - "logits/chosen": -2.2593188285827637, - "logits/rejected": -2.304441213607788, - "logps/chosen": -214.68161010742188, - "logps/rejected": -375.0998229980469, - "loss": 0.1026, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5545988082885742, - "rewards/margins": 11.266571998596191, - "rewards/rejected": -12.82116985321045, + "epoch": 1.22, + "learning_rate": 3.2960420752362275e-07, + "logits/chosen": -2.790848970413208, + "logits/rejected": -2.6933062076568604, + "logps/chosen": -228.9560089111328, + "logps/rejected": -273.2444763183594, + "loss": 0.077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6011162996292114, + "rewards/margins": 8.987518310546875, + "rewards/rejected": -9.588634490966797, "step": 5070 }, { - "epoch": 1.28, - "learning_rate": 3.177605093156072e-07, - "logits/chosen": -2.4767723083496094, - "logits/rejected": -2.3222458362579346, - "logps/chosen": -209.7731475830078, - "logps/rejected": -316.46942138671875, - "loss": 0.0994, + "epoch": 1.22, + "learning_rate": 3.2915849527545015e-07, + "logits/chosen": -2.755242109298706, + "logits/rejected": -2.7851879596710205, + "logps/chosen": -226.41183471679688, + "logps/rejected": -368.63677978515625, + "loss": 0.0666, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.6914139986038208, - "rewards/margins": 6.875144958496094, - "rewards/rejected": -7.566558837890625, + "rewards/chosen": -0.5428380370140076, + "rewards/margins": 8.593741416931152, + "rewards/rejected": -9.136579513549805, "step": 5080 }, { - "epoch": 1.29, - "learning_rate": 3.172923883531504e-07, - "logits/chosen": -2.4502346515655518, - "logits/rejected": -2.4465484619140625, - "logps/chosen": -260.17474365234375, - "logps/rejected": -325.09649658203125, - "loss": 0.2634, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.13710087537765503, - "rewards/margins": 7.22597599029541, - "rewards/rejected": -7.363077640533447, + "epoch": 1.23, + "learning_rate": 3.2871278302727755e-07, + "logits/chosen": -2.7453033924102783, + "logits/rejected": -2.747636556625366, + "logps/chosen": -145.54763793945312, + "logps/rejected": -267.27777099609375, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3984285891056061, + "rewards/margins": 5.825728893280029, + "rewards/rejected": -5.427300453186035, "step": 5090 }, { - "epoch": 1.29, - "learning_rate": 3.1682426739069376e-07, - "logits/chosen": -2.5645554065704346, - "logits/rejected": -2.2416329383850098, - "logps/chosen": -294.3785705566406, - "logps/rejected": -285.55218505859375, - "loss": 0.1383, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.3467468321323395, - "rewards/margins": 5.447103977203369, - "rewards/rejected": -5.793850898742676, + "epoch": 1.23, + "learning_rate": 3.28267070779105e-07, + "logits/chosen": -2.7796096801757812, + "logits/rejected": -2.6456923484802246, + "logps/chosen": -202.91152954101562, + "logps/rejected": -210.74325561523438, + "loss": 0.1145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6019163131713867, + "rewards/margins": 6.083104610443115, + "rewards/rejected": -6.68502140045166, "step": 5100 }, { - "epoch": 1.29, - "learning_rate": 3.1635614642823705e-07, - "logits/chosen": -2.532900333404541, - "logits/rejected": -2.584439754486084, - "logps/chosen": -228.9880828857422, - "logps/rejected": -327.6162414550781, - "loss": 0.1279, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.023542404174804688, - "rewards/margins": 6.46780252456665, - "rewards/rejected": -6.491345405578613, + "epoch": 1.23, + "eval_logits/chosen": -2.6008267402648926, + "eval_logits/rejected": -2.5736794471740723, + "eval_logps/chosen": -236.55941772460938, + "eval_logps/rejected": -250.5395965576172, + "eval_loss": 0.549608588218689, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -4.059840202331543, + "eval_rewards/margins": 2.554811477661133, + "eval_rewards/rejected": -6.614652156829834, + "eval_runtime": 132.4656, + "eval_samples_per_second": 23.825, + "eval_steps_per_second": 0.377, + "step": 5100 + }, + { + "epoch": 1.23, + "learning_rate": 3.278213585309324e-07, + "logits/chosen": -2.8164000511169434, + "logits/rejected": -2.7588438987731934, + "logps/chosen": -239.1763916015625, + "logps/rejected": -258.04486083984375, + "loss": 0.1071, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6350772380828857, + "rewards/margins": 5.319411277770996, + "rewards/rejected": -6.9544878005981445, "step": 5110 }, { - "epoch": 1.29, - "learning_rate": 3.1588802546578033e-07, - "logits/chosen": -2.691772937774658, - "logits/rejected": -2.6354057788848877, - "logps/chosen": -274.74908447265625, - "logps/rejected": -227.7584991455078, - "loss": 0.126, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2150676250457764, - "rewards/margins": 6.647829532623291, - "rewards/rejected": -5.432761192321777, + "epoch": 1.23, + "learning_rate": 3.273756462827598e-07, + "logits/chosen": -2.5732250213623047, + "logits/rejected": -2.593712091445923, + "logps/chosen": -174.12368774414062, + "logps/rejected": -297.83465576171875, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4870151877403259, + "rewards/margins": 7.844290256500244, + "rewards/rejected": -8.331304550170898, "step": 5120 }, { - "epoch": 1.3, - "learning_rate": 3.154199045033236e-07, - "logits/chosen": -2.686004161834717, - "logits/rejected": -2.56087064743042, - "logps/chosen": -267.67657470703125, - "logps/rejected": -256.8505859375, - "loss": 0.0943, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.1638038456439972, - "rewards/margins": 5.643346309661865, - "rewards/rejected": -5.807150840759277, + "epoch": 1.23, + "learning_rate": 3.269299340345872e-07, + "logits/chosen": -2.608652353286743, + "logits/rejected": -2.5631163120269775, + "logps/chosen": -275.79022216796875, + "logps/rejected": -346.18414306640625, + "loss": 0.1391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4670010805130005, + "rewards/margins": 11.63456916809082, + "rewards/rejected": -10.167566299438477, "step": 5130 }, { - "epoch": 1.3, - "learning_rate": 3.149517835408669e-07, - "logits/chosen": -2.780150890350342, - "logits/rejected": -2.749056100845337, - "logps/chosen": -260.70452880859375, - "logps/rejected": -246.4949951171875, - "loss": 0.1032, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6062029600143433, - "rewards/margins": 5.213749885559082, - "rewards/rejected": -5.81995153427124, + "epoch": 1.24, + "learning_rate": 3.2648422178641467e-07, + "logits/chosen": -2.7001876831054688, + "logits/rejected": -2.6927671432495117, + "logps/chosen": -279.2212219238281, + "logps/rejected": -353.4243469238281, + "loss": 0.1127, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7953327894210815, + "rewards/margins": 7.900286674499512, + "rewards/rejected": -8.695619583129883, "step": 5140 }, { - "epoch": 1.3, - "learning_rate": 3.1448366257841026e-07, - "logits/chosen": -2.6148600578308105, - "logits/rejected": -2.5808730125427246, - "logps/chosen": -188.8590545654297, - "logps/rejected": -271.2063903808594, - "loss": 0.109, + "epoch": 1.24, + "learning_rate": 3.260385095382421e-07, + "logits/chosen": -2.6843762397766113, + "logits/rejected": -2.752167224884033, + "logps/chosen": -183.01651000976562, + "logps/rejected": -245.5830078125, + "loss": 0.0938, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.014142001047730446, - "rewards/margins": 6.740876197814941, - "rewards/rejected": -6.726733207702637, + "rewards/chosen": 0.07583768665790558, + "rewards/margins": 5.993206977844238, + "rewards/rejected": -5.9173688888549805, "step": 5150 }, { - "epoch": 1.3, - "learning_rate": 3.1401554161595354e-07, - "logits/chosen": -2.544335126876831, - "logits/rejected": -2.4183132648468018, - "logps/chosen": -299.56256103515625, - "logps/rejected": -318.59283447265625, - "loss": 0.094, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3409278392791748, - "rewards/margins": 7.766920566558838, - "rewards/rejected": -7.4259934425354, + "epoch": 1.24, + "learning_rate": 3.255927972900695e-07, + "logits/chosen": -2.7239246368408203, + "logits/rejected": -2.6669764518737793, + "logps/chosen": -339.6507263183594, + "logps/rejected": -392.0833435058594, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4986976683139801, + "rewards/margins": 8.557465553283691, + "rewards/rejected": -8.058767318725586, "step": 5160 }, { - "epoch": 1.31, - "learning_rate": 3.135474206534969e-07, - "logits/chosen": -2.4106175899505615, - "logits/rejected": -2.454444646835327, - "logps/chosen": -252.49008178710938, - "logps/rejected": -335.98138427734375, - "loss": 0.318, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3366869390010834, - "rewards/margins": 6.528241157531738, - "rewards/rejected": -6.864927768707275, + "epoch": 1.24, + "learning_rate": 3.2514708504189693e-07, + "logits/chosen": -2.492598295211792, + "logits/rejected": -2.439323902130127, + "logps/chosen": -304.1005554199219, + "logps/rejected": -345.717529296875, + "loss": 0.071, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0371806621551514, + "rewards/margins": 7.382077693939209, + "rewards/rejected": -8.419259071350098, "step": 5170 }, { - "epoch": 1.31, - "learning_rate": 3.130792996910401e-07, - "logits/chosen": -2.7149930000305176, - "logits/rejected": -2.665344715118408, - "logps/chosen": -232.3904266357422, - "logps/rejected": -329.61187744140625, - "loss": 0.1881, + "epoch": 1.25, + "learning_rate": 3.2470137279372434e-07, + "logits/chosen": -2.719870090484619, + "logits/rejected": -2.6928839683532715, + "logps/chosen": -266.06756591796875, + "logps/rejected": -353.38165283203125, + "loss": 0.0917, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6869542002677917, - "rewards/margins": 6.880588531494141, - "rewards/rejected": -7.567543029785156, + "rewards/chosen": -0.0018859386909753084, + "rewards/margins": 7.535738468170166, + "rewards/rejected": -7.537625312805176, "step": 5180 }, { - "epoch": 1.31, - "learning_rate": 3.1261117872858347e-07, - "logits/chosen": -2.739945411682129, - "logits/rejected": -2.588160276412964, - "logps/chosen": -382.86700439453125, - "logps/rejected": -346.7525329589844, - "loss": 0.0714, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6857783794403076, - "rewards/margins": 7.36517333984375, - "rewards/rejected": -8.05095100402832, + "epoch": 1.25, + "learning_rate": 3.2425566054555174e-07, + "logits/chosen": -2.6234517097473145, + "logits/rejected": -2.6319172382354736, + "logps/chosen": -192.5091094970703, + "logps/rejected": -255.5124053955078, + "loss": 0.1261, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1070797443389893, + "rewards/margins": 6.69888162612915, + "rewards/rejected": -8.805960655212402, "step": 5190 }, { - "epoch": 1.31, - "learning_rate": 3.1214305776612676e-07, - "logits/chosen": -2.639806032180786, - "logits/rejected": -2.6764893531799316, - "logps/chosen": -253.75650024414062, - "logps/rejected": -262.29376220703125, - "loss": 0.1138, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3462933897972107, - "rewards/margins": 5.708548069000244, - "rewards/rejected": -6.054841041564941, + "epoch": 1.25, + "learning_rate": 3.238099482973792e-07, + "logits/chosen": -2.7816214561462402, + "logits/rejected": -2.6054234504699707, + "logps/chosen": -312.97784423828125, + "logps/rejected": -329.66259765625, + "loss": 0.2324, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3320107460021973, + "rewards/margins": 5.9293317794799805, + "rewards/rejected": -8.261343002319336, "step": 5200 }, { - "epoch": 1.32, - "learning_rate": 3.116749368036701e-07, - "logits/chosen": -2.5353024005889893, - "logits/rejected": -2.475508689880371, - "logps/chosen": -313.263916015625, - "logps/rejected": -308.0852355957031, - "loss": 0.1445, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.26782792806625366, - "rewards/margins": 8.699701309204102, - "rewards/rejected": -8.431873321533203, + "epoch": 1.25, + "eval_logits/chosen": -2.4736592769622803, + "eval_logits/rejected": -2.4382150173187256, + "eval_logps/chosen": -245.61148071289062, + "eval_logps/rejected": -261.0058288574219, + "eval_loss": 0.5523704886436462, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -4.965047359466553, + "eval_rewards/margins": 2.696227788925171, + "eval_rewards/rejected": -7.661274433135986, + "eval_runtime": 132.225, + "eval_samples_per_second": 23.868, + "eval_steps_per_second": 0.378, + "step": 5200 + }, + { + "epoch": 1.25, + "learning_rate": 3.233642360492066e-07, + "logits/chosen": -2.759204387664795, + "logits/rejected": -2.553602695465088, + "logps/chosen": -259.856201171875, + "logps/rejected": -370.1678771972656, + "loss": 0.0887, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8292741775512695, + "rewards/margins": 8.80259895324707, + "rewards/rejected": -9.631872177124023, "step": 5210 }, { - "epoch": 1.32, - "learning_rate": 3.1120681584121333e-07, - "logits/chosen": -2.51269268989563, - "logits/rejected": -2.4065051078796387, - "logps/chosen": -364.6129455566406, - "logps/rejected": -401.416748046875, - "loss": 0.083, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7194173336029053, - "rewards/margins": 9.384866714477539, - "rewards/rejected": -8.665449142456055, + "epoch": 1.26, + "learning_rate": 3.22918523801034e-07, + "logits/chosen": -2.7774031162261963, + "logits/rejected": -2.6909408569335938, + "logps/chosen": -372.52801513671875, + "logps/rejected": -337.82861328125, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1395309716463089, + "rewards/margins": 8.282355308532715, + "rewards/rejected": -8.142824172973633, "step": 5220 }, { - "epoch": 1.32, - "learning_rate": 3.107386948787566e-07, - "logits/chosen": -2.64780855178833, - "logits/rejected": -2.580000400543213, - "logps/chosen": -239.59799194335938, - "logps/rejected": -263.75860595703125, - "loss": 0.0784, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8628127574920654, - "rewards/margins": 5.6662468910217285, - "rewards/rejected": -7.529058933258057, + "epoch": 1.26, + "learning_rate": 3.2247281155286146e-07, + "logits/chosen": -2.5862934589385986, + "logits/rejected": -2.5209126472473145, + "logps/chosen": -331.7264404296875, + "logps/rejected": -404.93280029296875, + "loss": 0.119, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.02903413772583, + "rewards/margins": 8.425878524780273, + "rewards/rejected": -9.454913139343262, "step": 5230 }, { - "epoch": 1.32, - "learning_rate": 3.1027057391629997e-07, - "logits/chosen": -2.7219760417938232, - "logits/rejected": -2.5608391761779785, - "logps/chosen": -235.74038696289062, - "logps/rejected": -348.9249572753906, - "loss": 0.1029, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.319959819316864, - "rewards/margins": 7.829257011413574, - "rewards/rejected": -8.14921760559082, + "epoch": 1.26, + "learning_rate": 3.2202709930468886e-07, + "logits/chosen": -2.5465495586395264, + "logits/rejected": -2.489522933959961, + "logps/chosen": -321.5809326171875, + "logps/rejected": -520.196533203125, + "loss": 0.1046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11693539470434189, + "rewards/margins": 18.513683319091797, + "rewards/rejected": -18.39674949645996, "step": 5240 }, { - "epoch": 1.33, - "learning_rate": 3.0980245295384325e-07, - "logits/chosen": -2.2276036739349365, - "logits/rejected": -2.187509536743164, - "logps/chosen": -247.76754760742188, - "logps/rejected": -262.46905517578125, - "loss": 0.1006, + "epoch": 1.26, + "learning_rate": 3.2158138705651626e-07, + "logits/chosen": -2.5862770080566406, + "logits/rejected": -2.5724239349365234, + "logps/chosen": -357.66192626953125, + "logps/rejected": -473.635009765625, + "loss": 0.0824, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.9565721750259399, - "rewards/margins": 5.925984859466553, - "rewards/rejected": -6.882556915283203, + "rewards/chosen": 0.12269718945026398, + "rewards/margins": 9.088176727294922, + "rewards/rejected": -8.96548080444336, "step": 5250 }, { - "epoch": 1.33, - "learning_rate": 3.093343319913866e-07, - "logits/chosen": -2.4842872619628906, - "logits/rejected": -2.506471633911133, - "logps/chosen": -232.65634155273438, - "logps/rejected": -322.9957580566406, - "loss": 0.2545, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0278587341308594, - "rewards/margins": 6.862580299377441, - "rewards/rejected": -7.890439033508301, + "epoch": 1.27, + "learning_rate": 3.211356748083437e-07, + "logits/chosen": -2.6964962482452393, + "logits/rejected": -2.6302378177642822, + "logps/chosen": -321.17633056640625, + "logps/rejected": -261.5325622558594, + "loss": 0.0823, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.456024169921875, + "rewards/margins": 6.357872486114502, + "rewards/rejected": -7.813896179199219, "step": 5260 }, { - "epoch": 1.33, - "learning_rate": 3.0886621102892983e-07, - "logits/chosen": -2.5557355880737305, - "logits/rejected": -2.34470796585083, - "logps/chosen": -269.01129150390625, - "logps/rejected": -328.1927490234375, - "loss": 0.1072, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.42379847168922424, - "rewards/margins": 7.9827117919921875, - "rewards/rejected": -7.558913230895996, + "epoch": 1.27, + "learning_rate": 3.206899625601711e-07, + "logits/chosen": -2.5549778938293457, + "logits/rejected": -2.484894037246704, + "logps/chosen": -213.2004852294922, + "logps/rejected": -336.80706787109375, + "loss": 0.0738, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1724932193756104, + "rewards/margins": 6.60608434677124, + "rewards/rejected": -8.77857780456543, "step": 5270 }, { - "epoch": 1.33, - "learning_rate": 3.083980900664732e-07, - "logits/chosen": -2.5487945079803467, - "logits/rejected": -2.4814343452453613, - "logps/chosen": -300.04327392578125, - "logps/rejected": -354.25762939453125, - "loss": 0.0879, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.19205832481384277, - "rewards/margins": 7.899884223937988, - "rewards/rejected": -8.09194278717041, + "epoch": 1.27, + "learning_rate": 3.202442503119985e-07, + "logits/chosen": -2.6541330814361572, + "logits/rejected": -2.5694010257720947, + "logps/chosen": -222.997802734375, + "logps/rejected": -273.3302307128906, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.224632978439331, + "rewards/margins": 6.345826148986816, + "rewards/rejected": -8.570459365844727, "step": 5280 }, { - "epoch": 1.34, - "learning_rate": 3.0792996910401647e-07, - "logits/chosen": -2.5001492500305176, - "logits/rejected": -2.453411102294922, - "logps/chosen": -245.8662109375, - "logps/rejected": -261.9246520996094, - "loss": 0.0955, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.09629590809345245, - "rewards/margins": 7.062813758850098, - "rewards/rejected": -7.159110069274902, + "epoch": 1.27, + "learning_rate": 3.1979853806382603e-07, + "logits/chosen": -2.6275217533111572, + "logits/rejected": -2.529531478881836, + "logps/chosen": -287.9963684082031, + "logps/rejected": -413.64337158203125, + "loss": 0.0857, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2760472297668457, + "rewards/margins": 6.546228885650635, + "rewards/rejected": -8.82227611541748, "step": 5290 }, { - "epoch": 1.34, - "learning_rate": 3.074618481415598e-07, - "logits/chosen": -2.3594117164611816, - "logits/rejected": -2.2448248863220215, - "logps/chosen": -250.38961791992188, - "logps/rejected": -226.07034301757812, - "loss": 0.0924, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.12966462969779968, - "rewards/margins": 5.99800443649292, - "rewards/rejected": -6.127669334411621, + "epoch": 1.28, + "learning_rate": 3.1935282581565344e-07, + "logits/chosen": -2.668487787246704, + "logits/rejected": -2.6314032077789307, + "logps/chosen": -274.003173828125, + "logps/rejected": -275.71795654296875, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4710385799407959, + "rewards/margins": 7.950554847717285, + "rewards/rejected": -8.421592712402344, "step": 5300 }, { - "epoch": 1.34, - "learning_rate": 3.0699372717910304e-07, - "logits/chosen": -2.446681022644043, - "logits/rejected": -2.406707286834717, - "logps/chosen": -267.0587463378906, - "logps/rejected": -302.5667724609375, - "loss": 0.0784, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.03188369423151016, - "rewards/margins": 8.295231819152832, - "rewards/rejected": -8.327116012573242, + "epoch": 1.28, + "eval_logits/chosen": -2.4702041149139404, + "eval_logits/rejected": -2.4366860389709473, + "eval_logps/chosen": -245.5292205810547, + "eval_logps/rejected": -261.1644592285156, + "eval_loss": 0.5449301600456238, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -4.956819534301758, + "eval_rewards/margins": 2.7203195095062256, + "eval_rewards/rejected": -7.6771392822265625, + "eval_runtime": 132.1411, + "eval_samples_per_second": 23.884, + "eval_steps_per_second": 0.378, + "step": 5300 + }, + { + "epoch": 1.28, + "learning_rate": 3.1890711356748084e-07, + "logits/chosen": -2.5286965370178223, + "logits/rejected": -2.57186222076416, + "logps/chosen": -265.4423828125, + "logps/rejected": -285.1353454589844, + "loss": 0.0801, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.47563061118125916, + "rewards/margins": 7.681891441345215, + "rewards/rejected": -8.157522201538086, "step": 5310 }, { - "epoch": 1.34, - "learning_rate": 3.065256062166464e-07, - "logits/chosen": -2.5380733013153076, - "logits/rejected": -2.412759780883789, - "logps/chosen": -228.8946990966797, - "logps/rejected": -260.20880126953125, - "loss": 0.1016, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5675271153450012, - "rewards/margins": 6.607524871826172, - "rewards/rejected": -7.175052642822266, + "epoch": 1.28, + "learning_rate": 3.1846140131930824e-07, + "logits/chosen": -2.568870782852173, + "logits/rejected": -2.496760606765747, + "logps/chosen": -280.03533935546875, + "logps/rejected": -282.7202453613281, + "loss": 0.1222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6181378364562988, + "rewards/margins": 6.926832675933838, + "rewards/rejected": -7.544970512390137, "step": 5320 }, { - "epoch": 1.35, - "learning_rate": 3.060574852541897e-07, - "logits/chosen": -2.536606788635254, - "logits/rejected": -2.5423636436462402, - "logps/chosen": -238.0750732421875, - "logps/rejected": -361.2891845703125, - "loss": 0.0686, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.8750554919242859, - "rewards/margins": 6.947751522064209, - "rewards/rejected": -7.822807312011719, + "epoch": 1.28, + "learning_rate": 3.180156890711357e-07, + "logits/chosen": -2.582235813140869, + "logits/rejected": -2.338684558868408, + "logps/chosen": -262.03472900390625, + "logps/rejected": -258.9203796386719, + "loss": 0.0751, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.010696632787585258, + "rewards/margins": 7.7717413902282715, + "rewards/rejected": -7.761044979095459, "step": 5330 }, { - "epoch": 1.35, - "learning_rate": 3.0558936429173296e-07, - "logits/chosen": -2.3694663047790527, - "logits/rejected": -2.361489772796631, - "logps/chosen": -176.83859252929688, - "logps/rejected": -229.7638397216797, - "loss": 0.104, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.37298643589019775, - "rewards/margins": 4.905221939086914, - "rewards/rejected": -5.278207778930664, + "epoch": 1.29, + "learning_rate": 3.175699768229631e-07, + "logits/chosen": -2.5130810737609863, + "logits/rejected": -2.5166637897491455, + "logps/chosen": -265.3890075683594, + "logps/rejected": -319.9589538574219, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2969924807548523, + "rewards/margins": 8.435912132263184, + "rewards/rejected": -8.732906341552734, "step": 5340 }, { - "epoch": 1.35, - "learning_rate": 3.0512124332927625e-07, - "logits/chosen": -2.567654609680176, - "logits/rejected": -2.5228219032287598, - "logps/chosen": -383.1378173828125, - "logps/rejected": -327.1722106933594, - "loss": 0.0697, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.13435129821300507, - "rewards/margins": 7.013735294342041, - "rewards/rejected": -6.8793840408325195, + "epoch": 1.29, + "learning_rate": 3.171242645747905e-07, + "logits/chosen": -2.453415632247925, + "logits/rejected": -2.4161128997802734, + "logps/chosen": -253.13021850585938, + "logps/rejected": -268.65887451171875, + "loss": 0.1987, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.40642133355140686, + "rewards/margins": 8.008938789367676, + "rewards/rejected": -8.415359497070312, "step": 5350 }, { - "epoch": 1.35, - "learning_rate": 3.0465312236681954e-07, - "logits/chosen": -2.4223361015319824, - "logits/rejected": -2.3703174591064453, - "logps/chosen": -241.94107055664062, - "logps/rejected": -277.2035827636719, - "loss": 0.0595, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.31000375747680664, - "rewards/margins": 7.710813999176025, - "rewards/rejected": -8.020816802978516, + "epoch": 1.29, + "learning_rate": 3.1667855232661796e-07, + "logits/chosen": -2.744293212890625, + "logits/rejected": -2.6942994594573975, + "logps/chosen": -288.07489013671875, + "logps/rejected": -408.0615539550781, + "loss": 0.1225, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.06573915481567383, + "rewards/margins": 9.440717697143555, + "rewards/rejected": -9.50645637512207, "step": 5360 }, { - "epoch": 1.36, - "learning_rate": 3.041850014043629e-07, - "logits/chosen": -2.4581074714660645, - "logits/rejected": -2.3692710399627686, - "logps/chosen": -312.61236572265625, - "logps/rejected": -466.7472229003906, - "loss": 0.0994, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3494400978088379, - "rewards/margins": 9.886107444763184, - "rewards/rejected": -10.23554801940918, + "epoch": 1.29, + "learning_rate": 3.1623284007844536e-07, + "logits/chosen": -2.7829785346984863, + "logits/rejected": -2.696958303451538, + "logps/chosen": -244.95263671875, + "logps/rejected": -251.4464569091797, + "loss": 0.0865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22969529032707214, + "rewards/margins": 5.57951545715332, + "rewards/rejected": -5.809210777282715, "step": 5370 }, { - "epoch": 1.36, - "learning_rate": 3.037168804419062e-07, - "logits/chosen": -2.4162583351135254, - "logits/rejected": -2.347888708114624, - "logps/chosen": -233.33657836914062, - "logps/rejected": -285.4001159667969, - "loss": 0.06, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.32537588477134705, - "rewards/margins": 7.4720563888549805, - "rewards/rejected": -7.797431945800781, + "epoch": 1.29, + "learning_rate": 3.1578712783027276e-07, + "logits/chosen": -2.699413776397705, + "logits/rejected": -2.7575314044952393, + "logps/chosen": -237.43685913085938, + "logps/rejected": -352.1213684082031, + "loss": 0.1011, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9225679636001587, + "rewards/margins": 8.103208541870117, + "rewards/rejected": -9.025776863098145, "step": 5380 }, { - "epoch": 1.36, - "learning_rate": 3.032487594794495e-07, - "logits/chosen": -2.5550405979156494, - "logits/rejected": -2.5950775146484375, - "logps/chosen": -282.7281188964844, - "logps/rejected": -376.10205078125, - "loss": 0.0927, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0375185012817383, - "rewards/margins": 9.317535400390625, - "rewards/rejected": -8.280016899108887, + "epoch": 1.3, + "learning_rate": 3.153414155821002e-07, + "logits/chosen": -2.8399507999420166, + "logits/rejected": -2.7902252674102783, + "logps/chosen": -262.2862854003906, + "logps/rejected": -350.62933349609375, + "loss": 0.1271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.028121400624513626, + "rewards/margins": 7.400224208831787, + "rewards/rejected": -7.372103214263916, "step": 5390 }, { - "epoch": 1.37, - "learning_rate": 3.0278063851699275e-07, - "logits/chosen": -2.3756508827209473, - "logits/rejected": -2.281033992767334, - "logps/chosen": -216.5684814453125, - "logps/rejected": -313.62030029296875, - "loss": 0.0738, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8860958814620972, - "rewards/margins": 6.265264987945557, - "rewards/rejected": -7.151360511779785, + "epoch": 1.3, + "learning_rate": 3.148957033339276e-07, + "logits/chosen": -2.686304807662964, + "logits/rejected": -2.679694414138794, + "logps/chosen": -281.16754150390625, + "logps/rejected": -356.1285095214844, + "loss": 0.0503, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8717133402824402, + "rewards/margins": 7.832782745361328, + "rewards/rejected": -8.704496383666992, "step": 5400 }, { - "epoch": 1.37, - "learning_rate": 3.023125175545361e-07, - "logits/chosen": -2.414106845855713, - "logits/rejected": -2.336822509765625, - "logps/chosen": -247.4856414794922, - "logps/rejected": -265.3743591308594, - "loss": 0.1013, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.24329610168933868, - "rewards/margins": 6.677481174468994, - "rewards/rejected": -6.920777320861816, + "epoch": 1.3, + "eval_logits/chosen": -2.455679416656494, + "eval_logits/rejected": -2.423499584197998, + "eval_logps/chosen": -241.64488220214844, + "eval_logps/rejected": -256.252685546875, + "eval_loss": 0.5350882411003113, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -4.568386077880859, + "eval_rewards/margins": 2.617575168609619, + "eval_rewards/rejected": -7.1859612464904785, + "eval_runtime": 132.087, + "eval_samples_per_second": 23.893, + "eval_steps_per_second": 0.379, + "step": 5400 + }, + { + "epoch": 1.3, + "learning_rate": 3.14449991085755e-07, + "logits/chosen": -2.612245798110962, + "logits/rejected": -2.5397238731384277, + "logps/chosen": -229.0033416748047, + "logps/rejected": -317.2799377441406, + "loss": 0.0783, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.07087705284357071, + "rewards/margins": 9.942754745483398, + "rewards/rejected": -10.013631820678711, "step": 5410 }, { - "epoch": 1.37, - "learning_rate": 3.018443965920794e-07, - "logits/chosen": -2.40796160697937, - "logits/rejected": -2.5906550884246826, - "logps/chosen": -311.5230407714844, - "logps/rejected": -433.317626953125, - "loss": 0.0861, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6146554946899414, - "rewards/margins": 8.28280258178711, - "rewards/rejected": -8.897459030151367, + "epoch": 1.3, + "learning_rate": 3.140042788375825e-07, + "logits/chosen": -2.6079039573669434, + "logits/rejected": -2.54746150970459, + "logps/chosen": -193.78372192382812, + "logps/rejected": -259.4725036621094, + "loss": 0.0983, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9956803321838379, + "rewards/margins": 8.180231094360352, + "rewards/rejected": -9.175910949707031, "step": 5420 }, { - "epoch": 1.37, - "learning_rate": 3.013762756296227e-07, - "logits/chosen": -2.696668863296509, - "logits/rejected": -2.546956777572632, - "logps/chosen": -251.2556915283203, - "logps/rejected": -289.1623840332031, - "loss": 0.0825, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.5907089710235596, - "rewards/margins": 6.483929634094238, - "rewards/rejected": -8.074637413024902, + "epoch": 1.31, + "learning_rate": 3.135585665894099e-07, + "logits/chosen": -2.7893662452697754, + "logits/rejected": -2.809215545654297, + "logps/chosen": -274.98675537109375, + "logps/rejected": -303.9874572753906, + "loss": 0.1241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7706942558288574, + "rewards/margins": 7.017721652984619, + "rewards/rejected": -7.788415431976318, "step": 5430 }, { - "epoch": 1.38, - "learning_rate": 3.0090815466716596e-07, - "logits/chosen": -2.555224895477295, - "logits/rejected": -2.459705114364624, - "logps/chosen": -312.50482177734375, - "logps/rejected": -346.53387451171875, - "loss": 0.0872, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.4651520252227783, - "rewards/margins": 7.669609069824219, - "rewards/rejected": -8.134759902954102, + "epoch": 1.31, + "learning_rate": 3.131128543412373e-07, + "logits/chosen": -2.6171457767486572, + "logits/rejected": -2.447368621826172, + "logps/chosen": -277.42864990234375, + "logps/rejected": -272.22015380859375, + "loss": 0.0931, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2212035655975342, + "rewards/margins": 7.682473659515381, + "rewards/rejected": -8.903676986694336, "step": 5440 }, { - "epoch": 1.38, - "learning_rate": 3.0044003370470925e-07, - "logits/chosen": -2.4516243934631348, - "logits/rejected": -2.2774457931518555, - "logps/chosen": -330.15582275390625, - "logps/rejected": -310.63946533203125, - "loss": 0.0741, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.050064899027347565, - "rewards/margins": 6.6439690589904785, - "rewards/rejected": -6.5939040184021, + "epoch": 1.31, + "learning_rate": 3.1266714209306474e-07, + "logits/chosen": -2.7194457054138184, + "logits/rejected": -2.4638209342956543, + "logps/chosen": -228.48623657226562, + "logps/rejected": -291.41253662109375, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1238467693328857, + "rewards/margins": 7.38750696182251, + "rewards/rejected": -8.5113525390625, "step": 5450 }, { - "epoch": 1.38, - "learning_rate": 2.999719127422526e-07, - "logits/chosen": -2.441774606704712, - "logits/rejected": -2.4072556495666504, - "logps/chosen": -236.97335815429688, - "logps/rejected": -306.00140380859375, - "loss": 0.1016, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.8134746551513672, - "rewards/margins": 6.758217811584473, - "rewards/rejected": -8.571691513061523, + "epoch": 1.31, + "learning_rate": 3.1222142984489215e-07, + "logits/chosen": -2.669907331466675, + "logits/rejected": -2.5879030227661133, + "logps/chosen": -300.0238342285156, + "logps/rejected": -306.6994323730469, + "loss": 0.1473, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9195632934570312, + "rewards/margins": 6.539907932281494, + "rewards/rejected": -8.459470748901367, "step": 5460 }, { - "epoch": 1.38, - "learning_rate": 2.995037917797959e-07, - "logits/chosen": -2.4176416397094727, - "logits/rejected": -2.3949923515319824, - "logps/chosen": -280.667236328125, - "logps/rejected": -353.26507568359375, - "loss": 0.0999, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.2733783721923828, - "rewards/margins": 7.296590328216553, - "rewards/rejected": -7.569968223571777, + "epoch": 1.32, + "learning_rate": 3.1177571759671955e-07, + "logits/chosen": -2.885690450668335, + "logits/rejected": -2.807842493057251, + "logps/chosen": -379.2716064453125, + "logps/rejected": -329.15155029296875, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5091997385025024, + "rewards/margins": 7.824720859527588, + "rewards/rejected": -7.315522193908691, "step": 5470 }, { - "epoch": 1.39, - "learning_rate": 2.9903567081733923e-07, - "logits/chosen": -2.6557836532592773, - "logits/rejected": -2.504343032836914, - "logps/chosen": -311.24371337890625, - "logps/rejected": -369.2570495605469, - "loss": 0.1336, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.3242943286895752, - "rewards/margins": 7.029200077056885, - "rewards/rejected": -8.353494644165039, + "epoch": 1.32, + "learning_rate": 3.1133000534854695e-07, + "logits/chosen": -2.643221378326416, + "logits/rejected": -2.5729973316192627, + "logps/chosen": -322.3549499511719, + "logps/rejected": -378.7335205078125, + "loss": 0.0857, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9613577127456665, + "rewards/margins": 6.077725887298584, + "rewards/rejected": -8.039083480834961, "step": 5480 }, { - "epoch": 1.39, - "learning_rate": 2.9856754985488246e-07, - "logits/chosen": -2.8352675437927246, - "logits/rejected": -2.565420627593994, - "logps/chosen": -266.55560302734375, - "logps/rejected": -273.0650329589844, - "loss": 0.0972, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7270272970199585, - "rewards/margins": 7.272873878479004, - "rewards/rejected": -6.545845985412598, + "epoch": 1.32, + "learning_rate": 3.108842931003744e-07, + "logits/chosen": -2.5762481689453125, + "logits/rejected": -2.712843894958496, + "logps/chosen": -256.5275573730469, + "logps/rejected": -339.96551513671875, + "loss": 0.1337, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8788062930107117, + "rewards/margins": 5.873172760009766, + "rewards/rejected": -6.751979827880859, "step": 5490 }, { - "epoch": 1.39, - "learning_rate": 2.980994288924258e-07, - "logits/chosen": -2.438821792602539, - "logits/rejected": -2.379103660583496, - "logps/chosen": -238.837158203125, - "logps/rejected": -324.5002136230469, - "loss": 0.0805, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3304581642150879, - "rewards/margins": 7.2548418045043945, - "rewards/rejected": -7.585301399230957, + "epoch": 1.32, + "learning_rate": 3.104385808522018e-07, + "logits/chosen": -2.5719332695007324, + "logits/rejected": -2.4445595741271973, + "logps/chosen": -305.4701232910156, + "logps/rejected": -286.2919616699219, + "loss": 0.0977, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7869125604629517, + "rewards/margins": 6.059802055358887, + "rewards/rejected": -6.846714973449707, "step": 5500 }, { - "epoch": 1.39, - "learning_rate": 2.976313079299691e-07, - "logits/chosen": -2.37760066986084, - "logits/rejected": -2.4221079349517822, - "logps/chosen": -189.891357421875, - "logps/rejected": -291.51153564453125, - "loss": 0.1455, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0570133924484253, - "rewards/margins": 6.477774143218994, - "rewards/rejected": -7.534787178039551, + "epoch": 1.32, + "eval_logits/chosen": -2.561383008956909, + "eval_logits/rejected": -2.5311341285705566, + "eval_logps/chosen": -241.5596923828125, + "eval_logps/rejected": -255.70962524414062, + "eval_loss": 0.5430763363838196, + "eval_rewards/accuracies": 0.6549999713897705, + "eval_rewards/chosen": -4.559868335723877, + "eval_rewards/margins": 2.571784019470215, + "eval_rewards/rejected": -7.13165283203125, + "eval_runtime": 132.2428, + "eval_samples_per_second": 23.865, + "eval_steps_per_second": 0.378, + "step": 5500 + }, + { + "epoch": 1.33, + "learning_rate": 3.099928686040292e-07, + "logits/chosen": -2.3726134300231934, + "logits/rejected": -2.3346734046936035, + "logps/chosen": -147.59933471679688, + "logps/rejected": -225.5486297607422, + "loss": 0.0924, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.05254455655813217, + "rewards/margins": 7.699334144592285, + "rewards/rejected": -7.64678955078125, "step": 5510 }, { - "epoch": 1.4, - "learning_rate": 2.9716318696751244e-07, - "logits/chosen": -2.6192290782928467, - "logits/rejected": -2.759930372238159, - "logps/chosen": -193.07093811035156, - "logps/rejected": -374.53155517578125, - "loss": 0.1248, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1276962757110596, - "rewards/margins": 6.811429500579834, - "rewards/rejected": -7.939126014709473, + "epoch": 1.33, + "learning_rate": 3.0954715635585667e-07, + "logits/chosen": -2.7603375911712646, + "logits/rejected": -2.6560516357421875, + "logps/chosen": -280.23272705078125, + "logps/rejected": -286.752197265625, + "loss": 0.1862, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07860752195119858, + "rewards/margins": 7.284377098083496, + "rewards/rejected": -7.2057695388793945, "step": 5520 }, { - "epoch": 1.4, - "learning_rate": 2.966950660050557e-07, - "logits/chosen": -2.473038911819458, - "logits/rejected": -2.470531940460205, - "logps/chosen": -208.4632568359375, - "logps/rejected": -346.54901123046875, - "loss": 0.0749, + "epoch": 1.33, + "learning_rate": 3.0910144410768407e-07, + "logits/chosen": -2.6072757244110107, + "logits/rejected": -2.6516242027282715, + "logps/chosen": -268.14251708984375, + "logps/rejected": -367.84649658203125, + "loss": 0.0587, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.7250261306762695, - "rewards/margins": 6.926937103271484, - "rewards/rejected": -7.651963233947754, + "rewards/chosen": -2.8313350677490234, + "rewards/margins": 6.771770477294922, + "rewards/rejected": -9.603106498718262, "step": 5530 }, { - "epoch": 1.4, - "learning_rate": 2.9622694504259896e-07, - "logits/chosen": -2.854395866394043, - "logits/rejected": -2.8448264598846436, - "logps/chosen": -313.43011474609375, - "logps/rejected": -380.42047119140625, - "loss": 0.0608, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4461665749549866, - "rewards/margins": 7.274533271789551, - "rewards/rejected": -6.828366279602051, + "epoch": 1.33, + "learning_rate": 3.086557318595115e-07, + "logits/chosen": -2.6141247749328613, + "logits/rejected": -2.581433057785034, + "logps/chosen": -338.5303039550781, + "logps/rejected": -292.84466552734375, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6784435510635376, + "rewards/margins": 6.654521942138672, + "rewards/rejected": -8.332964897155762, "step": 5540 }, { - "epoch": 1.4, - "learning_rate": 2.957588240801423e-07, - "logits/chosen": -2.5442984104156494, - "logits/rejected": -2.3909056186676025, - "logps/chosen": -340.7677001953125, - "logps/rejected": -361.9802551269531, - "loss": 0.0682, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.6735048294067383, - "rewards/margins": 4.611882209777832, - "rewards/rejected": -6.28538703918457, + "epoch": 1.34, + "learning_rate": 3.0821001961133893e-07, + "logits/chosen": -2.7999234199523926, + "logits/rejected": -2.662100315093994, + "logps/chosen": -273.3230895996094, + "logps/rejected": -308.8775634765625, + "loss": 0.1049, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.027463411912322044, + "rewards/margins": 8.653151512145996, + "rewards/rejected": -8.680615425109863, "step": 5550 }, { - "epoch": 1.41, - "learning_rate": 2.952907031176856e-07, - "logits/chosen": -2.2576093673706055, - "logits/rejected": -2.2817511558532715, - "logps/chosen": -182.24478149414062, - "logps/rejected": -332.81268310546875, - "loss": 0.054, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.7802220582962036, - "rewards/margins": 7.97159481048584, - "rewards/rejected": -8.75181770324707, + "epoch": 1.34, + "learning_rate": 3.0776430736316633e-07, + "logits/chosen": -2.587996006011963, + "logits/rejected": -2.480525255203247, + "logps/chosen": -267.45501708984375, + "logps/rejected": -361.61468505859375, + "loss": 0.0747, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7066177129745483, + "rewards/margins": 7.623175621032715, + "rewards/rejected": -9.329792976379395, "step": 5560 }, { - "epoch": 1.41, - "learning_rate": 2.948225821552289e-07, - "logits/chosen": -2.4663827419281006, - "logits/rejected": -2.5319385528564453, - "logps/chosen": -180.53253173828125, - "logps/rejected": -316.0972595214844, - "loss": 0.1364, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.3494766354560852, - "rewards/margins": 9.86876106262207, - "rewards/rejected": -9.51928424835205, + "epoch": 1.34, + "learning_rate": 3.0731859511499374e-07, + "logits/chosen": -2.6902801990509033, + "logits/rejected": -2.582404613494873, + "logps/chosen": -274.99468994140625, + "logps/rejected": -358.1728210449219, + "loss": 0.0779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026019524782896042, + "rewards/margins": 8.530031204223633, + "rewards/rejected": -8.556051254272461, "step": 5570 }, { - "epoch": 1.41, - "learning_rate": 2.9435446119277217e-07, - "logits/chosen": -2.7857749462127686, - "logits/rejected": -2.664595127105713, - "logps/chosen": -317.0216064453125, - "logps/rejected": -350.7251892089844, - "loss": 0.0648, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3388091027736664, - "rewards/margins": 8.139375686645508, - "rewards/rejected": -8.478184700012207, + "epoch": 1.34, + "learning_rate": 3.068728828668212e-07, + "logits/chosen": -2.7614896297454834, + "logits/rejected": -2.6439006328582764, + "logps/chosen": -289.13677978515625, + "logps/rejected": -322.6458435058594, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9467754364013672, + "rewards/margins": 7.337257385253906, + "rewards/rejected": -8.284032821655273, "step": 5580 }, { - "epoch": 1.41, - "learning_rate": 2.938863402303155e-07, - "logits/chosen": -2.7515158653259277, - "logits/rejected": -2.567390203475952, - "logps/chosen": -326.83209228515625, - "logps/rejected": -353.8697204589844, - "loss": 0.0821, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.1699625253677368, - "rewards/margins": 7.984126091003418, - "rewards/rejected": -9.154088973999023, + "epoch": 1.35, + "learning_rate": 3.064271706186486e-07, + "logits/chosen": -2.80676007270813, + "logits/rejected": -2.7875328063964844, + "logps/chosen": -320.6369934082031, + "logps/rejected": -391.30462646484375, + "loss": 0.0662, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3034350275993347, + "rewards/margins": 7.06372594833374, + "rewards/rejected": -7.367160797119141, "step": 5590 }, { - "epoch": 1.42, - "learning_rate": 2.934182192678588e-07, - "logits/chosen": -2.676360607147217, - "logits/rejected": -2.38034987449646, - "logps/chosen": -261.83428955078125, - "logps/rejected": -288.66094970703125, - "loss": 0.1106, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.187965750694275, - "rewards/margins": 6.701111793518066, - "rewards/rejected": -7.889077186584473, + "epoch": 1.35, + "learning_rate": 3.05981458370476e-07, + "logits/chosen": -2.5499396324157715, + "logits/rejected": -2.6085877418518066, + "logps/chosen": -288.1993713378906, + "logps/rejected": -293.7450866699219, + "loss": 0.1564, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1891370266675949, + "rewards/margins": 7.340639591217041, + "rewards/rejected": -7.529776096343994, "step": 5600 }, { - "epoch": 1.42, - "learning_rate": 2.9295009830540215e-07, - "logits/chosen": -2.336933135986328, - "logits/rejected": -2.0914289951324463, - "logps/chosen": -319.1845703125, - "logps/rejected": -405.6087341308594, - "loss": 0.0786, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.1506112813949585, - "rewards/margins": 7.663626194000244, - "rewards/rejected": -8.814237594604492, + "epoch": 1.35, + "eval_logits/chosen": -2.397571325302124, + "eval_logits/rejected": -2.3498072624206543, + "eval_logps/chosen": -247.39111328125, + "eval_logps/rejected": -264.9027404785156, + "eval_loss": 0.551169753074646, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -5.143011569976807, + "eval_rewards/margins": 2.907952070236206, + "eval_rewards/rejected": -8.050963401794434, + "eval_runtime": 132.1483, + "eval_samples_per_second": 23.882, + "eval_steps_per_second": 0.378, + "step": 5600 + }, + { + "epoch": 1.35, + "learning_rate": 3.0553574612230345e-07, + "logits/chosen": -2.542187213897705, + "logits/rejected": -2.575735569000244, + "logps/chosen": -257.846923828125, + "logps/rejected": -317.1521911621094, + "loss": 0.1911, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3285924792289734, + "rewards/margins": 6.40179443359375, + "rewards/rejected": -6.730387210845947, "step": 5610 }, { - "epoch": 1.42, - "learning_rate": 2.924819773429454e-07, - "logits/chosen": -2.5694477558135986, - "logits/rejected": -2.3845672607421875, - "logps/chosen": -309.91021728515625, - "logps/rejected": -286.1216735839844, - "loss": 0.0766, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.1635410785675049, - "rewards/margins": 6.091832160949707, - "rewards/rejected": -7.255372524261475, + "epoch": 1.35, + "learning_rate": 3.0509003387413086e-07, + "logits/chosen": -2.6285223960876465, + "logits/rejected": -2.5457143783569336, + "logps/chosen": -279.45367431640625, + "logps/rejected": -217.64535522460938, + "loss": 0.1585, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.06619924306869507, + "rewards/margins": 5.675633907318115, + "rewards/rejected": -5.741833686828613, "step": 5620 }, { - "epoch": 1.42, - "learning_rate": 2.9201385638048867e-07, - "logits/chosen": -2.5705244541168213, - "logits/rejected": -2.4809091091156006, - "logps/chosen": -285.5097961425781, - "logps/rejected": -325.88372802734375, - "loss": 0.1075, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2106363773345947, - "rewards/margins": 10.178508758544922, - "rewards/rejected": -8.967872619628906, + "epoch": 1.35, + "learning_rate": 3.0464432162595826e-07, + "logits/chosen": -2.6819653511047363, + "logits/rejected": -2.5688071250915527, + "logps/chosen": -380.29290771484375, + "logps/rejected": -422.2904357910156, + "loss": 0.2763, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.266469955444336, + "rewards/margins": 10.178750038146973, + "rewards/rejected": -8.91227912902832, "step": 5630 }, { - "epoch": 1.43, - "learning_rate": 2.91545735418032e-07, - "logits/chosen": -2.4400763511657715, - "logits/rejected": -2.473245143890381, - "logps/chosen": -231.61257934570312, - "logps/rejected": -289.244873046875, - "loss": 0.0826, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.42569226026535034, - "rewards/margins": 7.356956481933594, - "rewards/rejected": -7.782649040222168, + "epoch": 1.36, + "learning_rate": 3.0419860937778566e-07, + "logits/chosen": -2.5895729064941406, + "logits/rejected": -2.562610387802124, + "logps/chosen": -298.11761474609375, + "logps/rejected": -442.8126525878906, + "loss": 0.0959, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7508869171142578, + "rewards/margins": 12.820283889770508, + "rewards/rejected": -12.069396018981934, "step": 5640 }, { - "epoch": 1.43, - "learning_rate": 2.910776144555753e-07, - "logits/chosen": -2.447082757949829, - "logits/rejected": -2.4613442420959473, - "logps/chosen": -246.17288208007812, - "logps/rejected": -291.76287841796875, - "loss": 0.0777, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.5513318777084351, - "rewards/margins": 7.071081638336182, - "rewards/rejected": -6.519749641418457, + "epoch": 1.36, + "learning_rate": 3.037528971296131e-07, + "logits/chosen": -2.2984566688537598, + "logits/rejected": -2.454714059829712, + "logps/chosen": -234.3234100341797, + "logps/rejected": -316.863525390625, + "loss": 0.0866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3430708944797516, + "rewards/margins": 9.358939170837402, + "rewards/rejected": -9.702009201049805, "step": 5650 }, { - "epoch": 1.43, - "learning_rate": 2.906094934931186e-07, - "logits/chosen": -2.573009490966797, - "logits/rejected": -2.3336191177368164, - "logps/chosen": -336.81903076171875, - "logps/rejected": -252.26974487304688, - "loss": 0.0923, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.25829944014549255, - "rewards/margins": 7.9574151039123535, - "rewards/rejected": -7.699115753173828, + "epoch": 1.36, + "learning_rate": 3.033071848814405e-07, + "logits/chosen": -2.540438175201416, + "logits/rejected": -2.4125802516937256, + "logps/chosen": -251.1460418701172, + "logps/rejected": -255.10275268554688, + "loss": 0.1142, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7258079051971436, + "rewards/margins": 6.141571044921875, + "rewards/rejected": -7.867378234863281, "step": 5660 }, { - "epoch": 1.43, - "learning_rate": 2.901413725306619e-07, - "logits/chosen": -2.550891399383545, - "logits/rejected": -2.451150417327881, - "logps/chosen": -295.5380554199219, - "logps/rejected": -313.8006286621094, - "loss": 0.0791, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.783888816833496, - "rewards/margins": 5.338407516479492, - "rewards/rejected": -7.1222968101501465, + "epoch": 1.36, + "learning_rate": 3.028614726332679e-07, + "logits/chosen": -2.2878785133361816, + "logits/rejected": -2.3755807876586914, + "logps/chosen": -223.54153442382812, + "logps/rejected": -386.21405029296875, + "loss": 0.0967, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7180954813957214, + "rewards/margins": 7.181421756744385, + "rewards/rejected": -7.899518013000488, "step": 5670 }, { - "epoch": 1.44, - "learning_rate": 2.896732515682052e-07, - "logits/chosen": -2.3952527046203613, - "logits/rejected": -2.389531373977661, - "logps/chosen": -204.33877563476562, - "logps/rejected": -264.1290588378906, - "loss": 0.1216, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.787590742111206, - "rewards/margins": 7.53562068939209, - "rewards/rejected": -6.748030185699463, + "epoch": 1.37, + "learning_rate": 3.024157603850954e-07, + "logits/chosen": -2.38838267326355, + "logits/rejected": -2.3276867866516113, + "logps/chosen": -308.7379150390625, + "logps/rejected": -425.5436096191406, + "loss": 0.0905, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1380276679992676, + "rewards/margins": 7.7468156814575195, + "rewards/rejected": -9.884842872619629, "step": 5680 }, { - "epoch": 1.44, - "learning_rate": 2.892051306057485e-07, - "logits/chosen": -2.5032591819763184, - "logits/rejected": -2.518265724182129, - "logps/chosen": -296.32977294921875, - "logps/rejected": -414.72564697265625, - "loss": 0.0841, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2691758871078491, - "rewards/margins": 9.437515258789062, - "rewards/rejected": -8.168340682983398, + "epoch": 1.37, + "learning_rate": 3.019700481369228e-07, + "logits/chosen": -2.411196231842041, + "logits/rejected": -2.3630738258361816, + "logps/chosen": -226.0814666748047, + "logps/rejected": -266.2834167480469, + "loss": 0.0796, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.739190697669983, + "rewards/margins": 7.474464416503906, + "rewards/rejected": -9.213655471801758, "step": 5690 }, { - "epoch": 1.44, - "learning_rate": 2.8873700964329186e-07, - "logits/chosen": -2.558290481567383, - "logits/rejected": -2.5007545948028564, - "logps/chosen": -289.505859375, - "logps/rejected": -322.5883483886719, - "loss": 0.1034, + "epoch": 1.37, + "learning_rate": 3.015243358887502e-07, + "logits/chosen": -2.3483874797821045, + "logits/rejected": -2.293850898742676, + "logps/chosen": -300.916259765625, + "logps/rejected": -269.19439697265625, + "loss": 0.0967, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.3201775848865509, - "rewards/margins": 7.209942817687988, - "rewards/rejected": -6.889764308929443, + "rewards/chosen": 0.0681561678647995, + "rewards/margins": 8.878449440002441, + "rewards/rejected": -8.810293197631836, "step": 5700 }, { - "epoch": 1.44, - "learning_rate": 2.882688886808351e-07, - "logits/chosen": -2.454603672027588, - "logits/rejected": -2.4256834983825684, - "logps/chosen": -193.55093383789062, - "logps/rejected": -257.0700988769531, - "loss": 0.1765, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.116165041923523, - "rewards/margins": 7.95557165145874, - "rewards/rejected": -6.839406490325928, + "epoch": 1.37, + "eval_logits/chosen": -2.2630622386932373, + "eval_logits/rejected": -2.2110085487365723, + "eval_logps/chosen": -241.03347778320312, + "eval_logps/rejected": -258.89892578125, + "eval_loss": 0.5519627928733826, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -4.507246494293213, + "eval_rewards/margins": 2.9433352947235107, + "eval_rewards/rejected": -7.450582504272461, + "eval_runtime": 132.1508, + "eval_samples_per_second": 23.882, + "eval_steps_per_second": 0.378, + "step": 5700 + }, + { + "epoch": 1.37, + "learning_rate": 3.0107862364057764e-07, + "logits/chosen": -2.312290668487549, + "logits/rejected": -2.155432939529419, + "logps/chosen": -147.79873657226562, + "logps/rejected": -259.18792724609375, + "loss": 0.1393, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6257935166358948, + "rewards/margins": 7.950065612792969, + "rewards/rejected": -8.575858116149902, "step": 5710 }, { - "epoch": 1.45, - "learning_rate": 2.8780076771837844e-07, - "logits/chosen": -2.575314521789551, - "logits/rejected": -2.5462193489074707, - "logps/chosen": -220.2628936767578, - "logps/rejected": -283.34088134765625, - "loss": 0.0921, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6230512857437134, - "rewards/margins": 6.939806938171387, - "rewards/rejected": -7.562857627868652, + "epoch": 1.38, + "learning_rate": 3.0063291139240504e-07, + "logits/chosen": -2.683090925216675, + "logits/rejected": -2.5158116817474365, + "logps/chosen": -265.20050048828125, + "logps/rejected": -321.0158996582031, + "loss": 0.1168, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9063001871109009, + "rewards/margins": 7.655638694763184, + "rewards/rejected": -9.561939239501953, "step": 5720 }, { - "epoch": 1.45, - "learning_rate": 2.873326467559217e-07, - "logits/chosen": -2.588092803955078, - "logits/rejected": -2.4266304969787598, - "logps/chosen": -218.74368286132812, - "logps/rejected": -260.84228515625, - "loss": 0.1046, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.0295507907867432, - "rewards/margins": 6.963271141052246, - "rewards/rejected": -5.933720588684082, + "epoch": 1.38, + "learning_rate": 3.0018719914423245e-07, + "logits/chosen": -2.5897376537323, + "logits/rejected": -2.646237373352051, + "logps/chosen": -273.7349853515625, + "logps/rejected": -356.7886962890625, + "loss": 0.0613, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.8004722595214844, + "rewards/margins": 8.781227111816406, + "rewards/rejected": -11.58169937133789, "step": 5730 }, { - "epoch": 1.45, - "learning_rate": 2.86864525793465e-07, - "logits/chosen": -2.608156681060791, - "logits/rejected": -2.5254130363464355, - "logps/chosen": -178.9147491455078, - "logps/rejected": -278.29290771484375, - "loss": 0.0738, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.29764580726623535, - "rewards/margins": 5.203714370727539, - "rewards/rejected": -5.5013604164123535, + "epoch": 1.38, + "learning_rate": 2.997414868960599e-07, + "logits/chosen": -2.482520818710327, + "logits/rejected": -2.4815404415130615, + "logps/chosen": -201.9820556640625, + "logps/rejected": -350.8991394042969, + "loss": 0.1306, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7341091632843018, + "rewards/margins": 7.285149574279785, + "rewards/rejected": -10.019259452819824, "step": 5740 }, { - "epoch": 1.45, - "learning_rate": 2.863964048310083e-07, - "logits/chosen": -2.6046547889709473, - "logits/rejected": -2.3902244567871094, - "logps/chosen": -258.7455749511719, - "logps/rejected": -235.589599609375, - "loss": 0.0817, + "epoch": 1.38, + "learning_rate": 2.992957746478873e-07, + "logits/chosen": -2.494835376739502, + "logits/rejected": -2.471322774887085, + "logps/chosen": -174.05007934570312, + "logps/rejected": -320.26824951171875, + "loss": 0.0879, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.6193943023681641, - "rewards/margins": 5.428850173950195, - "rewards/rejected": -6.048244476318359, + "rewards/chosen": -2.707693338394165, + "rewards/margins": 7.08135986328125, + "rewards/rejected": -9.789053916931152, "step": 5750 }, { - "epoch": 1.46, - "learning_rate": 2.859282838685516e-07, - "logits/chosen": -2.4807581901550293, - "logits/rejected": -2.586901903152466, - "logps/chosen": -215.5895538330078, - "logps/rejected": -336.99664306640625, - "loss": 0.1096, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.0720269680023193, - "rewards/margins": 5.663225173950195, - "rewards/rejected": -6.735252380371094, + "epoch": 1.39, + "learning_rate": 2.988500623997147e-07, + "logits/chosen": -2.54711651802063, + "logits/rejected": -2.461714744567871, + "logps/chosen": -217.9943389892578, + "logps/rejected": -281.04205322265625, + "loss": 0.1065, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7643048763275146, + "rewards/margins": 6.749370574951172, + "rewards/rejected": -8.513675689697266, "step": 5760 }, { - "epoch": 1.46, - "learning_rate": 2.8546016290609494e-07, - "logits/chosen": -2.515242338180542, - "logits/rejected": -2.443443775177002, - "logps/chosen": -224.4276885986328, - "logps/rejected": -299.26336669921875, - "loss": 0.1309, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5686905980110168, - "rewards/margins": 9.281776428222656, - "rewards/rejected": -9.850465774536133, + "epoch": 1.39, + "learning_rate": 2.9840435015154216e-07, + "logits/chosen": -2.3755505084991455, + "logits/rejected": -2.1987833976745605, + "logps/chosen": -260.4344482421875, + "logps/rejected": -268.816162109375, + "loss": 0.1413, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1388332843780518, + "rewards/margins": 7.831198215484619, + "rewards/rejected": -8.970032691955566, "step": 5770 }, { - "epoch": 1.46, - "learning_rate": 2.849920419436382e-07, - "logits/chosen": -2.3371036052703857, - "logits/rejected": -2.2131152153015137, - "logps/chosen": -278.50762939453125, - "logps/rejected": -278.50030517578125, - "loss": 0.0994, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.818460464477539, - "rewards/margins": 5.5670084953308105, - "rewards/rejected": -7.38546895980835, + "epoch": 1.39, + "learning_rate": 2.9795863790336957e-07, + "logits/chosen": -2.0446441173553467, + "logits/rejected": -2.1578776836395264, + "logps/chosen": -213.67953491210938, + "logps/rejected": -307.00396728515625, + "loss": 0.1803, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.52001953125, + "rewards/margins": 6.617905616760254, + "rewards/rejected": -9.13792610168457, "step": 5780 }, { - "epoch": 1.46, - "learning_rate": 2.845239209811815e-07, - "logits/chosen": -2.3276255130767822, - "logits/rejected": -2.200223922729492, - "logps/chosen": -199.0684814453125, - "logps/rejected": -212.3662872314453, - "loss": 0.0566, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.0024425506126135588, - "rewards/margins": 7.235228061676025, - "rewards/rejected": -7.2376708984375, + "epoch": 1.39, + "learning_rate": 2.9751292565519697e-07, + "logits/chosen": -2.689487934112549, + "logits/rejected": -2.5234534740448, + "logps/chosen": -324.85760498046875, + "logps/rejected": -267.6849365234375, + "loss": 0.2388, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6510416269302368, + "rewards/margins": 6.351454734802246, + "rewards/rejected": -8.002496719360352, "step": 5790 }, { - "epoch": 1.47, - "learning_rate": 2.840558000187248e-07, - "logits/chosen": -2.686560869216919, - "logits/rejected": -2.4994020462036133, - "logps/chosen": -344.4832763671875, - "logps/rejected": -400.8691101074219, - "loss": 0.0751, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.36507248878479004, - "rewards/margins": 8.268793106079102, - "rewards/rejected": -7.903720855712891, + "epoch": 1.4, + "learning_rate": 2.9706721340702437e-07, + "logits/chosen": -2.430111885070801, + "logits/rejected": -2.497422695159912, + "logps/chosen": -259.12298583984375, + "logps/rejected": -346.84722900390625, + "loss": 0.2046, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.451986312866211, + "rewards/margins": 9.522740364074707, + "rewards/rejected": -10.974726676940918, "step": 5800 }, { - "epoch": 1.47, - "learning_rate": 2.8358767905626815e-07, - "logits/chosen": -2.262820243835449, - "logits/rejected": -2.2107882499694824, - "logps/chosen": -200.62533569335938, - "logps/rejected": -208.8029022216797, - "loss": 0.1287, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.7861111164093018, - "rewards/margins": 5.80421781539917, - "rewards/rejected": -7.590329170227051, + "epoch": 1.4, + "eval_logits/chosen": -2.267742395401001, + "eval_logits/rejected": -2.215470314025879, + "eval_logps/chosen": -251.28880310058594, + "eval_logps/rejected": -269.7067565917969, + "eval_loss": 0.5587701201438904, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -5.532778263092041, + "eval_rewards/margins": 2.9985909461975098, + "eval_rewards/rejected": -8.53136920928955, + "eval_runtime": 131.9811, + "eval_samples_per_second": 23.913, + "eval_steps_per_second": 0.379, + "step": 5800 + }, + { + "epoch": 1.4, + "learning_rate": 2.9662150115885183e-07, + "logits/chosen": -2.301166534423828, + "logits/rejected": -2.4247069358825684, + "logps/chosen": -295.5283508300781, + "logps/rejected": -367.54217529296875, + "loss": 0.1201, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6161748170852661, + "rewards/margins": 9.013157844543457, + "rewards/rejected": -9.629331588745117, "step": 5810 }, { - "epoch": 1.47, - "learning_rate": 2.8311955809381143e-07, - "logits/chosen": -2.5503885746002197, - "logits/rejected": -2.4902098178863525, - "logps/chosen": -298.7713928222656, - "logps/rejected": -394.04833984375, - "loss": 0.0399, + "epoch": 1.4, + "learning_rate": 2.9617578891067923e-07, + "logits/chosen": -2.5498721599578857, + "logits/rejected": -2.4222207069396973, + "logps/chosen": -256.13702392578125, + "logps/rejected": -323.8200378417969, + "loss": 0.0437, "rewards/accuracies": 1.0, - "rewards/chosen": 0.4204397201538086, - "rewards/margins": 9.799768447875977, - "rewards/rejected": -9.379328727722168, + "rewards/chosen": -1.353124976158142, + "rewards/margins": 8.275461196899414, + "rewards/rejected": -9.62858772277832, "step": 5820 }, { - "epoch": 1.47, - "learning_rate": 2.826514371313548e-07, - "logits/chosen": -2.4589803218841553, - "logits/rejected": -2.4627013206481934, - "logps/chosen": -288.97943115234375, - "logps/rejected": -333.4466857910156, - "loss": 0.0799, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8154493570327759, - "rewards/margins": 9.827569961547852, - "rewards/rejected": -9.012121200561523, + "epoch": 1.4, + "learning_rate": 2.9573007666250663e-07, + "logits/chosen": -2.4855666160583496, + "logits/rejected": -2.229447603225708, + "logps/chosen": -304.44970703125, + "logps/rejected": -293.0791015625, + "loss": 0.0666, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2711284160614014, + "rewards/margins": 7.172418117523193, + "rewards/rejected": -10.443546295166016, "step": 5830 }, { - "epoch": 1.48, - "learning_rate": 2.82183316168898e-07, - "logits/chosen": -2.542847156524658, - "logits/rejected": -2.2891502380371094, - "logps/chosen": -265.6089172363281, - "logps/rejected": -256.6798095703125, - "loss": 0.1162, + "epoch": 1.41, + "learning_rate": 2.952843644143341e-07, + "logits/chosen": -2.299452304840088, + "logits/rejected": -2.33542537689209, + "logps/chosen": -373.319091796875, + "logps/rejected": -446.96319580078125, + "loss": 0.1887, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.137566328048706, - "rewards/margins": 7.110358238220215, - "rewards/rejected": -9.2479248046875, + "rewards/chosen": -2.214571714401245, + "rewards/margins": 5.634344577789307, + "rewards/rejected": -7.848916053771973, "step": 5840 }, { - "epoch": 1.48, - "learning_rate": 2.817151952064413e-07, - "logits/chosen": -2.2505059242248535, - "logits/rejected": -2.116539239883423, - "logps/chosen": -225.38626098632812, - "logps/rejected": -257.52081298828125, - "loss": 0.0794, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.6695985794067383, - "rewards/margins": 6.193515300750732, - "rewards/rejected": -7.8631134033203125, + "epoch": 1.41, + "learning_rate": 2.948386521661615e-07, + "logits/chosen": -2.7000741958618164, + "logits/rejected": -2.5803558826446533, + "logps/chosen": -236.0352783203125, + "logps/rejected": -283.18328857421875, + "loss": 0.127, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.0870792865753174, + "rewards/margins": 7.902926445007324, + "rewards/rejected": -10.990006446838379, "step": 5850 }, { - "epoch": 1.48, - "learning_rate": 2.8124707424398465e-07, - "logits/chosen": -2.6086325645446777, - "logits/rejected": -2.5266706943511963, - "logps/chosen": -244.1230010986328, - "logps/rejected": -297.35198974609375, - "loss": 0.0762, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.30047744512557983, - "rewards/margins": 8.088984489440918, - "rewards/rejected": -8.389463424682617, + "epoch": 1.41, + "learning_rate": 2.943929399179889e-07, + "logits/chosen": -2.6893811225891113, + "logits/rejected": -2.657405376434326, + "logps/chosen": -244.1248779296875, + "logps/rejected": -295.976318359375, + "loss": 0.1198, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.7635507583618164, + "rewards/margins": 6.506453514099121, + "rewards/rejected": -9.270003318786621, "step": 5860 }, { - "epoch": 1.48, - "learning_rate": 2.8077895328152793e-07, - "logits/chosen": -2.6594889163970947, - "logits/rejected": -2.5432751178741455, - "logps/chosen": -274.76116943359375, - "logps/rejected": -362.2596130371094, - "loss": 0.0862, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.45065465569496155, - "rewards/margins": 9.219099998474121, - "rewards/rejected": -9.669755935668945, + "epoch": 1.41, + "learning_rate": 2.9394722766981635e-07, + "logits/chosen": -2.4838690757751465, + "logits/rejected": -2.3061392307281494, + "logps/chosen": -326.0255432128906, + "logps/rejected": -323.9358215332031, + "loss": 0.1602, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9008700847625732, + "rewards/margins": 7.302412986755371, + "rewards/rejected": -9.203283309936523, "step": 5870 }, { - "epoch": 1.49, - "learning_rate": 2.803108323190712e-07, - "logits/chosen": -2.459502935409546, - "logits/rejected": -2.391540288925171, - "logps/chosen": -211.6705780029297, - "logps/rejected": -267.565185546875, - "loss": 0.1507, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.5549541115760803, - "rewards/margins": 7.696126461029053, - "rewards/rejected": -8.251081466674805, + "epoch": 1.42, + "learning_rate": 2.9350151542164375e-07, + "logits/chosen": -2.4550797939300537, + "logits/rejected": -2.3240678310394287, + "logps/chosen": -283.52569580078125, + "logps/rejected": -280.5987243652344, + "loss": 0.0935, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0691368579864502, + "rewards/margins": 8.332475662231445, + "rewards/rejected": -9.401610374450684, "step": 5880 }, { - "epoch": 1.49, - "learning_rate": 2.798427113566145e-07, - "logits/chosen": -2.5379414558410645, - "logits/rejected": -2.41731595993042, - "logps/chosen": -215.3525848388672, - "logps/rejected": -307.221435546875, - "loss": 0.128, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.5225763320922852, - "rewards/margins": 6.014660835266113, - "rewards/rejected": -7.537237644195557, + "epoch": 1.42, + "learning_rate": 2.9305580317347116e-07, + "logits/chosen": -2.299201011657715, + "logits/rejected": -2.2921295166015625, + "logps/chosen": -296.1810607910156, + "logps/rejected": -406.615478515625, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1589936017990112, + "rewards/margins": 8.78157901763916, + "rewards/rejected": -9.940572738647461, "step": 5890 }, { - "epoch": 1.49, - "learning_rate": 2.7937459039415786e-07, - "logits/chosen": -2.419687032699585, - "logits/rejected": -2.4673666954040527, - "logps/chosen": -203.66171264648438, - "logps/rejected": -299.1502685546875, - "loss": 0.0774, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.26614704728126526, - "rewards/margins": 7.863767147064209, - "rewards/rejected": -7.597620487213135, + "epoch": 1.42, + "learning_rate": 2.926100909252986e-07, + "logits/chosen": -2.5493826866149902, + "logits/rejected": -2.363455057144165, + "logps/chosen": -224.7209930419922, + "logps/rejected": -236.0917205810547, + "loss": 0.0985, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6585172414779663, + "rewards/margins": 6.071114540100098, + "rewards/rejected": -7.729632377624512, "step": 5900 }, { - "epoch": 1.49, - "learning_rate": 2.7890646943170114e-07, - "logits/chosen": -2.6347222328186035, - "logits/rejected": -2.705169439315796, - "logps/chosen": -290.8692626953125, - "logps/rejected": -386.2977600097656, - "loss": 0.1117, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.419967532157898, - "rewards/margins": 9.840895652770996, - "rewards/rejected": -8.420928955078125, + "epoch": 1.42, + "eval_logits/chosen": -2.307695150375366, + "eval_logits/rejected": -2.2606279850006104, + "eval_logps/chosen": -247.87648010253906, + "eval_logps/rejected": -263.8137512207031, + "eval_loss": 0.5429248213768005, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -5.191547870635986, + "eval_rewards/margins": 2.7505156993865967, + "eval_rewards/rejected": -7.94206428527832, + "eval_runtime": 132.2272, + "eval_samples_per_second": 23.868, + "eval_steps_per_second": 0.378, + "step": 5900 + }, + { + "epoch": 1.42, + "learning_rate": 2.92164378677126e-07, + "logits/chosen": -2.5546793937683105, + "logits/rejected": -2.3833582401275635, + "logps/chosen": -369.3504943847656, + "logps/rejected": -296.03070068359375, + "loss": 0.1209, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1729577779769897, + "rewards/margins": 6.624330997467041, + "rewards/rejected": -7.797287940979004, "step": 5910 }, { - "epoch": 1.5, - "learning_rate": 2.784383484692445e-07, - "logits/chosen": -2.620143175125122, - "logits/rejected": -2.575716495513916, - "logps/chosen": -234.12960815429688, - "logps/rejected": -333.3351745605469, - "loss": 0.0795, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7579079866409302, - "rewards/margins": 6.541896820068359, - "rewards/rejected": -7.299803733825684, + "epoch": 1.42, + "learning_rate": 2.917186664289534e-07, + "logits/chosen": -2.650674343109131, + "logits/rejected": -2.6440083980560303, + "logps/chosen": -305.36651611328125, + "logps/rejected": -360.25555419921875, + "loss": 0.1259, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0498480796813965, + "rewards/margins": 7.585900783538818, + "rewards/rejected": -8.635749816894531, "step": 5920 }, { - "epoch": 1.5, - "learning_rate": 2.779702275067877e-07, - "logits/chosen": -2.6059043407440186, - "logits/rejected": -2.5090622901916504, - "logps/chosen": -333.93701171875, - "logps/rejected": -346.04095458984375, - "loss": 0.0702, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5566359162330627, - "rewards/margins": 8.704923629760742, - "rewards/rejected": -8.148286819458008, + "epoch": 1.43, + "learning_rate": 2.912729541807809e-07, + "logits/chosen": -2.5265231132507324, + "logits/rejected": -2.414250373840332, + "logps/chosen": -298.4662170410156, + "logps/rejected": -292.9589538574219, + "loss": 0.1522, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9175195693969727, + "rewards/margins": 7.501967430114746, + "rewards/rejected": -8.419486999511719, "step": 5930 }, { - "epoch": 1.5, - "learning_rate": 2.77502106544331e-07, - "logits/chosen": -2.5816235542297363, - "logits/rejected": -2.607320547103882, - "logps/chosen": -202.93458557128906, - "logps/rejected": -324.5149230957031, - "loss": 0.0928, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5764981508255005, - "rewards/margins": 6.660517692565918, - "rewards/rejected": -7.237016201019287, + "epoch": 1.43, + "learning_rate": 2.908272419326083e-07, + "logits/chosen": -2.4942710399627686, + "logits/rejected": -2.4520766735076904, + "logps/chosen": -285.1364440917969, + "logps/rejected": -339.1588439941406, + "loss": 0.1473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8445135354995728, + "rewards/margins": 6.957829475402832, + "rewards/rejected": -7.802342891693115, "step": 5940 }, { - "epoch": 1.5, - "learning_rate": 2.7703398558187435e-07, - "logits/chosen": -2.6109724044799805, - "logits/rejected": -2.443455219268799, - "logps/chosen": -278.3966064453125, - "logps/rejected": -274.39251708984375, - "loss": 0.1078, + "epoch": 1.43, + "learning_rate": 2.903815296844357e-07, + "logits/chosen": -2.48777437210083, + "logits/rejected": -2.42653226852417, + "logps/chosen": -210.6765899658203, + "logps/rejected": -291.66363525390625, + "loss": 0.0574, "rewards/accuracies": 1.0, - "rewards/chosen": -0.7179809808731079, - "rewards/margins": 6.091689109802246, - "rewards/rejected": -6.809670448303223, + "rewards/chosen": -0.43162697553634644, + "rewards/margins": 6.811394691467285, + "rewards/rejected": -7.2430219650268555, "step": 5950 }, { - "epoch": 1.51, - "learning_rate": 2.7656586461941764e-07, - "logits/chosen": -2.5273938179016113, - "logits/rejected": -2.4414260387420654, - "logps/chosen": -255.90646362304688, - "logps/rejected": -310.47418212890625, - "loss": 0.095, + "epoch": 1.43, + "learning_rate": 2.899358174362631e-07, + "logits/chosen": -2.549114227294922, + "logits/rejected": -2.3700621128082275, + "logps/chosen": -324.35333251953125, + "logps/rejected": -325.03729248046875, + "loss": 0.0743, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.8377459645271301, - "rewards/margins": 8.048016548156738, - "rewards/rejected": -8.885762214660645, + "rewards/chosen": -2.6961145401000977, + "rewards/margins": 6.383924961090088, + "rewards/rejected": -9.080039024353027, "step": 5960 }, { - "epoch": 1.51, - "learning_rate": 2.7609774365696093e-07, - "logits/chosen": -2.461449146270752, - "logits/rejected": -2.3967373371124268, - "logps/chosen": -269.59979248046875, - "logps/rejected": -303.8924560546875, - "loss": 0.0864, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.5161443948745728, - "rewards/margins": 6.825022220611572, - "rewards/rejected": -7.341165065765381, + "epoch": 1.44, + "learning_rate": 2.894901051880906e-07, + "logits/chosen": -2.5293917655944824, + "logits/rejected": -2.535958766937256, + "logps/chosen": -299.3890075683594, + "logps/rejected": -395.7617492675781, + "loss": 0.104, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.045948050916194916, + "rewards/margins": 8.89492416381836, + "rewards/rejected": -8.940872192382812, "step": 5970 }, { - "epoch": 1.51, - "learning_rate": 2.756296226945042e-07, - "logits/chosen": -2.6349711418151855, - "logits/rejected": -2.652991533279419, - "logps/chosen": -228.6748504638672, - "logps/rejected": -292.7082824707031, - "loss": 0.1179, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.43261289596557617, - "rewards/margins": 7.230783939361572, - "rewards/rejected": -7.663397312164307, + "epoch": 1.44, + "learning_rate": 2.89044392939918e-07, + "logits/chosen": -2.3320322036743164, + "logits/rejected": -2.282362461090088, + "logps/chosen": -273.87091064453125, + "logps/rejected": -270.1268005371094, + "loss": 0.0683, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2193177938461304, + "rewards/margins": 7.491827487945557, + "rewards/rejected": -8.711146354675293, "step": 5980 }, { - "epoch": 1.51, - "learning_rate": 2.7516150173204757e-07, - "logits/chosen": -2.4800076484680176, - "logits/rejected": -2.265038013458252, - "logps/chosen": -218.5613555908203, - "logps/rejected": -243.19091796875, - "loss": 0.0493, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0557198524475098, - "rewards/margins": 5.498206615447998, - "rewards/rejected": -7.55392599105835, + "epoch": 1.44, + "learning_rate": 2.885986806917454e-07, + "logits/chosen": -2.5077028274536133, + "logits/rejected": -2.4737260341644287, + "logps/chosen": -256.36444091796875, + "logps/rejected": -262.4673767089844, + "loss": 0.1564, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.814154863357544, + "rewards/margins": 6.36250114440918, + "rewards/rejected": -8.176656723022461, "step": 5990 }, { - "epoch": 1.52, - "learning_rate": 2.7469338076959085e-07, - "logits/chosen": -2.6435389518737793, - "logits/rejected": -2.5455713272094727, - "logps/chosen": -222.0457763671875, - "logps/rejected": -301.1404113769531, - "loss": 0.1007, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5636796951293945, - "rewards/margins": 7.443596839904785, - "rewards/rejected": -6.879917144775391, + "epoch": 1.44, + "learning_rate": 2.8815296844357285e-07, + "logits/chosen": -2.574638843536377, + "logits/rejected": -2.3348612785339355, + "logps/chosen": -288.05169677734375, + "logps/rejected": -321.83355712890625, + "loss": 0.1398, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0945594310760498, + "rewards/margins": 8.49699592590332, + "rewards/rejected": -9.591554641723633, "step": 6000 }, { - "epoch": 1.52, - "learning_rate": 2.7422525980713414e-07, - "logits/chosen": -2.673046588897705, - "logits/rejected": -2.3299639225006104, - "logps/chosen": -270.10064697265625, - "logps/rejected": -286.19610595703125, - "loss": 0.0637, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.03729300573468208, - "rewards/margins": 8.096498489379883, - "rewards/rejected": -8.13379192352295, + "epoch": 1.44, + "eval_logits/chosen": -2.2809441089630127, + "eval_logits/rejected": -2.2290520668029785, + "eval_logps/chosen": -245.72242736816406, + "eval_logps/rejected": -263.7706298828125, + "eval_loss": 0.5349838137626648, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -4.976140975952148, + "eval_rewards/margins": 2.9616143703460693, + "eval_rewards/rejected": -7.937755107879639, + "eval_runtime": 132.0847, + "eval_samples_per_second": 23.894, + "eval_steps_per_second": 0.379, + "step": 6000 + }, + { + "epoch": 1.45, + "learning_rate": 2.8770725619540026e-07, + "logits/chosen": -2.652148962020874, + "logits/rejected": -2.557687997817993, + "logps/chosen": -311.33233642578125, + "logps/rejected": -296.98492431640625, + "loss": 0.0774, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.25186121463775635, + "rewards/margins": 7.013228416442871, + "rewards/rejected": -7.265089988708496, "step": 6010 }, { - "epoch": 1.52, - "learning_rate": 2.7375713884467743e-07, - "logits/chosen": -2.5294830799102783, - "logits/rejected": -2.4918341636657715, - "logps/chosen": -271.35174560546875, - "logps/rejected": -361.06256103515625, - "loss": 0.1391, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08345876634120941, - "rewards/margins": 8.526935577392578, - "rewards/rejected": -8.443475723266602, + "epoch": 1.45, + "learning_rate": 2.8726154394722766e-07, + "logits/chosen": -2.3547909259796143, + "logits/rejected": -2.4295361042022705, + "logps/chosen": -312.8769836425781, + "logps/rejected": -392.81549072265625, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.254342555999756, + "rewards/margins": 8.663065910339355, + "rewards/rejected": -10.917407989501953, "step": 6020 }, { - "epoch": 1.52, - "learning_rate": 2.732890178822208e-07, - "logits/chosen": -2.491140127182007, - "logits/rejected": -2.444007635116577, - "logps/chosen": -255.1403350830078, - "logps/rejected": -389.2249450683594, - "loss": 0.0816, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6935482025146484, - "rewards/margins": 8.629409790039062, - "rewards/rejected": -9.322957992553711, + "epoch": 1.45, + "learning_rate": 2.868158316990551e-07, + "logits/chosen": -2.290584087371826, + "logits/rejected": -2.39445161819458, + "logps/chosen": -223.8242950439453, + "logps/rejected": -289.13934326171875, + "loss": 0.0759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2975921034812927, + "rewards/margins": 9.252206802368164, + "rewards/rejected": -9.549798965454102, "step": 6030 }, { - "epoch": 1.53, - "learning_rate": 2.7282089691976406e-07, - "logits/chosen": -2.7475171089172363, - "logits/rejected": -2.7083027362823486, - "logps/chosen": -222.727783203125, - "logps/rejected": -315.22821044921875, - "loss": 0.1316, + "epoch": 1.45, + "learning_rate": 2.863701194508825e-07, + "logits/chosen": -2.5682883262634277, + "logits/rejected": -2.5016331672668457, + "logps/chosen": -297.325439453125, + "logps/rejected": -299.564453125, + "loss": 0.1126, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.23866835236549377, - "rewards/margins": 8.23305892944336, - "rewards/rejected": -7.99439001083374, + "rewards/chosen": -1.114385724067688, + "rewards/margins": 7.214772701263428, + "rewards/rejected": -8.329157829284668, "step": 6040 }, { - "epoch": 1.53, - "learning_rate": 2.7235277595730735e-07, - "logits/chosen": -2.478139877319336, - "logits/rejected": -2.435267925262451, - "logps/chosen": -312.53863525390625, - "logps/rejected": -334.06842041015625, - "loss": 0.1077, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7183864116668701, - "rewards/margins": 7.949586391448975, - "rewards/rejected": -8.66797161102295, + "epoch": 1.46, + "learning_rate": 2.859244072027099e-07, + "logits/chosen": -2.4681122303009033, + "logits/rejected": -2.4523766040802, + "logps/chosen": -352.3961486816406, + "logps/rejected": -339.02099609375, + "loss": 0.2319, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5710694789886475, + "rewards/margins": 7.753049373626709, + "rewards/rejected": -10.324119567871094, "step": 6050 }, { - "epoch": 1.53, - "learning_rate": 2.7188465499485064e-07, - "logits/chosen": -2.7338013648986816, - "logits/rejected": -2.608670473098755, - "logps/chosen": -393.45013427734375, - "logps/rejected": -444.36407470703125, - "loss": 0.0509, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3149901032447815, - "rewards/margins": 9.215995788574219, - "rewards/rejected": -8.901005744934082, + "epoch": 1.46, + "learning_rate": 2.854786949545374e-07, + "logits/chosen": -2.7102081775665283, + "logits/rejected": -2.651808500289917, + "logps/chosen": -392.7288818359375, + "logps/rejected": -388.1726989746094, + "loss": 0.0637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2489074170589447, + "rewards/margins": 8.206884384155273, + "rewards/rejected": -7.957977294921875, "step": 6060 }, { - "epoch": 1.53, - "learning_rate": 2.7141653403239393e-07, - "logits/chosen": -2.6936416625976562, - "logits/rejected": -2.6677496433258057, - "logps/chosen": -298.617431640625, - "logps/rejected": -334.5692443847656, - "loss": 0.1159, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.571147084236145, - "rewards/margins": 7.877842903137207, - "rewards/rejected": -7.306695461273193, + "epoch": 1.46, + "learning_rate": 2.850329827063648e-07, + "logits/chosen": -2.5755245685577393, + "logits/rejected": -2.5351719856262207, + "logps/chosen": -214.2162628173828, + "logps/rejected": -302.987548828125, + "loss": 0.0949, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1477195024490356, + "rewards/margins": 9.774003982543945, + "rewards/rejected": -10.921723365783691, "step": 6070 }, { - "epoch": 1.54, - "learning_rate": 2.709484130699373e-07, - "logits/chosen": -2.438176393508911, - "logits/rejected": -2.3903422355651855, - "logps/chosen": -298.58282470703125, - "logps/rejected": -266.06695556640625, - "loss": 0.0981, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.18747439980506897, - "rewards/margins": 7.7106194496154785, - "rewards/rejected": -7.523144721984863, + "epoch": 1.46, + "learning_rate": 2.845872704581922e-07, + "logits/chosen": -2.5941195487976074, + "logits/rejected": -2.5147435665130615, + "logps/chosen": -270.2546691894531, + "logps/rejected": -282.0974426269531, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8139473795890808, + "rewards/margins": 7.396145820617676, + "rewards/rejected": -8.210092544555664, "step": 6080 }, { - "epoch": 1.54, - "learning_rate": 2.7048029210748056e-07, - "logits/chosen": -2.6650218963623047, - "logits/rejected": -2.4819769859313965, - "logps/chosen": -331.0292053222656, - "logps/rejected": -271.9941711425781, - "loss": 0.1013, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5644257068634033, - "rewards/margins": 4.562926292419434, - "rewards/rejected": -6.127351760864258, + "epoch": 1.47, + "learning_rate": 2.8414155821001964e-07, + "logits/chosen": -2.6342711448669434, + "logits/rejected": -2.583918809890747, + "logps/chosen": -290.14495849609375, + "logps/rejected": -264.99774169921875, + "loss": 0.1042, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8355423212051392, + "rewards/margins": 4.614483833312988, + "rewards/rejected": -6.450026035308838, "step": 6090 }, { - "epoch": 1.54, - "learning_rate": 2.7001217114502385e-07, - "logits/chosen": -2.766846179962158, - "logits/rejected": -2.6548171043395996, - "logps/chosen": -249.60818481445312, - "logps/rejected": -301.04132080078125, - "loss": 0.0747, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7441790699958801, - "rewards/margins": 8.869656562805176, - "rewards/rejected": -8.12547779083252, + "epoch": 1.47, + "learning_rate": 2.8369584596184704e-07, + "logits/chosen": -2.6662182807922363, + "logits/rejected": -2.526212453842163, + "logps/chosen": -296.58966064453125, + "logps/rejected": -314.0143737792969, + "loss": 0.099, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.2500333786010742, + "rewards/margins": 10.89311408996582, + "rewards/rejected": -9.64307975769043, "step": 6100 }, { - "epoch": 1.54, - "learning_rate": 2.6954405018256714e-07, - "logits/chosen": -2.476426601409912, - "logits/rejected": -2.4908547401428223, - "logps/chosen": -340.268798828125, - "logps/rejected": -395.8197326660156, - "loss": 0.0972, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0655219554901123, - "rewards/margins": 6.699605464935303, - "rewards/rejected": -7.765127658843994, + "epoch": 1.47, + "eval_logits/chosen": -2.3859267234802246, + "eval_logits/rejected": -2.336207628250122, + "eval_logps/chosen": -242.16331481933594, + "eval_logps/rejected": -259.3891906738281, + "eval_loss": 0.543978214263916, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -4.620230674743652, + "eval_rewards/margins": 2.8793792724609375, + "eval_rewards/rejected": -7.49960994720459, + "eval_runtime": 132.0653, + "eval_samples_per_second": 23.897, + "eval_steps_per_second": 0.379, + "step": 6100 + }, + { + "epoch": 1.47, + "learning_rate": 2.8325013371367444e-07, + "logits/chosen": -2.4673752784729004, + "logits/rejected": -2.4710025787353516, + "logps/chosen": -217.2989044189453, + "logps/rejected": -285.23358154296875, + "loss": 0.2325, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3893321752548218, + "rewards/margins": 7.431262016296387, + "rewards/rejected": -8.820595741271973, "step": 6110 }, { - "epoch": 1.55, - "learning_rate": 2.690759292201105e-07, - "logits/chosen": -2.606290578842163, - "logits/rejected": -2.5720295906066895, - "logps/chosen": -264.5345764160156, - "logps/rejected": -287.35064697265625, - "loss": 0.122, + "epoch": 1.47, + "learning_rate": 2.828044214655019e-07, + "logits/chosen": -2.5103814601898193, + "logits/rejected": -2.4794182777404785, + "logps/chosen": -226.72787475585938, + "logps/rejected": -339.3985290527344, + "loss": 0.1039, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.07059669494628906, - "rewards/margins": 7.116732597351074, - "rewards/rejected": -7.046135902404785, + "rewards/chosen": -0.7129805684089661, + "rewards/margins": 8.581960678100586, + "rewards/rejected": -9.294940948486328, "step": 6120 }, { - "epoch": 1.55, - "learning_rate": 2.686078082576538e-07, - "logits/chosen": -2.607025146484375, - "logits/rejected": -2.63069486618042, - "logps/chosen": -259.15875244140625, - "logps/rejected": -405.8233337402344, - "loss": 0.0805, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.288994699716568, - "rewards/margins": 8.880585670471191, - "rewards/rejected": -9.169580459594727, + "epoch": 1.48, + "learning_rate": 2.823587092173293e-07, + "logits/chosen": -2.484416961669922, + "logits/rejected": -2.5024631023406982, + "logps/chosen": -240.6263885498047, + "logps/rejected": -333.2454528808594, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08175155520439148, + "rewards/margins": 6.6725006103515625, + "rewards/rejected": -6.754253387451172, "step": 6130 }, { - "epoch": 1.55, - "learning_rate": 2.681396872951971e-07, - "logits/chosen": -2.6519951820373535, - "logits/rejected": -2.664268970489502, - "logps/chosen": -333.8085632324219, - "logps/rejected": -403.4785461425781, - "loss": 0.0562, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8596887588500977, - "rewards/margins": 8.478861808776855, - "rewards/rejected": -9.338550567626953, + "epoch": 1.48, + "learning_rate": 2.819129969691567e-07, + "logits/chosen": -2.6409966945648193, + "logits/rejected": -2.6407971382141113, + "logps/chosen": -399.09124755859375, + "logps/rejected": -483.02117919921875, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7288281917572021, + "rewards/margins": 10.948076248168945, + "rewards/rejected": -10.21924877166748, "step": 6140 }, { - "epoch": 1.55, - "learning_rate": 2.6767156633274035e-07, - "logits/chosen": -2.642038583755493, - "logits/rejected": -2.377959966659546, - "logps/chosen": -240.6234130859375, - "logps/rejected": -286.7399597167969, - "loss": 0.1153, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.029271280393004417, - "rewards/margins": 7.734196662902832, - "rewards/rejected": -7.704924583435059, + "epoch": 1.48, + "learning_rate": 2.814672847209841e-07, + "logits/chosen": -2.6207234859466553, + "logits/rejected": -2.598151922225952, + "logps/chosen": -259.26104736328125, + "logps/rejected": -246.7240447998047, + "loss": 0.1062, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9522031545639038, + "rewards/margins": 4.404966831207275, + "rewards/rejected": -6.357170581817627, "step": 6150 }, { - "epoch": 1.56, - "learning_rate": 2.6720344537028364e-07, - "logits/chosen": -2.6156229972839355, - "logits/rejected": -2.625122308731079, - "logps/chosen": -405.40093994140625, - "logps/rejected": -423.067626953125, - "loss": 0.0686, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3577374815940857, - "rewards/margins": 8.518488883972168, - "rewards/rejected": -8.876226425170898, + "epoch": 1.48, + "learning_rate": 2.8102157247281156e-07, + "logits/chosen": -2.334829807281494, + "logits/rejected": -2.409487724304199, + "logps/chosen": -148.61184692382812, + "logps/rejected": -320.7638244628906, + "loss": 0.1092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.26359644532203674, + "rewards/margins": 11.49001693725586, + "rewards/rejected": -11.753612518310547, "step": 6160 }, { - "epoch": 1.56, - "learning_rate": 2.66735324407827e-07, - "logits/chosen": -2.3296422958374023, - "logits/rejected": -2.411426544189453, - "logps/chosen": -196.38916015625, - "logps/rejected": -244.12405395507812, - "loss": 0.1481, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.3764898777008057, - "rewards/margins": 5.746121406555176, - "rewards/rejected": -7.122610569000244, + "epoch": 1.48, + "learning_rate": 2.8057586022463897e-07, + "logits/chosen": -2.556227207183838, + "logits/rejected": -2.437958240509033, + "logps/chosen": -292.95330810546875, + "logps/rejected": -295.3667297363281, + "loss": 0.0758, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.31266456842422485, + "rewards/margins": 7.257256984710693, + "rewards/rejected": -7.569921970367432, "step": 6170 }, { - "epoch": 1.56, - "learning_rate": 2.662672034453703e-07, - "logits/chosen": -2.6419670581817627, - "logits/rejected": -2.539872169494629, - "logps/chosen": -249.18057250976562, - "logps/rejected": -309.7854309082031, - "loss": 0.092, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.07812035083770752, - "rewards/margins": 8.424200057983398, - "rewards/rejected": -8.346078872680664, + "epoch": 1.49, + "learning_rate": 2.8013014797646637e-07, + "logits/chosen": -2.5164313316345215, + "logits/rejected": -2.4446310997009277, + "logps/chosen": -335.8703308105469, + "logps/rejected": -354.5244445800781, + "loss": 0.1271, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.42862653732299805, + "rewards/margins": 8.303886413574219, + "rewards/rejected": -8.732512474060059, "step": 6180 }, { - "epoch": 1.56, - "learning_rate": 2.6579908248291356e-07, - "logits/chosen": -2.5486984252929688, - "logits/rejected": -2.5054383277893066, - "logps/chosen": -257.5140075683594, - "logps/rejected": -296.4288024902344, - "loss": 0.0929, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3803873658180237, - "rewards/margins": 7.806270599365234, - "rewards/rejected": -8.18665885925293, + "epoch": 1.49, + "learning_rate": 2.796844357282938e-07, + "logits/chosen": -2.46992826461792, + "logits/rejected": -2.332427978515625, + "logps/chosen": -296.15887451171875, + "logps/rejected": -349.50067138671875, + "loss": 0.1219, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.16386635601520538, + "rewards/margins": 9.603449821472168, + "rewards/rejected": -9.439584732055664, "step": 6190 }, { - "epoch": 1.57, - "learning_rate": 2.6533096152045685e-07, - "logits/chosen": -2.5460028648376465, - "logits/rejected": -2.613973617553711, - "logps/chosen": -348.6800537109375, - "logps/rejected": -482.733642578125, - "loss": 0.0409, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.12141378223896027, - "rewards/margins": 8.33647632598877, - "rewards/rejected": -8.457890510559082, + "epoch": 1.49, + "learning_rate": 2.7923872348012123e-07, + "logits/chosen": -2.268820285797119, + "logits/rejected": -2.394498825073242, + "logps/chosen": -201.5729522705078, + "logps/rejected": -379.7137451171875, + "loss": 0.1279, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1567596197128296, + "rewards/margins": 7.547829627990723, + "rewards/rejected": -8.704588890075684, "step": 6200 }, { - "epoch": 1.57, - "learning_rate": 2.648628405580002e-07, - "logits/chosen": -2.289564847946167, - "logits/rejected": -2.3023674488067627, - "logps/chosen": -245.26651000976562, - "logps/rejected": -272.0384826660156, - "loss": 0.0793, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3192493915557861, - "rewards/margins": 7.366220951080322, - "rewards/rejected": -8.685470581054688, + "epoch": 1.49, + "eval_logits/chosen": -2.273364305496216, + "eval_logits/rejected": -2.2275524139404297, + "eval_logps/chosen": -245.42166137695312, + "eval_logps/rejected": -262.301513671875, + "eval_loss": 0.538863480091095, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -4.946064472198486, + "eval_rewards/margins": 2.8447742462158203, + "eval_rewards/rejected": -7.790838718414307, + "eval_runtime": 132.0831, + "eval_samples_per_second": 23.894, + "eval_steps_per_second": 0.379, + "step": 6200 + }, + { + "epoch": 1.49, + "learning_rate": 2.7879301123194863e-07, + "logits/chosen": -2.4446260929107666, + "logits/rejected": -2.4825809001922607, + "logps/chosen": -370.47833251953125, + "logps/rejected": -364.43438720703125, + "loss": 0.1114, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9154040813446045, + "rewards/margins": 6.077618598937988, + "rewards/rejected": -7.993022918701172, "step": 6210 }, { - "epoch": 1.57, - "learning_rate": 2.643947195955435e-07, - "logits/chosen": -2.4759864807128906, - "logits/rejected": -2.3297553062438965, - "logps/chosen": -260.42449951171875, - "logps/rejected": -326.72003173828125, - "loss": 0.0926, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6568080186843872, - "rewards/margins": 8.137785911560059, - "rewards/rejected": -8.794594764709473, + "epoch": 1.5, + "learning_rate": 2.783472989837761e-07, + "logits/chosen": -2.439441680908203, + "logits/rejected": -2.425654172897339, + "logps/chosen": -237.7493896484375, + "logps/rejected": -339.28717041015625, + "loss": 0.1552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3277128338813782, + "rewards/margins": 9.261073112487793, + "rewards/rejected": -9.588786125183105, "step": 6220 }, { - "epoch": 1.57, - "learning_rate": 2.639265986330868e-07, - "logits/chosen": -2.4743053913116455, - "logits/rejected": -2.4329488277435303, - "logps/chosen": -284.16986083984375, - "logps/rejected": -234.8544158935547, - "loss": 0.0916, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.8003547191619873, - "rewards/margins": 4.773205757141113, - "rewards/rejected": -6.573559761047363, + "epoch": 1.5, + "learning_rate": 2.779015867356035e-07, + "logits/chosen": -2.7057745456695557, + "logits/rejected": -2.4995524883270264, + "logps/chosen": -358.54791259765625, + "logps/rejected": -385.1791076660156, + "loss": 0.1269, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6208736896514893, + "rewards/margins": 8.121594429016113, + "rewards/rejected": -9.742467880249023, "step": 6230 }, { - "epoch": 1.58, - "learning_rate": 2.6345847767063006e-07, - "logits/chosen": -2.6427693367004395, - "logits/rejected": -2.5126216411590576, - "logps/chosen": -278.27716064453125, - "logps/rejected": -337.318115234375, - "loss": 0.1066, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.15482717752456665, - "rewards/margins": 7.757693290710449, - "rewards/rejected": -7.602866172790527, + "epoch": 1.5, + "learning_rate": 2.774558744874309e-07, + "logits/chosen": -2.6789374351501465, + "logits/rejected": -2.655383348464966, + "logps/chosen": -242.5365753173828, + "logps/rejected": -317.4726257324219, + "loss": 0.0787, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4868624210357666, + "rewards/margins": 7.507850646972656, + "rewards/rejected": -8.994712829589844, "step": 6240 }, { - "epoch": 1.58, - "learning_rate": 2.6299035670817335e-07, - "logits/chosen": -2.5959510803222656, - "logits/rejected": -2.570957660675049, - "logps/chosen": -298.49114990234375, - "logps/rejected": -396.47198486328125, - "loss": 0.0724, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1863725185394287, - "rewards/margins": 6.7948126792907715, - "rewards/rejected": -7.9811835289001465, + "epoch": 1.5, + "learning_rate": 2.7701016223925835e-07, + "logits/chosen": -2.5996620655059814, + "logits/rejected": -2.6002743244171143, + "logps/chosen": -244.5210723876953, + "logps/rejected": -406.82733154296875, + "loss": 0.0624, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3323901295661926, + "rewards/margins": 10.137161254882812, + "rewards/rejected": -9.804771423339844, "step": 6250 }, { - "epoch": 1.58, - "learning_rate": 2.625222357457167e-07, - "logits/chosen": -2.5576119422912598, - "logits/rejected": -2.371354579925537, - "logps/chosen": -241.24801635742188, - "logps/rejected": -237.8208770751953, - "loss": 0.0968, + "epoch": 1.51, + "learning_rate": 2.7656444999108575e-07, + "logits/chosen": -2.6050620079040527, + "logits/rejected": -2.5804266929626465, + "logps/chosen": -267.5068664550781, + "logps/rejected": -395.11346435546875, + "loss": 0.1022, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.9517194032669067, - "rewards/margins": 5.712420463562012, - "rewards/rejected": -6.664140224456787, + "rewards/chosen": 0.7538161277770996, + "rewards/margins": 10.944032669067383, + "rewards/rejected": -10.190216064453125, "step": 6260 }, { - "epoch": 1.58, - "learning_rate": 2.6205411478326e-07, - "logits/chosen": -2.635572671890259, - "logits/rejected": -2.7240185737609863, - "logps/chosen": -332.22381591796875, - "logps/rejected": -367.07818603515625, - "loss": 0.1099, + "epoch": 1.51, + "learning_rate": 2.7611873774291315e-07, + "logits/chosen": -2.4656999111175537, + "logits/rejected": -2.3853886127471924, + "logps/chosen": -333.6300964355469, + "logps/rejected": -298.3754577636719, + "loss": 0.1086, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.5952962040901184, - "rewards/margins": 7.662569999694824, - "rewards/rejected": -8.257866859436035, + "rewards/chosen": -0.9208229780197144, + "rewards/margins": 9.539531707763672, + "rewards/rejected": -10.460355758666992, "step": 6270 }, { - "epoch": 1.59, - "learning_rate": 2.6158599382080327e-07, - "logits/chosen": -2.5941269397735596, - "logits/rejected": -2.5424253940582275, - "logps/chosen": -269.71881103515625, - "logps/rejected": -277.7208251953125, - "loss": 0.0993, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.6491734981536865, - "rewards/margins": 8.246459007263184, - "rewards/rejected": -6.597285270690918, + "epoch": 1.51, + "learning_rate": 2.756730254947406e-07, + "logits/chosen": -2.52311635017395, + "logits/rejected": -2.4596641063690186, + "logps/chosen": -189.1370849609375, + "logps/rejected": -308.7192687988281, + "loss": 0.1117, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8102662563323975, + "rewards/margins": 8.257328987121582, + "rewards/rejected": -10.067595481872559, "step": 6280 }, { - "epoch": 1.59, - "learning_rate": 2.6111787285834656e-07, - "logits/chosen": -2.282750368118286, - "logits/rejected": -2.1928257942199707, - "logps/chosen": -291.94964599609375, - "logps/rejected": -323.4311218261719, - "loss": 0.137, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7096611261367798, - "rewards/margins": 8.644102096557617, - "rewards/rejected": -7.934440612792969, + "epoch": 1.51, + "learning_rate": 2.75227313246568e-07, + "logits/chosen": -2.7163500785827637, + "logits/rejected": -2.744774341583252, + "logps/chosen": -242.19015502929688, + "logps/rejected": -230.99264526367188, + "loss": 0.1574, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2066490650177002, + "rewards/margins": 5.5687994956970215, + "rewards/rejected": -6.775448799133301, "step": 6290 }, { - "epoch": 1.59, - "learning_rate": 2.606497518958899e-07, - "logits/chosen": -2.480043411254883, - "logits/rejected": -2.45155930519104, - "logps/chosen": -274.54913330078125, - "logps/rejected": -396.34930419921875, - "loss": 0.0791, + "epoch": 1.52, + "learning_rate": 2.747816009983954e-07, + "logits/chosen": -2.557614803314209, + "logits/rejected": -2.4608089923858643, + "logps/chosen": -194.5203094482422, + "logps/rejected": -287.93988037109375, + "loss": 0.0778, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7529920339584351, - "rewards/margins": 7.522666931152344, - "rewards/rejected": -8.275659561157227, + "rewards/chosen": -1.4155465364456177, + "rewards/margins": 5.7504777908325195, + "rewards/rejected": -7.166024684906006, "step": 6300 }, { - "epoch": 1.6, - "learning_rate": 2.601816309334332e-07, - "logits/chosen": -2.3269565105438232, - "logits/rejected": -2.212367057800293, - "logps/chosen": -215.3049774169922, - "logps/rejected": -289.00567626953125, - "loss": 0.0842, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5885213613510132, - "rewards/margins": 8.83895492553711, - "rewards/rejected": -9.427475929260254, + "epoch": 1.52, + "eval_logits/chosen": -2.5193357467651367, + "eval_logits/rejected": -2.4781494140625, + "eval_logps/chosen": -245.5110321044922, + "eval_logps/rejected": -263.3570251464844, + "eval_loss": 0.545120894908905, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -4.955002784729004, + "eval_rewards/margins": 2.941389799118042, + "eval_rewards/rejected": -7.896392822265625, + "eval_runtime": 132.091, + "eval_samples_per_second": 23.893, + "eval_steps_per_second": 0.379, + "step": 6300 + }, + { + "epoch": 1.52, + "learning_rate": 2.743358887502228e-07, + "logits/chosen": -2.5674338340759277, + "logits/rejected": -2.6931252479553223, + "logps/chosen": -232.01467895507812, + "logps/rejected": -284.7641906738281, + "loss": 0.158, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8049416542053223, + "rewards/margins": 6.449332237243652, + "rewards/rejected": -9.25427532196045, "step": 6310 }, { - "epoch": 1.6, - "learning_rate": 2.597135099709765e-07, - "logits/chosen": -2.575462818145752, - "logits/rejected": -2.493577480316162, - "logps/chosen": -364.81634521484375, - "logps/rejected": -351.1142578125, - "loss": 0.1075, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5452896356582642, - "rewards/margins": 7.818143367767334, - "rewards/rejected": -7.272853851318359, + "epoch": 1.52, + "learning_rate": 2.738901765020503e-07, + "logits/chosen": -2.753450870513916, + "logits/rejected": -2.7570910453796387, + "logps/chosen": -271.8170471191406, + "logps/rejected": -321.32586669921875, + "loss": 0.1298, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.128871440887451, + "rewards/margins": 6.492806434631348, + "rewards/rejected": -9.621678352355957, "step": 6320 }, { - "epoch": 1.6, - "learning_rate": 2.5924538900851977e-07, - "logits/chosen": -2.531485080718994, - "logits/rejected": -2.5142064094543457, - "logps/chosen": -234.1458282470703, - "logps/rejected": -293.10308837890625, - "loss": 0.108, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.367737352848053, - "rewards/margins": 6.96529483795166, - "rewards/rejected": -7.333032131195068, + "epoch": 1.52, + "learning_rate": 2.734444642538777e-07, + "logits/chosen": -2.770362138748169, + "logits/rejected": -2.667633056640625, + "logps/chosen": -279.14044189453125, + "logps/rejected": -295.1834716796875, + "loss": 0.1257, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5739901065826416, + "rewards/margins": 8.314129829406738, + "rewards/rejected": -7.740139007568359, "step": 6330 }, { - "epoch": 1.6, - "learning_rate": 2.587772680460631e-07, - "logits/chosen": -2.566310167312622, - "logits/rejected": -2.4758496284484863, - "logps/chosen": -364.1842346191406, - "logps/rejected": -333.62384033203125, - "loss": 0.055, + "epoch": 1.53, + "learning_rate": 2.729987520057051e-07, + "logits/chosen": -2.8073010444641113, + "logits/rejected": -2.844325304031372, + "logps/chosen": -270.79583740234375, + "logps/rejected": -341.4439697265625, + "loss": 0.1217, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9204639196395874, - "rewards/margins": 8.889691352844238, - "rewards/rejected": -7.969226837158203, + "rewards/chosen": 1.226733922958374, + "rewards/margins": 10.273815155029297, + "rewards/rejected": -9.047080039978027, "step": 6340 }, { - "epoch": 1.61, - "learning_rate": 2.583091470836064e-07, - "logits/chosen": -2.4522814750671387, - "logits/rejected": -2.479931354522705, - "logps/chosen": -317.1643981933594, - "logps/rejected": -353.9148254394531, - "loss": 0.0866, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9924957156181335, - "rewards/margins": 7.705667972564697, - "rewards/rejected": -6.713172912597656, + "epoch": 1.53, + "learning_rate": 2.7255303975753254e-07, + "logits/chosen": -2.6258037090301514, + "logits/rejected": -2.5558743476867676, + "logps/chosen": -220.7984161376953, + "logps/rejected": -360.7315673828125, + "loss": 0.0893, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.602647304534912, + "rewards/margins": 7.148091793060303, + "rewards/rejected": -9.750738143920898, "step": 6350 }, { - "epoch": 1.61, - "learning_rate": 2.578410261211497e-07, - "logits/chosen": -2.390688419342041, - "logits/rejected": -2.3841347694396973, - "logps/chosen": -225.24075317382812, - "logps/rejected": -381.5619201660156, - "loss": 0.1551, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.04876537248492241, - "rewards/margins": 11.332666397094727, - "rewards/rejected": -11.283902168273926, + "epoch": 1.53, + "learning_rate": 2.7210732750935994e-07, + "logits/chosen": -2.6252262592315674, + "logits/rejected": -2.6275007724761963, + "logps/chosen": -252.8973388671875, + "logps/rejected": -258.74127197265625, + "loss": 0.1897, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2168649435043335, + "rewards/margins": 5.642578125, + "rewards/rejected": -6.859443664550781, "step": 6360 }, { - "epoch": 1.61, - "learning_rate": 2.57372905158693e-07, - "logits/chosen": -2.4534378051757812, - "logits/rejected": -2.3759846687316895, - "logps/chosen": -367.37713623046875, - "logps/rejected": -324.92864990234375, - "loss": 0.1195, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.09103012084960938, - "rewards/margins": 8.274453163146973, - "rewards/rejected": -8.365483283996582, + "epoch": 1.53, + "learning_rate": 2.7166161526118734e-07, + "logits/chosen": -2.6454825401306152, + "logits/rejected": -2.544174909591675, + "logps/chosen": -367.6883239746094, + "logps/rejected": -364.240966796875, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04706361144781113, + "rewards/margins": 9.600566864013672, + "rewards/rejected": -9.553503036499023, "step": 6370 }, { - "epoch": 1.61, - "learning_rate": 2.5690478419623627e-07, - "logits/chosen": -2.593829393386841, - "logits/rejected": -2.536083698272705, - "logps/chosen": -245.0706024169922, - "logps/rejected": -372.5894775390625, - "loss": 0.0701, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.8631905317306519, - "rewards/margins": 9.275471687316895, - "rewards/rejected": -10.13866138458252, + "epoch": 1.54, + "learning_rate": 2.712159030130148e-07, + "logits/chosen": -2.625828266143799, + "logits/rejected": -2.5344960689544678, + "logps/chosen": -288.52593994140625, + "logps/rejected": -292.865478515625, + "loss": 0.081, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.495135545730591, + "rewards/margins": 5.297086238861084, + "rewards/rejected": -8.792220115661621, "step": 6380 }, { - "epoch": 1.62, - "learning_rate": 2.564366632337796e-07, - "logits/chosen": -2.532728433609009, - "logits/rejected": -2.518982410430908, - "logps/chosen": -161.19578552246094, - "logps/rejected": -306.3395080566406, - "loss": 0.0699, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.0426340326666832, - "rewards/margins": 9.686996459960938, - "rewards/rejected": -9.644363403320312, + "epoch": 1.54, + "learning_rate": 2.707701907648422e-07, + "logits/chosen": -2.5542495250701904, + "logits/rejected": -2.5069243907928467, + "logps/chosen": -293.36474609375, + "logps/rejected": -313.1446228027344, + "loss": 0.1376, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4597678184509277, + "rewards/margins": 7.706934928894043, + "rewards/rejected": -9.166703224182129, "step": 6390 }, { - "epoch": 1.62, - "learning_rate": 2.559685422713229e-07, - "logits/chosen": -2.5918660163879395, - "logits/rejected": -2.5031676292419434, - "logps/chosen": -298.0547790527344, - "logps/rejected": -390.0431213378906, - "loss": 0.0742, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7646089792251587, - "rewards/margins": 7.8302788734436035, - "rewards/rejected": -8.594887733459473, + "epoch": 1.54, + "learning_rate": 2.703244785166696e-07, + "logits/chosen": -2.6298329830169678, + "logits/rejected": -2.5626161098480225, + "logps/chosen": -193.88504028320312, + "logps/rejected": -216.7042236328125, + "loss": 0.0911, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4972293376922607, + "rewards/margins": 6.809592247009277, + "rewards/rejected": -8.306821823120117, "step": 6400 }, { - "epoch": 1.62, - "learning_rate": 2.555004213088662e-07, - "logits/chosen": -2.6251654624938965, - "logits/rejected": -2.649387836456299, - "logps/chosen": -300.34613037109375, - "logps/rejected": -409.62548828125, - "loss": 0.0819, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.8415437936782837, - "rewards/margins": 10.491747856140137, - "rewards/rejected": -8.6502046585083, + "epoch": 1.54, + "eval_logits/chosen": -2.4048027992248535, + "eval_logits/rejected": -2.3604178428649902, + "eval_logps/chosen": -250.5128173828125, + "eval_logps/rejected": -267.53240966796875, + "eval_loss": 0.5412023067474365, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -5.4551801681518555, + "eval_rewards/margins": 2.858752727508545, + "eval_rewards/rejected": -8.313933372497559, + "eval_runtime": 131.9745, + "eval_samples_per_second": 23.914, + "eval_steps_per_second": 0.379, + "step": 6400 + }, + { + "epoch": 1.54, + "learning_rate": 2.6987876626849706e-07, + "logits/chosen": -2.5710575580596924, + "logits/rejected": -2.5692861080169678, + "logps/chosen": -200.3492431640625, + "logps/rejected": -255.5147705078125, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8119436502456665, + "rewards/margins": 6.283229351043701, + "rewards/rejected": -8.095173835754395, "step": 6410 }, { - "epoch": 1.62, - "learning_rate": 2.550323003464095e-07, - "logits/chosen": -2.3623738288879395, - "logits/rejected": -2.245854139328003, - "logps/chosen": -243.4949188232422, - "logps/rejected": -234.89468383789062, - "loss": 0.0967, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.059647560119629, - "rewards/margins": 6.525529384613037, - "rewards/rejected": -8.585176467895508, + "epoch": 1.55, + "learning_rate": 2.6943305402032446e-07, + "logits/chosen": -2.6577858924865723, + "logits/rejected": -2.4518516063690186, + "logps/chosen": -365.58966064453125, + "logps/rejected": -308.8609619140625, + "loss": 0.0745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.041567906737327576, + "rewards/margins": 9.455829620361328, + "rewards/rejected": -9.497397422790527, "step": 6420 }, { - "epoch": 1.63, - "learning_rate": 2.545641793839528e-07, - "logits/chosen": -2.6008667945861816, - "logits/rejected": -2.637789726257324, - "logps/chosen": -192.2373504638672, - "logps/rejected": -267.214111328125, - "loss": 0.1195, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7727561593055725, - "rewards/margins": 8.40786075592041, - "rewards/rejected": -7.635104179382324, + "epoch": 1.55, + "learning_rate": 2.6898734177215186e-07, + "logits/chosen": -2.5847132205963135, + "logits/rejected": -2.482497215270996, + "logps/chosen": -217.7862548828125, + "logps/rejected": -256.5086669921875, + "loss": 0.0801, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5570067167282104, + "rewards/margins": 5.74337100982666, + "rewards/rejected": -7.300376892089844, "step": 6430 }, { - "epoch": 1.63, - "learning_rate": 2.540960584214961e-07, - "logits/chosen": -2.643723964691162, - "logits/rejected": -2.5780606269836426, - "logps/chosen": -286.4054260253906, - "logps/rejected": -300.36834716796875, - "loss": 0.0951, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.12415985763072968, - "rewards/margins": 7.626623630523682, - "rewards/rejected": -7.502463340759277, + "epoch": 1.55, + "learning_rate": 2.685416295239793e-07, + "logits/chosen": -2.606306552886963, + "logits/rejected": -2.6333842277526855, + "logps/chosen": -303.5548400878906, + "logps/rejected": -351.7878112792969, + "loss": 0.0902, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4206736087799072, + "rewards/margins": 7.630602836608887, + "rewards/rejected": -9.051277160644531, "step": 6440 }, { - "epoch": 1.63, - "learning_rate": 2.5362793745903946e-07, - "logits/chosen": -2.520909309387207, - "logits/rejected": -2.569328784942627, - "logps/chosen": -220.816650390625, - "logps/rejected": -351.45928955078125, - "loss": 0.1153, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.2902345359325409, - "rewards/margins": 7.306796073913574, - "rewards/rejected": -7.597031593322754, + "epoch": 1.55, + "learning_rate": 2.680959172758067e-07, + "logits/chosen": -2.6123645305633545, + "logits/rejected": -2.5800538063049316, + "logps/chosen": -242.76815795898438, + "logps/rejected": -262.8008728027344, + "loss": 0.1175, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9002971649169922, + "rewards/margins": 4.8772196769714355, + "rewards/rejected": -6.777516841888428, "step": 6450 }, { - "epoch": 1.63, - "learning_rate": 2.531598164965827e-07, - "logits/chosen": -2.7431952953338623, - "logits/rejected": -2.5619826316833496, - "logps/chosen": -279.5010986328125, - "logps/rejected": -286.8915100097656, - "loss": 0.0826, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.9281533360481262, - "rewards/margins": 6.31351375579834, - "rewards/rejected": -7.2416672706604, + "epoch": 1.55, + "learning_rate": 2.676502050276341e-07, + "logits/chosen": -2.6654367446899414, + "logits/rejected": -2.525826930999756, + "logps/chosen": -267.04541015625, + "logps/rejected": -329.3205261230469, + "loss": 0.0968, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0808510780334473, + "rewards/margins": 6.370826721191406, + "rewards/rejected": -8.451677322387695, "step": 6460 }, { - "epoch": 1.64, - "learning_rate": 2.52691695534126e-07, - "logits/chosen": -2.461831569671631, - "logits/rejected": -2.447096824645996, - "logps/chosen": -245.5779571533203, - "logps/rejected": -284.29473876953125, - "loss": 0.1038, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.645194411277771, - "rewards/margins": 7.570770263671875, - "rewards/rejected": -8.215965270996094, + "epoch": 1.56, + "learning_rate": 2.6720449277946153e-07, + "logits/chosen": -2.515866756439209, + "logits/rejected": -2.6269681453704834, + "logps/chosen": -334.1268310546875, + "logps/rejected": -438.0691833496094, + "loss": 0.1015, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.6662344932556152, + "rewards/margins": 9.552573204040527, + "rewards/rejected": -8.88633918762207, "step": 6470 }, { - "epoch": 1.64, - "learning_rate": 2.522235745716693e-07, - "logits/chosen": -2.7126336097717285, - "logits/rejected": -2.641799211502075, - "logps/chosen": -252.3530731201172, - "logps/rejected": -305.55706787109375, - "loss": 0.0437, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7650710940361023, - "rewards/margins": 7.985662937164307, - "rewards/rejected": -7.2205915451049805, + "epoch": 1.56, + "learning_rate": 2.66758780531289e-07, + "logits/chosen": -2.7897913455963135, + "logits/rejected": -2.670492649078369, + "logps/chosen": -207.9557647705078, + "logps/rejected": -260.10137939453125, + "loss": 0.0874, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6581312417984009, + "rewards/margins": 6.606692314147949, + "rewards/rejected": -7.264822959899902, "step": 6480 }, { - "epoch": 1.64, - "learning_rate": 2.517554536092126e-07, - "logits/chosen": -2.561034917831421, - "logits/rejected": -2.5873913764953613, - "logps/chosen": -219.373291015625, - "logps/rejected": -346.26123046875, - "loss": 0.0884, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.11245546489953995, - "rewards/margins": 7.357387542724609, - "rewards/rejected": -7.469842433929443, + "epoch": 1.56, + "learning_rate": 2.663130682831164e-07, + "logits/chosen": -2.6898562908172607, + "logits/rejected": -2.609832286834717, + "logps/chosen": -273.21917724609375, + "logps/rejected": -251.85183715820312, + "loss": 0.0856, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.06722860783338547, + "rewards/margins": 7.260194301605225, + "rewards/rejected": -7.327422142028809, "step": 6490 }, { - "epoch": 1.64, - "learning_rate": 2.512873326467559e-07, - "logits/chosen": -2.4951984882354736, - "logits/rejected": -2.5137152671813965, - "logps/chosen": -274.0867614746094, - "logps/rejected": -345.02178955078125, - "loss": 0.1047, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.48362818360328674, - "rewards/margins": 8.117841720581055, - "rewards/rejected": -8.601469039916992, + "epoch": 1.56, + "learning_rate": 2.658673560349438e-07, + "logits/chosen": -2.5431177616119385, + "logits/rejected": -2.5567047595977783, + "logps/chosen": -249.8641815185547, + "logps/rejected": -307.4685974121094, + "loss": 0.2149, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5301803946495056, + "rewards/margins": 7.6516523361206055, + "rewards/rejected": -8.181832313537598, "step": 6500 }, { - "epoch": 1.65, - "learning_rate": 2.508192116842992e-07, - "logits/chosen": -2.5647077560424805, - "logits/rejected": -2.4434280395507812, - "logps/chosen": -227.68240356445312, - "logps/rejected": -282.48980712890625, - "loss": 0.087, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.21963413059711456, - "rewards/margins": 7.6169867515563965, - "rewards/rejected": -7.836620330810547, + "epoch": 1.56, + "eval_logits/chosen": -2.446145534515381, + "eval_logits/rejected": -2.4011125564575195, + "eval_logps/chosen": -240.4731903076172, + "eval_logps/rejected": -257.58734130859375, + "eval_loss": 0.5241071581840515, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -4.451216697692871, + "eval_rewards/margins": 2.86820912361145, + "eval_rewards/rejected": -7.319425106048584, + "eval_runtime": 131.9336, + "eval_samples_per_second": 23.921, + "eval_steps_per_second": 0.379, + "step": 6500 + }, + { + "epoch": 1.57, + "learning_rate": 2.6542164378677125e-07, + "logits/chosen": -2.5523993968963623, + "logits/rejected": -2.580894947052002, + "logps/chosen": -204.18283081054688, + "logps/rejected": -281.56011962890625, + "loss": 0.2485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6025975942611694, + "rewards/margins": 6.649613857269287, + "rewards/rejected": -7.252211570739746, "step": 6510 }, { - "epoch": 1.65, - "learning_rate": 2.5035109072184253e-07, - "logits/chosen": -2.4927191734313965, - "logits/rejected": -2.355459690093994, - "logps/chosen": -237.4644775390625, - "logps/rejected": -275.3338317871094, - "loss": 0.0832, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.9448397755622864, - "rewards/margins": 7.046795845031738, - "rewards/rejected": -7.991635322570801, + "epoch": 1.57, + "learning_rate": 2.6497593153859865e-07, + "logits/chosen": -2.817427396774292, + "logits/rejected": -2.8023767471313477, + "logps/chosen": -288.5300598144531, + "logps/rejected": -335.1292419433594, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40045562386512756, + "rewards/margins": 7.118527412414551, + "rewards/rejected": -7.518982887268066, "step": 6520 }, { - "epoch": 1.65, - "learning_rate": 2.498829697593858e-07, - "logits/chosen": -2.7306220531463623, - "logits/rejected": -2.5981738567352295, - "logps/chosen": -292.41632080078125, - "logps/rejected": -398.3863830566406, - "loss": 0.0958, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7138475179672241, - "rewards/margins": 9.044022560119629, - "rewards/rejected": -9.7578706741333, + "epoch": 1.57, + "learning_rate": 2.6453021929042605e-07, + "logits/chosen": -2.6209304332733154, + "logits/rejected": -2.5499496459960938, + "logps/chosen": -242.03311157226562, + "logps/rejected": -359.74896240234375, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21108956634998322, + "rewards/margins": 7.636587619781494, + "rewards/rejected": -7.847676753997803, "step": 6530 }, { - "epoch": 1.65, - "learning_rate": 2.494148487969291e-07, - "logits/chosen": -2.5870468616485596, - "logits/rejected": -2.4440600872039795, - "logps/chosen": -307.70513916015625, - "logps/rejected": -365.44696044921875, - "loss": 0.1477, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8003414273262024, - "rewards/margins": 11.231703758239746, - "rewards/rejected": -10.431364059448242, + "epoch": 1.57, + "learning_rate": 2.640845070422535e-07, + "logits/chosen": -2.4486939907073975, + "logits/rejected": -2.3721094131469727, + "logps/chosen": -207.40811157226562, + "logps/rejected": -195.20480346679688, + "loss": 0.103, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1045043468475342, + "rewards/margins": 4.896298885345459, + "rewards/rejected": -6.000802993774414, "step": 6540 }, { - "epoch": 1.66, - "learning_rate": 2.489467278344724e-07, - "logits/chosen": -2.5128679275512695, - "logits/rejected": -2.475287675857544, - "logps/chosen": -233.4687957763672, - "logps/rejected": -257.11376953125, - "loss": 0.0656, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2492097616195679, - "rewards/margins": 8.299988746643066, - "rewards/rejected": -7.050778865814209, + "epoch": 1.58, + "learning_rate": 2.636387947940809e-07, + "logits/chosen": -2.664353132247925, + "logits/rejected": -2.6787848472595215, + "logps/chosen": -241.99267578125, + "logps/rejected": -276.84771728515625, + "loss": 0.1461, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6894559860229492, + "rewards/margins": 5.331470489501953, + "rewards/rejected": -7.020925998687744, "step": 6550 }, { - "epoch": 1.66, - "learning_rate": 2.484786068720157e-07, - "logits/chosen": -2.547872543334961, - "logits/rejected": -2.586104154586792, - "logps/chosen": -264.0244140625, - "logps/rejected": -343.283935546875, - "loss": 0.0833, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.10393998771905899, - "rewards/margins": 9.039542198181152, - "rewards/rejected": -8.935602188110352, + "epoch": 1.58, + "learning_rate": 2.631930825459083e-07, + "logits/chosen": -2.6161677837371826, + "logits/rejected": -2.5490145683288574, + "logps/chosen": -192.01504516601562, + "logps/rejected": -322.1249084472656, + "loss": 0.0695, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8095295429229736, + "rewards/margins": 7.257183074951172, + "rewards/rejected": -9.066713333129883, "step": 6560 }, { - "epoch": 1.66, - "learning_rate": 2.4801048590955903e-07, - "logits/chosen": -2.45554518699646, - "logits/rejected": -2.4026269912719727, - "logps/chosen": -310.2799377441406, - "logps/rejected": -418.31463623046875, - "loss": 0.066, + "epoch": 1.58, + "learning_rate": 2.6274737029773577e-07, + "logits/chosen": -2.591261386871338, + "logits/rejected": -2.639180898666382, + "logps/chosen": -199.4892578125, + "logps/rejected": -340.40850830078125, + "loss": 0.0909, "rewards/accuracies": 1.0, - "rewards/chosen": -0.7158440351486206, - "rewards/margins": 11.024736404418945, - "rewards/rejected": -11.740580558776855, + "rewards/chosen": -0.03453409671783447, + "rewards/margins": 8.318466186523438, + "rewards/rejected": -8.352999687194824, "step": 6570 }, { - "epoch": 1.66, - "learning_rate": 2.475423649471023e-07, - "logits/chosen": -2.699667453765869, - "logits/rejected": -2.7050626277923584, - "logps/chosen": -288.3102722167969, - "logps/rejected": -335.96221923828125, - "loss": 0.1254, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9094661474227905, - "rewards/margins": 6.560095310211182, - "rewards/rejected": -7.469560146331787, + "epoch": 1.58, + "learning_rate": 2.6230165804956317e-07, + "logits/chosen": -2.587012529373169, + "logits/rejected": -2.4184250831604004, + "logps/chosen": -276.42169189453125, + "logps/rejected": -329.2543029785156, + "loss": 0.0739, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7662872076034546, + "rewards/margins": 6.234993934631348, + "rewards/rejected": -8.00128173828125, "step": 6580 }, { - "epoch": 1.67, - "learning_rate": 2.470742439846456e-07, - "logits/chosen": -2.6850056648254395, - "logits/rejected": -2.447831630706787, - "logps/chosen": -289.18353271484375, - "logps/rejected": -295.9617614746094, - "loss": 0.0962, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.7664966583251953, - "rewards/margins": 4.887343406677246, - "rewards/rejected": -6.6538405418396, + "epoch": 1.59, + "learning_rate": 2.618559458013906e-07, + "logits/chosen": -2.8391337394714355, + "logits/rejected": -2.7069647312164307, + "logps/chosen": -283.3822937011719, + "logps/rejected": -314.87200927734375, + "loss": 0.0975, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.1805918961763382, + "rewards/margins": 8.450132369995117, + "rewards/rejected": -8.269540786743164, "step": 6590 }, { - "epoch": 1.67, - "learning_rate": 2.466061230221889e-07, - "logits/chosen": -2.601702928543091, - "logits/rejected": -2.488107204437256, - "logps/chosen": -287.02020263671875, - "logps/rejected": -321.2840270996094, - "loss": 0.1292, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.7866836786270142, - "rewards/margins": 4.828063488006592, - "rewards/rejected": -6.614747047424316, + "epoch": 1.59, + "learning_rate": 2.6141023355321803e-07, + "logits/chosen": -2.543520450592041, + "logits/rejected": -2.4865641593933105, + "logps/chosen": -408.915771484375, + "logps/rejected": -398.7325439453125, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9992796182632446, + "rewards/margins": 7.654056549072266, + "rewards/rejected": -9.653336524963379, "step": 6600 }, { - "epoch": 1.67, - "learning_rate": 2.4613800205973224e-07, - "logits/chosen": -2.545788288116455, - "logits/rejected": -2.5788114070892334, - "logps/chosen": -228.93972778320312, - "logps/rejected": -291.91680908203125, - "loss": 0.0651, + "epoch": 1.59, + "eval_logits/chosen": -2.4576714038848877, + "eval_logits/rejected": -2.414294481277466, + "eval_logps/chosen": -246.1035614013672, + "eval_logps/rejected": -261.8999328613281, + "eval_loss": 0.5329164862632751, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -5.014252185821533, + "eval_rewards/margins": 2.7364330291748047, + "eval_rewards/rejected": -7.75068473815918, + "eval_runtime": 132.2237, + "eval_samples_per_second": 23.869, + "eval_steps_per_second": 0.378, + "step": 6600 + }, + { + "epoch": 1.59, + "learning_rate": 2.6096452130504543e-07, + "logits/chosen": -2.6992452144622803, + "logits/rejected": -2.677377223968506, + "logps/chosen": -279.94281005859375, + "logps/rejected": -449.54833984375, + "loss": 0.089, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.0478473901748657, - "rewards/margins": 8.06389045715332, - "rewards/rejected": -9.111737251281738, + "rewards/chosen": -0.5037785172462463, + "rewards/margins": 10.878410339355469, + "rewards/rejected": -11.38218879699707, "step": 6610 }, { - "epoch": 1.67, - "learning_rate": 2.4566988109727553e-07, - "logits/chosen": -2.6414923667907715, - "logits/rejected": -2.5644431114196777, - "logps/chosen": -228.28369140625, - "logps/rejected": -265.0378112792969, - "loss": 0.0969, + "epoch": 1.59, + "learning_rate": 2.6051880905687284e-07, + "logits/chosen": -2.687520742416382, + "logits/rejected": -2.674999237060547, + "logps/chosen": -287.6028747558594, + "logps/rejected": -290.20660400390625, + "loss": 0.0977, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.6914323568344116, - "rewards/margins": 4.776876449584961, - "rewards/rejected": -6.468308925628662, + "rewards/chosen": -0.1383069008588791, + "rewards/margins": 6.374781608581543, + "rewards/rejected": -6.513087272644043, "step": 6620 }, { - "epoch": 1.68, - "learning_rate": 2.452017601348188e-07, - "logits/chosen": -2.5595173835754395, - "logits/rejected": -2.523301362991333, - "logps/chosen": -321.19122314453125, - "logps/rejected": -370.2229309082031, - "loss": 0.124, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.3089625835418701, - "rewards/margins": 5.994938373565674, - "rewards/rejected": -7.303900718688965, + "epoch": 1.6, + "learning_rate": 2.6007309680870024e-07, + "logits/chosen": -2.6049275398254395, + "logits/rejected": -2.4614250659942627, + "logps/chosen": -300.57891845703125, + "logps/rejected": -338.83349609375, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9794005155563354, + "rewards/margins": 8.73996639251709, + "rewards/rejected": -9.719367027282715, "step": 6630 }, { - "epoch": 1.68, - "learning_rate": 2.447336391723621e-07, - "logits/chosen": -2.6744046211242676, - "logits/rejected": -2.500985622406006, - "logps/chosen": -302.8641662597656, - "logps/rejected": -312.8785095214844, - "loss": 0.0595, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.256498247385025, - "rewards/margins": 8.627106666564941, - "rewards/rejected": -8.37060832977295, + "epoch": 1.6, + "learning_rate": 2.596273845605277e-07, + "logits/chosen": -2.668069839477539, + "logits/rejected": -2.5372109413146973, + "logps/chosen": -372.4585876464844, + "logps/rejected": -319.42840576171875, + "loss": 0.0852, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3613933324813843, + "rewards/margins": 8.843244552612305, + "rewards/rejected": -8.481851577758789, "step": 6640 }, { - "epoch": 1.68, - "learning_rate": 2.4426551820990546e-07, - "logits/chosen": -2.772514581680298, - "logits/rejected": -2.5799484252929688, - "logps/chosen": -309.5052185058594, - "logps/rejected": -348.5119934082031, - "loss": 0.0934, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.509163498878479, - "rewards/margins": 6.6776251792907715, - "rewards/rejected": -8.186788558959961, + "epoch": 1.6, + "learning_rate": 2.591816723123551e-07, + "logits/chosen": -2.583962917327881, + "logits/rejected": -2.477149724960327, + "logps/chosen": -261.5204772949219, + "logps/rejected": -380.90814208984375, + "loss": 0.0869, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.454642117023468, + "rewards/margins": 12.749290466308594, + "rewards/rejected": -12.294649124145508, "step": 6650 }, { - "epoch": 1.68, - "learning_rate": 2.437973972474487e-07, - "logits/chosen": -2.7458884716033936, - "logits/rejected": -2.7266435623168945, - "logps/chosen": -264.4335021972656, - "logps/rejected": -250.4994354248047, - "loss": 0.0909, + "epoch": 1.6, + "learning_rate": 2.5873596006418255e-07, + "logits/chosen": -2.456943988800049, + "logits/rejected": -2.550523519515991, + "logps/chosen": -174.7811737060547, + "logps/rejected": -221.3059844970703, + "loss": 0.1022, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.5879275798797607, - "rewards/margins": 5.835958480834961, - "rewards/rejected": -7.423886775970459, + "rewards/chosen": -1.790693998336792, + "rewards/margins": 5.0040717124938965, + "rewards/rejected": -6.794766426086426, "step": 6660 }, { - "epoch": 1.69, - "learning_rate": 2.4332927628499203e-07, - "logits/chosen": -2.5481972694396973, - "logits/rejected": -2.609694004058838, - "logps/chosen": -274.73199462890625, - "logps/rejected": -350.5310363769531, - "loss": 0.0603, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5865224599838257, - "rewards/margins": 7.221885681152344, - "rewards/rejected": -7.808408260345459, + "epoch": 1.61, + "learning_rate": 2.5829024781601e-07, + "logits/chosen": -2.4249067306518555, + "logits/rejected": -2.460820198059082, + "logps/chosen": -291.3958435058594, + "logps/rejected": -432.06842041015625, + "loss": 0.1202, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.00011756420281017199, + "rewards/margins": 10.620519638061523, + "rewards/rejected": -10.620401382446289, "step": 6670 }, { - "epoch": 1.69, - "learning_rate": 2.428611553225353e-07, - "logits/chosen": -2.6879124641418457, - "logits/rejected": -2.711090564727783, - "logps/chosen": -310.59771728515625, - "logps/rejected": -443.3534240722656, - "loss": 0.1009, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.01967158354818821, - "rewards/margins": 8.676813125610352, - "rewards/rejected": -8.69648551940918, + "epoch": 1.61, + "learning_rate": 2.578445355678374e-07, + "logits/chosen": -2.5823750495910645, + "logits/rejected": -2.5110130310058594, + "logps/chosen": -182.38796997070312, + "logps/rejected": -244.4145050048828, + "loss": 0.0964, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.88779878616333, + "rewards/margins": 6.077536582946777, + "rewards/rejected": -8.96533489227295, "step": 6680 }, { - "epoch": 1.69, - "learning_rate": 2.423930343600786e-07, - "logits/chosen": -2.64151668548584, - "logits/rejected": -2.7272515296936035, - "logps/chosen": -250.64956665039062, - "logps/rejected": -479.2918395996094, - "loss": 0.2103, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.5805938243865967, - "rewards/margins": 9.652566909790039, - "rewards/rejected": -11.233160018920898, + "epoch": 1.61, + "learning_rate": 2.573988233196648e-07, + "logits/chosen": -2.469998836517334, + "logits/rejected": -2.479484796524048, + "logps/chosen": -214.8350067138672, + "logps/rejected": -365.8271179199219, + "loss": 0.081, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9874442219734192, + "rewards/margins": 10.283195495605469, + "rewards/rejected": -11.270639419555664, "step": 6690 }, { - "epoch": 1.69, - "learning_rate": 2.4192491339762195e-07, - "logits/chosen": -2.4825167655944824, - "logits/rejected": -2.4715609550476074, - "logps/chosen": -286.91864013671875, - "logps/rejected": -352.4754638671875, - "loss": 0.0877, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.8185830116271973, - "rewards/margins": 6.200232028961182, - "rewards/rejected": -9.018815040588379, + "epoch": 1.61, + "learning_rate": 2.5695311107149227e-07, + "logits/chosen": -2.6444027423858643, + "logits/rejected": -2.5192294120788574, + "logps/chosen": -321.697021484375, + "logps/rejected": -395.98309326171875, + "loss": 0.0842, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6234207153320312, + "rewards/margins": 10.761842727661133, + "rewards/rejected": -12.385263442993164, "step": 6700 }, { - "epoch": 1.7, - "learning_rate": 2.4145679243516524e-07, - "logits/chosen": -2.6975979804992676, - "logits/rejected": -2.7159650325775146, - "logps/chosen": -239.55581665039062, - "logps/rejected": -321.30322265625, - "loss": 0.0949, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.047037959098816, - "rewards/margins": 6.473003387451172, - "rewards/rejected": -7.520041465759277, - "step": 6710 + "epoch": 1.61, + "eval_logits/chosen": -2.437563180923462, + "eval_logits/rejected": -2.387695074081421, + "eval_logps/chosen": -247.15597534179688, + "eval_logps/rejected": -265.2489318847656, + "eval_loss": 0.5395439863204956, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -5.119494438171387, + "eval_rewards/margins": 2.9660897254943848, + "eval_rewards/rejected": -8.085583686828613, + "eval_runtime": 131.9703, + "eval_samples_per_second": 23.914, + "eval_steps_per_second": 0.379, + "step": 6700 }, { - "epoch": 1.7, - "learning_rate": 2.4098867147270853e-07, - "logits/chosen": -2.7223079204559326, - "logits/rejected": -2.674180746078491, - "logps/chosen": -336.23590087890625, - "logps/rejected": -390.67169189453125, - "loss": 0.0824, + "epoch": 1.61, + "learning_rate": 2.565073988233197e-07, + "logits/chosen": -2.613145351409912, + "logits/rejected": -2.488893508911133, + "logps/chosen": -184.1219024658203, + "logps/rejected": -279.33843994140625, + "loss": 0.1074, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.45447224378585815, - "rewards/margins": 10.92662239074707, - "rewards/rejected": -10.472149848937988, + "rewards/chosen": -1.9139735698699951, + "rewards/margins": 7.507667541503906, + "rewards/rejected": -9.42164134979248, + "step": 6710 + }, + { + "epoch": 1.62, + "learning_rate": 2.560616865751471e-07, + "logits/chosen": -2.7501156330108643, + "logits/rejected": -2.743370532989502, + "logps/chosen": -293.50408935546875, + "logps/rejected": -349.14111328125, + "loss": 0.0798, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.18803198635578156, + "rewards/margins": 8.618574142456055, + "rewards/rejected": -8.806605339050293, "step": 6720 }, { - "epoch": 1.7, - "learning_rate": 2.405205505102518e-07, - "logits/chosen": -2.695161819458008, - "logits/rejected": -2.6044206619262695, - "logps/chosen": -287.9726867675781, - "logps/rejected": -309.17095947265625, - "loss": 0.0571, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6049206852912903, - "rewards/margins": 7.56399393081665, - "rewards/rejected": -8.168913841247559, + "epoch": 1.62, + "learning_rate": 2.5561597432697453e-07, + "logits/chosen": -2.7439446449279785, + "logits/rejected": -2.630293130874634, + "logps/chosen": -269.8843688964844, + "logps/rejected": -380.7567138671875, + "loss": 0.0727, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.01217429619282484, + "rewards/margins": 9.621479988098145, + "rewards/rejected": -9.633654594421387, "step": 6730 }, { - "epoch": 1.7, - "learning_rate": 2.4005242954779516e-07, - "logits/chosen": -2.5688953399658203, - "logits/rejected": -2.490004062652588, - "logps/chosen": -186.72337341308594, - "logps/rejected": -260.4896545410156, - "loss": 0.1629, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.2096558809280396, - "rewards/margins": 6.041118144989014, - "rewards/rejected": -7.2507734298706055, + "epoch": 1.62, + "learning_rate": 2.5517026207880194e-07, + "logits/chosen": -2.7878904342651367, + "logits/rejected": -2.5107624530792236, + "logps/chosen": -343.46453857421875, + "logps/rejected": -269.94403076171875, + "loss": 0.1026, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.569779098033905, + "rewards/margins": 6.804880619049072, + "rewards/rejected": -7.374659061431885, "step": 6740 }, { - "epoch": 1.71, - "learning_rate": 2.3958430858533845e-07, - "logits/chosen": -2.559790849685669, - "logits/rejected": -2.5062003135681152, - "logps/chosen": -243.90713500976562, - "logps/rejected": -420.903564453125, - "loss": 0.1199, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.14399679005146027, - "rewards/margins": 12.745423316955566, - "rewards/rejected": -12.60142707824707, + "epoch": 1.62, + "learning_rate": 2.5472454983062934e-07, + "logits/chosen": -2.44608211517334, + "logits/rejected": -2.4434010982513428, + "logps/chosen": -288.37237548828125, + "logps/rejected": -319.5035705566406, + "loss": 0.1536, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0117526054382324, + "rewards/margins": 5.843101978302002, + "rewards/rejected": -7.854853630065918, "step": 6750 }, { - "epoch": 1.71, - "learning_rate": 2.3911618762288174e-07, - "logits/chosen": -2.6574254035949707, - "logits/rejected": -2.565380811691284, - "logps/chosen": -222.82992553710938, - "logps/rejected": -308.78271484375, - "loss": 0.086, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.3074750900268555, - "rewards/margins": 6.395266056060791, - "rewards/rejected": -7.7027411460876465, + "epoch": 1.63, + "learning_rate": 2.542788375824568e-07, + "logits/chosen": -2.7693092823028564, + "logits/rejected": -2.6921029090881348, + "logps/chosen": -288.9019470214844, + "logps/rejected": -323.41986083984375, + "loss": 0.158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9345978498458862, + "rewards/margins": 6.151586532592773, + "rewards/rejected": -7.086184501647949, "step": 6760 }, { - "epoch": 1.71, - "learning_rate": 2.3864806666042503e-07, - "logits/chosen": -2.383507490158081, - "logits/rejected": -2.495840311050415, - "logps/chosen": -289.81036376953125, - "logps/rejected": -347.7547302246094, - "loss": 0.0574, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4515106678009033, - "rewards/margins": 7.600310325622559, - "rewards/rejected": -9.051820755004883, + "epoch": 1.63, + "learning_rate": 2.538331253342842e-07, + "logits/chosen": -2.6897239685058594, + "logits/rejected": -2.558520793914795, + "logps/chosen": -259.8335876464844, + "logps/rejected": -293.4278869628906, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43900036811828613, + "rewards/margins": 8.149229049682617, + "rewards/rejected": -8.588228225708008, "step": 6770 }, { - "epoch": 1.71, - "learning_rate": 2.3817994569796835e-07, - "logits/chosen": -2.4468441009521484, - "logits/rejected": -2.4855294227600098, - "logps/chosen": -219.2752685546875, - "logps/rejected": -316.3480529785156, - "loss": 0.0904, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.7793846130371094, - "rewards/margins": 5.968520641326904, - "rewards/rejected": -9.747904777526855, + "epoch": 1.63, + "learning_rate": 2.533874130861116e-07, + "logits/chosen": -2.66084623336792, + "logits/rejected": -2.683711290359497, + "logps/chosen": -256.7698059082031, + "logps/rejected": -320.4084167480469, + "loss": 0.1194, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.19505897164344788, + "rewards/margins": 6.576790809631348, + "rewards/rejected": -6.381731986999512, "step": 6780 }, { - "epoch": 1.72, - "learning_rate": 2.3771182473551166e-07, - "logits/chosen": -2.5500540733337402, - "logits/rejected": -2.5283660888671875, - "logps/chosen": -210.5634307861328, - "logps/rejected": -265.06378173828125, - "loss": 0.1372, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.719752311706543, - "rewards/margins": 7.075891017913818, - "rewards/rejected": -8.79564380645752, + "epoch": 1.63, + "learning_rate": 2.5294170083793906e-07, + "logits/chosen": -2.833439826965332, + "logits/rejected": -2.8388583660125732, + "logps/chosen": -298.8921203613281, + "logps/rejected": -324.91180419921875, + "loss": 0.0907, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9625449180603027, + "rewards/margins": 6.888250827789307, + "rewards/rejected": -9.85079574584961, "step": 6790 }, { - "epoch": 1.72, - "learning_rate": 2.3724370377305495e-07, - "logits/chosen": -2.453866958618164, - "logits/rejected": -2.2946243286132812, - "logps/chosen": -305.92694091796875, - "logps/rejected": -296.0342712402344, - "loss": 0.0778, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.159287691116333, - "rewards/margins": 6.641240119934082, - "rewards/rejected": -8.800528526306152, + "epoch": 1.64, + "learning_rate": 2.5249598858976646e-07, + "logits/chosen": -2.675136089324951, + "logits/rejected": -2.584604024887085, + "logps/chosen": -316.5179443359375, + "logps/rejected": -334.19189453125, + "loss": 0.105, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8175735473632812, + "rewards/margins": 10.322064399719238, + "rewards/rejected": -9.504490852355957, "step": 6800 }, { - "epoch": 1.72, - "learning_rate": 2.3677558281059827e-07, - "logits/chosen": -2.5608479976654053, - "logits/rejected": -2.425877571105957, - "logps/chosen": -310.08770751953125, - "logps/rejected": -382.2318420410156, - "loss": 0.0921, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.0401217937469482, - "rewards/margins": 7.864357948303223, - "rewards/rejected": -9.904478073120117, + "epoch": 1.64, + "eval_logits/chosen": -2.4323160648345947, + "eval_logits/rejected": -2.379817008972168, + "eval_logps/chosen": -245.3402862548828, + "eval_logps/rejected": -261.9502868652344, + "eval_loss": 0.5422552824020386, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -4.937926769256592, + "eval_rewards/margins": 2.8177926540374756, + "eval_rewards/rejected": -7.755719184875488, + "eval_runtime": 131.982, + "eval_samples_per_second": 23.912, + "eval_steps_per_second": 0.379, + "step": 6800 + }, + { + "epoch": 1.64, + "learning_rate": 2.5205027634159386e-07, + "logits/chosen": -2.7946159839630127, + "logits/rejected": -2.5344414710998535, + "logps/chosen": -260.5481872558594, + "logps/rejected": -261.27069091796875, + "loss": 0.1107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9300878047943115, + "rewards/margins": 5.995010852813721, + "rewards/rejected": -7.9250993728637695, "step": 6810 }, { - "epoch": 1.72, - "learning_rate": 2.3630746184814153e-07, - "logits/chosen": -2.493360996246338, - "logits/rejected": -2.489497661590576, - "logps/chosen": -265.77734375, - "logps/rejected": -298.32769775390625, - "loss": 0.0836, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.8773454427719116, - "rewards/margins": 7.0327863693237305, - "rewards/rejected": -8.910131454467773, + "epoch": 1.64, + "learning_rate": 2.5160456409342126e-07, + "logits/chosen": -2.816343069076538, + "logits/rejected": -2.7874741554260254, + "logps/chosen": -328.5304870605469, + "logps/rejected": -323.9268798828125, + "loss": 0.0989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01662883721292019, + "rewards/margins": 7.812521934509277, + "rewards/rejected": -7.8291497230529785, "step": 6820 }, { - "epoch": 1.73, - "learning_rate": 2.3583934088568485e-07, - "logits/chosen": -2.4660093784332275, - "logits/rejected": -2.43588924407959, - "logps/chosen": -270.72589111328125, - "logps/rejected": -375.72332763671875, - "loss": 0.0781, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8436657786369324, - "rewards/margins": 8.026527404785156, - "rewards/rejected": -8.870193481445312, + "epoch": 1.64, + "learning_rate": 2.511588518452487e-07, + "logits/chosen": -2.795189142227173, + "logits/rejected": -2.6112332344055176, + "logps/chosen": -292.189697265625, + "logps/rejected": -342.4793395996094, + "loss": 0.097, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.248765230178833, + "rewards/margins": 6.475526332855225, + "rewards/rejected": -8.724291801452637, "step": 6830 }, { - "epoch": 1.73, - "learning_rate": 2.3537121992322814e-07, - "logits/chosen": -2.517906665802002, - "logits/rejected": -2.3145463466644287, - "logps/chosen": -289.3856506347656, - "logps/rejected": -351.4459533691406, - "loss": 0.0948, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.6315228939056396, - "rewards/margins": 7.835135459899902, - "rewards/rejected": -8.466657638549805, + "epoch": 1.65, + "learning_rate": 2.507131395970761e-07, + "logits/chosen": -2.750699758529663, + "logits/rejected": -2.7412402629852295, + "logps/chosen": -281.0845642089844, + "logps/rejected": -394.5185852050781, + "loss": 0.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8283578753471375, + "rewards/margins": 9.283575057983398, + "rewards/rejected": -10.111932754516602, "step": 6840 }, { - "epoch": 1.73, - "learning_rate": 2.3490309896077145e-07, - "logits/chosen": -2.646665573120117, - "logits/rejected": -2.6209945678710938, - "logps/chosen": -337.2581481933594, - "logps/rejected": -352.4207458496094, - "loss": 0.1138, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.6701911687850952, - "rewards/margins": 5.531129360198975, - "rewards/rejected": -7.201320648193359, + "epoch": 1.65, + "learning_rate": 2.502674273489035e-07, + "logits/chosen": -2.7499070167541504, + "logits/rejected": -2.585718870162964, + "logps/chosen": -332.30810546875, + "logps/rejected": -342.9530334472656, + "loss": 0.094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08875688910484314, + "rewards/margins": 8.316621780395508, + "rewards/rejected": -8.405378341674805, "step": 6850 }, { - "epoch": 1.73, - "learning_rate": 2.3443497799831474e-07, - "logits/chosen": -2.38620924949646, - "logits/rejected": -2.3328304290771484, - "logps/chosen": -249.17434692382812, - "logps/rejected": -329.9270324707031, - "loss": 0.0697, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7047498226165771, - "rewards/margins": 8.854387283325195, - "rewards/rejected": -10.559137344360352, + "epoch": 1.65, + "learning_rate": 2.49821715100731e-07, + "logits/chosen": -2.8606324195861816, + "logits/rejected": -2.656543254852295, + "logps/chosen": -276.4691162109375, + "logps/rejected": -339.7984924316406, + "loss": 0.0539, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.13015088438987732, + "rewards/margins": 9.504716873168945, + "rewards/rejected": -9.374567031860352, "step": 6860 }, { - "epoch": 1.74, - "learning_rate": 2.3396685703585806e-07, - "logits/chosen": -2.4927468299865723, - "logits/rejected": -2.560324192047119, - "logps/chosen": -180.02432250976562, - "logps/rejected": -261.7504577636719, - "loss": 0.11, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.4752345085144043, - "rewards/margins": 5.783717155456543, - "rewards/rejected": -8.258952140808105, + "epoch": 1.65, + "learning_rate": 2.493760028525584e-07, + "logits/chosen": -2.5060436725616455, + "logits/rejected": -2.4398794174194336, + "logps/chosen": -209.08154296875, + "logps/rejected": -266.82379150390625, + "loss": 0.0799, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0843982696533203, + "rewards/margins": 6.8950676918029785, + "rewards/rejected": -7.979465484619141, "step": 6870 }, { - "epoch": 1.74, - "learning_rate": 2.3349873607340137e-07, - "logits/chosen": -2.5648436546325684, - "logits/rejected": -2.3717598915100098, - "logps/chosen": -277.2408447265625, - "logps/rejected": -287.49285888671875, - "loss": 0.0593, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.4586201310157776, - "rewards/margins": 6.640373229980469, - "rewards/rejected": -7.09899377822876, + "epoch": 1.66, + "learning_rate": 2.489302906043858e-07, + "logits/chosen": -2.77506685256958, + "logits/rejected": -2.7071878910064697, + "logps/chosen": -270.93707275390625, + "logps/rejected": -407.3302917480469, + "loss": 0.129, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7129808664321899, + "rewards/margins": 9.967083930969238, + "rewards/rejected": -9.254103660583496, "step": 6880 }, { - "epoch": 1.74, - "learning_rate": 2.3303061511094466e-07, - "logits/chosen": -2.6955389976501465, - "logits/rejected": -2.5326249599456787, - "logps/chosen": -327.55389404296875, - "logps/rejected": -347.0709533691406, - "loss": 0.0627, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5011809468269348, - "rewards/margins": 8.189516067504883, - "rewards/rejected": -8.690695762634277, + "epoch": 1.66, + "learning_rate": 2.4848457835621324e-07, + "logits/chosen": -2.708089828491211, + "logits/rejected": -2.60959792137146, + "logps/chosen": -229.55307006835938, + "logps/rejected": -349.6488342285156, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6899232268333435, + "rewards/margins": 9.56570816040039, + "rewards/rejected": -10.255631446838379, "step": 6890 }, { - "epoch": 1.74, - "learning_rate": 2.3256249414848798e-07, - "logits/chosen": -2.476616382598877, - "logits/rejected": -2.397434949874878, - "logps/chosen": -242.52053833007812, - "logps/rejected": -335.563232421875, - "loss": 0.194, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2094663381576538, - "rewards/margins": 9.379359245300293, - "rewards/rejected": -10.588825225830078, + "epoch": 1.66, + "learning_rate": 2.4803886610804065e-07, + "logits/chosen": -2.68363618850708, + "logits/rejected": -2.667495012283325, + "logps/chosen": -189.0015869140625, + "logps/rejected": -308.7129821777344, + "loss": 0.086, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6877439618110657, + "rewards/margins": 7.841957092285156, + "rewards/rejected": -8.529701232910156, "step": 6900 }, { - "epoch": 1.75, - "learning_rate": 2.3209437318603127e-07, - "logits/chosen": -2.455580234527588, - "logits/rejected": -2.6129374504089355, - "logps/chosen": -234.93679809570312, - "logps/rejected": -267.9778137207031, - "loss": 0.0831, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7409460544586182, - "rewards/margins": 6.319588661193848, - "rewards/rejected": -7.060534477233887, + "epoch": 1.66, + "eval_logits/chosen": -2.4382877349853516, + "eval_logits/rejected": -2.387030601501465, + "eval_logps/chosen": -239.5587921142578, + "eval_logps/rejected": -255.54940795898438, + "eval_loss": 0.53505939245224, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -4.359776020050049, + "eval_rewards/margins": 2.755856990814209, + "eval_rewards/rejected": -7.1156325340271, + "eval_runtime": 132.0588, + "eval_samples_per_second": 23.898, + "eval_steps_per_second": 0.379, + "step": 6900 + }, + { + "epoch": 1.66, + "learning_rate": 2.4759315385986805e-07, + "logits/chosen": -2.6302647590637207, + "logits/rejected": -2.7092225551605225, + "logps/chosen": -215.89694213867188, + "logps/rejected": -337.82781982421875, + "loss": 0.1225, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9565473794937134, + "rewards/margins": 5.960141181945801, + "rewards/rejected": -6.916687965393066, "step": 6910 }, { - "epoch": 1.75, - "learning_rate": 2.3162625222357456e-07, - "logits/chosen": -2.249671220779419, - "logits/rejected": -2.421679735183716, - "logps/chosen": -220.7417449951172, - "logps/rejected": -348.41351318359375, - "loss": 0.1057, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.4988008141517639, - "rewards/margins": 9.099369049072266, - "rewards/rejected": -9.598170280456543, + "epoch": 1.67, + "learning_rate": 2.471474416116955e-07, + "logits/chosen": -2.4610588550567627, + "logits/rejected": -2.3604302406311035, + "logps/chosen": -242.921142578125, + "logps/rejected": -317.09271240234375, + "loss": 0.1893, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0648877620697021, + "rewards/margins": 8.815712928771973, + "rewards/rejected": -9.880599975585938, "step": 6920 }, { - "epoch": 1.75, - "learning_rate": 2.3115813126111785e-07, - "logits/chosen": -2.5640151500701904, - "logits/rejected": -2.563455820083618, - "logps/chosen": -228.15603637695312, - "logps/rejected": -277.2568359375, - "loss": 0.1121, + "epoch": 1.67, + "learning_rate": 2.467017293635229e-07, + "logits/chosen": -2.8157455921173096, + "logits/rejected": -2.7946114540100098, + "logps/chosen": -241.2842559814453, + "logps/rejected": -349.528076171875, + "loss": 0.0827, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.2560606002807617, - "rewards/margins": 6.815365791320801, - "rewards/rejected": -7.071427345275879, + "rewards/chosen": -0.3087575435638428, + "rewards/margins": 6.308024883270264, + "rewards/rejected": -6.616782188415527, "step": 6930 }, + { + "epoch": 1.67, + "learning_rate": 2.462560171153503e-07, + "logits/chosen": -2.608351469039917, + "logits/rejected": -2.61462140083313, + "logps/chosen": -290.62713623046875, + "logps/rejected": -346.190185546875, + "loss": 0.0956, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.45347294211387634, + "rewards/margins": 9.4335355758667, + "rewards/rejected": -8.980062484741211, + "step": 6940 + }, + { + "epoch": 1.67, + "learning_rate": 2.4581030486717777e-07, + "logits/chosen": -2.402801513671875, + "logits/rejected": -2.337174892425537, + "logps/chosen": -328.09173583984375, + "logps/rejected": -392.1811828613281, + "loss": 0.1539, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8712444305419922, + "rewards/margins": 9.416337013244629, + "rewards/rejected": -8.545092582702637, + "step": 6950 + }, + { + "epoch": 1.68, + "learning_rate": 2.4536459261900517e-07, + "logits/chosen": -2.7248482704162598, + "logits/rejected": -2.81314754486084, + "logps/chosen": -230.143798828125, + "logps/rejected": -327.3935546875, + "loss": 0.1006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28216123580932617, + "rewards/margins": 8.926961898803711, + "rewards/rejected": -9.209123611450195, + "step": 6960 + }, + { + "epoch": 1.68, + "learning_rate": 2.4491888037083257e-07, + "logits/chosen": -2.5695297718048096, + "logits/rejected": -2.5491271018981934, + "logps/chosen": -337.0064392089844, + "logps/rejected": -405.1397705078125, + "loss": 0.1027, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8940232992172241, + "rewards/margins": 9.002965927124023, + "rewards/rejected": -10.896989822387695, + "step": 6970 + }, + { + "epoch": 1.68, + "learning_rate": 2.4447316812266e-07, + "logits/chosen": -2.737940549850464, + "logits/rejected": -2.690535068511963, + "logps/chosen": -238.6956024169922, + "logps/rejected": -239.56362915039062, + "loss": 0.1125, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.32410043478012085, + "rewards/margins": 6.964704990386963, + "rewards/rejected": -6.6406049728393555, + "step": 6980 + }, + { + "epoch": 1.68, + "learning_rate": 2.4402745587448743e-07, + "logits/chosen": -2.668504238128662, + "logits/rejected": -2.614301919937134, + "logps/chosen": -276.4830322265625, + "logps/rejected": -332.14630126953125, + "loss": 0.0797, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.353655219078064, + "rewards/margins": 5.739809989929199, + "rewards/rejected": -7.093465328216553, + "step": 6990 + }, + { + "epoch": 1.68, + "learning_rate": 2.4358174362631483e-07, + "logits/chosen": -2.592217206954956, + "logits/rejected": -2.517693042755127, + "logps/chosen": -240.7083740234375, + "logps/rejected": -329.5331726074219, + "loss": 0.0622, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.42312636971473694, + "rewards/margins": 7.415658473968506, + "rewards/rejected": -7.838784694671631, + "step": 7000 + }, + { + "epoch": 1.68, + "eval_logits/chosen": -2.4779374599456787, + "eval_logits/rejected": -2.427616834640503, + "eval_logps/chosen": -242.79150390625, + "eval_logps/rejected": -260.9709777832031, + "eval_loss": 0.5394036173820496, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -4.68304967880249, + "eval_rewards/margins": 2.9747402667999268, + "eval_rewards/rejected": -7.657789707183838, + "eval_runtime": 132.0975, + "eval_samples_per_second": 23.891, + "eval_steps_per_second": 0.379, + "step": 7000 + }, + { + "epoch": 1.69, + "learning_rate": 2.4313603137814224e-07, + "logits/chosen": -2.8185973167419434, + "logits/rejected": -2.6999199390411377, + "logps/chosen": -291.8052978515625, + "logps/rejected": -264.32916259765625, + "loss": 0.1165, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.03381216526031494, + "rewards/margins": 8.385944366455078, + "rewards/rejected": -8.419755935668945, + "step": 7010 + }, + { + "epoch": 1.69, + "learning_rate": 2.426903191299697e-07, + "logits/chosen": -2.7404980659484863, + "logits/rejected": -2.616826295852661, + "logps/chosen": -276.4908142089844, + "logps/rejected": -291.42218017578125, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.044650495052337646, + "rewards/margins": 6.940688133239746, + "rewards/rejected": -6.896038055419922, + "step": 7020 + }, + { + "epoch": 1.69, + "learning_rate": 2.422446068817971e-07, + "logits/chosen": -2.731682062149048, + "logits/rejected": -2.604588508605957, + "logps/chosen": -261.1102600097656, + "logps/rejected": -298.77288818359375, + "loss": 0.1021, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5248648524284363, + "rewards/margins": 6.572853088378906, + "rewards/rejected": -7.09771728515625, + "step": 7030 + }, + { + "epoch": 1.69, + "learning_rate": 2.417988946336245e-07, + "logits/chosen": -2.7587058544158936, + "logits/rejected": -2.689257860183716, + "logps/chosen": -249.2584228515625, + "logps/rejected": -367.74566650390625, + "loss": 0.0645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.074500322341919, + "rewards/margins": 9.59827995300293, + "rewards/rejected": -8.523778915405273, + "step": 7040 + }, + { + "epoch": 1.7, + "learning_rate": 2.4135318238545195e-07, + "logits/chosen": -2.8093748092651367, + "logits/rejected": -2.6664083003997803, + "logps/chosen": -302.90234375, + "logps/rejected": -379.04754638671875, + "loss": 0.0614, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6072776913642883, + "rewards/margins": 9.818187713623047, + "rewards/rejected": -10.425464630126953, + "step": 7050 + }, + { + "epoch": 1.7, + "learning_rate": 2.4090747013727936e-07, + "logits/chosen": -2.57035493850708, + "logits/rejected": -2.575605869293213, + "logps/chosen": -340.317138671875, + "logps/rejected": -327.8518981933594, + "loss": 0.1268, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0726079940795898, + "rewards/margins": 7.638877868652344, + "rewards/rejected": -8.711485862731934, + "step": 7060 + }, + { + "epoch": 1.7, + "learning_rate": 2.4046175788910676e-07, + "logits/chosen": -2.763205051422119, + "logits/rejected": -2.729142904281616, + "logps/chosen": -233.8443603515625, + "logps/rejected": -353.21868896484375, + "loss": 0.1626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9505976438522339, + "rewards/margins": 6.790528297424316, + "rewards/rejected": -7.74112606048584, + "step": 7070 + }, + { + "epoch": 1.7, + "learning_rate": 2.400160456409342e-07, + "logits/chosen": -2.7113826274871826, + "logits/rejected": -2.669304609298706, + "logps/chosen": -201.74424743652344, + "logps/rejected": -313.32342529296875, + "loss": 0.1585, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4821749925613403, + "rewards/margins": 7.0733819007873535, + "rewards/rejected": -8.555557250976562, + "step": 7080 + }, + { + "epoch": 1.71, + "learning_rate": 2.395703333927616e-07, + "logits/chosen": -2.6256930828094482, + "logits/rejected": -2.663498878479004, + "logps/chosen": -200.3671417236328, + "logps/rejected": -307.119140625, + "loss": 0.1288, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.09732379764318466, + "rewards/margins": 7.844536781311035, + "rewards/rejected": -7.941859245300293, + "step": 7090 + }, + { + "epoch": 1.71, + "learning_rate": 2.39124621144589e-07, + "logits/chosen": -2.709343671798706, + "logits/rejected": -2.6565096378326416, + "logps/chosen": -224.8814239501953, + "logps/rejected": -304.5251770019531, + "loss": 0.0973, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.959080696105957, + "rewards/margins": 9.313790321350098, + "rewards/rejected": -8.354707717895508, + "step": 7100 + }, + { + "epoch": 1.71, + "eval_logits/chosen": -2.3563954830169678, + "eval_logits/rejected": -2.300964593887329, + "eval_logps/chosen": -243.43643188476562, + "eval_logps/rejected": -260.9595947265625, + "eval_loss": 0.53191077709198, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -4.747539520263672, + "eval_rewards/margins": 2.909111261367798, + "eval_rewards/rejected": -7.656650543212891, + "eval_runtime": 132.1254, + "eval_samples_per_second": 23.886, + "eval_steps_per_second": 0.378, + "step": 7100 + }, + { + "epoch": 1.71, + "learning_rate": 2.386789088964165e-07, + "logits/chosen": -2.679600238800049, + "logits/rejected": -2.644308090209961, + "logps/chosen": -314.29693603515625, + "logps/rejected": -346.9539489746094, + "loss": 0.1259, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9009653329849243, + "rewards/margins": 6.139172077178955, + "rewards/rejected": -7.040136814117432, + "step": 7110 + }, + { + "epoch": 1.71, + "learning_rate": 2.3823319664824388e-07, + "logits/chosen": -2.5172038078308105, + "logits/rejected": -2.5531249046325684, + "logps/chosen": -205.6974639892578, + "logps/rejected": -293.9311828613281, + "loss": 0.079, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.874517560005188, + "rewards/margins": 6.828620910644531, + "rewards/rejected": -7.703138828277588, + "step": 7120 + }, + { + "epoch": 1.72, + "learning_rate": 2.3778748440007128e-07, + "logits/chosen": -2.7703957557678223, + "logits/rejected": -2.6555066108703613, + "logps/chosen": -311.3225402832031, + "logps/rejected": -278.5381164550781, + "loss": 0.0874, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5287644863128662, + "rewards/margins": 6.611566066741943, + "rewards/rejected": -7.140329837799072, + "step": 7130 + }, + { + "epoch": 1.72, + "learning_rate": 2.373417721518987e-07, + "logits/chosen": -2.551462411880493, + "logits/rejected": -2.5756757259368896, + "logps/chosen": -226.84228515625, + "logps/rejected": -322.23486328125, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6650134325027466, + "rewards/margins": 8.317803382873535, + "rewards/rejected": -8.982815742492676, + "step": 7140 + }, + { + "epoch": 1.72, + "learning_rate": 2.3689605990372614e-07, + "logits/chosen": -2.7335731983184814, + "logits/rejected": -2.680335283279419, + "logps/chosen": -263.3839416503906, + "logps/rejected": -303.4400939941406, + "loss": 0.0873, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0720853805541992, + "rewards/margins": 7.760520935058594, + "rewards/rejected": -8.832606315612793, + "step": 7150 + }, + { + "epoch": 1.72, + "learning_rate": 2.3645034765555354e-07, + "logits/chosen": -2.6269562244415283, + "logits/rejected": -2.6114819049835205, + "logps/chosen": -189.7339324951172, + "logps/rejected": -291.15081787109375, + "loss": 0.1101, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2032740116119385, + "rewards/margins": 9.132551193237305, + "rewards/rejected": -10.335824966430664, + "step": 7160 + }, + { + "epoch": 1.73, + "learning_rate": 2.36004635407381e-07, + "logits/chosen": -2.6511261463165283, + "logits/rejected": -2.7067880630493164, + "logps/chosen": -255.1636199951172, + "logps/rejected": -348.8571472167969, + "loss": 0.12, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.509678065776825, + "rewards/margins": 8.13435173034668, + "rewards/rejected": -8.644031524658203, + "step": 7170 + }, + { + "epoch": 1.73, + "learning_rate": 2.3555892315920843e-07, + "logits/chosen": -2.7220358848571777, + "logits/rejected": -2.6488194465637207, + "logps/chosen": -247.86367797851562, + "logps/rejected": -248.39077758789062, + "loss": 0.1343, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1571508646011353, + "rewards/margins": 5.116828918457031, + "rewards/rejected": -6.273979187011719, + "step": 7180 + }, + { + "epoch": 1.73, + "learning_rate": 2.3511321091103583e-07, + "logits/chosen": -2.8051559925079346, + "logits/rejected": -2.735093832015991, + "logps/chosen": -283.8224792480469, + "logps/rejected": -429.597900390625, + "loss": 0.159, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8221418857574463, + "rewards/margins": 8.526994705200195, + "rewards/rejected": -10.349136352539062, + "step": 7190 + }, + { + "epoch": 1.73, + "learning_rate": 2.3466749866286326e-07, + "logits/chosen": -2.7978286743164062, + "logits/rejected": -2.6827690601348877, + "logps/chosen": -375.71368408203125, + "logps/rejected": -385.3860778808594, + "loss": 0.1052, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.14973331987857819, + "rewards/margins": 8.908220291137695, + "rewards/rejected": -9.057953834533691, + "step": 7200 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -2.4200572967529297, + "eval_logits/rejected": -2.3696177005767822, + "eval_logps/chosen": -241.93289184570312, + "eval_logps/rejected": -259.7779235839844, + "eval_loss": 0.5284144282341003, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -4.597188949584961, + "eval_rewards/margins": 2.9412925243377686, + "eval_rewards/rejected": -7.53848123550415, + "eval_runtime": 132.3526, + "eval_samples_per_second": 23.845, + "eval_steps_per_second": 0.378, + "step": 7200 + }, + { + "epoch": 1.74, + "learning_rate": 2.3422178641469066e-07, + "logits/chosen": -2.5901970863342285, + "logits/rejected": -2.5686583518981934, + "logps/chosen": -242.0441436767578, + "logps/rejected": -297.9270935058594, + "loss": 0.0942, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8504171371459961, + "rewards/margins": 7.879048824310303, + "rewards/rejected": -8.729467391967773, + "step": 7210 + }, + { + "epoch": 1.74, + "learning_rate": 2.337760741665181e-07, + "logits/chosen": -2.6589884757995605, + "logits/rejected": -2.6443393230438232, + "logps/chosen": -304.5870361328125, + "logps/rejected": -413.0279235839844, + "loss": 0.0447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11173856258392334, + "rewards/margins": 9.459914207458496, + "rewards/rejected": -9.57165241241455, + "step": 7220 + }, + { + "epoch": 1.74, + "learning_rate": 2.3333036191834552e-07, + "logits/chosen": -2.58132004737854, + "logits/rejected": -2.5422065258026123, + "logps/chosen": -203.72744750976562, + "logps/rejected": -239.21908569335938, + "loss": 0.1121, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9320341348648071, + "rewards/margins": 4.503114223480225, + "rewards/rejected": -6.435147762298584, + "step": 7230 + }, + { + "epoch": 1.74, + "learning_rate": 2.3288464967017293e-07, + "logits/chosen": -2.790097951889038, + "logits/rejected": -2.5973305702209473, + "logps/chosen": -295.18780517578125, + "logps/rejected": -394.1920471191406, + "loss": 0.1008, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9543908834457397, + "rewards/margins": 6.669290065765381, + "rewards/rejected": -8.62368106842041, + "step": 7240 + }, + { + "epoch": 1.74, + "learning_rate": 2.3243893742200035e-07, + "logits/chosen": -2.57969331741333, + "logits/rejected": -2.6340432167053223, + "logps/chosen": -299.4039611816406, + "logps/rejected": -323.65057373046875, + "loss": 0.1135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.460325241088867, + "rewards/margins": 4.559673309326172, + "rewards/rejected": -7.019999027252197, + "step": 7250 + }, { "epoch": 1.75, - "learning_rate": 2.3069001029866116e-07, - "logits/chosen": -2.3587443828582764, - "logits/rejected": -2.3043508529663086, - "logps/chosen": -257.75775146484375, - "logps/rejected": -232.22909545898438, - "loss": 0.104, + "learning_rate": 2.3199322517382778e-07, + "logits/chosen": -2.7026455402374268, + "logits/rejected": -2.6360459327697754, + "logps/chosen": -269.4342041015625, + "logps/rejected": -352.17724609375, + "loss": 0.1301, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.49799785017967224, + "rewards/margins": 8.591739654541016, + "rewards/rejected": -9.089736938476562, + "step": 7260 + }, + { + "epoch": 1.75, + "learning_rate": 2.315475129256552e-07, + "logits/chosen": -2.6452369689941406, + "logits/rejected": -2.6692497730255127, + "logps/chosen": -222.3269500732422, + "logps/rejected": -343.4007568359375, + "loss": 0.0813, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8904783725738525, + "rewards/margins": 6.369973182678223, + "rewards/rejected": -8.260451316833496, + "step": 7270 + }, + { + "epoch": 1.75, + "learning_rate": 2.3110180067748262e-07, + "logits/chosen": -2.5933916568756104, + "logits/rejected": -2.5813040733337402, + "logps/chosen": -229.5352783203125, + "logps/rejected": -265.1120300292969, + "loss": 0.1, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.6903194189071655, - "rewards/margins": 5.616563320159912, - "rewards/rejected": -7.306881904602051, - "step": 6940 + "rewards/chosen": -2.5162527561187744, + "rewards/margins": 5.829333782196045, + "rewards/rejected": -8.345586776733398, + "step": 7280 + }, + { + "epoch": 1.75, + "learning_rate": 2.3065608842931002e-07, + "logits/chosen": -2.663539171218872, + "logits/rejected": -2.569728374481201, + "logps/chosen": -379.2259521484375, + "logps/rejected": -321.4938049316406, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2845958471298218, + "rewards/margins": 6.981713771820068, + "rewards/rejected": -8.266308784484863, + "step": 7290 }, { "epoch": 1.76, - "learning_rate": 2.3022188933620445e-07, - "logits/chosen": -2.5451407432556152, - "logits/rejected": -2.4094347953796387, - "logps/chosen": -291.5643005371094, - "logps/rejected": -332.65875244140625, - "loss": 0.0361, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5125554800033569, - "rewards/margins": 7.864842414855957, - "rewards/rejected": -8.377397537231445, - "step": 6950 + "learning_rate": 2.3021037618113745e-07, + "logits/chosen": -2.3304195404052734, + "logits/rejected": -2.3825669288635254, + "logps/chosen": -381.06402587890625, + "logps/rejected": -330.9862365722656, + "loss": 0.0645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.911083996295929, + "rewards/margins": 6.9651899337768555, + "rewards/rejected": -7.876273155212402, + "step": 7300 }, { "epoch": 1.76, - "learning_rate": 2.2975376837374777e-07, - "logits/chosen": -2.498363971710205, - "logits/rejected": -2.5168495178222656, - "logps/chosen": -276.8007507324219, - "logps/rejected": -270.8402099609375, - "loss": 0.0863, + "eval_logits/chosen": -2.3440210819244385, + "eval_logits/rejected": -2.2856647968292236, + "eval_logps/chosen": -245.7831268310547, + "eval_logps/rejected": -264.6048278808594, + "eval_loss": 0.5338801741600037, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -4.982211112976074, + "eval_rewards/margins": 3.038963556289673, + "eval_rewards/rejected": -8.021175384521484, + "eval_runtime": 132.1118, + "eval_samples_per_second": 23.889, + "eval_steps_per_second": 0.378, + "step": 7300 + }, + { + "epoch": 1.76, + "learning_rate": 2.2976466393296488e-07, + "logits/chosen": -2.5880045890808105, + "logits/rejected": -2.5642735958099365, + "logps/chosen": -233.44772338867188, + "logps/rejected": -290.04962158203125, + "loss": 0.1544, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.0414700023829937, - "rewards/margins": 7.689317226409912, - "rewards/rejected": -7.647847652435303, - "step": 6960 + "rewards/chosen": -1.418136477470398, + "rewards/margins": 7.5532941818237305, + "rewards/rejected": -8.971430778503418, + "step": 7310 }, { "epoch": 1.76, - "learning_rate": 2.2928564741129106e-07, - "logits/chosen": -2.657831907272339, - "logits/rejected": -2.4639506340026855, - "logps/chosen": -412.9681091308594, - "logps/rejected": -304.3277282714844, - "loss": 0.0825, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8967636227607727, - "rewards/margins": 6.674870491027832, - "rewards/rejected": -7.571633815765381, - "step": 6970 + "learning_rate": 2.2931895168479228e-07, + "logits/chosen": -2.384918212890625, + "logits/rejected": -2.425996780395508, + "logps/chosen": -277.6245422363281, + "logps/rejected": -367.5646667480469, + "loss": 0.1315, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.36100435256958, + "rewards/margins": 8.907506942749023, + "rewards/rejected": -12.268511772155762, + "step": 7320 }, { "epoch": 1.76, - "learning_rate": 2.2881752644883437e-07, - "logits/chosen": -2.576490640640259, - "logits/rejected": -2.5533933639526367, - "logps/chosen": -324.05462646484375, - "logps/rejected": -329.29339599609375, - "loss": 0.1431, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.7502419352531433, - "rewards/margins": 7.129467964172363, - "rewards/rejected": -7.8797101974487305, - "step": 6980 + "learning_rate": 2.288732394366197e-07, + "logits/chosen": -2.637505292892456, + "logits/rejected": -2.437636613845825, + "logps/chosen": -238.11196899414062, + "logps/rejected": -261.47930908203125, + "loss": 0.0928, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9223560094833374, + "rewards/margins": 7.016798496246338, + "rewards/rejected": -8.939155578613281, + "step": 7330 }, { "epoch": 1.77, - "learning_rate": 2.283494054863777e-07, - "logits/chosen": -2.5980420112609863, - "logits/rejected": -2.373439311981201, - "logps/chosen": -349.974365234375, - "logps/rejected": -433.7562561035156, - "loss": 0.0873, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7957502007484436, - "rewards/margins": 9.475018501281738, - "rewards/rejected": -8.679269790649414, - "step": 6990 + "learning_rate": 2.2842752718844714e-07, + "logits/chosen": -2.775888204574585, + "logits/rejected": -2.7374014854431152, + "logps/chosen": -305.4595642089844, + "logps/rejected": -322.2148132324219, + "loss": 0.1392, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8518146276473999, + "rewards/margins": 8.282339096069336, + "rewards/rejected": -9.134153366088867, + "step": 7340 }, { "epoch": 1.77, - "learning_rate": 2.2788128452392098e-07, - "logits/chosen": -2.405667543411255, - "logits/rejected": -2.24853515625, - "logps/chosen": -247.5191650390625, - "logps/rejected": -276.60723876953125, - "loss": 0.0855, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1435565948486328, - "rewards/margins": 6.712456703186035, - "rewards/rejected": -7.856014251708984, - "step": 7000 + "learning_rate": 2.2798181494027454e-07, + "logits/chosen": -2.438192129135132, + "logits/rejected": -2.5132827758789062, + "logps/chosen": -264.43670654296875, + "logps/rejected": -291.0517578125, + "loss": 0.0705, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8488703966140747, + "rewards/margins": 7.184966087341309, + "rewards/rejected": -8.033838272094727, + "step": 7350 }, { "epoch": 1.77, - "learning_rate": 2.274131635614643e-07, - "logits/chosen": -2.345731258392334, - "logits/rejected": -2.2325825691223145, - "logps/chosen": -295.4849548339844, - "logps/rejected": -341.0472412109375, - "loss": 0.0638, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4678739011287689, - "rewards/margins": 7.9846062660217285, - "rewards/rejected": -7.516732215881348, - "step": 7010 + "learning_rate": 2.2753610269210197e-07, + "logits/chosen": -2.6082770824432373, + "logits/rejected": -2.5533294677734375, + "logps/chosen": -224.6868896484375, + "logps/rejected": -310.91815185546875, + "loss": 0.0888, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.13552884757518768, + "rewards/margins": 8.522917747497559, + "rewards/rejected": -8.658447265625, + "step": 7360 }, { "epoch": 1.77, - "learning_rate": 2.2694504259900756e-07, - "logits/chosen": -2.706740617752075, - "logits/rejected": -2.68070650100708, - "logps/chosen": -251.5078125, - "logps/rejected": -267.70477294921875, - "loss": 0.0968, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2914077639579773, - "rewards/margins": 6.667860507965088, - "rewards/rejected": -6.959267616271973, - "step": 7020 + "learning_rate": 2.2709039044392937e-07, + "logits/chosen": -2.705390214920044, + "logits/rejected": -2.4593687057495117, + "logps/chosen": -224.9893035888672, + "logps/rejected": -311.37396240234375, + "loss": 0.0524, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.041104815900325775, + "rewards/margins": 9.800336837768555, + "rewards/rejected": -9.759233474731445, + "step": 7370 }, { "epoch": 1.78, - "learning_rate": 2.2647692163655087e-07, - "logits/chosen": -2.697540521621704, - "logits/rejected": -2.433387517929077, - "logps/chosen": -240.4569091796875, - "logps/rejected": -249.18798828125, - "loss": 0.0813, + "learning_rate": 2.266446781957568e-07, + "logits/chosen": -2.667921543121338, + "logits/rejected": -2.4788870811462402, + "logps/chosen": -251.2928466796875, + "logps/rejected": -257.45068359375, + "loss": 0.0899, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.07182397693395615, + "rewards/margins": 8.074679374694824, + "rewards/rejected": -8.00285530090332, + "step": 7380 + }, + { + "epoch": 1.78, + "learning_rate": 2.2619896594758423e-07, + "logits/chosen": -2.739666223526001, + "logits/rejected": -2.6384711265563965, + "logps/chosen": -359.79071044921875, + "logps/rejected": -391.1249694824219, + "loss": 0.0825, "rewards/accuracies": 1.0, - "rewards/chosen": -1.3658664226531982, - "rewards/margins": 5.591639518737793, - "rewards/rejected": -6.957505702972412, - "step": 7030 + "rewards/chosen": 1.4219744205474854, + "rewards/margins": 10.46330738067627, + "rewards/rejected": -9.041333198547363, + "step": 7390 }, { "epoch": 1.78, - "learning_rate": 2.2600880067409416e-07, - "logits/chosen": -2.5349862575531006, - "logits/rejected": -2.370959758758545, - "logps/chosen": -289.0957336425781, - "logps/rejected": -315.78192138671875, - "loss": 0.1125, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.2642221450805664, - "rewards/margins": 8.159451484680176, - "rewards/rejected": -9.423672676086426, - "step": 7040 + "learning_rate": 2.2575325369941164e-07, + "logits/chosen": -2.561511754989624, + "logits/rejected": -2.488967180252075, + "logps/chosen": -254.07925415039062, + "logps/rejected": -288.2048645019531, + "loss": 0.0923, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.937911033630371, + "rewards/margins": 5.389759540557861, + "rewards/rejected": -7.327670097351074, + "step": 7400 }, { "epoch": 1.78, - "learning_rate": 2.2554067971163748e-07, - "logits/chosen": -2.5603203773498535, - "logits/rejected": -2.5768723487854004, - "logps/chosen": -293.04669189453125, - "logps/rejected": -282.08416748046875, - "loss": 0.0597, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6653440594673157, - "rewards/margins": 6.230250358581543, - "rewards/rejected": -6.895594120025635, - "step": 7050 + "eval_logits/chosen": -2.3150076866149902, + "eval_logits/rejected": -2.2563118934631348, + "eval_logps/chosen": -242.3295135498047, + "eval_logps/rejected": -261.0246276855469, + "eval_loss": 0.538532555103302, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -4.636850833892822, + "eval_rewards/margins": 3.0263025760650635, + "eval_rewards/rejected": -7.663153648376465, + "eval_runtime": 132.1446, + "eval_samples_per_second": 23.883, + "eval_steps_per_second": 0.378, + "step": 7400 }, { "epoch": 1.78, - "learning_rate": 2.2507255874918077e-07, - "logits/chosen": -2.4578702449798584, - "logits/rejected": -2.349565029144287, - "logps/chosen": -191.6605682373047, - "logps/rejected": -267.44464111328125, - "loss": 0.0556, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3502098023891449, - "rewards/margins": 9.301736831665039, - "rewards/rejected": -9.651947975158691, - "step": 7060 + "learning_rate": 2.2530754145123907e-07, + "logits/chosen": -2.461325168609619, + "logits/rejected": -2.3739752769470215, + "logps/chosen": -200.18081665039062, + "logps/rejected": -280.34088134765625, + "loss": 0.1141, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.567082166671753, + "rewards/margins": 5.990841865539551, + "rewards/rejected": -8.557924270629883, + "step": 7410 }, { "epoch": 1.79, - "learning_rate": 2.2460443778672408e-07, - "logits/chosen": -2.788151264190674, - "logits/rejected": -2.52620530128479, - "logps/chosen": -288.56158447265625, - "logps/rejected": -306.52960205078125, - "loss": 0.098, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.22187399864196777, - "rewards/margins": 8.754189491271973, - "rewards/rejected": -8.97606372833252, - "step": 7070 + "learning_rate": 2.248618292030665e-07, + "logits/chosen": -2.538097381591797, + "logits/rejected": -2.472212314605713, + "logps/chosen": -286.78515625, + "logps/rejected": -312.0413513183594, + "loss": 0.0962, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7170461416244507, + "rewards/margins": 6.334829330444336, + "rewards/rejected": -8.051875114440918, + "step": 7420 }, { "epoch": 1.79, - "learning_rate": 2.2413631682426737e-07, - "logits/chosen": -2.605219602584839, - "logits/rejected": -2.574493885040283, - "logps/chosen": -331.01458740234375, - "logps/rejected": -378.16302490234375, - "loss": 0.0972, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.7465782165527344, - "rewards/margins": 11.760537147521973, - "rewards/rejected": -10.013957023620605, - "step": 7080 + "learning_rate": 2.244161169548939e-07, + "logits/chosen": -2.6596620082855225, + "logits/rejected": -2.5989315509796143, + "logps/chosen": -206.81430053710938, + "logps/rejected": -244.4898223876953, + "loss": 0.0981, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23001742362976074, + "rewards/margins": 7.438671112060547, + "rewards/rejected": -7.668688774108887, + "step": 7430 }, { "epoch": 1.79, - "learning_rate": 2.236681958618107e-07, - "logits/chosen": -2.598388195037842, - "logits/rejected": -2.4072489738464355, - "logps/chosen": -294.28790283203125, - "logps/rejected": -289.75323486328125, - "loss": 0.1413, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.3980717062950134, - "rewards/margins": 7.004315376281738, - "rewards/rejected": -7.4023871421813965, - "step": 7090 + "learning_rate": 2.2397040470672133e-07, + "logits/chosen": -2.6181411743164062, + "logits/rejected": -2.4394915103912354, + "logps/chosen": -201.66659545898438, + "logps/rejected": -251.88565063476562, + "loss": 0.123, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3851864337921143, + "rewards/margins": 6.120608329772949, + "rewards/rejected": -7.505795478820801, + "step": 7440 }, { "epoch": 1.79, - "learning_rate": 2.23200074899354e-07, - "logits/chosen": -2.652200937271118, - "logits/rejected": -2.6423587799072266, - "logps/chosen": -267.423828125, - "logps/rejected": -380.0479431152344, - "loss": 0.08, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.3439788520336151, - "rewards/margins": 7.537928104400635, - "rewards/rejected": -7.881906986236572, - "step": 7100 + "learning_rate": 2.2352469245854873e-07, + "logits/chosen": -2.680032253265381, + "logits/rejected": -2.6020634174346924, + "logps/chosen": -223.3247833251953, + "logps/rejected": -276.1643981933594, + "loss": 0.1372, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8675986528396606, + "rewards/margins": 6.381720542907715, + "rewards/rejected": -7.249318599700928, + "step": 7450 }, { "epoch": 1.8, - "learning_rate": 2.227319539368973e-07, - "logits/chosen": -2.6082496643066406, - "logits/rejected": -2.4751124382019043, - "logps/chosen": -238.06454467773438, - "logps/rejected": -234.53146362304688, - "loss": 0.0793, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.09598349034786224, - "rewards/margins": 6.674858093261719, - "rewards/rejected": -6.578874111175537, - "step": 7110 + "learning_rate": 2.2307898021037616e-07, + "logits/chosen": -2.68167781829834, + "logits/rejected": -2.5055346488952637, + "logps/chosen": -263.083984375, + "logps/rejected": -311.91168212890625, + "loss": 0.1154, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.19335949420928955, + "rewards/margins": 8.216484069824219, + "rewards/rejected": -8.023124694824219, + "step": 7460 }, { "epoch": 1.8, - "learning_rate": 2.2226383297444058e-07, - "logits/chosen": -2.6969308853149414, - "logits/rejected": -2.634843349456787, - "logps/chosen": -292.68414306640625, - "logps/rejected": -342.77752685546875, - "loss": 0.1412, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7074816226959229, - "rewards/margins": 7.93694543838501, - "rewards/rejected": -8.644426345825195, - "step": 7120 + "learning_rate": 2.226332679622036e-07, + "logits/chosen": -2.482551336288452, + "logits/rejected": -2.4507856369018555, + "logps/chosen": -309.1131591796875, + "logps/rejected": -443.99224853515625, + "loss": 0.0773, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7441800832748413, + "rewards/margins": 7.502471923828125, + "rewards/rejected": -9.246652603149414, + "step": 7470 }, { "epoch": 1.8, - "learning_rate": 2.2179571201198387e-07, - "logits/chosen": -2.823737382888794, - "logits/rejected": -2.614729404449463, - "logps/chosen": -326.89892578125, - "logps/rejected": -340.80279541015625, - "loss": 0.1154, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.7879875898361206, - "rewards/margins": 8.97514533996582, - "rewards/rejected": -8.18715763092041, - "step": 7130 + "learning_rate": 2.22187555714031e-07, + "logits/chosen": -2.6848702430725098, + "logits/rejected": -2.526034116744995, + "logps/chosen": -273.9921569824219, + "logps/rejected": -289.3262023925781, + "loss": 0.0871, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.3857364058494568, + "rewards/margins": 9.072836875915527, + "rewards/rejected": -8.687101364135742, + "step": 7480 }, { "epoch": 1.8, - "learning_rate": 2.213275910495272e-07, - "logits/chosen": -2.5659549236297607, - "logits/rejected": -2.409237861633301, - "logps/chosen": -289.281005859375, - "logps/rejected": -292.24603271484375, - "loss": 0.1, + "learning_rate": 2.2174184346585842e-07, + "logits/chosen": -2.4992496967315674, + "logits/rejected": -2.5616142749786377, + "logps/chosen": -217.97653198242188, + "logps/rejected": -336.658203125, + "loss": 0.1617, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.997495412826538, - "rewards/margins": 9.539546012878418, - "rewards/rejected": -7.542050838470459, - "step": 7140 + "rewards/chosen": -1.1197841167449951, + "rewards/margins": 7.244405269622803, + "rewards/rejected": -8.364189147949219, + "step": 7490 }, { "epoch": 1.81, - "learning_rate": 2.2085947008707048e-07, - "logits/chosen": -2.7370827198028564, - "logits/rejected": -2.5288028717041016, - "logps/chosen": -210.69241333007812, - "logps/rejected": -243.98434448242188, - "loss": 0.1095, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.514499306678772, - "rewards/margins": 5.825979709625244, - "rewards/rejected": -7.340478420257568, - "step": 7150 + "learning_rate": 2.2129613121768585e-07, + "logits/chosen": -2.564615249633789, + "logits/rejected": -2.5288052558898926, + "logps/chosen": -227.24386596679688, + "logps/rejected": -303.5227966308594, + "loss": 0.0842, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2124767303466797, + "rewards/margins": 7.121157169342041, + "rewards/rejected": -9.333633422851562, + "step": 7500 }, { "epoch": 1.81, - "learning_rate": 2.203913491246138e-07, - "logits/chosen": -2.6789844036102295, - "logits/rejected": -2.664365530014038, - "logps/chosen": -300.47686767578125, - "logps/rejected": -307.5674743652344, - "loss": 0.097, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.961271047592163, - "rewards/margins": 6.479406833648682, - "rewards/rejected": -8.440677642822266, - "step": 7160 + "eval_logits/chosen": -2.32869291305542, + "eval_logits/rejected": -2.280797004699707, + "eval_logps/chosen": -244.6661376953125, + "eval_logps/rejected": -261.157958984375, + "eval_loss": 0.5394155383110046, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -4.870510101318359, + "eval_rewards/margins": 2.805976629257202, + "eval_rewards/rejected": -7.676486968994141, + "eval_runtime": 131.9802, + "eval_samples_per_second": 23.913, + "eval_steps_per_second": 0.379, + "step": 7500 }, { "epoch": 1.81, - "learning_rate": 2.1992322816215708e-07, - "logits/chosen": -2.6289384365081787, - "logits/rejected": -2.4877371788024902, - "logps/chosen": -264.2879943847656, - "logps/rejected": -271.01910400390625, - "loss": 0.0916, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2111540585756302, - "rewards/margins": 7.823189735412598, - "rewards/rejected": -8.034343719482422, - "step": 7170 + "learning_rate": 2.2085041896951328e-07, + "logits/chosen": -2.651597261428833, + "logits/rejected": -2.5351715087890625, + "logps/chosen": -233.18887329101562, + "logps/rejected": -314.96484375, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20833830535411835, + "rewards/margins": 10.049888610839844, + "rewards/rejected": -9.841550827026367, + "step": 7510 }, { "epoch": 1.81, - "learning_rate": 2.194551071997004e-07, - "logits/chosen": -2.6112618446350098, - "logits/rejected": -2.5769357681274414, - "logps/chosen": -301.84149169921875, - "logps/rejected": -337.9112243652344, - "loss": 0.0946, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.090156689286232, - "rewards/margins": 8.0753173828125, - "rewards/rejected": -8.165472984313965, - "step": 7180 + "learning_rate": 2.204047067213407e-07, + "logits/chosen": -2.500227212905884, + "logits/rejected": -2.3441576957702637, + "logps/chosen": -277.38812255859375, + "logps/rejected": -467.77227783203125, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3476669788360596, + "rewards/margins": 13.493830680847168, + "rewards/rejected": -14.841497421264648, + "step": 7520 + }, + { + "epoch": 1.81, + "learning_rate": 2.199589944731681e-07, + "logits/chosen": -2.623532772064209, + "logits/rejected": -2.6285688877105713, + "logps/chosen": -204.9327850341797, + "logps/rejected": -354.1318359375, + "loss": 0.1881, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1996500492095947, + "rewards/margins": 8.017396926879883, + "rewards/rejected": -9.217047691345215, + "step": 7530 + }, + { + "epoch": 1.81, + "learning_rate": 2.1951328222499554e-07, + "logits/chosen": -2.6879019737243652, + "logits/rejected": -2.3995656967163086, + "logps/chosen": -282.3447570800781, + "logps/rejected": -341.80194091796875, + "loss": 0.1613, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.062755823135376, + "rewards/margins": 5.728832244873047, + "rewards/rejected": -6.79158878326416, + "step": 7540 }, { "epoch": 1.82, - "learning_rate": 2.189869862372437e-07, - "logits/chosen": -2.807898998260498, - "logits/rejected": -2.5814788341522217, - "logps/chosen": -289.78948974609375, - "logps/rejected": -307.60565185546875, - "loss": 0.0797, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6811469793319702, - "rewards/margins": 6.347805976867676, - "rewards/rejected": -7.028953552246094, - "step": 7190 + "learning_rate": 2.1906756997682297e-07, + "logits/chosen": -2.6619040966033936, + "logits/rejected": -2.6494619846343994, + "logps/chosen": -223.7069549560547, + "logps/rejected": -303.066162109375, + "loss": 0.1037, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.924419105052948, + "rewards/margins": 8.06522274017334, + "rewards/rejected": -8.989643096923828, + "step": 7550 + }, + { + "epoch": 1.82, + "learning_rate": 2.1862185772865037e-07, + "logits/chosen": -2.427685260772705, + "logits/rejected": -2.417823553085327, + "logps/chosen": -220.85061645507812, + "logps/rejected": -283.2967834472656, + "loss": 0.0979, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9303325414657593, + "rewards/margins": 6.390061378479004, + "rewards/rejected": -8.320395469665527, + "step": 7560 + }, + { + "epoch": 1.82, + "learning_rate": 2.181761454804778e-07, + "logits/chosen": -2.670269012451172, + "logits/rejected": -2.678056240081787, + "logps/chosen": -223.07571411132812, + "logps/rejected": -274.32550048828125, + "loss": 0.2251, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1056216955184937, + "rewards/margins": 6.7892656326293945, + "rewards/rejected": -7.8948869705200195, + "step": 7570 }, { "epoch": 1.82, - "learning_rate": 2.18518865274787e-07, - "logits/chosen": -2.5731379985809326, - "logits/rejected": -2.39717173576355, - "logps/chosen": -255.71981811523438, - "logps/rejected": -259.46124267578125, - "loss": 0.1267, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.0866820812225342, - "rewards/margins": 5.789783000946045, - "rewards/rejected": -6.87646484375, - "step": 7200 + "learning_rate": 2.1773043323230523e-07, + "logits/chosen": -2.591601610183716, + "logits/rejected": -2.612534761428833, + "logps/chosen": -203.68833923339844, + "logps/rejected": -235.7775421142578, + "loss": 0.1326, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0697176456451416, + "rewards/margins": 6.125297546386719, + "rewards/rejected": -7.195015907287598, + "step": 7580 }, { - "epoch": 1.82, - "learning_rate": 2.1805074431233032e-07, - "logits/chosen": -2.474670886993408, - "logits/rejected": -2.4122226238250732, - "logps/chosen": -314.0117492675781, - "logps/rejected": -416.59649658203125, - "loss": 0.0809, + "epoch": 1.83, + "learning_rate": 2.1728472098413263e-07, + "logits/chosen": -2.7008602619171143, + "logits/rejected": -2.565767288208008, + "logps/chosen": -326.1660461425781, + "logps/rejected": -376.441650390625, + "loss": 0.0862, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 1.323627233505249, - "rewards/margins": 11.513708114624023, - "rewards/rejected": -10.190080642700195, - "step": 7210 + "rewards/chosen": 0.34923577308654785, + "rewards/margins": 8.05476188659668, + "rewards/rejected": -7.705525875091553, + "step": 7590 }, { "epoch": 1.83, - "learning_rate": 2.1758262334987358e-07, - "logits/chosen": -2.6376280784606934, - "logits/rejected": -2.7935502529144287, - "logps/chosen": -304.44403076171875, - "logps/rejected": -479.2804260253906, - "loss": 0.0759, + "learning_rate": 2.1683900873596006e-07, + "logits/chosen": -2.5438268184661865, + "logits/rejected": -2.539036750793457, + "logps/chosen": -164.1432647705078, + "logps/rejected": -298.6413269042969, + "loss": 0.1178, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.33253157138824463, - "rewards/margins": 10.46351432800293, - "rewards/rejected": -10.130983352661133, - "step": 7220 + "rewards/chosen": -1.1467936038970947, + "rewards/margins": 7.0994553565979, + "rewards/rejected": -8.246248245239258, + "step": 7600 }, { "epoch": 1.83, - "learning_rate": 2.171145023874169e-07, - "logits/chosen": -2.5788180828094482, - "logits/rejected": -2.365527391433716, - "logps/chosen": -329.9599914550781, - "logps/rejected": -326.90673828125, - "loss": 0.0777, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.3725981116294861, - "rewards/margins": 6.738213539123535, - "rewards/rejected": -7.110811710357666, - "step": 7230 + "eval_logits/chosen": -2.446324110031128, + "eval_logits/rejected": -2.4021761417388916, + "eval_logps/chosen": -243.94573974609375, + "eval_logps/rejected": -260.0276184082031, + "eval_loss": 0.5252702832221985, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -4.79847526550293, + "eval_rewards/margins": 2.7649755477905273, + "eval_rewards/rejected": -7.563450813293457, + "eval_runtime": 132.0571, + "eval_samples_per_second": 23.899, + "eval_steps_per_second": 0.379, + "step": 7600 }, { "epoch": 1.83, - "learning_rate": 2.166463814249602e-07, - "logits/chosen": -2.4922292232513428, - "logits/rejected": -2.5368895530700684, - "logps/chosen": -225.52914428710938, - "logps/rejected": -326.6394348144531, - "loss": 0.1136, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.11458321660757065, - "rewards/margins": 7.7642316818237305, - "rewards/rejected": -7.878814697265625, - "step": 7240 + "learning_rate": 2.163932964877875e-07, + "logits/chosen": -2.506002187728882, + "logits/rejected": -2.481971263885498, + "logps/chosen": -221.18936157226562, + "logps/rejected": -273.1419372558594, + "loss": 0.0774, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6983006000518799, + "rewards/margins": 7.25876522064209, + "rewards/rejected": -8.957064628601074, + "step": 7610 }, { "epoch": 1.83, - "learning_rate": 2.161782604625035e-07, - "logits/chosen": -2.851933240890503, - "logits/rejected": -2.7211337089538574, - "logps/chosen": -320.7086486816406, - "logps/rejected": -369.9259033203125, - "loss": 0.0703, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8477370142936707, - "rewards/margins": 8.635969161987305, - "rewards/rejected": -7.788232326507568, - "step": 7250 + "learning_rate": 2.159475842396149e-07, + "logits/chosen": -2.5664591789245605, + "logits/rejected": -2.5360147953033447, + "logps/chosen": -278.72808837890625, + "logps/rejected": -337.9598693847656, + "loss": 0.1131, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3152107000350952, + "rewards/margins": 7.291049003601074, + "rewards/rejected": -8.606260299682617, + "step": 7620 }, { "epoch": 1.84, - "learning_rate": 2.157101395000468e-07, - "logits/chosen": -2.5511319637298584, - "logits/rejected": -2.5334312915802, - "logps/chosen": -306.78057861328125, - "logps/rejected": -398.1654052734375, - "loss": 0.124, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.13714860379695892, - "rewards/margins": 10.0731201171875, - "rewards/rejected": -10.210268020629883, - "step": 7260 + "learning_rate": 2.1550187199144233e-07, + "logits/chosen": -2.578714370727539, + "logits/rejected": -2.5630674362182617, + "logps/chosen": -279.19573974609375, + "logps/rejected": -337.86907958984375, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.021250318735837936, + "rewards/margins": 8.285003662109375, + "rewards/rejected": -8.263752937316895, + "step": 7630 }, { "epoch": 1.84, - "learning_rate": 2.152420185375901e-07, - "logits/chosen": -2.6786410808563232, - "logits/rejected": -2.5691006183624268, - "logps/chosen": -236.7671356201172, - "logps/rejected": -323.62188720703125, - "loss": 0.0962, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0288159847259521, - "rewards/margins": 8.96338176727295, - "rewards/rejected": -9.992198944091797, - "step": 7270 + "learning_rate": 2.1505615974326973e-07, + "logits/chosen": -2.633155584335327, + "logits/rejected": -2.6528987884521484, + "logps/chosen": -240.26559448242188, + "logps/rejected": -362.24578857421875, + "loss": 0.0797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3059333860874176, + "rewards/margins": 8.460762023925781, + "rewards/rejected": -8.766695976257324, + "step": 7640 }, { "epoch": 1.84, - "learning_rate": 2.147738975751334e-07, - "logits/chosen": -2.6515941619873047, - "logits/rejected": -2.3960585594177246, - "logps/chosen": -273.30029296875, - "logps/rejected": -370.7136535644531, - "loss": 0.0744, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.4314087927341461, - "rewards/margins": 9.314390182495117, - "rewards/rejected": -8.88298225402832, - "step": 7280 + "learning_rate": 2.1461044749509716e-07, + "logits/chosen": -2.6852028369903564, + "logits/rejected": -2.6269357204437256, + "logps/chosen": -281.8564453125, + "logps/rejected": -404.3232116699219, + "loss": 0.0779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10787633806467056, + "rewards/margins": 10.228116035461426, + "rewards/rejected": -10.335990905761719, + "step": 7650 }, { "epoch": 1.84, - "learning_rate": 2.1430577661267671e-07, - "logits/chosen": -2.606426239013672, - "logits/rejected": -2.6145050525665283, - "logps/chosen": -338.6814270019531, - "logps/rejected": -360.7559814453125, - "loss": 0.1138, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.5850367546081543, - "rewards/margins": 11.156438827514648, - "rewards/rejected": -9.571401596069336, - "step": 7290 + "learning_rate": 2.141647352469246e-07, + "logits/chosen": -2.606945753097534, + "logits/rejected": -2.5903420448303223, + "logps/chosen": -291.482421875, + "logps/rejected": -398.61138916015625, + "loss": 0.2162, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.0812334269285202, + "rewards/margins": 7.445326805114746, + "rewards/rejected": -7.364092826843262, + "step": 7660 }, { "epoch": 1.85, - "learning_rate": 2.1383765565022e-07, - "logits/chosen": -2.4672369956970215, - "logits/rejected": -2.5930633544921875, - "logps/chosen": -213.44735717773438, - "logps/rejected": -447.8428649902344, - "loss": 0.0545, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3330296576023102, - "rewards/margins": 9.776864051818848, - "rewards/rejected": -9.443833351135254, - "step": 7300 + "learning_rate": 2.13719022998752e-07, + "logits/chosen": -2.866992235183716, + "logits/rejected": -2.6583330631256104, + "logps/chosen": -383.8420715332031, + "logps/rejected": -344.3113098144531, + "loss": 0.1049, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.18078795075416565, + "rewards/margins": 9.435612678527832, + "rewards/rejected": -9.2548246383667, + "step": 7670 }, { "epoch": 1.85, - "learning_rate": 2.1336953468776332e-07, - "logits/chosen": -2.655038595199585, - "logits/rejected": -2.5842857360839844, - "logps/chosen": -210.86434936523438, - "logps/rejected": -270.0993957519531, - "loss": 0.1019, + "learning_rate": 2.1327331075057942e-07, + "logits/chosen": -2.6465487480163574, + "logits/rejected": -2.5750632286071777, + "logps/chosen": -314.3196716308594, + "logps/rejected": -314.7630615234375, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7735605239868164, + "rewards/margins": 8.784982681274414, + "rewards/rejected": -8.011421203613281, + "step": 7680 + }, + { + "epoch": 1.85, + "learning_rate": 2.1282759850240685e-07, + "logits/chosen": -2.6840643882751465, + "logits/rejected": -2.625324249267578, + "logps/chosen": -218.67141723632812, + "logps/rejected": -281.2733459472656, + "loss": 0.1266, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6795251369476318, - "rewards/margins": 6.547571659088135, - "rewards/rejected": -7.2270965576171875, - "step": 7310 + "rewards/chosen": 0.2676972448825836, + "rewards/margins": 8.17557430267334, + "rewards/rejected": -7.907877445220947, + "step": 7690 }, { "epoch": 1.85, - "learning_rate": 2.1290141372530663e-07, - "logits/chosen": -2.729170083999634, - "logits/rejected": -2.6793854236602783, - "logps/chosen": -344.52044677734375, - "logps/rejected": -432.5497131347656, - "loss": 0.0438, + "learning_rate": 2.1238188625423425e-07, + "logits/chosen": -2.5301029682159424, + "logits/rejected": -2.5229175090789795, + "logps/chosen": -211.6637725830078, + "logps/rejected": -322.996826171875, + "loss": 0.1255, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.254820317029953, - "rewards/margins": 9.785828590393066, - "rewards/rejected": -9.531007766723633, - "step": 7320 + "rewards/chosen": -1.8442342281341553, + "rewards/margins": 6.535142421722412, + "rewards/rejected": -8.379377365112305, + "step": 7700 }, { "epoch": 1.85, - "learning_rate": 2.124332927628499e-07, - "logits/chosen": -2.456355571746826, - "logits/rejected": -2.330481767654419, - "logps/chosen": -214.1356964111328, - "logps/rejected": -252.86752319335938, - "loss": 0.0637, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.11180851608514786, - "rewards/margins": 7.105473518371582, - "rewards/rejected": -7.217282772064209, - "step": 7330 + "eval_logits/chosen": -2.5501203536987305, + "eval_logits/rejected": -2.5072529315948486, + "eval_logps/chosen": -242.96844482421875, + "eval_logps/rejected": -258.755615234375, + "eval_loss": 0.5355476140975952, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -4.700740814208984, + "eval_rewards/margins": 2.735515832901001, + "eval_rewards/rejected": -7.43625545501709, + "eval_runtime": 131.9245, + "eval_samples_per_second": 23.923, + "eval_steps_per_second": 0.379, + "step": 7700 }, { "epoch": 1.86, - "learning_rate": 2.119651718003932e-07, - "logits/chosen": -2.4621005058288574, - "logits/rejected": -2.4193661212921143, - "logps/chosen": -273.9238586425781, - "logps/rejected": -351.8313903808594, - "loss": 0.1036, + "learning_rate": 2.1193617400606168e-07, + "logits/chosen": -2.7850518226623535, + "logits/rejected": -2.659503936767578, + "logps/chosen": -251.50332641601562, + "logps/rejected": -222.73001098632812, + "loss": 0.1028, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.4636752605438232, - "rewards/margins": 8.42043685913086, - "rewards/rejected": -9.884112358093262, - "step": 7340 + "rewards/chosen": -1.6358991861343384, + "rewards/margins": 5.284237861633301, + "rewards/rejected": -6.92013692855835, + "step": 7710 }, { "epoch": 1.86, - "learning_rate": 2.114970508379365e-07, - "logits/chosen": -2.611341714859009, - "logits/rejected": -2.4828543663024902, - "logps/chosen": -271.2765808105469, - "logps/rejected": -346.5517272949219, - "loss": 0.0514, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7445017695426941, - "rewards/margins": 7.4684295654296875, - "rewards/rejected": -8.212930679321289, - "step": 7350 + "learning_rate": 2.1149046175788908e-07, + "logits/chosen": -2.5619523525238037, + "logits/rejected": -2.5342695713043213, + "logps/chosen": -230.004150390625, + "logps/rejected": -327.9783935546875, + "loss": 0.0611, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2647145986557007, + "rewards/margins": 6.328179359436035, + "rewards/rejected": -7.592893123626709, + "step": 7720 }, { "epoch": 1.86, - "learning_rate": 2.1102892987547982e-07, - "logits/chosen": -2.58764910697937, - "logits/rejected": -2.609360694885254, - "logps/chosen": -235.40536499023438, - "logps/rejected": -340.46533203125, - "loss": 0.0732, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.4518706798553467, - "rewards/margins": 7.273972988128662, - "rewards/rejected": -7.725844383239746, - "step": 7360 + "learning_rate": 2.110447495097165e-07, + "logits/chosen": -2.652963161468506, + "logits/rejected": -2.5709705352783203, + "logps/chosen": -194.14453125, + "logps/rejected": -312.2419738769531, + "loss": 0.0656, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.940604031085968, + "rewards/margins": 8.206774711608887, + "rewards/rejected": -9.147378921508789, + "step": 7730 }, { "epoch": 1.86, - "learning_rate": 2.105608089130231e-07, - "logits/chosen": -2.7084922790527344, - "logits/rejected": -2.6220972537994385, - "logps/chosen": -262.4686584472656, - "logps/rejected": -316.953125, - "loss": 0.0511, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7665785551071167, - "rewards/margins": 8.81516170501709, - "rewards/rejected": -9.581741333007812, - "step": 7370 + "learning_rate": 2.1059903726154394e-07, + "logits/chosen": -2.78930926322937, + "logits/rejected": -2.768345355987549, + "logps/chosen": -293.38189697265625, + "logps/rejected": -272.0474853515625, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0656585693359375, + "rewards/margins": 4.420651912689209, + "rewards/rejected": -6.4863104820251465, + "step": 7740 }, { "epoch": 1.87, - "learning_rate": 2.1009268795056642e-07, - "logits/chosen": -2.3753809928894043, - "logits/rejected": -2.2589664459228516, - "logps/chosen": -272.1087951660156, - "logps/rejected": -286.30279541015625, - "loss": 0.0677, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8401474952697754, - "rewards/margins": 6.868135929107666, - "rewards/rejected": -7.708283424377441, - "step": 7380 + "learning_rate": 2.1015332501337135e-07, + "logits/chosen": -2.8142502307891846, + "logits/rejected": -2.7967958450317383, + "logps/chosen": -288.67584228515625, + "logps/rejected": -338.80975341796875, + "loss": 0.1453, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.044142626225948334, + "rewards/margins": 8.233192443847656, + "rewards/rejected": -8.189050674438477, + "step": 7750 }, { "epoch": 1.87, - "learning_rate": 2.096245669881097e-07, - "logits/chosen": -2.447204113006592, - "logits/rejected": -2.477513551712036, - "logps/chosen": -187.57974243164062, - "logps/rejected": -261.90338134765625, - "loss": 0.1106, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.4475107192993164, - "rewards/margins": 6.116883754730225, - "rewards/rejected": -6.564394474029541, - "step": 7390 + "learning_rate": 2.0970761276519877e-07, + "logits/chosen": -2.7419071197509766, + "logits/rejected": -2.7851767539978027, + "logps/chosen": -289.52667236328125, + "logps/rejected": -402.12762451171875, + "loss": 0.0981, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6060962080955505, + "rewards/margins": 8.396982192993164, + "rewards/rejected": -9.003077507019043, + "step": 7760 }, { "epoch": 1.87, - "learning_rate": 2.0915644602565303e-07, - "logits/chosen": -2.64605712890625, - "logits/rejected": -2.6174445152282715, - "logps/chosen": -266.545654296875, - "logps/rejected": -323.08856201171875, - "loss": 0.0647, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3667122721672058, - "rewards/margins": 9.014110565185547, - "rewards/rejected": -8.647397994995117, - "step": 7400 + "learning_rate": 2.092619005170262e-07, + "logits/chosen": -2.7153637409210205, + "logits/rejected": -2.701239824295044, + "logps/chosen": -368.2999572753906, + "logps/rejected": -387.92486572265625, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3965792953968048, + "rewards/margins": 9.607254028320312, + "rewards/rejected": -9.210674285888672, + "step": 7770 }, { "epoch": 1.87, - "learning_rate": 2.0868832506319632e-07, - "logits/chosen": -2.531986713409424, - "logits/rejected": -2.54695987701416, - "logps/chosen": -243.2427978515625, - "logps/rejected": -461.019287109375, - "loss": 0.0968, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.1118009090423584, - "rewards/margins": 9.954936981201172, - "rewards/rejected": -10.066737174987793, - "step": 7410 + "learning_rate": 2.088161882688536e-07, + "logits/chosen": -2.74839448928833, + "logits/rejected": -2.6897144317626953, + "logps/chosen": -267.82342529296875, + "logps/rejected": -311.11041259765625, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8902170658111572, + "rewards/margins": 5.893338203430176, + "rewards/rejected": -7.783555030822754, + "step": 7780 + }, + { + "epoch": 1.87, + "learning_rate": 2.0837047602068104e-07, + "logits/chosen": -2.8124265670776367, + "logits/rejected": -2.6543900966644287, + "logps/chosen": -305.02850341796875, + "logps/rejected": -317.5285949707031, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0398132801055908, + "rewards/margins": 6.994645595550537, + "rewards/rejected": -8.03445816040039, + "step": 7790 }, { "epoch": 1.88, - "learning_rate": 2.0822020410073963e-07, - "logits/chosen": -2.595608711242676, - "logits/rejected": -2.468226194381714, - "logps/chosen": -311.4830627441406, - "logps/rejected": -302.28363037109375, - "loss": 0.0691, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9550451040267944, - "rewards/margins": 6.499562740325928, - "rewards/rejected": -7.454607963562012, - "step": 7420 + "learning_rate": 2.0792476377250844e-07, + "logits/chosen": -2.5986790657043457, + "logits/rejected": -2.6101126670837402, + "logps/chosen": -346.0367126464844, + "logps/rejected": -549.4420166015625, + "loss": 0.1541, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1621568202972412, + "rewards/margins": 19.80972671508789, + "rewards/rejected": -20.971885681152344, + "step": 7800 }, { "epoch": 1.88, - "learning_rate": 2.0775208313828292e-07, - "logits/chosen": -2.715292453765869, - "logits/rejected": -2.664085865020752, - "logps/chosen": -254.71871948242188, - "logps/rejected": -304.4339599609375, - "loss": 0.0522, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2248115837574005, - "rewards/margins": 6.619701385498047, - "rewards/rejected": -6.844512939453125, - "step": 7430 + "eval_logits/chosen": -2.4036145210266113, + "eval_logits/rejected": -2.355100154876709, + "eval_logps/chosen": -245.2546844482422, + "eval_logps/rejected": -260.8583679199219, + "eval_loss": 0.5439911484718323, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -4.929368019104004, + "eval_rewards/margins": 2.717161178588867, + "eval_rewards/rejected": -7.646529674530029, + "eval_runtime": 131.998, + "eval_samples_per_second": 23.909, + "eval_steps_per_second": 0.379, + "step": 7800 }, { "epoch": 1.88, - "learning_rate": 2.072839621758262e-07, - "logits/chosen": -2.4319987297058105, - "logits/rejected": -2.4993152618408203, - "logps/chosen": -272.27923583984375, - "logps/rejected": -266.61456298828125, - "loss": 0.0518, + "learning_rate": 2.0747905152433587e-07, + "logits/chosen": -2.6249797344207764, + "logits/rejected": -2.6913743019104004, + "logps/chosen": -271.9580993652344, + "logps/rejected": -284.90972900390625, + "loss": 0.1237, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.692047119140625, - "rewards/margins": 6.355321884155273, - "rewards/rejected": -8.047369003295898, - "step": 7440 + "rewards/chosen": -3.7007839679718018, + "rewards/margins": 4.087651252746582, + "rewards/rejected": -7.7884345054626465, + "step": 7810 }, { "epoch": 1.88, - "learning_rate": 2.0681584121336953e-07, - "logits/chosen": -2.679299831390381, - "logits/rejected": -2.6855454444885254, - "logps/chosen": -248.4481201171875, - "logps/rejected": -376.2951354980469, - "loss": 0.0956, + "learning_rate": 2.070333392761633e-07, + "logits/chosen": -2.5934550762176514, + "logits/rejected": -2.4892807006835938, + "logps/chosen": -213.82901000976562, + "logps/rejected": -305.619384765625, + "loss": 0.07, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.08112213760614395, - "rewards/margins": 7.70186710357666, - "rewards/rejected": -7.782988548278809, - "step": 7450 + "rewards/chosen": -0.19481240212917328, + "rewards/margins": 8.749310493469238, + "rewards/rejected": -8.944124221801758, + "step": 7820 }, { - "epoch": 1.89, - "learning_rate": 2.0634772025091282e-07, - "logits/chosen": -2.3245654106140137, - "logits/rejected": -2.3124887943267822, - "logps/chosen": -237.27978515625, - "logps/rejected": -272.12652587890625, - "loss": 0.0964, + "epoch": 1.88, + "learning_rate": 2.065876270279907e-07, + "logits/chosen": -2.61063289642334, + "logits/rejected": -2.4288926124572754, + "logps/chosen": -273.6852111816406, + "logps/rejected": -318.98663330078125, + "loss": 0.0528, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.0427350997924805, - "rewards/margins": 6.740478515625, - "rewards/rejected": -8.783212661743164, - "step": 7460 + "rewards/chosen": -2.0701587200164795, + "rewards/margins": 8.105264663696289, + "rewards/rejected": -10.175421714782715, + "step": 7830 }, { "epoch": 1.89, - "learning_rate": 2.0587959928845613e-07, - "logits/chosen": -2.2935822010040283, - "logits/rejected": -2.4650814533233643, - "logps/chosen": -267.4075927734375, - "logps/rejected": -303.23565673828125, - "loss": 0.049, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3434972763061523, - "rewards/margins": 8.509553909301758, - "rewards/rejected": -9.85305118560791, - "step": 7470 + "learning_rate": 2.0614191477981813e-07, + "logits/chosen": -2.2563159465789795, + "logits/rejected": -2.3084681034088135, + "logps/chosen": -269.3587341308594, + "logps/rejected": -327.7840270996094, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1475660800933838, + "rewards/margins": 9.29503345489502, + "rewards/rejected": -10.442598342895508, + "step": 7840 }, { "epoch": 1.89, - "learning_rate": 2.0541147832599942e-07, - "logits/chosen": -2.5621564388275146, - "logits/rejected": -2.5696120262145996, - "logps/chosen": -270.3358459472656, - "logps/rejected": -301.4221496582031, - "loss": 0.1126, + "learning_rate": 2.0569620253164559e-07, + "logits/chosen": -2.5513806343078613, + "logits/rejected": -2.508632183074951, + "logps/chosen": -174.73211669921875, + "logps/rejected": -192.35537719726562, + "loss": 0.1019, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9965864419937134, + "rewards/margins": 5.334261894226074, + "rewards/rejected": -7.33084774017334, + "step": 7850 + }, + { + "epoch": 1.89, + "learning_rate": 2.05250490283473e-07, + "logits/chosen": -2.4502675533294678, + "logits/rejected": -2.5528974533081055, + "logps/chosen": -303.0812683105469, + "logps/rejected": -304.05975341796875, + "loss": 0.0775, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.2438421249389648, - "rewards/margins": 6.812567234039307, - "rewards/rejected": -8.056408882141113, - "step": 7480 + "rewards/chosen": -1.3185979127883911, + "rewards/margins": 7.151303291320801, + "rewards/rejected": -8.469901084899902, + "step": 7860 }, { "epoch": 1.89, - "learning_rate": 2.0494335736354274e-07, - "logits/chosen": -2.524179458618164, - "logits/rejected": -2.4801383018493652, - "logps/chosen": -293.1369934082031, - "logps/rejected": -440.0506286621094, - "loss": 0.0783, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.30194908380508423, - "rewards/margins": 9.120832443237305, - "rewards/rejected": -9.422780990600586, - "step": 7490 + "learning_rate": 2.0480477803530042e-07, + "logits/chosen": -2.713970422744751, + "logits/rejected": -2.644402027130127, + "logps/chosen": -386.41680908203125, + "logps/rejected": -379.39227294921875, + "loss": 0.1051, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20769663155078888, + "rewards/margins": 8.661986351013184, + "rewards/rejected": -8.869683265686035, + "step": 7870 }, { "epoch": 1.9, - "learning_rate": 2.0447523640108603e-07, - "logits/chosen": -2.46061635017395, - "logits/rejected": -2.450618267059326, - "logps/chosen": -308.24365234375, - "logps/rejected": -366.5410461425781, - "loss": 0.0504, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.354601502418518, - "rewards/margins": 8.541120529174805, - "rewards/rejected": -9.895721435546875, - "step": 7500 + "learning_rate": 2.0435906578712782e-07, + "logits/chosen": -2.6116013526916504, + "logits/rejected": -2.4812984466552734, + "logps/chosen": -244.54345703125, + "logps/rejected": -415.404296875, + "loss": 0.1302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11553603410720825, + "rewards/margins": 11.803556442260742, + "rewards/rejected": -11.919092178344727, + "step": 7880 }, { "epoch": 1.9, - "learning_rate": 2.0400711543862934e-07, - "logits/chosen": -2.558875560760498, - "logits/rejected": -2.4312326908111572, - "logps/chosen": -307.14453125, - "logps/rejected": -400.5588684082031, - "loss": 0.0861, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.27744191884994507, - "rewards/margins": 12.654280662536621, - "rewards/rejected": -12.376840591430664, - "step": 7510 + "learning_rate": 2.0391335353895525e-07, + "logits/chosen": -2.6290132999420166, + "logits/rejected": -2.556884288787842, + "logps/chosen": -279.66571044921875, + "logps/rejected": -358.2665710449219, + "loss": 0.0741, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2475175857543945, + "rewards/margins": 8.194717407226562, + "rewards/rejected": -10.442234992980957, + "step": 7890 }, { "epoch": 1.9, - "learning_rate": 2.0353899447617263e-07, - "logits/chosen": -2.7522482872009277, - "logits/rejected": -2.647388458251953, - "logps/chosen": -332.55987548828125, - "logps/rejected": -371.76434326171875, - "loss": 0.0509, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.23856863379478455, - "rewards/margins": 9.279437065124512, - "rewards/rejected": -9.040868759155273, - "step": 7520 + "learning_rate": 2.0346764129078268e-07, + "logits/chosen": -2.856350898742676, + "logits/rejected": -2.763349771499634, + "logps/chosen": -338.95635986328125, + "logps/rejected": -341.5370178222656, + "loss": 0.0893, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3789913654327393, + "rewards/margins": 6.404806613922119, + "rewards/rejected": -7.7837982177734375, + "step": 7900 }, { "epoch": 1.9, - "learning_rate": 2.0307087351371592e-07, - "logits/chosen": -2.3382153511047363, - "logits/rejected": -2.2408435344696045, - "logps/chosen": -285.0376892089844, - "logps/rejected": -289.60760498046875, - "loss": 0.107, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.278595209121704, - "rewards/margins": 7.759607791900635, - "rewards/rejected": -9.038202285766602, - "step": 7530 + "eval_logits/chosen": -2.3784422874450684, + "eval_logits/rejected": -2.321394205093384, + "eval_logps/chosen": -248.09593200683594, + "eval_logps/rejected": -267.6338806152344, + "eval_loss": 0.5396592617034912, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -5.213491439819336, + "eval_rewards/margins": 3.1105873584747314, + "eval_rewards/rejected": -8.324078559875488, + "eval_runtime": 131.8808, + "eval_samples_per_second": 23.931, + "eval_steps_per_second": 0.379, + "step": 7900 }, { - "epoch": 1.91, - "learning_rate": 2.0260275255125924e-07, - "logits/chosen": -2.3935012817382812, - "logits/rejected": -2.4903364181518555, - "logps/chosen": -255.71084594726562, - "logps/rejected": -360.177978515625, - "loss": 0.108, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.7504819631576538, - "rewards/margins": 7.956636905670166, - "rewards/rejected": -9.70711898803711, - "step": 7540 + "epoch": 1.9, + "learning_rate": 2.0302192904261008e-07, + "logits/chosen": -2.5201923847198486, + "logits/rejected": -2.557476043701172, + "logps/chosen": -279.41461181640625, + "logps/rejected": -303.5218811035156, + "loss": 0.106, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4836001396179199, + "rewards/margins": 7.964742183685303, + "rewards/rejected": -8.448343276977539, + "step": 7910 }, { "epoch": 1.91, - "learning_rate": 2.0213463158880253e-07, - "logits/chosen": -2.4879226684570312, - "logits/rejected": -2.4252655506134033, - "logps/chosen": -328.02435302734375, - "logps/rejected": -385.58612060546875, - "loss": 0.0671, + "learning_rate": 2.025762167944375e-07, + "logits/chosen": -2.630619764328003, + "logits/rejected": -2.531430721282959, + "logps/chosen": -343.33404541015625, + "logps/rejected": -296.44439697265625, + "loss": 0.0476, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0912948846817017, - "rewards/margins": 10.426485061645508, - "rewards/rejected": -9.33519172668457, - "step": 7550 + "rewards/chosen": -1.1706873178482056, + "rewards/margins": 7.957624912261963, + "rewards/rejected": -9.128311157226562, + "step": 7920 }, { "epoch": 1.91, - "learning_rate": 2.0166651062634584e-07, - "logits/chosen": -2.42702317237854, - "logits/rejected": -2.4872756004333496, - "logps/chosen": -293.220703125, - "logps/rejected": -341.46490478515625, - "loss": 0.0857, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6263461112976074, - "rewards/margins": 9.559865951538086, - "rewards/rejected": -12.186211585998535, - "step": 7560 + "learning_rate": 2.0213050454626494e-07, + "logits/chosen": -2.4779675006866455, + "logits/rejected": -2.418358087539673, + "logps/chosen": -291.142822265625, + "logps/rejected": -292.71368408203125, + "loss": 0.0995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2799273729324341, + "rewards/margins": 8.684205055236816, + "rewards/rejected": -8.964131355285645, + "step": 7930 }, { "epoch": 1.91, - "learning_rate": 2.0119838966388913e-07, - "logits/chosen": -2.5302624702453613, - "logits/rejected": -2.486345052719116, - "logps/chosen": -266.01739501953125, - "logps/rejected": -352.3585510253906, - "loss": 0.0657, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.9551617503166199, - "rewards/margins": 9.321630477905273, - "rewards/rejected": -10.276793479919434, - "step": 7570 + "learning_rate": 2.0168479229809234e-07, + "logits/chosen": -2.482870101928711, + "logits/rejected": -2.4456191062927246, + "logps/chosen": -170.06320190429688, + "logps/rejected": -201.51309204101562, + "loss": 0.175, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.899862289428711, + "rewards/margins": 6.18842887878418, + "rewards/rejected": -8.08829116821289, + "step": 7940 }, { - "epoch": 1.92, - "learning_rate": 2.0073026870143245e-07, - "logits/chosen": -2.3224892616271973, - "logits/rejected": -2.189652919769287, - "logps/chosen": -279.7569274902344, - "logps/rejected": -243.7903594970703, - "loss": 0.0699, + "epoch": 1.91, + "learning_rate": 2.0123908004991977e-07, + "logits/chosen": -2.7418711185455322, + "logits/rejected": -2.668391466140747, + "logps/chosen": -342.06915283203125, + "logps/rejected": -375.1126403808594, + "loss": 0.0857, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.153075933456421, - "rewards/margins": 5.087791919708252, - "rewards/rejected": -6.240868091583252, - "step": 7580 + "rewards/chosen": -1.2378227710723877, + "rewards/margins": 7.8989410400390625, + "rewards/rejected": -9.136762619018555, + "step": 7950 }, { "epoch": 1.92, - "learning_rate": 2.0026214773897574e-07, - "logits/chosen": -2.5566675662994385, - "logits/rejected": -2.4298269748687744, - "logps/chosen": -286.26812744140625, - "logps/rejected": -471.37152099609375, - "loss": 0.0764, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9075733423233032, - "rewards/margins": 9.651289939880371, - "rewards/rejected": -10.558862686157227, - "step": 7590 + "learning_rate": 2.0079336780174718e-07, + "logits/chosen": -2.6857995986938477, + "logits/rejected": -2.6013073921203613, + "logps/chosen": -238.41659545898438, + "logps/rejected": -355.93743896484375, + "loss": 0.0852, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4674594402313232, + "rewards/margins": 6.5976667404174805, + "rewards/rejected": -8.065126419067383, + "step": 7960 }, { "epoch": 1.92, - "learning_rate": 1.9979402677651905e-07, - "logits/chosen": -2.4859347343444824, - "logits/rejected": -2.332869052886963, - "logps/chosen": -302.6961364746094, - "logps/rejected": -289.33233642578125, - "loss": 0.1151, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.1341279745101929, - "rewards/margins": 8.396425247192383, - "rewards/rejected": -9.530553817749023, - "step": 7600 + "learning_rate": 2.003476555535746e-07, + "logits/chosen": -2.52712345123291, + "logits/rejected": -2.588061809539795, + "logps/chosen": -224.71859741210938, + "logps/rejected": -227.07241821289062, + "loss": 0.1807, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2789312601089478, + "rewards/margins": 5.685016632080078, + "rewards/rejected": -6.9639482498168945, + "step": 7970 }, { "epoch": 1.92, - "learning_rate": 1.9932590581406234e-07, - "logits/chosen": -2.49135160446167, - "logits/rejected": -2.3705971240997314, - "logps/chosen": -235.57321166992188, - "logps/rejected": -392.3863220214844, - "loss": 0.097, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.1869460493326187, - "rewards/margins": 8.586758613586426, - "rewards/rejected": -8.773704528808594, - "step": 7610 + "learning_rate": 1.9990194330540203e-07, + "logits/chosen": -2.6547811031341553, + "logits/rejected": -2.626857280731201, + "logps/chosen": -385.37060546875, + "logps/rejected": -443.29718017578125, + "loss": 0.0887, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.077797293663025, + "rewards/margins": 11.06875991821289, + "rewards/rejected": -9.99096393585205, + "step": 7980 + }, + { + "epoch": 1.92, + "learning_rate": 1.9945623105722944e-07, + "logits/chosen": -2.546708345413208, + "logits/rejected": -2.469754695892334, + "logps/chosen": -187.1951904296875, + "logps/rejected": -291.9125671386719, + "loss": 0.1095, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5086857080459595, + "rewards/margins": 8.782590866088867, + "rewards/rejected": -10.291276931762695, + "step": 7990 }, { "epoch": 1.93, - "learning_rate": 1.9885778485160566e-07, - "logits/chosen": -2.483900547027588, - "logits/rejected": -2.541287660598755, - "logps/chosen": -283.54876708984375, - "logps/rejected": -357.29583740234375, - "loss": 0.0855, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6025287508964539, - "rewards/margins": 7.6768479347229, - "rewards/rejected": -8.279376029968262, - "step": 7620 + "learning_rate": 1.9901051880905687e-07, + "logits/chosen": -2.5901618003845215, + "logits/rejected": -2.4430928230285645, + "logps/chosen": -329.35382080078125, + "logps/rejected": -278.3038635253906, + "loss": 0.1203, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8766376972198486, + "rewards/margins": 6.404065132141113, + "rewards/rejected": -8.280701637268066, + "step": 8000 }, { "epoch": 1.93, - "learning_rate": 1.9838966388914892e-07, - "logits/chosen": -2.3701515197753906, - "logits/rejected": -2.5350399017333984, - "logps/chosen": -291.2542419433594, - "logps/rejected": -467.56756591796875, - "loss": 0.0716, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.4438214898109436, - "rewards/margins": 11.625341415405273, - "rewards/rejected": -11.181519508361816, - "step": 7630 + "eval_logits/chosen": -2.496852159500122, + "eval_logits/rejected": -2.450942277908325, + "eval_logps/chosen": -244.60537719726562, + "eval_logps/rejected": -262.9913330078125, + "eval_loss": 0.5295895338058472, + "eval_rewards/accuracies": 0.6549999713897705, + "eval_rewards/chosen": -4.864434719085693, + "eval_rewards/margins": 2.995391607284546, + "eval_rewards/rejected": -7.85982608795166, + "eval_runtime": 132.0947, + "eval_samples_per_second": 23.892, + "eval_steps_per_second": 0.379, + "step": 8000 }, { "epoch": 1.93, - "learning_rate": 1.9792154292669224e-07, - "logits/chosen": -2.595479726791382, - "logits/rejected": -2.625075578689575, - "logps/chosen": -277.4136047363281, - "logps/rejected": -371.3047180175781, - "loss": 0.1188, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5710428953170776, - "rewards/margins": 10.5836763381958, - "rewards/rejected": -10.012632369995117, - "step": 7640 + "learning_rate": 1.985648065608843e-07, + "logits/chosen": -2.5610668659210205, + "logits/rejected": -2.5320394039154053, + "logps/chosen": -344.0411376953125, + "logps/rejected": -406.90509033203125, + "loss": 0.1178, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.09236741065979, + "rewards/margins": 8.439855575561523, + "rewards/rejected": -9.53222370147705, + "step": 8010 }, { "epoch": 1.93, - "learning_rate": 1.9745342196423555e-07, - "logits/chosen": -2.531487464904785, - "logits/rejected": -2.466464042663574, - "logps/chosen": -296.78155517578125, - "logps/rejected": -376.5704650878906, - "loss": 0.0856, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.18947431445121765, - "rewards/margins": 9.353696823120117, - "rewards/rejected": -9.543170928955078, - "step": 7650 + "learning_rate": 1.981190943127117e-07, + "logits/chosen": -2.696176052093506, + "logits/rejected": -2.65200138092041, + "logps/chosen": -383.5420227050781, + "logps/rejected": -256.7798156738281, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.83172607421875, + "rewards/margins": 5.847100257873535, + "rewards/rejected": -6.678825378417969, + "step": 8020 + }, + { + "epoch": 1.93, + "learning_rate": 1.9767338206453913e-07, + "logits/chosen": -2.5560178756713867, + "logits/rejected": -2.4728081226348877, + "logps/chosen": -243.54385375976562, + "logps/rejected": -347.6253356933594, + "loss": 0.1364, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9548274874687195, + "rewards/margins": 8.861523628234863, + "rewards/rejected": -9.816350936889648, + "step": 8030 }, { "epoch": 1.94, - "learning_rate": 1.9698530100177884e-07, - "logits/chosen": -2.4659383296966553, - "logits/rejected": -2.4665517807006836, - "logps/chosen": -309.4673156738281, - "logps/rejected": -407.2119140625, - "loss": 0.0671, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9733579754829407, - "rewards/margins": 7.893436431884766, - "rewards/rejected": -8.866793632507324, - "step": 7660 + "learning_rate": 1.9722766981636653e-07, + "logits/chosen": -2.4439332485198975, + "logits/rejected": -2.4561045169830322, + "logps/chosen": -300.6723327636719, + "logps/rejected": -331.7294006347656, + "loss": 0.0852, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8925291299819946, + "rewards/margins": 6.072808265686035, + "rewards/rejected": -7.965336799621582, + "step": 8040 }, { "epoch": 1.94, - "learning_rate": 1.9651718003932216e-07, - "logits/chosen": -2.1824698448181152, - "logits/rejected": -2.095674753189087, - "logps/chosen": -282.605712890625, - "logps/rejected": -280.44488525390625, - "loss": 0.0824, + "learning_rate": 1.9678195756819396e-07, + "logits/chosen": -2.7666091918945312, + "logits/rejected": -2.7808451652526855, + "logps/chosen": -343.18426513671875, + "logps/rejected": -372.3157653808594, + "loss": 0.1745, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.8554694056510925, - "rewards/margins": 6.769730567932129, - "rewards/rejected": -7.6252007484436035, - "step": 7670 + "rewards/chosen": -0.25246429443359375, + "rewards/margins": 8.844322204589844, + "rewards/rejected": -9.096786499023438, + "step": 8050 }, { "epoch": 1.94, - "learning_rate": 1.9604905907686545e-07, - "logits/chosen": -2.369614839553833, - "logits/rejected": -2.2198705673217773, - "logps/chosen": -324.4106750488281, - "logps/rejected": -367.41461181640625, - "loss": 0.1118, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.5607366561889648, - "rewards/margins": 7.041375160217285, - "rewards/rejected": -8.60211181640625, - "step": 7680 + "learning_rate": 1.963362453200214e-07, + "logits/chosen": -2.4762940406799316, + "logits/rejected": -2.403837203979492, + "logps/chosen": -225.29672241210938, + "logps/rejected": -302.9705505371094, + "loss": 0.0936, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1442207098007202, + "rewards/margins": 7.724308967590332, + "rewards/rejected": -8.868529319763184, + "step": 8060 }, { "epoch": 1.94, - "learning_rate": 1.9558093811440876e-07, - "logits/chosen": -2.494131565093994, - "logits/rejected": -2.4976108074188232, - "logps/chosen": -244.96316528320312, - "logps/rejected": -326.7379455566406, - "loss": 0.0849, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.0386927127838135, - "rewards/margins": 6.035378456115723, - "rewards/rejected": -8.074070930480957, - "step": 7690 + "learning_rate": 1.958905330718488e-07, + "logits/chosen": -2.4947333335876465, + "logits/rejected": -2.31681227684021, + "logps/chosen": -166.21688842773438, + "logps/rejected": -208.54916381835938, + "loss": 0.0551, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.137930154800415, + "rewards/margins": 4.669826030731201, + "rewards/rejected": -5.807755947113037, + "step": 8070 + }, + { + "epoch": 1.94, + "learning_rate": 1.9544482082367622e-07, + "logits/chosen": -2.649096965789795, + "logits/rejected": -2.5682971477508545, + "logps/chosen": -300.778564453125, + "logps/rejected": -409.8912048339844, + "loss": 0.0947, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6549450159072876, + "rewards/margins": 7.110496520996094, + "rewards/rejected": -8.76544189453125, + "step": 8080 }, { "epoch": 1.95, - "learning_rate": 1.9511281715195205e-07, - "logits/chosen": -2.569685697555542, - "logits/rejected": -2.408794641494751, - "logps/chosen": -253.5305633544922, - "logps/rejected": -320.5977783203125, - "loss": 0.088, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.7515738010406494, - "rewards/margins": 6.714259147644043, - "rewards/rejected": -8.46583366394043, - "step": 7700 + "learning_rate": 1.9499910857550365e-07, + "logits/chosen": -2.489932060241699, + "logits/rejected": -2.536132335662842, + "logps/chosen": -292.3930358886719, + "logps/rejected": -326.5362854003906, + "loss": 0.052, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1598970890045166, + "rewards/margins": 7.666459083557129, + "rewards/rejected": -7.826356410980225, + "step": 8090 }, { "epoch": 1.95, - "learning_rate": 1.9464469618949537e-07, - "logits/chosen": -2.457524538040161, - "logits/rejected": -2.29757022857666, - "logps/chosen": -262.61212158203125, - "logps/rejected": -294.0771484375, - "loss": 0.0492, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6144840717315674, - "rewards/margins": 6.872900485992432, - "rewards/rejected": -8.487383842468262, - "step": 7710 + "learning_rate": 1.9455339632733105e-07, + "logits/chosen": -2.5426011085510254, + "logits/rejected": -2.46451473236084, + "logps/chosen": -225.72988891601562, + "logps/rejected": -318.11737060546875, + "loss": 0.1018, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.484419107437134, + "rewards/margins": 7.18081521987915, + "rewards/rejected": -9.665234565734863, + "step": 8100 }, { "epoch": 1.95, - "learning_rate": 1.9417657522703866e-07, - "logits/chosen": -2.5344772338867188, - "logits/rejected": -2.473503589630127, - "logps/chosen": -253.647216796875, - "logps/rejected": -355.26422119140625, - "loss": 0.0902, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.004709136672317982, - "rewards/margins": 8.908960342407227, - "rewards/rejected": -8.904251098632812, - "step": 7720 + "eval_logits/chosen": -2.4671175479888916, + "eval_logits/rejected": -2.4193367958068848, + "eval_logps/chosen": -249.4323272705078, + "eval_logps/rejected": -269.311279296875, + "eval_loss": 0.5381121039390564, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -5.347128391265869, + "eval_rewards/margins": 3.144692897796631, + "eval_rewards/rejected": -8.4918212890625, + "eval_runtime": 131.8178, + "eval_samples_per_second": 23.942, + "eval_steps_per_second": 0.379, + "step": 8100 }, { "epoch": 1.95, - "learning_rate": 1.9370845426458197e-07, - "logits/chosen": -2.347165584564209, - "logits/rejected": -2.3553099632263184, - "logps/chosen": -199.9696502685547, - "logps/rejected": -317.84295654296875, - "loss": 0.0735, + "learning_rate": 1.9410768407915848e-07, + "logits/chosen": -2.5479893684387207, + "logits/rejected": -2.475780963897705, + "logps/chosen": -213.58932495117188, + "logps/rejected": -332.92327880859375, + "loss": 0.1291, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.1516036987304688, - "rewards/margins": 6.281300067901611, - "rewards/rejected": -8.432904243469238, - "step": 7730 + "rewards/chosen": -1.070810317993164, + "rewards/margins": 7.458122253417969, + "rewards/rejected": -8.528932571411133, + "step": 8110 }, { - "epoch": 1.96, - "learning_rate": 1.9324033330212524e-07, - "logits/chosen": -2.6470491886138916, - "logits/rejected": -2.4362094402313232, - "logps/chosen": -248.72830200195312, - "logps/rejected": -278.7564697265625, - "loss": 0.0773, + "epoch": 1.95, + "learning_rate": 1.9366197183098589e-07, + "logits/chosen": -2.6560816764831543, + "logits/rejected": -2.6043269634246826, + "logps/chosen": -186.26986694335938, + "logps/rejected": -362.4632873535156, + "loss": 0.0746, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.181691288948059, - "rewards/margins": 7.5501861572265625, - "rewards/rejected": -8.731878280639648, - "step": 7740 + "rewards/chosen": -0.08851809799671173, + "rewards/margins": 11.514171600341797, + "rewards/rejected": -11.602689743041992, + "step": 8120 }, { "epoch": 1.96, - "learning_rate": 1.9277221233966855e-07, - "logits/chosen": -2.4118123054504395, - "logits/rejected": -2.496093273162842, - "logps/chosen": -251.16970825195312, - "logps/rejected": -275.74749755859375, - "loss": 0.0745, + "learning_rate": 1.9321625958281332e-07, + "logits/chosen": -2.7491543292999268, + "logits/rejected": -2.604905128479004, + "logps/chosen": -235.55526733398438, + "logps/rejected": -317.4085388183594, + "loss": 0.0872, "rewards/accuracies": 1.0, - "rewards/chosen": -1.6367145776748657, - "rewards/margins": 7.31857442855835, - "rewards/rejected": -8.955288887023926, - "step": 7750 + "rewards/chosen": -0.2736426293849945, + "rewards/margins": 10.446340560913086, + "rewards/rejected": -10.71998405456543, + "step": 8130 }, { "epoch": 1.96, - "learning_rate": 1.9230409137721187e-07, - "logits/chosen": -2.350545883178711, - "logits/rejected": -2.2462215423583984, - "logps/chosen": -307.57269287109375, - "logps/rejected": -275.0626525878906, - "loss": 0.0645, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.4821748733520508, - "rewards/margins": 9.54548454284668, - "rewards/rejected": -10.02765941619873, - "step": 7760 + "learning_rate": 1.9277054733464074e-07, + "logits/chosen": -2.5898823738098145, + "logits/rejected": -2.5371153354644775, + "logps/chosen": -246.69979858398438, + "logps/rejected": -356.67962646484375, + "loss": 0.0863, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4393095076084137, + "rewards/margins": 8.542855262756348, + "rewards/rejected": -8.98216438293457, + "step": 8140 }, { "epoch": 1.96, - "learning_rate": 1.9183597041475516e-07, - "logits/chosen": -2.6003167629241943, - "logits/rejected": -2.481722593307495, - "logps/chosen": -257.1544494628906, - "logps/rejected": -341.75543212890625, - "loss": 0.0574, + "learning_rate": 1.9232483508646815e-07, + "logits/chosen": -2.5945725440979004, + "logits/rejected": -2.5346193313598633, + "logps/chosen": -296.9828186035156, + "logps/rejected": -294.2388610839844, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3400607109069824, + "rewards/margins": 12.43709945678711, + "rewards/rejected": -10.097040176391602, + "step": 8150 + }, + { + "epoch": 1.96, + "learning_rate": 1.9187912283829558e-07, + "logits/chosen": -2.762641668319702, + "logits/rejected": -2.509420394897461, + "logps/chosen": -223.868408203125, + "logps/rejected": -222.8394317626953, + "loss": 0.1855, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9357603192329407, + "rewards/margins": 5.512927532196045, + "rewards/rejected": -6.448688507080078, + "step": 8160 + }, + { + "epoch": 1.97, + "learning_rate": 1.91433410590123e-07, + "logits/chosen": -2.713310956954956, + "logits/rejected": -2.6913247108459473, + "logps/chosen": -280.3653869628906, + "logps/rejected": -370.3204345703125, + "loss": 0.1244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.420027494430542, + "rewards/margins": 8.768559455871582, + "rewards/rejected": -8.348531723022461, + "step": 8170 + }, + { + "epoch": 1.97, + "learning_rate": 1.909876983419504e-07, + "logits/chosen": -2.5790278911590576, + "logits/rejected": -2.573655366897583, + "logps/chosen": -251.7449493408203, + "logps/rejected": -323.3601379394531, + "loss": 0.1279, "rewards/accuracies": 1.0, - "rewards/chosen": -1.408017873764038, - "rewards/margins": 7.46950626373291, - "rewards/rejected": -8.877523422241211, - "step": 7770 + "rewards/chosen": -0.10781435668468475, + "rewards/margins": 9.65677547454834, + "rewards/rejected": -9.764589309692383, + "step": 8180 }, { "epoch": 1.97, - "learning_rate": 1.9136784945229847e-07, - "logits/chosen": -2.6348369121551514, - "logits/rejected": -2.52321195602417, - "logps/chosen": -281.85699462890625, - "logps/rejected": -319.59405517578125, - "loss": 0.1264, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7298279404640198, - "rewards/margins": 8.692323684692383, - "rewards/rejected": -9.422151565551758, - "step": 7780 + "learning_rate": 1.9054198609377787e-07, + "logits/chosen": -2.799701690673828, + "logits/rejected": -2.6298649311065674, + "logps/chosen": -278.739501953125, + "logps/rejected": -260.8819274902344, + "loss": 0.078, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.2646152973175049, + "rewards/margins": 7.12109375, + "rewards/rejected": -8.385709762573242, + "step": 8190 }, { "epoch": 1.97, - "learning_rate": 1.9089972848984176e-07, - "logits/chosen": -2.6521339416503906, - "logits/rejected": -2.540344476699829, - "logps/chosen": -305.10418701171875, - "logps/rejected": -263.56634521484375, - "loss": 0.0689, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.8900665044784546, - "rewards/margins": 6.8570051193237305, - "rewards/rejected": -7.747071743011475, - "step": 7790 + "learning_rate": 1.900962738456053e-07, + "logits/chosen": -2.6792545318603516, + "logits/rejected": -2.6745057106018066, + "logps/chosen": -189.7362060546875, + "logps/rejected": -358.11407470703125, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12119893729686737, + "rewards/margins": 11.700910568237305, + "rewards/rejected": -11.82210922241211, + "step": 8200 }, { "epoch": 1.97, - "learning_rate": 1.9043160752738508e-07, - "logits/chosen": -2.67622709274292, - "logits/rejected": -2.6399292945861816, - "logps/chosen": -355.0856628417969, - "logps/rejected": -337.4562683105469, - "loss": 0.0999, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6816396713256836, - "rewards/margins": 7.101809501647949, - "rewards/rejected": -7.783450126647949, - "step": 7800 + "eval_logits/chosen": -2.5328567028045654, + "eval_logits/rejected": -2.4873476028442383, + "eval_logps/chosen": -248.1124267578125, + "eval_logps/rejected": -268.1266784667969, + "eval_loss": 0.5385783910751343, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -5.215142250061035, + "eval_rewards/margins": 3.158216953277588, + "eval_rewards/rejected": -8.373359680175781, + "eval_runtime": 131.9123, + "eval_samples_per_second": 23.925, + "eval_steps_per_second": 0.379, + "step": 8200 }, { - "epoch": 1.97, - "learning_rate": 1.8996348656492837e-07, - "logits/chosen": -2.376469135284424, - "logits/rejected": -2.32849383354187, - "logps/chosen": -207.29660034179688, - "logps/rejected": -284.89007568359375, - "loss": 0.079, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.5479943752288818, - "rewards/margins": 6.303579330444336, - "rewards/rejected": -7.851573944091797, - "step": 7810 + "epoch": 1.98, + "learning_rate": 1.896505615974327e-07, + "logits/chosen": -2.6724190711975098, + "logits/rejected": -2.635072708129883, + "logps/chosen": -211.46078491210938, + "logps/rejected": -405.99066162109375, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8314496874809265, + "rewards/margins": 10.228105545043945, + "rewards/rejected": -11.059555053710938, + "step": 8210 }, { "epoch": 1.98, - "learning_rate": 1.8949536560247168e-07, - "logits/chosen": -2.3354849815368652, - "logits/rejected": -2.3749680519104004, - "logps/chosen": -181.08316040039062, - "logps/rejected": -217.5195770263672, - "loss": 0.0984, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.27036499977111816, - "rewards/margins": 6.646331787109375, - "rewards/rejected": -6.916696071624756, - "step": 7820 + "learning_rate": 1.8920484934926013e-07, + "logits/chosen": -2.6155571937561035, + "logits/rejected": -2.577794313430786, + "logps/chosen": -179.88742065429688, + "logps/rejected": -251.8545379638672, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4434904158115387, + "rewards/margins": 7.135495662689209, + "rewards/rejected": -7.578986167907715, + "step": 8220 }, { "epoch": 1.98, - "learning_rate": 1.8902724464001497e-07, - "logits/chosen": -2.676802158355713, - "logits/rejected": -2.4680111408233643, - "logps/chosen": -322.8711853027344, - "logps/rejected": -424.1873474121094, - "loss": 0.1034, + "learning_rate": 1.8875913710108753e-07, + "logits/chosen": -2.640148639678955, + "logits/rejected": -2.4899373054504395, + "logps/chosen": -259.8675231933594, + "logps/rejected": -249.64013671875, + "loss": 0.0928, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.44950175285339355, - "rewards/margins": 8.20262336730957, - "rewards/rejected": -8.652125358581543, - "step": 7830 + "rewards/chosen": -3.0020036697387695, + "rewards/margins": 6.6126813888549805, + "rewards/rejected": -9.614686012268066, + "step": 8230 }, { "epoch": 1.98, - "learning_rate": 1.8855912367755826e-07, - "logits/chosen": -2.494516372680664, - "logits/rejected": -2.5215835571289062, - "logps/chosen": -227.83432006835938, - "logps/rejected": -321.3638916015625, - "loss": 0.0906, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.327940821647644, - "rewards/margins": 6.914625644683838, - "rewards/rejected": -8.24256706237793, - "step": 7840 + "learning_rate": 1.8831342485291496e-07, + "logits/chosen": -2.8600502014160156, + "logits/rejected": -2.792074203491211, + "logps/chosen": -238.5311279296875, + "logps/rejected": -364.1710510253906, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07950621098279953, + "rewards/margins": 9.228584289550781, + "rewards/rejected": -9.149078369140625, + "step": 8240 }, { - "epoch": 1.98, - "learning_rate": 1.8809100271510155e-07, - "logits/chosen": -2.4991791248321533, - "logits/rejected": -2.3919272422790527, - "logps/chosen": -243.00161743164062, - "logps/rejected": -311.275390625, - "loss": 0.0558, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8239580392837524, - "rewards/margins": 7.247486114501953, - "rewards/rejected": -9.071443557739258, - "step": 7850 + "epoch": 1.99, + "learning_rate": 1.878677126047424e-07, + "logits/chosen": -2.578575611114502, + "logits/rejected": -2.5697288513183594, + "logps/chosen": -306.8039245605469, + "logps/rejected": -302.91839599609375, + "loss": 0.1767, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1644628047943115, + "rewards/margins": 9.136687278747559, + "rewards/rejected": -10.301149368286133, + "step": 8250 }, { "epoch": 1.99, - "learning_rate": 1.8762288175264487e-07, - "logits/chosen": -2.4490151405334473, - "logits/rejected": -2.3293774127960205, - "logps/chosen": -335.91619873046875, - "logps/rejected": -364.5838928222656, - "loss": 0.1274, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.4316998720169067, - "rewards/margins": 7.8372673988342285, - "rewards/rejected": -9.268967628479004, - "step": 7860 + "learning_rate": 1.874220003565698e-07, + "logits/chosen": -2.7323837280273438, + "logits/rejected": -2.714454174041748, + "logps/chosen": -290.375244140625, + "logps/rejected": -332.1299743652344, + "loss": 0.0912, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6332210302352905, + "rewards/margins": 8.374825477600098, + "rewards/rejected": -10.008047103881836, + "step": 8260 }, { "epoch": 1.99, - "learning_rate": 1.8715476079018818e-07, - "logits/chosen": -2.497100591659546, - "logits/rejected": -2.379605770111084, - "logps/chosen": -226.01351928710938, - "logps/rejected": -261.7375793457031, - "loss": 0.2659, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.5435857772827148, - "rewards/margins": 6.394545078277588, - "rewards/rejected": -7.9381303787231445, - "step": 7870 + "learning_rate": 1.8697628810839722e-07, + "logits/chosen": -2.76542592048645, + "logits/rejected": -2.67024564743042, + "logps/chosen": -427.3636779785156, + "logps/rejected": -323.9165344238281, + "loss": 0.0812, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4392527937889099, + "rewards/margins": 8.453493118286133, + "rewards/rejected": -8.014241218566895, + "step": 8270 }, { "epoch": 1.99, - "learning_rate": 1.8668663982773147e-07, - "logits/chosen": -2.4984803199768066, - "logits/rejected": -2.4084489345550537, - "logps/chosen": -278.9674377441406, - "logps/rejected": -300.3205261230469, - "loss": 0.1404, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5822738409042358, - "rewards/margins": 8.082863807678223, - "rewards/rejected": -8.665136337280273, - "step": 7880 + "learning_rate": 1.8653057586022465e-07, + "logits/chosen": -2.6926684379577637, + "logits/rejected": -2.629220724105835, + "logps/chosen": -273.28399658203125, + "logps/rejected": -327.89324951171875, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6484842300415039, + "rewards/margins": 8.712350845336914, + "rewards/rejected": -8.063865661621094, + "step": 8280 }, { - "epoch": 1.99, - "learning_rate": 1.862185188652748e-07, - "logits/chosen": -2.2636733055114746, - "logits/rejected": -2.1600308418273926, - "logps/chosen": -235.1582794189453, - "logps/rejected": -430.0575256347656, - "loss": 0.0936, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.06023601442575455, - "rewards/margins": 12.86986255645752, - "rewards/rejected": -12.930097579956055, - "step": 7890 + "epoch": 2.0, + "learning_rate": 1.8608486361205205e-07, + "logits/chosen": -2.604889154434204, + "logits/rejected": -2.561067581176758, + "logps/chosen": -271.330078125, + "logps/rejected": -347.42889404296875, + "loss": 0.1449, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2322348356246948, + "rewards/margins": 9.401994705200195, + "rewards/rejected": -10.63422966003418, + "step": 8290 }, { "epoch": 2.0, - "learning_rate": 1.8575039790281808e-07, - "logits/chosen": -2.718794822692871, - "logits/rejected": -2.6176838874816895, - "logps/chosen": -297.5869140625, - "logps/rejected": -413.158203125, - "loss": 0.1766, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0013257265090942, - "rewards/margins": 9.500307083129883, - "rewards/rejected": -10.501633644104004, - "step": 7900 + "learning_rate": 1.8563915136387948e-07, + "logits/chosen": -2.636011838912964, + "logits/rejected": -2.547791004180908, + "logps/chosen": -267.44732666015625, + "logps/rejected": -222.2125244140625, + "loss": 0.0801, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4617294073104858, + "rewards/margins": 5.576994895935059, + "rewards/rejected": -7.038724422454834, + "step": 8300 }, { "epoch": 2.0, - "learning_rate": 1.852822769403614e-07, - "logits/chosen": -2.5108580589294434, - "logits/rejected": -2.5221803188323975, - "logps/chosen": -266.2281188964844, - "logps/rejected": -489.9151306152344, - "loss": 0.051, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1882187128067017, - "rewards/margins": 11.635540008544922, - "rewards/rejected": -12.823759078979492, - "step": 7910 + "eval_logits/chosen": -2.486698627471924, + "eval_logits/rejected": -2.434771776199341, + "eval_logps/chosen": -254.06394958496094, + "eval_logps/rejected": -274.7842102050781, + "eval_loss": 0.5429018139839172, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -5.810294151306152, + "eval_rewards/margins": 3.228818416595459, + "eval_rewards/rejected": -9.039112091064453, + "eval_runtime": 131.7281, + "eval_samples_per_second": 23.958, + "eval_steps_per_second": 0.38, + "step": 8300 }, { "epoch": 2.0, - "learning_rate": 1.8481415597790468e-07, - "logits/chosen": -2.561251163482666, - "logits/rejected": -2.601534605026245, - "logps/chosen": -256.61260986328125, - "logps/rejected": -371.5245666503906, - "loss": 0.0257, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7856419086456299, - "rewards/margins": 9.267403602600098, - "rewards/rejected": -10.053044319152832, - "step": 7920 + "learning_rate": 1.8519343911570688e-07, + "logits/chosen": -2.4333741664886475, + "logits/rejected": -2.343681573867798, + "logps/chosen": -227.2239227294922, + "logps/rejected": -378.5175476074219, + "loss": 0.1149, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0861270427703857, + "rewards/margins": 8.761810302734375, + "rewards/rejected": -10.847936630249023, + "step": 8310 }, { "epoch": 2.0, - "learning_rate": 1.84346035015448e-07, - "logits/chosen": -2.456209659576416, - "logits/rejected": -2.4456276893615723, - "logps/chosen": -269.7339782714844, - "logps/rejected": -317.511474609375, - "loss": 0.0271, + "learning_rate": 1.8474772686753431e-07, + "logits/chosen": -2.632474422454834, + "logits/rejected": -2.481940746307373, + "logps/chosen": -251.89047241210938, + "logps/rejected": -295.0331726074219, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.502909779548645, + "rewards/margins": 8.537437438964844, + "rewards/rejected": -10.040348052978516, + "step": 8320 + }, + { + "epoch": 2.0, + "learning_rate": 1.8430201461936174e-07, + "logits/chosen": -2.5819125175476074, + "logits/rejected": -2.621971368789673, + "logps/chosen": -192.30247497558594, + "logps/rejected": -319.94122314453125, + "loss": 0.0361, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5396373867988586, - "rewards/margins": 9.451229095458984, - "rewards/rejected": -9.990866661071777, - "step": 7930 + "rewards/chosen": -0.7425268888473511, + "rewards/margins": 8.981419563293457, + "rewards/rejected": -9.723945617675781, + "step": 8330 }, { "epoch": 2.01, - "learning_rate": 1.8387791405299126e-07, - "logits/chosen": -2.3685498237609863, - "logits/rejected": -2.5117557048797607, - "logps/chosen": -263.42596435546875, - "logps/rejected": -358.99200439453125, - "loss": 0.0242, + "learning_rate": 1.8385630237118915e-07, + "logits/chosen": -2.8007924556732178, + "logits/rejected": -2.741535186767578, + "logps/chosen": -287.9234313964844, + "logps/rejected": -362.2257995605469, + "loss": 0.0302, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.06902637332677841, - "rewards/margins": 11.954598426818848, - "rewards/rejected": -12.023625373840332, - "step": 7940 + "rewards/chosen": -1.950335144996643, + "rewards/margins": 10.085515975952148, + "rewards/rejected": -12.035852432250977, + "step": 8340 }, { "epoch": 2.01, - "learning_rate": 1.8340979309053458e-07, - "logits/chosen": -2.643951892852783, - "logits/rejected": -2.4455976486206055, - "logps/chosen": -264.4395446777344, - "logps/rejected": -310.5169982910156, - "loss": 0.0164, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.0862385481595993, - "rewards/margins": 10.593948364257812, - "rewards/rejected": -10.68018627166748, - "step": 7950 + "learning_rate": 1.8341059012301658e-07, + "logits/chosen": -2.512610673904419, + "logits/rejected": -2.5478758811950684, + "logps/chosen": -292.9354248046875, + "logps/rejected": -442.20147705078125, + "loss": 0.036, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7576490640640259, + "rewards/margins": 9.573620796203613, + "rewards/rejected": -11.331270217895508, + "step": 8350 }, { "epoch": 2.01, - "learning_rate": 1.829416721280779e-07, - "logits/chosen": -2.6394400596618652, - "logits/rejected": -2.614877939224243, - "logps/chosen": -283.7574157714844, - "logps/rejected": -358.73687744140625, - "loss": 0.0389, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.05305292457342148, - "rewards/margins": 9.071102142333984, - "rewards/rejected": -9.12415599822998, - "step": 7960 + "learning_rate": 1.82964877874844e-07, + "logits/chosen": -2.567701816558838, + "logits/rejected": -2.6314425468444824, + "logps/chosen": -213.1210479736328, + "logps/rejected": -288.82269287109375, + "loss": 0.0235, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5137118101119995, + "rewards/margins": 7.740043640136719, + "rewards/rejected": -8.253755569458008, + "step": 8360 }, { "epoch": 2.01, - "learning_rate": 1.8247355116562118e-07, - "logits/chosen": -2.4935081005096436, - "logits/rejected": -2.388760805130005, - "logps/chosen": -250.9310760498047, - "logps/rejected": -325.74835205078125, - "loss": 0.0234, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.061603307723999, - "rewards/margins": 6.261511325836182, - "rewards/rejected": -7.32311487197876, - "step": 7970 + "learning_rate": 1.825191656266714e-07, + "logits/chosen": -2.7058777809143066, + "logits/rejected": -2.6668639183044434, + "logps/chosen": -295.42303466796875, + "logps/rejected": -297.51226806640625, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03315739706158638, + "rewards/margins": 10.056219100952148, + "rewards/rejected": -10.089376449584961, + "step": 8370 }, { "epoch": 2.02, - "learning_rate": 1.820054302031645e-07, - "logits/chosen": -2.652278423309326, - "logits/rejected": -2.5909059047698975, - "logps/chosen": -204.5234375, - "logps/rejected": -269.10760498046875, - "loss": 0.0267, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3194776177406311, - "rewards/margins": 8.187994003295898, - "rewards/rejected": -8.50747299194336, - "step": 7980 + "learning_rate": 1.8207345337849884e-07, + "logits/chosen": -2.599362850189209, + "logits/rejected": -2.615999460220337, + "logps/chosen": -249.38247680664062, + "logps/rejected": -313.9382629394531, + "loss": 0.0388, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.276896595954895, + "rewards/margins": 9.06472396850586, + "rewards/rejected": -10.341619491577148, + "step": 8380 }, { "epoch": 2.02, - "learning_rate": 1.815373092407078e-07, - "logits/chosen": -2.5203022956848145, - "logits/rejected": -2.4558236598968506, - "logps/chosen": -259.03582763671875, - "logps/rejected": -283.21807861328125, - "loss": 0.0216, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.8201421499252319, - "rewards/margins": 6.852190971374512, - "rewards/rejected": -7.672332763671875, - "step": 7990 + "learning_rate": 1.8162774113032624e-07, + "logits/chosen": -2.346391201019287, + "logits/rejected": -2.2419934272766113, + "logps/chosen": -249.8375244140625, + "logps/rejected": -366.44683837890625, + "loss": 0.035, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6584094762802124, + "rewards/margins": 8.798803329467773, + "rewards/rejected": -10.457212448120117, + "step": 8390 }, { "epoch": 2.02, - "learning_rate": 1.810691882782511e-07, - "logits/chosen": -2.5171782970428467, - "logits/rejected": -2.3930742740631104, - "logps/chosen": -310.53656005859375, - "logps/rejected": -249.26687622070312, - "loss": 0.0274, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.053589820861816406, - "rewards/margins": 8.388738632202148, - "rewards/rejected": -8.335149765014648, - "step": 8000 + "learning_rate": 1.8118202888215367e-07, + "logits/chosen": -2.6704933643341064, + "logits/rejected": -2.606541872024536, + "logps/chosen": -310.34149169921875, + "logps/rejected": -318.3885192871094, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3962994813919067, + "rewards/margins": 7.977416038513184, + "rewards/rejected": -9.3737154006958, + "step": 8400 }, { "epoch": 2.02, - "learning_rate": 1.806010673157944e-07, - "logits/chosen": -2.5312087535858154, - "logits/rejected": -2.4033422470092773, - "logps/chosen": -299.86273193359375, - "logps/rejected": -359.2492370605469, - "loss": 0.0208, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.21362681686878204, - "rewards/margins": 8.105158805847168, - "rewards/rejected": -8.318785667419434, - "step": 8010 + "eval_logits/chosen": -2.427152395248413, + "eval_logits/rejected": -2.3678908348083496, + "eval_logps/chosen": -253.86766052246094, + "eval_logps/rejected": -276.8174743652344, + "eval_loss": 0.5565958619117737, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -5.79066276550293, + "eval_rewards/margins": 3.4517767429351807, + "eval_rewards/rejected": -9.242439270019531, + "eval_runtime": 131.8134, + "eval_samples_per_second": 23.943, + "eval_steps_per_second": 0.379, + "step": 8400 }, { - "epoch": 2.03, - "learning_rate": 1.801329463533377e-07, - "logits/chosen": -2.6355605125427246, - "logits/rejected": -2.521852970123291, - "logps/chosen": -234.41146850585938, - "logps/rejected": -390.18963623046875, - "loss": 0.0278, + "epoch": 2.02, + "learning_rate": 1.807363166339811e-07, + "logits/chosen": -2.7115583419799805, + "logits/rejected": -2.583926200866699, + "logps/chosen": -240.94540405273438, + "logps/rejected": -428.8079528808594, + "loss": 0.0221, "rewards/accuracies": 1.0, - "rewards/chosen": 0.062195561826229095, - "rewards/margins": 10.18429183959961, - "rewards/rejected": -10.122096061706543, - "step": 8020 + "rewards/chosen": -1.298967719078064, + "rewards/margins": 10.36292839050293, + "rewards/rejected": -11.661896705627441, + "step": 8410 }, { "epoch": 2.03, - "learning_rate": 1.79664825390881e-07, - "logits/chosen": -2.543912410736084, - "logits/rejected": -2.3445167541503906, - "logps/chosen": -274.00604248046875, - "logps/rejected": -264.05810546875, - "loss": 0.0171, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.859461784362793, - "rewards/margins": 8.486991882324219, - "rewards/rejected": -9.346452713012695, - "step": 8030 + "learning_rate": 1.802906043858085e-07, + "logits/chosen": -2.5375876426696777, + "logits/rejected": -2.5282256603240967, + "logps/chosen": -245.84805297851562, + "logps/rejected": -326.02740478515625, + "loss": 0.0544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.797917366027832, + "rewards/margins": 8.332709312438965, + "rewards/rejected": -10.130627632141113, + "step": 8420 }, { "epoch": 2.03, - "learning_rate": 1.791967044284243e-07, - "logits/chosen": -2.34971284866333, - "logits/rejected": -2.3091883659362793, - "logps/chosen": -189.92611694335938, - "logps/rejected": -276.96270751953125, - "loss": 0.0207, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5734513998031616, - "rewards/margins": 7.828526496887207, - "rewards/rejected": -9.401978492736816, - "step": 8040 + "learning_rate": 1.7984489213763593e-07, + "logits/chosen": -2.725847005844116, + "logits/rejected": -2.6780190467834473, + "logps/chosen": -275.8963317871094, + "logps/rejected": -372.16851806640625, + "loss": 0.0425, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.3954068422317505, + "rewards/margins": 10.925408363342285, + "rewards/rejected": -12.32081413269043, + "step": 8430 }, { "epoch": 2.03, - "learning_rate": 1.7872858346596758e-07, - "logits/chosen": -2.5440075397491455, - "logits/rejected": -2.3190009593963623, - "logps/chosen": -291.01593017578125, - "logps/rejected": -358.1510009765625, - "loss": 0.0401, + "learning_rate": 1.7939917988946336e-07, + "logits/chosen": -2.3743064403533936, + "logits/rejected": -2.3651223182678223, + "logps/chosen": -237.2482452392578, + "logps/rejected": -266.89056396484375, + "loss": 0.0305, "rewards/accuracies": 1.0, - "rewards/chosen": -0.44751954078674316, - "rewards/margins": 8.990605354309082, - "rewards/rejected": -9.438124656677246, - "step": 8050 + "rewards/chosen": -1.51999032497406, + "rewards/margins": 9.4834566116333, + "rewards/rejected": -11.003446578979492, + "step": 8440 + }, + { + "epoch": 2.03, + "learning_rate": 1.7895346764129076e-07, + "logits/chosen": -2.687239170074463, + "logits/rejected": -2.538908004760742, + "logps/chosen": -318.89630126953125, + "logps/rejected": -357.08477783203125, + "loss": 0.0258, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5173957943916321, + "rewards/margins": 10.455747604370117, + "rewards/rejected": -9.938352584838867, + "step": 8450 }, { "epoch": 2.04, - "learning_rate": 1.782604625035109e-07, - "logits/chosen": -2.489872455596924, - "logits/rejected": -2.409348249435425, - "logps/chosen": -235.9306640625, - "logps/rejected": -279.5631103515625, - "loss": 0.0205, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.1051379442214966, - "rewards/margins": 7.312878608703613, - "rewards/rejected": -8.41801643371582, - "step": 8060 + "learning_rate": 1.785077553931182e-07, + "logits/chosen": -2.686711311340332, + "logits/rejected": -2.5898630619049072, + "logps/chosen": -233.9967041015625, + "logps/rejected": -396.8772277832031, + "loss": 0.0201, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.22783681750297546, + "rewards/margins": 11.44433307647705, + "rewards/rejected": -11.67216968536377, + "step": 8460 }, { "epoch": 2.04, - "learning_rate": 1.777923415410542e-07, - "logits/chosen": -2.252562999725342, - "logits/rejected": -2.2914912700653076, - "logps/chosen": -222.335693359375, - "logps/rejected": -248.390380859375, - "loss": 0.0174, + "learning_rate": 1.780620431449456e-07, + "logits/chosen": -2.5094399452209473, + "logits/rejected": -2.3309948444366455, + "logps/chosen": -221.36325073242188, + "logps/rejected": -314.6371154785156, + "loss": 0.0331, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.3904139995574951, - "rewards/margins": 7.800166130065918, - "rewards/rejected": -9.190579414367676, - "step": 8070 + "rewards/chosen": -0.18073305487632751, + "rewards/margins": 11.4480562210083, + "rewards/rejected": -11.628789901733398, + "step": 8470 }, { "epoch": 2.04, - "learning_rate": 1.773242205785975e-07, - "logits/chosen": -2.641828775405884, - "logits/rejected": -2.566885232925415, - "logps/chosen": -332.9060363769531, - "logps/rejected": -376.92059326171875, - "loss": 0.0128, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.2503730356693268, - "rewards/margins": 10.505960464477539, - "rewards/rejected": -10.25558853149414, - "step": 8080 + "learning_rate": 1.7761633089677302e-07, + "logits/chosen": -2.6071910858154297, + "logits/rejected": -2.629465103149414, + "logps/chosen": -265.3450927734375, + "logps/rejected": -323.4317932128906, + "loss": 0.0335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8022689819335938, + "rewards/margins": 8.804093360900879, + "rewards/rejected": -10.606361389160156, + "step": 8480 }, { "epoch": 2.04, - "learning_rate": 1.768560996161408e-07, - "logits/chosen": -2.5699288845062256, - "logits/rejected": -2.359781503677368, - "logps/chosen": -283.2177734375, - "logps/rejected": -267.2395935058594, - "loss": 0.1899, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8085619807243347, - "rewards/margins": 8.801640510559082, - "rewards/rejected": -9.610200881958008, - "step": 8090 + "learning_rate": 1.7717061864860045e-07, + "logits/chosen": -2.7370150089263916, + "logits/rejected": -2.6126275062561035, + "logps/chosen": -279.35992431640625, + "logps/rejected": -292.2677001953125, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2729337513446808, + "rewards/margins": 9.686235427856445, + "rewards/rejected": -9.413301467895508, + "step": 8490 }, { "epoch": 2.05, - "learning_rate": 1.763879786536841e-07, - "logits/chosen": -2.3845903873443604, - "logits/rejected": -2.3299880027770996, - "logps/chosen": -269.281005859375, - "logps/rejected": -364.01177978515625, - "loss": 0.0265, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.045176975429058075, - "rewards/margins": 11.046797752380371, - "rewards/rejected": -11.001619338989258, - "step": 8100 + "learning_rate": 1.7672490640042786e-07, + "logits/chosen": -2.5374059677124023, + "logits/rejected": -2.457620143890381, + "logps/chosen": -311.1575622558594, + "logps/rejected": -338.55517578125, + "loss": 0.0246, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.676213026046753, + "rewards/margins": 10.284980773925781, + "rewards/rejected": -11.961193084716797, + "step": 8500 }, { "epoch": 2.05, - "learning_rate": 1.7591985769122742e-07, - "logits/chosen": -2.6512389183044434, - "logits/rejected": -2.4565510749816895, - "logps/chosen": -293.4483642578125, - "logps/rejected": -316.6645812988281, - "loss": 0.0168, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3632938861846924, - "rewards/margins": 7.443239688873291, - "rewards/rejected": -8.806532859802246, - "step": 8110 + "eval_logits/chosen": -2.3957743644714355, + "eval_logits/rejected": -2.333507776260376, + "eval_logps/chosen": -252.27833557128906, + "eval_logps/rejected": -275.9263916015625, + "eval_loss": 0.5758479833602905, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -5.631732940673828, + "eval_rewards/margins": 3.521597385406494, + "eval_rewards/rejected": -9.153331756591797, + "eval_runtime": 131.9377, + "eval_samples_per_second": 23.92, + "eval_steps_per_second": 0.379, + "step": 8500 }, { "epoch": 2.05, - "learning_rate": 1.754517367287707e-07, - "logits/chosen": -2.4491851329803467, - "logits/rejected": -2.3319764137268066, - "logps/chosen": -191.01324462890625, - "logps/rejected": -290.0277404785156, - "loss": 0.0316, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.18040040135383606, - "rewards/margins": 10.047754287719727, - "rewards/rejected": -10.228155136108398, - "step": 8120 + "learning_rate": 1.7627919415225529e-07, + "logits/chosen": -2.585289716720581, + "logits/rejected": -2.5478899478912354, + "logps/chosen": -216.0231170654297, + "logps/rejected": -328.88055419921875, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.875042736530304, + "rewards/margins": 12.213201522827148, + "rewards/rejected": -13.08824348449707, + "step": 8510 }, { - "epoch": 2.06, - "learning_rate": 1.7498361576631402e-07, - "logits/chosen": -2.584289073944092, - "logits/rejected": -2.410949945449829, - "logps/chosen": -220.06869506835938, - "logps/rejected": -292.25885009765625, - "loss": 0.0287, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.637312889099121, - "rewards/margins": 9.337835311889648, - "rewards/rejected": -10.975146293640137, - "step": 8130 + "epoch": 2.05, + "learning_rate": 1.7583348190408272e-07, + "logits/chosen": -2.2625935077667236, + "logits/rejected": -2.296347141265869, + "logps/chosen": -185.80349731445312, + "logps/rejected": -276.11492919921875, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.706001877784729, + "rewards/margins": 12.223898887634277, + "rewards/rejected": -11.51789665222168, + "step": 8520 + }, + { + "epoch": 2.05, + "learning_rate": 1.7538776965591012e-07, + "logits/chosen": -2.6731491088867188, + "logits/rejected": -2.5285797119140625, + "logps/chosen": -220.24111938476562, + "logps/rejected": -294.22369384765625, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3502438068389893, + "rewards/margins": 9.295039176940918, + "rewards/rejected": -10.645281791687012, + "step": 8530 }, { "epoch": 2.06, - "learning_rate": 1.745154948038573e-07, - "logits/chosen": -2.5168070793151855, - "logits/rejected": -2.3504860401153564, - "logps/chosen": -291.9444885253906, - "logps/rejected": -430.6874084472656, - "loss": 0.0212, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9920564889907837, - "rewards/margins": 9.87191390991211, - "rewards/rejected": -11.863969802856445, - "step": 8140 + "learning_rate": 1.7494205740773757e-07, + "logits/chosen": -2.501774311065674, + "logits/rejected": -2.466353178024292, + "logps/chosen": -258.7721862792969, + "logps/rejected": -350.9908447265625, + "loss": 0.0416, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7618000507354736, + "rewards/margins": 8.672781944274902, + "rewards/rejected": -10.434581756591797, + "step": 8540 }, { "epoch": 2.06, - "learning_rate": 1.740473738414006e-07, - "logits/chosen": -2.504371404647827, - "logits/rejected": -2.3894200325012207, - "logps/chosen": -317.1540222167969, - "logps/rejected": -293.84375, - "loss": 0.0229, + "learning_rate": 1.7449634515956498e-07, + "logits/chosen": -2.741926670074463, + "logits/rejected": -2.656954288482666, + "logps/chosen": -280.8689880371094, + "logps/rejected": -352.93304443359375, + "loss": 0.0312, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.0019736289978027, - "rewards/margins": 6.3885931968688965, - "rewards/rejected": -8.390567779541016, - "step": 8150 + "rewards/chosen": -0.4702383875846863, + "rewards/margins": 9.958547592163086, + "rewards/rejected": -10.42878532409668, + "step": 8550 }, { "epoch": 2.06, - "learning_rate": 1.735792528789439e-07, - "logits/chosen": -2.6090266704559326, - "logits/rejected": -2.4409031867980957, - "logps/chosen": -314.2073669433594, - "logps/rejected": -401.43157958984375, - "loss": 0.0287, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9937704801559448, - "rewards/margins": 9.054048538208008, - "rewards/rejected": -10.047819137573242, - "step": 8160 + "learning_rate": 1.740506329113924e-07, + "logits/chosen": -2.575545072555542, + "logits/rejected": -2.607477903366089, + "logps/chosen": -245.94775390625, + "logps/rejected": -293.87603759765625, + "loss": 0.0502, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8155536651611328, + "rewards/margins": 8.2267427444458, + "rewards/rejected": -10.042295455932617, + "step": 8560 + }, + { + "epoch": 2.06, + "learning_rate": 1.7360492066321984e-07, + "logits/chosen": -2.671506404876709, + "logits/rejected": -2.528284788131714, + "logps/chosen": -292.8680114746094, + "logps/rejected": -336.23797607421875, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01942155323922634, + "rewards/margins": 11.124090194702148, + "rewards/rejected": -11.104668617248535, + "step": 8570 + }, + { + "epoch": 2.06, + "learning_rate": 1.7315920841504724e-07, + "logits/chosen": -2.6238932609558105, + "logits/rejected": -2.4847018718719482, + "logps/chosen": -262.1552734375, + "logps/rejected": -282.15142822265625, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9679250717163086, + "rewards/margins": 8.12026309967041, + "rewards/rejected": -9.088189125061035, + "step": 8580 }, { "epoch": 2.07, - "learning_rate": 1.731111319164872e-07, - "logits/chosen": -2.321427345275879, - "logits/rejected": -2.2758946418762207, - "logps/chosen": -152.3300018310547, - "logps/rejected": -221.68136596679688, - "loss": 0.0069, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7852057814598083, - "rewards/margins": 8.7341890335083, - "rewards/rejected": -9.519393920898438, - "step": 8170 + "learning_rate": 1.7271349616687467e-07, + "logits/chosen": -2.2478580474853516, + "logits/rejected": -2.1565537452697754, + "logps/chosen": -256.3941955566406, + "logps/rejected": -427.9732971191406, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0428926944732666, + "rewards/margins": 12.040678024291992, + "rewards/rejected": -13.083572387695312, + "step": 8590 }, { "epoch": 2.07, - "learning_rate": 1.7264301095403052e-07, - "logits/chosen": -2.5600805282592773, - "logits/rejected": -2.400702714920044, - "logps/chosen": -327.7169494628906, - "logps/rejected": -362.20648193359375, - "loss": 0.0184, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4652659893035889, - "rewards/margins": 10.510732650756836, - "rewards/rejected": -11.975997924804688, - "step": 8180 + "learning_rate": 1.722677839187021e-07, + "logits/chosen": -2.5653958320617676, + "logits/rejected": -2.5322914123535156, + "logps/chosen": -253.1537322998047, + "logps/rejected": -327.99798583984375, + "loss": 0.0187, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.9576309323310852, + "rewards/margins": 9.887900352478027, + "rewards/rejected": -10.845531463623047, + "step": 8600 }, { "epoch": 2.07, - "learning_rate": 1.721748899915738e-07, - "logits/chosen": -2.646230459213257, - "logits/rejected": -2.510148525238037, - "logps/chosen": -252.9543914794922, - "logps/rejected": -352.07916259765625, - "loss": 0.0117, + "eval_logits/chosen": -2.4165918827056885, + "eval_logits/rejected": -2.361424207687378, + "eval_logps/chosen": -251.7559051513672, + "eval_logps/rejected": -276.9613342285156, + "eval_loss": 0.5770373344421387, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -5.579489707946777, + "eval_rewards/margins": 3.6773383617401123, + "eval_rewards/rejected": -9.256827354431152, + "eval_runtime": 132.1991, + "eval_samples_per_second": 23.873, + "eval_steps_per_second": 0.378, + "step": 8600 + }, + { + "epoch": 2.07, + "learning_rate": 1.718220716705295e-07, + "logits/chosen": -2.648555040359497, + "logits/rejected": -2.5799965858459473, + "logps/chosen": -248.0072021484375, + "logps/rejected": -317.71514892578125, + "loss": 0.0176, "rewards/accuracies": 1.0, - "rewards/chosen": 0.06406328827142715, - "rewards/margins": 10.646974563598633, - "rewards/rejected": -10.582910537719727, - "step": 8190 + "rewards/chosen": -0.8246999979019165, + "rewards/margins": 9.367280006408691, + "rewards/rejected": -10.19197940826416, + "step": 8610 }, { "epoch": 2.07, - "learning_rate": 1.7170676902911713e-07, - "logits/chosen": -2.5234501361846924, - "logits/rejected": -2.3625144958496094, - "logps/chosen": -216.58914184570312, - "logps/rejected": -283.11175537109375, - "loss": 0.0169, + "learning_rate": 1.7137635942235693e-07, + "logits/chosen": -2.6260035037994385, + "logits/rejected": -2.5549159049987793, + "logps/chosen": -301.1200866699219, + "logps/rejected": -366.55859375, + "loss": 0.0135, "rewards/accuracies": 1.0, - "rewards/chosen": -0.7709776163101196, - "rewards/margins": 7.509818077087402, - "rewards/rejected": -8.280795097351074, - "step": 8200 + "rewards/chosen": -1.4152629375457764, + "rewards/margins": 11.261905670166016, + "rewards/rejected": -12.677168846130371, + "step": 8620 }, { "epoch": 2.08, - "learning_rate": 1.7123864806666042e-07, - "logits/chosen": -2.4937376976013184, - "logits/rejected": -2.333495616912842, - "logps/chosen": -267.9828796386719, - "logps/rejected": -351.60833740234375, - "loss": 0.0193, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5620592832565308, - "rewards/margins": 10.002971649169922, - "rewards/rejected": -11.565031051635742, - "step": 8210 + "learning_rate": 1.7093064717418433e-07, + "logits/chosen": -2.639784336090088, + "logits/rejected": -2.4940614700317383, + "logps/chosen": -264.86962890625, + "logps/rejected": -354.2148132324219, + "loss": 0.022, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.5261491537094116, + "rewards/margins": 11.026629447937012, + "rewards/rejected": -12.552778244018555, + "step": 8630 }, { "epoch": 2.08, - "learning_rate": 1.7077052710420373e-07, - "logits/chosen": -2.5776846408843994, - "logits/rejected": -2.494020938873291, - "logps/chosen": -236.8033447265625, - "logps/rejected": -289.5408935546875, - "loss": 0.0207, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.069629192352295, - "rewards/margins": 8.876639366149902, - "rewards/rejected": -10.946269035339355, - "step": 8220 + "learning_rate": 1.7048493492601176e-07, + "logits/chosen": -2.596727132797241, + "logits/rejected": -2.5123703479766846, + "logps/chosen": -272.24139404296875, + "logps/rejected": -296.4933166503906, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7782129049301147, + "rewards/margins": 10.783132553100586, + "rewards/rejected": -11.561345100402832, + "step": 8640 }, { "epoch": 2.08, - "learning_rate": 1.7030240614174702e-07, - "logits/chosen": -2.341704845428467, - "logits/rejected": -2.1591591835021973, - "logps/chosen": -207.3115234375, - "logps/rejected": -285.88458251953125, - "loss": 0.0171, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1234140396118164, - "rewards/margins": 11.760368347167969, - "rewards/rejected": -10.636955261230469, - "step": 8230 + "learning_rate": 1.700392226778392e-07, + "logits/chosen": -2.492323398590088, + "logits/rejected": -2.597478151321411, + "logps/chosen": -234.81118774414062, + "logps/rejected": -339.72198486328125, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5566939115524292, + "rewards/margins": 13.372156143188477, + "rewards/rejected": -13.928850173950195, + "step": 8650 }, { "epoch": 2.08, - "learning_rate": 1.6983428517929034e-07, - "logits/chosen": -2.479773998260498, - "logits/rejected": -2.2539517879486084, - "logps/chosen": -366.88201904296875, - "logps/rejected": -370.52618408203125, - "loss": 0.0108, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.2932839393615723, - "rewards/margins": 7.529888153076172, - "rewards/rejected": -9.823171615600586, - "step": 8240 + "learning_rate": 1.695935104296666e-07, + "logits/chosen": -2.735934257507324, + "logits/rejected": -2.596465587615967, + "logps/chosen": -364.35662841796875, + "logps/rejected": -406.90338134765625, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23018774390220642, + "rewards/margins": 10.407444953918457, + "rewards/rejected": -10.637632369995117, + "step": 8660 }, { "epoch": 2.09, - "learning_rate": 1.693661642168336e-07, - "logits/chosen": -2.645338296890259, - "logits/rejected": -2.3822028636932373, - "logps/chosen": -244.0205535888672, - "logps/rejected": -349.7413024902344, - "loss": 0.0196, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.49810829758644104, - "rewards/margins": 10.42314338684082, - "rewards/rejected": -9.92503547668457, - "step": 8250 + "learning_rate": 1.6914779818149402e-07, + "logits/chosen": -2.5345845222473145, + "logits/rejected": -2.490156412124634, + "logps/chosen": -358.6170654296875, + "logps/rejected": -452.56048583984375, + "loss": 0.0375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6384512782096863, + "rewards/margins": 12.556425094604492, + "rewards/rejected": -11.917972564697266, + "step": 8670 }, { "epoch": 2.09, - "learning_rate": 1.6889804325437692e-07, - "logits/chosen": -2.571413040161133, - "logits/rejected": -2.4334158897399902, - "logps/chosen": -335.8421325683594, - "logps/rejected": -308.39263916015625, - "loss": 0.0186, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.9779975414276123, - "rewards/margins": 8.579587936401367, - "rewards/rejected": -11.557584762573242, - "step": 8260 + "learning_rate": 1.6870208593332145e-07, + "logits/chosen": -2.776972770690918, + "logits/rejected": -2.65968918800354, + "logps/chosen": -361.05218505859375, + "logps/rejected": -383.57781982421875, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1055375337600708, + "rewards/margins": 10.641298294067383, + "rewards/rejected": -11.746835708618164, + "step": 8680 }, { "epoch": 2.09, - "learning_rate": 1.684299222919202e-07, - "logits/chosen": -2.5503220558166504, - "logits/rejected": -2.539371967315674, - "logps/chosen": -194.6260986328125, - "logps/rejected": -334.47161865234375, - "loss": 0.0398, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.26310062408447266, - "rewards/margins": 11.362150192260742, - "rewards/rejected": -11.099050521850586, - "step": 8270 + "learning_rate": 1.6825637368514886e-07, + "logits/chosen": -2.5694427490234375, + "logits/rejected": -2.3911197185516357, + "logps/chosen": -326.6131896972656, + "logps/rejected": -282.97808837890625, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07688417285680771, + "rewards/margins": 11.150547981262207, + "rewards/rejected": -11.073664665222168, + "step": 8690 }, { "epoch": 2.09, - "learning_rate": 1.6796180132946352e-07, - "logits/chosen": -2.3966946601867676, - "logits/rejected": -2.3247835636138916, - "logps/chosen": -344.1413269042969, - "logps/rejected": -380.3586730957031, - "loss": 0.062, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.485003799200058, - "rewards/margins": 8.122628211975098, - "rewards/rejected": -8.60763168334961, - "step": 8280 + "learning_rate": 1.6781066143697628e-07, + "logits/chosen": -2.3880043029785156, + "logits/rejected": -2.459749221801758, + "logps/chosen": -218.9061737060547, + "logps/rejected": -354.92498779296875, + "loss": 0.0606, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6321842670440674, + "rewards/margins": 11.147878646850586, + "rewards/rejected": -13.780062675476074, + "step": 8700 }, { - "epoch": 2.1, - "learning_rate": 1.6749368036700684e-07, - "logits/chosen": -2.525015354156494, - "logits/rejected": -2.412067413330078, - "logps/chosen": -264.25482177734375, - "logps/rejected": -356.77655029296875, - "loss": 0.0736, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.21086089313030243, - "rewards/margins": 9.446159362792969, - "rewards/rejected": -9.657018661499023, - "step": 8290 + "epoch": 2.09, + "eval_logits/chosen": -2.336517572402954, + "eval_logits/rejected": -2.2736802101135254, + "eval_logps/chosen": -267.15118408203125, + "eval_logps/rejected": -297.2460021972656, + "eval_loss": 0.6114829182624817, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -7.1190185546875, + "eval_rewards/margins": 4.166274070739746, + "eval_rewards/rejected": -11.285292625427246, + "eval_runtime": 132.1012, + "eval_samples_per_second": 23.891, + "eval_steps_per_second": 0.378, + "step": 8700 }, { "epoch": 2.1, - "learning_rate": 1.6702555940455013e-07, - "logits/chosen": -2.270688056945801, - "logits/rejected": -2.3070790767669678, - "logps/chosen": -185.3845977783203, - "logps/rejected": -337.3004455566406, - "loss": 0.0214, + "learning_rate": 1.673649491888037e-07, + "logits/chosen": -2.669351100921631, + "logits/rejected": -2.5618691444396973, + "logps/chosen": -388.2505187988281, + "logps/rejected": -360.0283203125, + "loss": 0.0278, "rewards/accuracies": 1.0, - "rewards/chosen": -1.5996869802474976, - "rewards/margins": 10.957901000976562, - "rewards/rejected": -12.557588577270508, - "step": 8300 + "rewards/chosen": -0.9836538434028625, + "rewards/margins": 11.006999969482422, + "rewards/rejected": -11.990653038024902, + "step": 8710 }, { "epoch": 2.1, - "learning_rate": 1.6655743844209344e-07, - "logits/chosen": -2.582287073135376, - "logits/rejected": -2.5308005809783936, - "logps/chosen": -279.72979736328125, - "logps/rejected": -383.0921325683594, - "loss": 0.0192, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.091134786605835, - "rewards/margins": 9.544285774230957, - "rewards/rejected": -10.635419845581055, - "step": 8310 + "learning_rate": 1.6691923694063112e-07, + "logits/chosen": -2.434178352355957, + "logits/rejected": -2.240828037261963, + "logps/chosen": -231.48251342773438, + "logps/rejected": -322.2445373535156, + "loss": 0.0205, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0507824420928955, + "rewards/margins": 11.852723121643066, + "rewards/rejected": -12.903505325317383, + "step": 8720 }, { "epoch": 2.1, - "learning_rate": 1.6608931747963673e-07, - "logits/chosen": -2.496814727783203, - "logits/rejected": -2.4529342651367188, - "logps/chosen": -347.9647521972656, - "logps/rejected": -375.94781494140625, - "loss": 0.0143, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.0510023832321167, - "rewards/margins": 10.772096633911133, - "rewards/rejected": -10.721094131469727, - "step": 8320 + "learning_rate": 1.6647352469245855e-07, + "logits/chosen": -2.558774471282959, + "logits/rejected": -2.43123197555542, + "logps/chosen": -298.77374267578125, + "logps/rejected": -326.5734558105469, + "loss": 0.0354, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6990379691123962, + "rewards/margins": 9.126887321472168, + "rewards/rejected": -9.82592487335205, + "step": 8730 + }, + { + "epoch": 2.1, + "learning_rate": 1.6602781244428595e-07, + "logits/chosen": -2.5324432849884033, + "logits/rejected": -2.453352451324463, + "logps/chosen": -232.05087280273438, + "logps/rejected": -332.95831298828125, + "loss": 0.0427, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3409179151058197, + "rewards/margins": 10.205674171447754, + "rewards/rejected": -10.546591758728027, + "step": 8740 }, { "epoch": 2.11, - "learning_rate": 1.6562119651718005e-07, - "logits/chosen": -2.5629899501800537, - "logits/rejected": -2.434959888458252, - "logps/chosen": -246.48849487304688, - "logps/rejected": -348.2873229980469, - "loss": 0.0134, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9217214584350586, - "rewards/margins": 10.495768547058105, - "rewards/rejected": -11.417490005493164, - "step": 8330 + "learning_rate": 1.6558210019611338e-07, + "logits/chosen": -2.6918833255767822, + "logits/rejected": -2.640571117401123, + "logps/chosen": -343.3521423339844, + "logps/rejected": -389.19488525390625, + "loss": 0.018, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0499929189682007, + "rewards/margins": 12.304195404052734, + "rewards/rejected": -13.354188919067383, + "step": 8750 }, { "epoch": 2.11, - "learning_rate": 1.6515307555472334e-07, - "logits/chosen": -2.3305695056915283, - "logits/rejected": -2.317716598510742, - "logps/chosen": -251.8772430419922, - "logps/rejected": -313.6505432128906, - "loss": 0.0165, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3803676664829254, - "rewards/margins": 9.625941276550293, - "rewards/rejected": -10.006309509277344, - "step": 8340 + "learning_rate": 1.651363879479408e-07, + "logits/chosen": -2.46795654296875, + "logits/rejected": -2.324697494506836, + "logps/chosen": -236.7128448486328, + "logps/rejected": -284.4989929199219, + "loss": 0.057, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.217156171798706, + "rewards/margins": 8.084733963012695, + "rewards/rejected": -11.301889419555664, + "step": 8760 }, { "epoch": 2.11, - "learning_rate": 1.6468495459226663e-07, - "logits/chosen": -2.4659130573272705, - "logits/rejected": -2.4751675128936768, - "logps/chosen": -156.78298950195312, - "logps/rejected": -280.6018981933594, - "loss": 0.0165, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.27651506662368774, - "rewards/margins": 10.506505012512207, - "rewards/rejected": -10.78302001953125, - "step": 8350 + "learning_rate": 1.646906756997682e-07, + "logits/chosen": -2.5654988288879395, + "logits/rejected": -2.4846606254577637, + "logps/chosen": -395.7351989746094, + "logps/rejected": -362.2881774902344, + "loss": 0.0255, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.390353262424469, + "rewards/margins": 9.132757186889648, + "rewards/rejected": -9.523110389709473, + "step": 8770 }, { "epoch": 2.11, - "learning_rate": 1.6421683362980992e-07, - "logits/chosen": -2.323483943939209, - "logits/rejected": -2.4297595024108887, - "logps/chosen": -195.44876098632812, - "logps/rejected": -443.52691650390625, - "loss": 0.0107, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0662974119186401, - "rewards/margins": 14.12694263458252, - "rewards/rejected": -15.19324016571045, - "step": 8360 + "learning_rate": 1.6424496345159564e-07, + "logits/chosen": -2.4784624576568604, + "logits/rejected": -2.374669075012207, + "logps/chosen": -316.1205139160156, + "logps/rejected": -406.8511657714844, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.217925071716309, + "rewards/margins": 12.226975440979004, + "rewards/rejected": -16.444900512695312, + "step": 8780 }, { "epoch": 2.12, - "learning_rate": 1.6374871266735323e-07, - "logits/chosen": -2.4566922187805176, - "logits/rejected": -2.3604483604431152, - "logps/chosen": -276.23419189453125, - "logps/rejected": -290.4494323730469, - "loss": 0.0108, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.7056076526641846, - "rewards/margins": 10.065206527709961, - "rewards/rejected": -11.77081298828125, - "step": 8370 + "learning_rate": 1.6379925120342304e-07, + "logits/chosen": -2.458488941192627, + "logits/rejected": -2.3209900856018066, + "logps/chosen": -201.60546875, + "logps/rejected": -352.35003662109375, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.187859058380127, + "rewards/margins": 11.035813331604004, + "rewards/rejected": -14.223672866821289, + "step": 8790 }, { "epoch": 2.12, - "learning_rate": 1.6328059170489652e-07, - "logits/chosen": -2.63828706741333, - "logits/rejected": -2.462395429611206, - "logps/chosen": -289.7229919433594, - "logps/rejected": -303.6163330078125, - "loss": 0.0217, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.912933349609375, - "rewards/margins": 7.971318244934082, - "rewards/rejected": -9.88425064086914, - "step": 8380 + "learning_rate": 1.6335353895525047e-07, + "logits/chosen": -2.529459238052368, + "logits/rejected": -2.373972177505493, + "logps/chosen": -267.9228820800781, + "logps/rejected": -420.4358825683594, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27717798948287964, + "rewards/margins": 13.667352676391602, + "rewards/rejected": -13.390174865722656, + "step": 8800 + }, + { + "epoch": 2.12, + "eval_logits/chosen": -2.2653732299804688, + "eval_logits/rejected": -2.200467586517334, + "eval_logps/chosen": -266.4918518066406, + "eval_logps/rejected": -295.7088623046875, + "eval_loss": 0.6163830757141113, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -7.053084850311279, + "eval_rewards/margins": 4.078494548797607, + "eval_rewards/rejected": -11.131579399108887, + "eval_runtime": 132.1493, + "eval_samples_per_second": 23.882, + "eval_steps_per_second": 0.378, + "step": 8800 }, { "epoch": 2.12, - "learning_rate": 1.6281247074243984e-07, - "logits/chosen": -2.365940809249878, - "logits/rejected": -2.3253707885742188, - "logps/chosen": -275.8111877441406, - "logps/rejected": -374.54693603515625, + "learning_rate": 1.629078267070779e-07, + "logits/chosen": -2.5365514755249023, + "logits/rejected": -2.351696491241455, + "logps/chosen": -260.916259765625, + "logps/rejected": -308.0904541015625, "loss": 0.0226, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3288636207580566, - "rewards/margins": 12.161454200744629, - "rewards/rejected": -13.490318298339844, - "step": 8390 + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5732786655426025, + "rewards/margins": 9.407404899597168, + "rewards/rejected": -11.980682373046875, + "step": 8810 }, { "epoch": 2.12, - "learning_rate": 1.6234434977998315e-07, - "logits/chosen": -2.3862948417663574, - "logits/rejected": -2.3025941848754883, - "logps/chosen": -292.1995849609375, - "logps/rejected": -398.25225830078125, - "loss": 0.0518, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.936740517616272, - "rewards/margins": 10.562994003295898, - "rewards/rejected": -12.499734878540039, - "step": 8400 + "learning_rate": 1.624621144589053e-07, + "logits/chosen": -2.563375949859619, + "logits/rejected": -2.533550500869751, + "logps/chosen": -237.24057006835938, + "logps/rejected": -376.21295166015625, + "loss": 0.0573, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7408981323242188, + "rewards/margins": 10.642239570617676, + "rewards/rejected": -12.383138656616211, + "step": 8820 }, { "epoch": 2.13, - "learning_rate": 1.6187622881752644e-07, - "logits/chosen": -2.454590320587158, - "logits/rejected": -2.471228837966919, - "logps/chosen": -235.47763061523438, - "logps/rejected": -368.3105163574219, - "loss": 0.0464, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.234427452087402, - "rewards/margins": 9.330568313598633, - "rewards/rejected": -13.564994812011719, - "step": 8410 + "learning_rate": 1.6201640221073273e-07, + "logits/chosen": -2.634030342102051, + "logits/rejected": -2.412334680557251, + "logps/chosen": -290.4249572753906, + "logps/rejected": -358.2115173339844, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6466649770736694, + "rewards/margins": 11.438368797302246, + "rewards/rejected": -12.08503532409668, + "step": 8830 }, { "epoch": 2.13, - "learning_rate": 1.6140810785506976e-07, - "logits/chosen": -2.5940306186676025, - "logits/rejected": -2.3940608501434326, - "logps/chosen": -293.6297607421875, - "logps/rejected": -333.98248291015625, - "loss": 0.0236, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9440231323242188, - "rewards/margins": 10.160451889038086, - "rewards/rejected": -12.104475975036621, - "step": 8420 + "learning_rate": 1.6157068996256016e-07, + "logits/chosen": -2.4653375148773193, + "logits/rejected": -2.4410319328308105, + "logps/chosen": -251.10128784179688, + "logps/rejected": -313.43658447265625, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.60115647315979, + "rewards/margins": 9.766569137573242, + "rewards/rejected": -10.36772632598877, + "step": 8840 }, { "epoch": 2.13, - "learning_rate": 1.6093998689261305e-07, - "logits/chosen": -2.4853529930114746, - "logits/rejected": -2.4013895988464355, - "logps/chosen": -269.6568298339844, - "logps/rejected": -433.7679748535156, - "loss": 0.019, + "learning_rate": 1.6112497771438757e-07, + "logits/chosen": -2.580268383026123, + "logits/rejected": -2.465444564819336, + "logps/chosen": -255.9634246826172, + "logps/rejected": -336.2716369628906, + "loss": 0.0358, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.2059710025787354, - "rewards/margins": 12.60078239440918, - "rewards/rejected": -13.806753158569336, - "step": 8430 + "rewards/chosen": -2.005524158477783, + "rewards/margins": 9.394083023071289, + "rewards/rejected": -11.399606704711914, + "step": 8850 }, { "epoch": 2.13, - "learning_rate": 1.6047186593015636e-07, - "logits/chosen": -2.5733630657196045, - "logits/rejected": -2.41157865524292, - "logps/chosen": -260.69903564453125, - "logps/rejected": -291.9221496582031, - "loss": 0.0136, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.5500335693359375, - "rewards/margins": 8.757722854614258, - "rewards/rejected": -11.307757377624512, - "step": 8440 + "learning_rate": 1.60679265466215e-07, + "logits/chosen": -2.490568161010742, + "logits/rejected": -2.4568448066711426, + "logps/chosen": -376.48785400390625, + "logps/rejected": -401.3404846191406, + "loss": 0.0401, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.3607647120952606, + "rewards/margins": 14.579371452331543, + "rewards/rejected": -14.940136909484863, + "step": 8860 + }, + { + "epoch": 2.13, + "learning_rate": 1.602335532180424e-07, + "logits/chosen": -2.511733055114746, + "logits/rejected": -2.4589807987213135, + "logps/chosen": -299.12969970703125, + "logps/rejected": -389.6202087402344, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6319599747657776, + "rewards/margins": 11.733071327209473, + "rewards/rejected": -12.365031242370605, + "step": 8870 }, { "epoch": 2.14, - "learning_rate": 1.6000374496769963e-07, - "logits/chosen": -2.5668065547943115, - "logits/rejected": -2.5014026165008545, - "logps/chosen": -255.66799926757812, - "logps/rejected": -373.7508239746094, - "loss": 0.1027, + "learning_rate": 1.5978784096986985e-07, + "logits/chosen": -2.3009090423583984, + "logits/rejected": -2.2098991870880127, + "logps/chosen": -304.6602783203125, + "logps/rejected": -435.3086853027344, + "loss": 0.0396, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.9714128375053406, + "rewards/margins": 15.813325881958008, + "rewards/rejected": -16.784738540649414, + "step": 8880 + }, + { + "epoch": 2.14, + "learning_rate": 1.5934212872169728e-07, + "logits/chosen": -2.3099558353424072, + "logits/rejected": -2.354621171951294, + "logps/chosen": -293.64208984375, + "logps/rejected": -374.3144226074219, + "loss": 0.0271, "rewards/accuracies": 1.0, - "rewards/chosen": -0.7974373698234558, - "rewards/margins": 11.770620346069336, - "rewards/rejected": -12.568058013916016, - "step": 8450 + "rewards/chosen": 0.12089452892541885, + "rewards/margins": 12.865636825561523, + "rewards/rejected": -12.744743347167969, + "step": 8890 }, { "epoch": 2.14, - "learning_rate": 1.5953562400524294e-07, - "logits/chosen": -2.6310415267944336, - "logits/rejected": -2.5319175720214844, - "logps/chosen": -262.56201171875, - "logps/rejected": -301.62335205078125, - "loss": 0.0186, + "learning_rate": 1.5889641647352469e-07, + "logits/chosen": -2.5336740016937256, + "logits/rejected": -2.5077195167541504, + "logps/chosen": -413.22515869140625, + "logps/rejected": -551.4707641601562, + "loss": 0.0263, "rewards/accuracies": 1.0, - "rewards/chosen": -1.2064162492752075, - "rewards/margins": 10.903173446655273, - "rewards/rejected": -12.109588623046875, - "step": 8460 + "rewards/chosen": -1.3416969776153564, + "rewards/margins": 12.972776412963867, + "rewards/rejected": -14.314474105834961, + "step": 8900 }, { "epoch": 2.14, - "learning_rate": 1.5906750304278623e-07, - "logits/chosen": -2.5703375339508057, - "logits/rejected": -2.5990467071533203, - "logps/chosen": -272.7796630859375, - "logps/rejected": -399.44158935546875, - "loss": 0.0192, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.6385996341705322, - "rewards/margins": 10.162915229797363, - "rewards/rejected": -12.801515579223633, - "step": 8470 + "eval_logits/chosen": -2.1660590171813965, + "eval_logits/rejected": -2.0957818031311035, + "eval_logps/chosen": -277.569580078125, + "eval_logps/rejected": -308.1033630371094, + "eval_loss": 0.6209201812744141, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -8.160855293273926, + "eval_rewards/margins": 4.2101731300354, + "eval_rewards/rejected": -12.371027946472168, + "eval_runtime": 132.3034, + "eval_samples_per_second": 23.854, + "eval_steps_per_second": 0.378, + "step": 8900 }, { "epoch": 2.14, - "learning_rate": 1.5859938208032955e-07, - "logits/chosen": -2.532130718231201, - "logits/rejected": -2.4024813175201416, - "logps/chosen": -396.67840576171875, - "logps/rejected": -454.4134216308594, - "loss": 0.0464, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2578914165496826, - "rewards/margins": 10.379530906677246, - "rewards/rejected": -11.637422561645508, - "step": 8480 + "learning_rate": 1.5845070422535212e-07, + "logits/chosen": -2.3411030769348145, + "logits/rejected": -2.215272903442383, + "logps/chosen": -315.43035888671875, + "logps/rejected": -427.6231994628906, + "loss": 0.0426, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7919819355010986, + "rewards/margins": 12.08747673034668, + "rewards/rejected": -13.879457473754883, + "step": 8910 }, { "epoch": 2.15, - "learning_rate": 1.5813126111787284e-07, - "logits/chosen": -2.4982962608337402, - "logits/rejected": -2.474616289138794, - "logps/chosen": -397.2320251464844, - "logps/rejected": -552.1438598632812, - "loss": 0.0216, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.637967348098755, - "rewards/margins": 12.508705139160156, - "rewards/rejected": -15.146673202514648, - "step": 8490 + "learning_rate": 1.5800499197717954e-07, + "logits/chosen": -2.507469654083252, + "logits/rejected": -2.4018709659576416, + "logps/chosen": -353.97100830078125, + "logps/rejected": -532.8887939453125, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4683825969696045, + "rewards/margins": 15.863914489746094, + "rewards/rejected": -17.33229637145996, + "step": 8920 }, { "epoch": 2.15, - "learning_rate": 1.5766314015541615e-07, - "logits/chosen": -2.7057583332061768, - "logits/rejected": -2.437467575073242, - "logps/chosen": -283.351318359375, - "logps/rejected": -287.83544921875, - "loss": 0.0109, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8018553853034973, - "rewards/margins": 9.905305862426758, - "rewards/rejected": -10.707162857055664, - "step": 8500 + "learning_rate": 1.5755927972900695e-07, + "logits/chosen": -2.2565438747406006, + "logits/rejected": -2.2103662490844727, + "logps/chosen": -279.3439025878906, + "logps/rejected": -354.6036682128906, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3021779954433441, + "rewards/margins": 13.721453666687012, + "rewards/rejected": -14.023633003234863, + "step": 8930 }, { "epoch": 2.15, - "learning_rate": 1.5719501919295947e-07, - "logits/chosen": -2.7296359539031982, - "logits/rejected": -2.6196982860565186, - "logps/chosen": -330.23516845703125, - "logps/rejected": -404.32794189453125, - "loss": 0.0194, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6082562804222107, - "rewards/margins": 10.484846115112305, - "rewards/rejected": -11.093101501464844, - "step": 8510 + "learning_rate": 1.5711356748083438e-07, + "logits/chosen": -2.570256471633911, + "logits/rejected": -2.467932939529419, + "logps/chosen": -274.5641174316406, + "logps/rejected": -351.32940673828125, + "loss": 0.0325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.906691312789917, + "rewards/margins": 9.975526809692383, + "rewards/rejected": -12.882217407226562, + "step": 8940 }, { "epoch": 2.15, - "learning_rate": 1.5672689823050276e-07, - "logits/chosen": -2.2502963542938232, - "logits/rejected": -2.183072566986084, - "logps/chosen": -266.4691467285156, - "logps/rejected": -300.840087890625, - "loss": 0.0168, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8021724820137024, - "rewards/margins": 10.726180076599121, - "rewards/rejected": -11.528352737426758, - "step": 8520 + "learning_rate": 1.566678552326618e-07, + "logits/chosen": -2.447014331817627, + "logits/rejected": -2.3372673988342285, + "logps/chosen": -269.1405029296875, + "logps/rejected": -271.47161865234375, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5176638960838318, + "rewards/margins": 11.4063138961792, + "rewards/rejected": -10.888651847839355, + "step": 8950 }, { "epoch": 2.16, - "learning_rate": 1.5625877726804607e-07, - "logits/chosen": -2.5576233863830566, - "logits/rejected": -2.457980155944824, - "logps/chosen": -325.43280029296875, - "logps/rejected": -400.6141662597656, - "loss": 0.0794, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.25530198216438293, - "rewards/margins": 12.95799446105957, - "rewards/rejected": -12.702692031860352, - "step": 8530 + "learning_rate": 1.562221429844892e-07, + "logits/chosen": -2.4485888481140137, + "logits/rejected": -2.06019926071167, + "logps/chosen": -270.01446533203125, + "logps/rejected": -287.9712829589844, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0585132837295532, + "rewards/margins": 12.9647798538208, + "rewards/rejected": -14.023290634155273, + "step": 8960 }, { "epoch": 2.16, - "learning_rate": 1.5579065630558936e-07, - "logits/chosen": -2.40440034866333, - "logits/rejected": -2.240015745162964, - "logps/chosen": -312.38092041015625, - "logps/rejected": -383.36187744140625, - "loss": 0.0242, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0788674354553223, - "rewards/margins": 11.552621841430664, - "rewards/rejected": -13.631490707397461, - "step": 8540 + "learning_rate": 1.5577643073631664e-07, + "logits/chosen": -2.3456592559814453, + "logits/rejected": -2.3666493892669678, + "logps/chosen": -219.8240203857422, + "logps/rejected": -346.49163818359375, + "loss": 0.0334, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9383699893951416, + "rewards/margins": 11.286222457885742, + "rewards/rejected": -13.224591255187988, + "step": 8970 }, { "epoch": 2.16, - "learning_rate": 1.5532253534313268e-07, - "logits/chosen": -2.654876232147217, - "logits/rejected": -2.5984530448913574, - "logps/chosen": -276.11053466796875, - "logps/rejected": -370.93743896484375, - "loss": 0.028, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.086482048034668, - "rewards/margins": 10.666714668273926, - "rewards/rejected": -11.753195762634277, - "step": 8550 + "learning_rate": 1.5533071848814404e-07, + "logits/chosen": -2.4346134662628174, + "logits/rejected": -2.3483047485351562, + "logps/chosen": -269.57244873046875, + "logps/rejected": -302.46771240234375, + "loss": 0.0331, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0364601612091064, + "rewards/margins": 9.884992599487305, + "rewards/rejected": -12.921453475952148, + "step": 8980 }, { "epoch": 2.16, - "learning_rate": 1.5485441438067594e-07, - "logits/chosen": -2.6042940616607666, - "logits/rejected": -2.5303900241851807, - "logps/chosen": -276.2106018066406, - "logps/rejected": -304.36370849609375, - "loss": 0.0221, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.0825324058532715, - "rewards/margins": 9.108705520629883, - "rewards/rejected": -11.191239356994629, - "step": 8560 + "learning_rate": 1.5488500623997147e-07, + "logits/chosen": -2.2744264602661133, + "logits/rejected": -2.3571650981903076, + "logps/chosen": -252.5452117919922, + "logps/rejected": -406.44012451171875, + "loss": 0.0211, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.238948345184326, + "rewards/margins": 10.377193450927734, + "rewards/rejected": -13.616144180297852, + "step": 8990 }, { "epoch": 2.17, - "learning_rate": 1.5438629341821926e-07, - "logits/chosen": -2.4339449405670166, - "logits/rejected": -2.4021949768066406, - "logps/chosen": -234.2420654296875, - "logps/rejected": -386.6480407714844, - "loss": 0.0225, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.374173641204834, - "rewards/margins": 10.155722618103027, - "rewards/rejected": -12.529895782470703, - "step": 8570 + "learning_rate": 1.544392939917989e-07, + "logits/chosen": -2.356158971786499, + "logits/rejected": -2.2244272232055664, + "logps/chosen": -198.62962341308594, + "logps/rejected": -360.06512451171875, + "loss": 0.0242, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9871301651000977, + "rewards/margins": 11.81344223022461, + "rewards/rejected": -13.800573348999023, + "step": 9000 }, { "epoch": 2.17, - "learning_rate": 1.5391817245576255e-07, - "logits/chosen": -2.5122060775756836, - "logits/rejected": -2.4285624027252197, - "logps/chosen": -229.5922088623047, - "logps/rejected": -280.2808837890625, - "loss": 0.015, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.24378149211406708, - "rewards/margins": 10.249922752380371, - "rewards/rejected": -10.493703842163086, - "step": 8580 + "eval_logits/chosen": -2.230372428894043, + "eval_logits/rejected": -2.1650547981262207, + "eval_logps/chosen": -263.16217041015625, + "eval_logps/rejected": -292.0106201171875, + "eval_loss": 0.6042197346687317, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -6.720116138458252, + "eval_rewards/margins": 4.041637420654297, + "eval_rewards/rejected": -10.76175308227539, + "eval_runtime": 132.1589, + "eval_samples_per_second": 23.88, + "eval_steps_per_second": 0.378, + "step": 9000 }, { "epoch": 2.17, - "learning_rate": 1.5345005149330586e-07, - "logits/chosen": -2.523165464401245, - "logits/rejected": -2.4486472606658936, - "logps/chosen": -266.5024719238281, - "logps/rejected": -344.3494873046875, - "loss": 0.0299, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.16855347156524658, - "rewards/margins": 12.045059204101562, - "rewards/rejected": -12.213613510131836, - "step": 8590 + "learning_rate": 1.539935817436263e-07, + "logits/chosen": -2.4826877117156982, + "logits/rejected": -2.3789048194885254, + "logps/chosen": -252.3740692138672, + "logps/rejected": -349.7666931152344, + "loss": 0.0576, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8747469782829285, + "rewards/margins": 10.163451194763184, + "rewards/rejected": -11.038199424743652, + "step": 9010 }, { "epoch": 2.17, - "learning_rate": 1.5298193053084915e-07, - "logits/chosen": -2.5514864921569824, - "logits/rejected": -2.444061040878296, - "logps/chosen": -266.6001892089844, - "logps/rejected": -331.06170654296875, - "loss": 0.0176, + "learning_rate": 1.5354786949545373e-07, + "logits/chosen": -2.2955923080444336, + "logits/rejected": -2.2653818130493164, + "logps/chosen": -266.5935974121094, + "logps/rejected": -338.3429870605469, + "loss": 0.0341, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7335854768753052, - "rewards/margins": 9.204207420349121, - "rewards/rejected": -9.937792778015137, - "step": 8600 + "rewards/chosen": -1.175144076347351, + "rewards/margins": 8.819084167480469, + "rewards/rejected": -9.99422836303711, + "step": 9020 }, { - "epoch": 2.18, - "learning_rate": 1.5251380956839247e-07, - "logits/chosen": -2.603799343109131, - "logits/rejected": -2.4442601203918457, - "logps/chosen": -330.43023681640625, - "logps/rejected": -305.4593811035156, - "loss": 0.0073, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.46179574728012085, - "rewards/margins": 11.207643508911133, - "rewards/rejected": -10.745849609375, - "step": 8610 + "epoch": 2.17, + "learning_rate": 1.5310215724728116e-07, + "logits/chosen": -2.5113468170166016, + "logits/rejected": -2.429802417755127, + "logps/chosen": -344.1707763671875, + "logps/rejected": -367.857421875, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.27444326877594, + "rewards/margins": 10.788863182067871, + "rewards/rejected": -12.06330680847168, + "step": 9030 }, { "epoch": 2.18, - "learning_rate": 1.5204568860593578e-07, - "logits/chosen": -2.5968942642211914, - "logits/rejected": -2.5343058109283447, - "logps/chosen": -282.2979736328125, - "logps/rejected": -334.32244873046875, - "loss": 0.0206, + "learning_rate": 1.5265644499910856e-07, + "logits/chosen": -2.405306816101074, + "logits/rejected": -2.200188636779785, + "logps/chosen": -227.342529296875, + "logps/rejected": -382.5558776855469, + "loss": 0.0338, "rewards/accuracies": 1.0, - "rewards/chosen": -1.5933091640472412, - "rewards/margins": 10.09453010559082, - "rewards/rejected": -11.687838554382324, - "step": 8620 + "rewards/chosen": 0.1724577397108078, + "rewards/margins": 12.219175338745117, + "rewards/rejected": -12.046717643737793, + "step": 9040 }, { "epoch": 2.18, - "learning_rate": 1.5157756764347907e-07, - "logits/chosen": -2.6372528076171875, - "logits/rejected": -2.5136094093322754, - "logps/chosen": -332.3785705566406, - "logps/rejected": -405.2020263671875, - "loss": 0.0186, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.054649591445923, - "rewards/margins": 10.335689544677734, - "rewards/rejected": -12.390339851379395, - "step": 8630 + "learning_rate": 1.52210732750936e-07, + "logits/chosen": -2.378784656524658, + "logits/rejected": -2.3644559383392334, + "logps/chosen": -185.2550506591797, + "logps/rejected": -382.40960693359375, + "loss": 0.0257, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.774221658706665, + "rewards/margins": 11.348250389099121, + "rewards/rejected": -13.122471809387207, + "step": 9050 }, { "epoch": 2.18, - "learning_rate": 1.511094466810224e-07, - "logits/chosen": -2.5080435276031494, - "logits/rejected": -2.451399326324463, - "logps/chosen": -210.416259765625, - "logps/rejected": -338.97479248046875, - "loss": 0.0698, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.2667189836502075, - "rewards/margins": 10.377790451049805, - "rewards/rejected": -11.64450740814209, - "step": 8640 + "learning_rate": 1.517650205027634e-07, + "logits/chosen": -2.344386339187622, + "logits/rejected": -2.2430851459503174, + "logps/chosen": -301.843994140625, + "logps/rejected": -454.56231689453125, + "loss": 0.0192, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6924386024475098, + "rewards/margins": 13.1585054397583, + "rewards/rejected": -15.85094165802002, + "step": 9060 + }, + { + "epoch": 2.18, + "learning_rate": 1.5131930825459083e-07, + "logits/chosen": -2.1696395874023438, + "logits/rejected": -2.2282121181488037, + "logps/chosen": -258.8611145019531, + "logps/rejected": -320.43634033203125, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.088587999343872, + "rewards/margins": 10.888508796691895, + "rewards/rejected": -11.977095603942871, + "step": 9070 }, { "epoch": 2.19, - "learning_rate": 1.5064132571856568e-07, - "logits/chosen": -2.4267754554748535, - "logits/rejected": -2.339641809463501, - "logps/chosen": -277.78851318359375, - "logps/rejected": -439.1700744628906, - "loss": 0.0304, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.432140827178955, - "rewards/margins": 11.009929656982422, - "rewards/rejected": -13.442071914672852, - "step": 8650 + "learning_rate": 1.5087359600641826e-07, + "logits/chosen": -2.3222198486328125, + "logits/rejected": -2.387406587600708, + "logps/chosen": -320.2882385253906, + "logps/rejected": -483.84588623046875, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6956138610839844, + "rewards/margins": 13.782968521118164, + "rewards/rejected": -14.478582382202148, + "step": 9080 }, { "epoch": 2.19, - "learning_rate": 1.5017320475610897e-07, - "logits/chosen": -2.4750618934631348, - "logits/rejected": -2.5541577339172363, - "logps/chosen": -273.45489501953125, - "logps/rejected": -420.8013610839844, - "loss": 0.0229, + "learning_rate": 1.5042788375824566e-07, + "logits/chosen": -2.5829436779022217, + "logits/rejected": -2.234083652496338, + "logps/chosen": -254.4413604736328, + "logps/rejected": -325.9149169921875, + "loss": 0.0195, "rewards/accuracies": 1.0, - "rewards/chosen": -0.6541218757629395, - "rewards/margins": 12.435382843017578, - "rewards/rejected": -13.089506149291992, - "step": 8660 + "rewards/chosen": -0.7922242879867554, + "rewards/margins": 11.126602172851562, + "rewards/rejected": -11.91882610321045, + "step": 9090 }, { "epoch": 2.19, - "learning_rate": 1.4970508379365226e-07, - "logits/chosen": -2.5706381797790527, - "logits/rejected": -2.381520986557007, - "logps/chosen": -356.1487731933594, - "logps/rejected": -336.7781982421875, - "loss": 0.007, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6374567747116089, - "rewards/margins": 11.748361587524414, - "rewards/rejected": -12.385817527770996, - "step": 8670 + "learning_rate": 1.499821715100731e-07, + "logits/chosen": -2.437284469604492, + "logits/rejected": -2.3241400718688965, + "logps/chosen": -321.37286376953125, + "logps/rejected": -365.09295654296875, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6497052907943726, + "rewards/margins": 9.808609962463379, + "rewards/rejected": -11.4583158493042, + "step": 9100 }, { "epoch": 2.19, - "learning_rate": 1.4923696283119557e-07, - "logits/chosen": -2.29630446434021, - "logits/rejected": -2.1913228034973145, - "logps/chosen": -283.35162353515625, - "logps/rejected": -354.83184814453125, - "loss": 0.0268, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.598496913909912, - "rewards/margins": 9.002246856689453, - "rewards/rejected": -11.60074234008789, - "step": 8680 + "eval_logits/chosen": -2.1661880016326904, + "eval_logits/rejected": -2.100615978240967, + "eval_logps/chosen": -273.85870361328125, + "eval_logps/rejected": -303.7488708496094, + "eval_loss": 0.6079710125923157, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -7.789770126342773, + "eval_rewards/margins": 4.14580774307251, + "eval_rewards/rejected": -11.935577392578125, + "eval_runtime": 131.984, + "eval_samples_per_second": 23.912, + "eval_steps_per_second": 0.379, + "step": 9100 + }, + { + "epoch": 2.19, + "learning_rate": 1.4953645926190052e-07, + "logits/chosen": -1.9663139581680298, + "logits/rejected": -1.8209493160247803, + "logps/chosen": -266.14129638671875, + "logps/rejected": -377.9076843261719, + "loss": 0.0295, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.284947633743286, + "rewards/margins": 10.449875831604004, + "rewards/rejected": -13.734825134277344, + "step": 9110 + }, + { + "epoch": 2.19, + "learning_rate": 1.4909074701372792e-07, + "logits/chosen": -2.2690224647521973, + "logits/rejected": -2.036006450653076, + "logps/chosen": -299.5445861816406, + "logps/rejected": -301.4452209472656, + "loss": 0.1246, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.482708215713501, + "rewards/margins": 12.273618698120117, + "rewards/rejected": -13.756327629089355, + "step": 9120 }, { "epoch": 2.2, - "learning_rate": 1.4876884186873886e-07, - "logits/chosen": -2.284550428390503, - "logits/rejected": -2.113361358642578, - "logps/chosen": -234.25259399414062, - "logps/rejected": -332.28424072265625, - "loss": 0.0155, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.0471138954162598, - "rewards/margins": 10.27766227722168, - "rewards/rejected": -13.324775695800781, - "step": 8690 + "learning_rate": 1.4864503476555535e-07, + "logits/chosen": -2.2247190475463867, + "logits/rejected": -2.300473690032959, + "logps/chosen": -242.97433471679688, + "logps/rejected": -359.6977233886719, + "loss": 0.0316, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1927809715270996, + "rewards/margins": 9.915419578552246, + "rewards/rejected": -13.10820198059082, + "step": 9130 }, { "epoch": 2.2, - "learning_rate": 1.4830072090628218e-07, - "logits/chosen": -2.464995861053467, - "logits/rejected": -2.440201759338379, - "logps/chosen": -295.09588623046875, - "logps/rejected": -337.61492919921875, - "loss": 0.0259, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0475311279296875, - "rewards/margins": 10.91230583190918, - "rewards/rejected": -11.959836959838867, - "step": 8700 + "learning_rate": 1.4819932251738275e-07, + "logits/chosen": -2.261200428009033, + "logits/rejected": -2.300924777984619, + "logps/chosen": -214.7405242919922, + "logps/rejected": -345.27239990234375, + "loss": 0.027, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8763138055801392, + "rewards/margins": 12.543069839477539, + "rewards/rejected": -13.419384956359863, + "step": 9140 }, { "epoch": 2.2, - "learning_rate": 1.4783259994382547e-07, - "logits/chosen": -2.500070571899414, - "logits/rejected": -2.3573880195617676, - "logps/chosen": -325.81646728515625, - "logps/rejected": -481.3152770996094, - "loss": 0.0074, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9885588884353638, - "rewards/margins": 15.744482040405273, - "rewards/rejected": -14.755925178527832, - "step": 8710 + "learning_rate": 1.4775361026921018e-07, + "logits/chosen": -2.2778267860412598, + "logits/rejected": -2.1965887546539307, + "logps/chosen": -253.302734375, + "logps/rejected": -334.212158203125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1072163581848145, + "rewards/margins": 10.70435905456543, + "rewards/rejected": -13.811578750610352, + "step": 9150 }, { "epoch": 2.2, - "learning_rate": 1.4736447898136878e-07, - "logits/chosen": -2.5980992317199707, - "logits/rejected": -2.4638357162475586, - "logps/chosen": -326.902587890625, - "logps/rejected": -382.9718933105469, - "loss": 0.0241, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.2018386870622635, - "rewards/margins": 12.231120109558105, - "rewards/rejected": -12.432960510253906, - "step": 8720 + "learning_rate": 1.473078980210376e-07, + "logits/chosen": -2.337700843811035, + "logits/rejected": -2.2513089179992676, + "logps/chosen": -307.0190124511719, + "logps/rejected": -291.26092529296875, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7315356731414795, + "rewards/margins": 9.64849853515625, + "rewards/rejected": -12.380033493041992, + "step": 9160 }, { "epoch": 2.21, - "learning_rate": 1.468963580189121e-07, - "logits/chosen": -2.485769748687744, - "logits/rejected": -2.4419784545898438, - "logps/chosen": -312.25067138671875, - "logps/rejected": -351.7271728515625, - "loss": 0.0266, + "learning_rate": 1.46862185772865e-07, + "logits/chosen": -2.507974624633789, + "logits/rejected": -2.3630850315093994, + "logps/chosen": -279.1249084472656, + "logps/rejected": -342.54498291015625, + "loss": 0.0521, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1176087856292725, + "rewards/margins": 10.511555671691895, + "rewards/rejected": -12.62916374206543, + "step": 9170 + }, + { + "epoch": 2.21, + "learning_rate": 1.4641647352469244e-07, + "logits/chosen": -2.514815092086792, + "logits/rejected": -2.3654842376708984, + "logps/chosen": -248.6379852294922, + "logps/rejected": -459.70965576171875, + "loss": 0.0203, "rewards/accuracies": 1.0, - "rewards/chosen": -1.832437515258789, - "rewards/margins": 10.075966835021973, - "rewards/rejected": -11.908404350280762, - "step": 8730 + "rewards/chosen": -0.42885732650756836, + "rewards/margins": 12.995776176452637, + "rewards/rejected": -13.424633979797363, + "step": 9180 }, { "epoch": 2.21, - "learning_rate": 1.4642823705645539e-07, - "logits/chosen": -2.316826105117798, - "logits/rejected": -2.4810423851013184, - "logps/chosen": -236.44052124023438, - "logps/rejected": -311.9570617675781, - "loss": 0.038, + "learning_rate": 1.4597076127651987e-07, + "logits/chosen": -2.517486095428467, + "logits/rejected": -2.3683810234069824, + "logps/chosen": -331.0230407714844, + "logps/rejected": -350.1826171875, + "loss": 0.0194, "rewards/accuracies": 1.0, - "rewards/chosen": -1.018673300743103, - "rewards/margins": 9.659832954406738, - "rewards/rejected": -10.678506851196289, - "step": 8740 + "rewards/chosen": 0.27780014276504517, + "rewards/margins": 11.491705894470215, + "rewards/rejected": -11.213905334472656, + "step": 9190 }, { "epoch": 2.21, - "learning_rate": 1.459601160939987e-07, - "logits/chosen": -2.653926372528076, - "logits/rejected": -2.537010908126831, - "logps/chosen": -291.753662109375, - "logps/rejected": -322.4140319824219, - "loss": 0.0246, + "learning_rate": 1.4552504902834727e-07, + "logits/chosen": -2.4122374057769775, + "logits/rejected": -2.2545065879821777, + "logps/chosen": -349.33612060546875, + "logps/rejected": -302.2837219238281, + "loss": 0.0371, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0316170454025269, - "rewards/margins": 9.18066692352295, - "rewards/rejected": -10.212284088134766, - "step": 8750 + "rewards/chosen": -2.4120290279388428, + "rewards/margins": 8.94854736328125, + "rewards/rejected": -11.360577583312988, + "step": 9200 }, { "epoch": 2.21, - "learning_rate": 1.4549199513154197e-07, - "logits/chosen": -2.6024010181427, - "logits/rejected": -2.4828600883483887, - "logps/chosen": -405.0433044433594, - "logps/rejected": -491.9237365722656, - "loss": 0.0178, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7376415133476257, - "rewards/margins": 12.731585502624512, - "rewards/rejected": -13.46922779083252, - "step": 8760 + "eval_logits/chosen": -2.215470790863037, + "eval_logits/rejected": -2.155618667602539, + "eval_logps/chosen": -271.5960388183594, + "eval_logps/rejected": -301.44329833984375, + "eval_loss": 0.6149211525917053, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -7.5634989738464355, + "eval_rewards/margins": 4.141521453857422, + "eval_rewards/rejected": -11.705020904541016, + "eval_runtime": 132.187, + "eval_samples_per_second": 23.875, + "eval_steps_per_second": 0.378, + "step": 9200 }, { "epoch": 2.22, - "learning_rate": 1.4502387416908528e-07, - "logits/chosen": -2.42547869682312, - "logits/rejected": -2.299511671066284, - "logps/chosen": -236.38729858398438, - "logps/rejected": -299.8211364746094, - "loss": 0.0026, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.5624053478240967, - "rewards/margins": 9.061653137207031, - "rewards/rejected": -11.624059677124023, - "step": 8770 + "learning_rate": 1.450793367801747e-07, + "logits/chosen": -2.3780016899108887, + "logits/rejected": -2.2906293869018555, + "logps/chosen": -314.30657958984375, + "logps/rejected": -379.53179931640625, + "loss": 0.0447, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.712317943572998, + "rewards/margins": 11.500996589660645, + "rewards/rejected": -14.2133150100708, + "step": 9210 }, { "epoch": 2.22, - "learning_rate": 1.4455575320662857e-07, - "logits/chosen": -2.3677194118499756, - "logits/rejected": -2.2375476360321045, - "logps/chosen": -293.98370361328125, - "logps/rejected": -419.0365295410156, - "loss": 0.0109, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0371956825256348, - "rewards/margins": 12.501256942749023, - "rewards/rejected": -14.5384521484375, - "step": 8780 + "learning_rate": 1.4463362453200213e-07, + "logits/chosen": -2.303220272064209, + "logits/rejected": -2.1944663524627686, + "logps/chosen": -237.29061889648438, + "logps/rejected": -268.38177490234375, + "loss": 0.0533, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2472636699676514, + "rewards/margins": 10.482453346252441, + "rewards/rejected": -12.729717254638672, + "step": 9220 }, { "epoch": 2.22, - "learning_rate": 1.4408763224417189e-07, - "logits/chosen": -2.31605863571167, - "logits/rejected": -2.36974835395813, - "logps/chosen": -233.51315307617188, - "logps/rejected": -291.78143310546875, - "loss": 0.0341, + "learning_rate": 1.4418791228382956e-07, + "logits/chosen": -2.6654210090637207, + "logits/rejected": -2.4725797176361084, + "logps/chosen": -282.4718017578125, + "logps/rejected": -315.22467041015625, + "loss": 0.0399, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.6936259269714355, - "rewards/margins": 7.989305019378662, - "rewards/rejected": -10.682931900024414, - "step": 8790 + "rewards/chosen": -1.465663194656372, + "rewards/margins": 9.992634773254395, + "rewards/rejected": -11.458298683166504, + "step": 9230 }, { "epoch": 2.22, - "learning_rate": 1.4361951128171518e-07, - "logits/chosen": -2.484557628631592, - "logits/rejected": -2.4126553535461426, - "logps/chosen": -246.5937957763672, - "logps/rejected": -323.31671142578125, - "loss": 0.0249, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.5201890468597412, - "rewards/margins": 9.29806137084961, - "rewards/rejected": -10.818249702453613, - "step": 8800 + "learning_rate": 1.43742200035657e-07, + "logits/chosen": -2.5580074787139893, + "logits/rejected": -2.591865301132202, + "logps/chosen": -262.86260986328125, + "logps/rejected": -356.44317626953125, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.891562819480896, + "rewards/margins": 11.680395126342773, + "rewards/rejected": -13.57196044921875, + "step": 9240 }, { "epoch": 2.23, - "learning_rate": 1.431513903192585e-07, - "logits/chosen": -2.5416526794433594, - "logits/rejected": -2.418910264968872, - "logps/chosen": -273.0360107421875, - "logps/rejected": -309.00787353515625, - "loss": 0.0529, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.139587163925171, - "rewards/margins": 11.749044418334961, - "rewards/rejected": -13.888630867004395, - "step": 8810 + "learning_rate": 1.432964877874844e-07, + "logits/chosen": -2.549896717071533, + "logits/rejected": -2.2038583755493164, + "logps/chosen": -296.05181884765625, + "logps/rejected": -308.00762939453125, + "loss": 0.044, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.270108461380005, + "rewards/margins": 10.156574249267578, + "rewards/rejected": -13.42668342590332, + "step": 9250 }, { "epoch": 2.23, - "learning_rate": 1.4268326935680178e-07, - "logits/chosen": -2.2652955055236816, - "logits/rejected": -2.275373697280884, - "logps/chosen": -282.45245361328125, - "logps/rejected": -355.07855224609375, - "loss": 0.011, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.4622039794921875, - "rewards/margins": 12.305490493774414, - "rewards/rejected": -13.767694473266602, - "step": 8820 + "learning_rate": 1.4285077553931182e-07, + "logits/chosen": -2.394225597381592, + "logits/rejected": -2.246384382247925, + "logps/chosen": -245.6808319091797, + "logps/rejected": -386.2701721191406, + "loss": 0.0329, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.294296979904175, + "rewards/margins": 10.8009672164917, + "rewards/rejected": -14.095263481140137, + "step": 9260 }, { "epoch": 2.23, - "learning_rate": 1.422151483943451e-07, - "logits/chosen": -2.5491325855255127, - "logits/rejected": -2.653744697570801, - "logps/chosen": -300.6777648925781, - "logps/rejected": -386.43768310546875, - "loss": 0.0253, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2964599132537842, - "rewards/margins": 12.887380599975586, - "rewards/rejected": -14.18384075164795, - "step": 8830 + "learning_rate": 1.4240506329113925e-07, + "logits/chosen": -2.492157459259033, + "logits/rejected": -2.44276762008667, + "logps/chosen": -256.7959899902344, + "logps/rejected": -307.61724853515625, + "loss": 0.0276, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5127564072608948, + "rewards/margins": 10.328193664550781, + "rewards/rejected": -10.840948104858398, + "step": 9270 }, { "epoch": 2.23, - "learning_rate": 1.417470274318884e-07, - "logits/chosen": -2.336543560028076, - "logits/rejected": -2.199955701828003, - "logps/chosen": -235.3468475341797, - "logps/rejected": -328.2801208496094, - "loss": 0.0107, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0419225692749023, - "rewards/margins": 9.49148941040039, - "rewards/rejected": -11.53341293334961, - "step": 8840 + "learning_rate": 1.4195935104296666e-07, + "logits/chosen": -2.4448800086975098, + "logits/rejected": -2.4512434005737305, + "logps/chosen": -197.6912078857422, + "logps/rejected": -302.0965881347656, + "loss": 0.0304, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.3855786323547363, + "rewards/margins": 8.117597579956055, + "rewards/rejected": -11.503175735473633, + "step": 9280 }, { "epoch": 2.24, - "learning_rate": 1.412789064694317e-07, - "logits/chosen": -2.620922565460205, - "logits/rejected": -2.45328688621521, - "logps/chosen": -279.2070617675781, - "logps/rejected": -362.3523254394531, - "loss": 0.0314, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4809529781341553, - "rewards/margins": 12.773890495300293, - "rewards/rejected": -14.254842758178711, - "step": 8850 + "learning_rate": 1.4151363879479409e-07, + "logits/chosen": -2.3412024974823, + "logits/rejected": -2.257275104522705, + "logps/chosen": -317.9329528808594, + "logps/rejected": -316.4639892578125, + "loss": 0.0275, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.4877772331237793, + "rewards/margins": 9.137595176696777, + "rewards/rejected": -12.625372886657715, + "step": 9290 }, { "epoch": 2.24, - "learning_rate": 1.4081078550697502e-07, - "logits/chosen": -2.3447766304016113, - "logits/rejected": -2.2119431495666504, - "logps/chosen": -223.18124389648438, - "logps/rejected": -394.81146240234375, - "loss": 0.026, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.897385358810425, - "rewards/margins": 10.753974914550781, - "rewards/rejected": -13.651362419128418, - "step": 8860 + "learning_rate": 1.4106792654662152e-07, + "logits/chosen": -2.5886285305023193, + "logits/rejected": -2.4372642040252686, + "logps/chosen": -299.9481201171875, + "logps/rejected": -345.8121032714844, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1748989373445511, + "rewards/margins": 12.7409029006958, + "rewards/rejected": -12.56600284576416, + "step": 9300 }, { "epoch": 2.24, - "learning_rate": 1.4034266454451828e-07, - "logits/chosen": -2.4083456993103027, - "logits/rejected": -2.4523205757141113, - "logps/chosen": -314.6570739746094, - "logps/rejected": -476.2359924316406, - "loss": 0.0058, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.19616852700710297, - "rewards/margins": 15.615038871765137, - "rewards/rejected": -15.418869018554688, - "step": 8870 + "eval_logits/chosen": -2.2398574352264404, + "eval_logits/rejected": -2.177757978439331, + "eval_logps/chosen": -277.6473083496094, + "eval_logps/rejected": -308.83966064453125, + "eval_loss": 0.6154993176460266, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -8.168630599975586, + "eval_rewards/margins": 4.27602481842041, + "eval_rewards/rejected": -12.444655418395996, + "eval_runtime": 132.2816, + "eval_samples_per_second": 23.858, + "eval_steps_per_second": 0.378, + "step": 9300 }, { "epoch": 2.24, - "learning_rate": 1.398745435820616e-07, - "logits/chosen": -2.2582082748413086, - "logits/rejected": -2.3017678260803223, - "logps/chosen": -309.89971923828125, - "logps/rejected": -369.45458984375, - "loss": 0.043, + "learning_rate": 1.4062221429844892e-07, + "logits/chosen": -2.4051930904388428, + "logits/rejected": -2.3327088356018066, + "logps/chosen": -259.2762145996094, + "logps/rejected": -419.8612365722656, + "loss": 0.1935, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.484740734100342, + "rewards/margins": 8.872762680053711, + "rewards/rejected": -12.357503890991211, + "step": 9310 + }, + { + "epoch": 2.24, + "learning_rate": 1.4017650205027635e-07, + "logits/chosen": -2.5085196495056152, + "logits/rejected": -2.5422542095184326, + "logps/chosen": -283.96514892578125, + "logps/rejected": -354.5263671875, + "loss": 0.0334, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0767788887023926, + "rewards/margins": 10.314794540405273, + "rewards/rejected": -12.391573905944824, + "step": 9320 + }, + { + "epoch": 2.25, + "learning_rate": 1.3973078980210375e-07, + "logits/chosen": -2.539790630340576, + "logits/rejected": -2.4084389209747314, + "logps/chosen": -255.6092529296875, + "logps/rejected": -356.59381103515625, + "loss": 0.0212, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.068044662475586, - "rewards/margins": 11.423654556274414, - "rewards/rejected": -13.49169921875, - "step": 8880 + "rewards/chosen": -3.166245937347412, + "rewards/margins": 10.530374526977539, + "rewards/rejected": -13.696619987487793, + "step": 9330 + }, + { + "epoch": 2.25, + "learning_rate": 1.3928507755393118e-07, + "logits/chosen": -2.496283531188965, + "logits/rejected": -2.5676708221435547, + "logps/chosen": -308.56671142578125, + "logps/rejected": -410.56475830078125, + "loss": 0.0438, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.019418478012085, + "rewards/margins": 13.008565902709961, + "rewards/rejected": -14.027982711791992, + "step": 9340 }, { "epoch": 2.25, - "learning_rate": 1.3940642261960489e-07, - "logits/chosen": -2.507986068725586, - "logits/rejected": -2.3526625633239746, - "logps/chosen": -384.5350036621094, - "logps/rejected": -393.1593933105469, - "loss": 0.0225, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.5077223777770996, - "rewards/margins": 10.584321975708008, - "rewards/rejected": -13.09204387664795, - "step": 8890 + "learning_rate": 1.388393653057586e-07, + "logits/chosen": -2.680394411087036, + "logits/rejected": -2.5199759006500244, + "logps/chosen": -325.9722900390625, + "logps/rejected": -337.5803527832031, + "loss": 0.0214, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7766210436820984, + "rewards/margins": 8.828282356262207, + "rewards/rejected": -9.604904174804688, + "step": 9350 }, { "epoch": 2.25, - "learning_rate": 1.389383016571482e-07, - "logits/chosen": -2.3579325675964355, - "logits/rejected": -2.460432291030884, - "logps/chosen": -286.2724609375, - "logps/rejected": -325.6170959472656, - "loss": 0.0242, + "learning_rate": 1.38393653057586e-07, + "logits/chosen": -2.499191999435425, + "logits/rejected": -2.4237887859344482, + "logps/chosen": -204.8596649169922, + "logps/rejected": -261.20428466796875, + "loss": 0.0174, "rewards/accuracies": 1.0, - "rewards/chosen": -0.48190802335739136, - "rewards/margins": 9.54693603515625, - "rewards/rejected": -10.028843879699707, - "step": 8900 + "rewards/chosen": -0.04103156924247742, + "rewards/margins": 9.294739723205566, + "rewards/rejected": -9.335771560668945, + "step": 9360 }, { - "epoch": 2.25, - "learning_rate": 1.384701806946915e-07, - "logits/chosen": -2.3750059604644775, - "logits/rejected": -2.5095913410186768, - "logps/chosen": -215.9287109375, - "logps/rejected": -340.02880859375, - "loss": 0.0406, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.5825750827789307, - "rewards/margins": 9.099946975708008, - "rewards/rejected": -11.682523727416992, - "step": 8910 + "epoch": 2.26, + "learning_rate": 1.3794794080941344e-07, + "logits/chosen": -2.6455764770507812, + "logits/rejected": -2.509627103805542, + "logps/chosen": -308.97271728515625, + "logps/rejected": -346.7227478027344, + "loss": 0.0335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.6119283437728882, + "rewards/margins": 9.723724365234375, + "rewards/rejected": -11.335652351379395, + "step": 9370 }, { - "epoch": 2.25, - "learning_rate": 1.380020597322348e-07, - "logits/chosen": -2.4824776649475098, - "logits/rejected": -2.3568711280822754, - "logps/chosen": -240.6768341064453, - "logps/rejected": -319.8079833984375, - "loss": 0.0102, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.2835326194763184, - "rewards/margins": 8.985818862915039, - "rewards/rejected": -12.269351959228516, - "step": 8920 + "epoch": 2.26, + "learning_rate": 1.3750222856124087e-07, + "logits/chosen": -2.6599440574645996, + "logits/rejected": -2.5252063274383545, + "logps/chosen": -283.8724060058594, + "logps/rejected": -318.01300048828125, + "loss": 0.0479, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.13417676091194153, + "rewards/margins": 12.659725189208984, + "rewards/rejected": -12.793903350830078, + "step": 9380 }, { "epoch": 2.26, - "learning_rate": 1.375339387697781e-07, - "logits/chosen": -2.248166561126709, - "logits/rejected": -2.1565823554992676, - "logps/chosen": -220.9330596923828, - "logps/rejected": -338.0259094238281, - "loss": 0.0314, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.05809593200683594, - "rewards/margins": 11.133580207824707, - "rewards/rejected": -11.191675186157227, - "step": 8930 + "learning_rate": 1.3705651631306827e-07, + "logits/chosen": -2.461179494857788, + "logits/rejected": -2.3756933212280273, + "logps/chosen": -194.1339569091797, + "logps/rejected": -293.0584411621094, + "loss": 0.0578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3713197708129883, + "rewards/margins": 12.192877769470215, + "rewards/rejected": -13.564196586608887, + "step": 9390 }, { "epoch": 2.26, - "learning_rate": 1.370658178073214e-07, - "logits/chosen": -2.3465335369110107, - "logits/rejected": -2.1472835540771484, - "logps/chosen": -303.6405029296875, - "logps/rejected": -448.4706115722656, - "loss": 0.0118, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8687835931777954, - "rewards/margins": 15.166003227233887, - "rewards/rejected": -17.034786224365234, - "step": 8940 + "learning_rate": 1.366108040648957e-07, + "logits/chosen": -2.545292615890503, + "logits/rejected": -2.5755152702331543, + "logps/chosen": -293.4745788574219, + "logps/rejected": -407.249267578125, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8673050999641418, + "rewards/margins": 12.070673942565918, + "rewards/rejected": -12.937980651855469, + "step": 9400 }, { "epoch": 2.26, - "learning_rate": 1.3659769684486473e-07, - "logits/chosen": -2.6118006706237793, - "logits/rejected": -2.6439757347106934, - "logps/chosen": -399.91021728515625, - "logps/rejected": -482.90924072265625, - "loss": 0.0681, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.12734787166118622, - "rewards/margins": 11.891292572021484, - "rewards/rejected": -11.763945579528809, - "step": 8950 + "eval_logits/chosen": -2.295834541320801, + "eval_logits/rejected": -2.2403228282928467, + "eval_logps/chosen": -274.2550048828125, + "eval_logps/rejected": -304.8091735839844, + "eval_loss": 0.6137393116950989, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -7.829398155212402, + "eval_rewards/margins": 4.212209701538086, + "eval_rewards/rejected": -12.041608810424805, + "eval_runtime": 132.3596, + "eval_samples_per_second": 23.844, + "eval_steps_per_second": 0.378, + "step": 9400 }, { "epoch": 2.26, - "learning_rate": 1.3612957588240802e-07, - "logits/chosen": -2.6660571098327637, - "logits/rejected": -2.5263445377349854, - "logps/chosen": -280.5657653808594, - "logps/rejected": -375.1638488769531, - "loss": 0.0044, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1703410148620605, - "rewards/margins": 9.894917488098145, - "rewards/rejected": -12.065258026123047, - "step": 8960 + "learning_rate": 1.361650918167231e-07, + "logits/chosen": -2.384176731109619, + "logits/rejected": -2.3541951179504395, + "logps/chosen": -277.32928466796875, + "logps/rejected": -330.84307861328125, + "loss": 0.0593, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.691202163696289, + "rewards/margins": 8.922127723693848, + "rewards/rejected": -11.61332893371582, + "step": 9410 }, { "epoch": 2.27, - "learning_rate": 1.356614549199513e-07, - "logits/chosen": -2.3960671424865723, - "logits/rejected": -2.389824628829956, - "logps/chosen": -271.875732421875, - "logps/rejected": -421.37152099609375, - "loss": 0.0156, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.11441369354724884, - "rewards/margins": 11.73409366607666, - "rewards/rejected": -11.61967945098877, - "step": 8970 + "learning_rate": 1.3571937956855053e-07, + "logits/chosen": -2.5649285316467285, + "logits/rejected": -2.368058681488037, + "logps/chosen": -305.06719970703125, + "logps/rejected": -331.3009338378906, + "loss": 0.0275, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.207766056060791, + "rewards/margins": 13.491445541381836, + "rewards/rejected": -14.699213027954102, + "step": 9420 }, { "epoch": 2.27, - "learning_rate": 1.351933339574946e-07, - "logits/chosen": -2.498690605163574, - "logits/rejected": -2.442638874053955, - "logps/chosen": -332.7298889160156, - "logps/rejected": -372.57525634765625, - "loss": 0.0466, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4244102239608765, - "rewards/margins": 10.737654685974121, - "rewards/rejected": -12.162065505981445, - "step": 8980 + "learning_rate": 1.3527366732037796e-07, + "logits/chosen": -2.5269012451171875, + "logits/rejected": -2.5459303855895996, + "logps/chosen": -363.5727233886719, + "logps/rejected": -446.6298828125, + "loss": 0.0872, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.667677402496338, + "rewards/margins": 9.697546005249023, + "rewards/rejected": -12.365221977233887, + "step": 9430 }, { "epoch": 2.27, - "learning_rate": 1.347252129950379e-07, - "logits/chosen": -2.467064619064331, - "logits/rejected": -2.436596393585205, - "logps/chosen": -338.3554992675781, - "logps/rejected": -352.9508361816406, - "loss": 0.0307, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.0824882984161377, - "rewards/margins": 8.874943733215332, - "rewards/rejected": -10.95743179321289, - "step": 8990 + "learning_rate": 1.3482795507220537e-07, + "logits/chosen": -2.597001314163208, + "logits/rejected": -2.5953915119171143, + "logps/chosen": -262.72509765625, + "logps/rejected": -417.89935302734375, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8631964921951294, + "rewards/margins": 16.523540496826172, + "rewards/rejected": -15.660344123840332, + "step": 9440 }, { - "epoch": 2.28, - "learning_rate": 1.342570920325812e-07, - "logits/chosen": -2.48591685295105, - "logits/rejected": -2.327716827392578, - "logps/chosen": -225.51107788085938, - "logps/rejected": -343.0364990234375, - "loss": 0.0263, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3385100364685059, - "rewards/margins": 9.523209571838379, - "rewards/rejected": -10.861720085144043, - "step": 9000 + "epoch": 2.27, + "learning_rate": 1.343822428240328e-07, + "logits/chosen": -2.534240484237671, + "logits/rejected": -2.4725382328033447, + "logps/chosen": -312.21697998046875, + "logps/rejected": -394.55133056640625, + "loss": 0.0166, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.467499256134033, + "rewards/margins": 10.204544067382812, + "rewards/rejected": -13.672042846679688, + "step": 9450 }, { "epoch": 2.28, - "learning_rate": 1.3378897107012452e-07, - "logits/chosen": -2.4900214672088623, - "logits/rejected": -2.4207189083099365, - "logps/chosen": -335.5738525390625, - "logps/rejected": -325.5471496582031, - "loss": 0.0385, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3286423683166504, - "rewards/margins": 10.190323829650879, - "rewards/rejected": -12.51896858215332, - "step": 9010 + "learning_rate": 1.3393653057586023e-07, + "logits/chosen": -2.3375308513641357, + "logits/rejected": -2.3017425537109375, + "logps/chosen": -247.7636260986328, + "logps/rejected": -336.9314880371094, + "loss": 0.023, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0907466411590576, + "rewards/margins": 9.062185287475586, + "rewards/rejected": -11.152931213378906, + "step": 9460 }, { "epoch": 2.28, - "learning_rate": 1.333208501076678e-07, - "logits/chosen": -2.5807812213897705, - "logits/rejected": -2.552722930908203, - "logps/chosen": -298.92108154296875, - "logps/rejected": -359.711181640625, - "loss": 0.0368, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.9593040943145752, - "rewards/margins": 9.846508979797363, - "rewards/rejected": -11.805811882019043, - "step": 9020 + "learning_rate": 1.3349081832768763e-07, + "logits/chosen": -2.531371593475342, + "logits/rejected": -2.462928056716919, + "logps/chosen": -292.72955322265625, + "logps/rejected": -324.849853515625, + "loss": 0.0232, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4821574687957764, + "rewards/margins": 8.786005020141602, + "rewards/rejected": -11.268162727355957, + "step": 9470 }, { "epoch": 2.28, - "learning_rate": 1.3285272914521112e-07, - "logits/chosen": -2.588824510574341, - "logits/rejected": -2.516756057739258, - "logps/chosen": -343.57745361328125, - "logps/rejected": -385.2336730957031, - "loss": 0.018, + "learning_rate": 1.3304510607951506e-07, + "logits/chosen": -2.636730432510376, + "logits/rejected": -2.5447049140930176, + "logps/chosen": -247.34933471679688, + "logps/rejected": -309.447021484375, + "loss": 0.0187, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.6453635096549988, - "rewards/margins": 9.960227012634277, - "rewards/rejected": -10.6055908203125, - "step": 9030 + "rewards/chosen": 0.4195857048034668, + "rewards/margins": 12.351226806640625, + "rewards/rejected": -11.931640625, + "step": 9480 + }, + { + "epoch": 2.28, + "learning_rate": 1.3259939383134246e-07, + "logits/chosen": -2.4578869342803955, + "logits/rejected": -2.4700393676757812, + "logps/chosen": -274.8259582519531, + "logps/rejected": -368.4563903808594, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1490941047668457, + "rewards/margins": 12.789095878601074, + "rewards/rejected": -14.938189506530762, + "step": 9490 }, { "epoch": 2.29, - "learning_rate": 1.3238460818275444e-07, - "logits/chosen": -2.5509066581726074, - "logits/rejected": -2.501462459564209, - "logps/chosen": -232.3785858154297, - "logps/rejected": -356.4541931152344, - "loss": 0.0095, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5372840762138367, - "rewards/margins": 11.531806945800781, - "rewards/rejected": -12.069090843200684, - "step": 9040 + "learning_rate": 1.321536815831699e-07, + "logits/chosen": -2.5568885803222656, + "logits/rejected": -2.560631275177002, + "logps/chosen": -236.66342163085938, + "logps/rejected": -420.954345703125, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3946126103401184, + "rewards/margins": 13.331586837768555, + "rewards/rejected": -13.72619915008545, + "step": 9500 }, { "epoch": 2.29, - "learning_rate": 1.3191648722029773e-07, - "logits/chosen": -2.479043483734131, - "logits/rejected": -2.459160566329956, - "logps/chosen": -336.2101135253906, - "logps/rejected": -355.56170654296875, - "loss": 0.0149, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3328600525856018, - "rewards/margins": 12.044123649597168, - "rewards/rejected": -11.711263656616211, - "step": 9050 + "eval_logits/chosen": -2.3496193885803223, + "eval_logits/rejected": -2.2925631999969482, + "eval_logps/chosen": -275.1883544921875, + "eval_logps/rejected": -307.2347412109375, + "eval_loss": 0.6238428354263306, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -7.922736644744873, + "eval_rewards/margins": 4.361425399780273, + "eval_rewards/rejected": -12.284161567687988, + "eval_runtime": 132.2862, + "eval_samples_per_second": 23.857, + "eval_steps_per_second": 0.378, + "step": 9500 }, { "epoch": 2.29, - "learning_rate": 1.3144836625784104e-07, - "logits/chosen": -2.6534008979797363, - "logits/rejected": -2.392120361328125, - "logps/chosen": -354.12432861328125, - "logps/rejected": -394.28521728515625, - "loss": 0.0063, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8992952108383179, - "rewards/margins": 11.788612365722656, - "rewards/rejected": -12.687907218933105, - "step": 9060 + "learning_rate": 1.3170796933499732e-07, + "logits/chosen": -2.596870183944702, + "logits/rejected": -2.551544666290283, + "logps/chosen": -226.4156494140625, + "logps/rejected": -450.28851318359375, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5191240310668945, + "rewards/margins": 12.775012969970703, + "rewards/rejected": -14.294137954711914, + "step": 9510 }, { "epoch": 2.29, - "learning_rate": 1.309802452953843e-07, - "logits/chosen": -2.467833995819092, - "logits/rejected": -2.377655029296875, - "logps/chosen": -172.44436645507812, - "logps/rejected": -307.0425109863281, - "loss": 0.009, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1732361316680908, - "rewards/margins": 10.656147956848145, - "rewards/rejected": -11.829381942749023, - "step": 9070 + "learning_rate": 1.3126225708682472e-07, + "logits/chosen": -2.622180938720703, + "logits/rejected": -2.552677631378174, + "logps/chosen": -241.6460723876953, + "logps/rejected": -334.58270263671875, + "loss": 0.0335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2114663124084473, + "rewards/margins": 10.476550102233887, + "rewards/rejected": -13.688018798828125, + "step": 9520 + }, + { + "epoch": 2.29, + "learning_rate": 1.3081654483865215e-07, + "logits/chosen": -2.336742401123047, + "logits/rejected": -2.327411413192749, + "logps/chosen": -229.7757568359375, + "logps/rejected": -404.87872314453125, + "loss": 0.0493, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6361451148986816, + "rewards/margins": 12.966715812683105, + "rewards/rejected": -15.602861404418945, + "step": 9530 }, { "epoch": 2.3, - "learning_rate": 1.3051212433292762e-07, - "logits/chosen": -2.5307626724243164, - "logits/rejected": -2.511751174926758, - "logps/chosen": -279.1362609863281, - "logps/rejected": -463.63214111328125, - "loss": 0.0171, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.1896442025899887, - "rewards/margins": 12.709389686584473, - "rewards/rejected": -12.519744873046875, - "step": 9080 + "learning_rate": 1.3037083259047958e-07, + "logits/chosen": -2.622959852218628, + "logits/rejected": -2.509721279144287, + "logps/chosen": -332.7505798339844, + "logps/rejected": -362.7456359863281, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.044748067855835, + "rewards/margins": 11.685507774353027, + "rewards/rejected": -12.730256080627441, + "step": 9540 }, { "epoch": 2.3, - "learning_rate": 1.300440033704709e-07, - "logits/chosen": -2.435119390487671, - "logits/rejected": -2.4567675590515137, - "logps/chosen": -257.6720275878906, - "logps/rejected": -317.12664794921875, - "loss": 0.0119, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.369809627532959, - "rewards/margins": 9.4765625, - "rewards/rejected": -11.846372604370117, - "step": 9090 + "learning_rate": 1.2992512034230698e-07, + "logits/chosen": -2.7111148834228516, + "logits/rejected": -2.644036054611206, + "logps/chosen": -288.38226318359375, + "logps/rejected": -446.72369384765625, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4897494912147522, + "rewards/margins": 12.886700630187988, + "rewards/rejected": -13.37645149230957, + "step": 9550 }, { "epoch": 2.3, - "learning_rate": 1.2957588240801423e-07, - "logits/chosen": -2.7394285202026367, - "logits/rejected": -2.7037463188171387, - "logps/chosen": -427.0330505371094, - "logps/rejected": -442.126220703125, - "loss": 0.0206, + "learning_rate": 1.2947940809413444e-07, + "logits/chosen": -2.5800740718841553, + "logits/rejected": -2.4798507690429688, + "logps/chosen": -231.86795043945312, + "logps/rejected": -312.1792297363281, + "loss": 0.0157, "rewards/accuracies": 1.0, - "rewards/chosen": -1.711077332496643, - "rewards/margins": 10.261374473571777, - "rewards/rejected": -11.972452163696289, - "step": 9100 + "rewards/chosen": -1.5118236541748047, + "rewards/margins": 9.782416343688965, + "rewards/rejected": -11.294239044189453, + "step": 9560 }, { "epoch": 2.3, - "learning_rate": 1.2910776144555752e-07, - "logits/chosen": -2.494143009185791, - "logits/rejected": -2.436272144317627, - "logps/chosen": -254.1820068359375, - "logps/rejected": -328.72955322265625, - "loss": 0.0127, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5816042423248291, - "rewards/margins": 10.016374588012695, - "rewards/rejected": -10.597977638244629, - "step": 9110 + "learning_rate": 1.2903369584596184e-07, + "logits/chosen": -2.4425435066223145, + "logits/rejected": -2.327712297439575, + "logps/chosen": -192.24102783203125, + "logps/rejected": -291.2496643066406, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.220635890960693, + "rewards/margins": 10.254015922546387, + "rewards/rejected": -14.474653244018555, + "step": 9570 }, { "epoch": 2.31, - "learning_rate": 1.2863964048310083e-07, - "logits/chosen": -2.5131287574768066, - "logits/rejected": -2.4419400691986084, - "logps/chosen": -214.53964233398438, - "logps/rejected": -357.4662170410156, - "loss": 0.0219, + "learning_rate": 1.2858798359778927e-07, + "logits/chosen": -2.612334728240967, + "logits/rejected": -2.5917248725891113, + "logps/chosen": -302.56268310546875, + "logps/rejected": -374.328369140625, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38091421127319336, + "rewards/margins": 11.838956832885742, + "rewards/rejected": -12.219871520996094, + "step": 9580 + }, + { + "epoch": 2.31, + "learning_rate": 1.281422713496167e-07, + "logits/chosen": -2.6529603004455566, + "logits/rejected": -2.5329434871673584, + "logps/chosen": -268.00885009765625, + "logps/rejected": -335.9152526855469, + "loss": 0.0306, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.9653714299201965, - "rewards/margins": 10.164517402648926, - "rewards/rejected": -11.129888534545898, - "step": 9120 + "rewards/chosen": -2.0708703994750977, + "rewards/margins": 9.467866897583008, + "rewards/rejected": -11.538736343383789, + "step": 9590 }, { "epoch": 2.31, - "learning_rate": 1.2817151952064412e-07, - "logits/chosen": -2.556823492050171, - "logits/rejected": -2.44260835647583, - "logps/chosen": -343.62738037109375, - "logps/rejected": -361.4820556640625, - "loss": 0.0174, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.45240944623947144, - "rewards/margins": 9.722356796264648, - "rewards/rejected": -10.17476749420166, - "step": 9130 + "learning_rate": 1.276965591014441e-07, + "logits/chosen": -2.537541627883911, + "logits/rejected": -2.5117154121398926, + "logps/chosen": -229.8907928466797, + "logps/rejected": -390.6585998535156, + "loss": 0.0412, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2214009314775467, + "rewards/margins": 12.097308158874512, + "rewards/rejected": -12.318709373474121, + "step": 9600 }, { "epoch": 2.31, - "learning_rate": 1.2770339855818744e-07, - "logits/chosen": -2.4272677898406982, - "logits/rejected": -2.476644515991211, - "logps/chosen": -273.29840087890625, - "logps/rejected": -372.9749755859375, - "loss": 0.0105, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.32155928015708923, - "rewards/margins": 12.134693145751953, - "rewards/rejected": -12.456252098083496, - "step": 9140 + "eval_logits/chosen": -2.296140670776367, + "eval_logits/rejected": -2.2376718521118164, + "eval_logps/chosen": -273.0552978515625, + "eval_logps/rejected": -304.16845703125, + "eval_loss": 0.6126354932785034, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -7.709428310394287, + "eval_rewards/margins": 4.268109321594238, + "eval_rewards/rejected": -11.97753620147705, + "eval_runtime": 132.3536, + "eval_samples_per_second": 23.845, + "eval_steps_per_second": 0.378, + "step": 9600 }, { "epoch": 2.31, - "learning_rate": 1.2723527759573075e-07, - "logits/chosen": -2.484036684036255, - "logits/rejected": -2.4402365684509277, - "logps/chosen": -264.90240478515625, - "logps/rejected": -367.96478271484375, - "loss": 0.0153, + "learning_rate": 1.2725084685327153e-07, + "logits/chosen": -2.482020616531372, + "logits/rejected": -2.4609808921813965, + "logps/chosen": -234.28176879882812, + "logps/rejected": -424.196533203125, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2595276832580566, + "rewards/margins": 14.13378620147705, + "rewards/rejected": -16.393314361572266, + "step": 9610 + }, + { + "epoch": 2.32, + "learning_rate": 1.2680513460509896e-07, + "logits/chosen": -2.533411979675293, + "logits/rejected": -2.554663896560669, + "logps/chosen": -288.6836853027344, + "logps/rejected": -402.8774719238281, + "loss": 0.0558, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5199756026268005, - "rewards/margins": 12.971882820129395, - "rewards/rejected": -12.451906204223633, - "step": 9150 + "rewards/chosen": -1.1928050518035889, + "rewards/margins": 12.324869155883789, + "rewards/rejected": -13.517674446105957, + "step": 9620 }, { "epoch": 2.32, - "learning_rate": 1.2676715663327404e-07, - "logits/chosen": -2.6081578731536865, - "logits/rejected": -2.5950417518615723, - "logps/chosen": -319.1241149902344, - "logps/rejected": -425.2018127441406, - "loss": 0.0235, + "learning_rate": 1.2635942235692637e-07, + "logits/chosen": -2.334124803543091, + "logits/rejected": -2.3364882469177246, + "logps/chosen": -262.31719970703125, + "logps/rejected": -394.1955261230469, + "loss": 0.0255, "rewards/accuracies": 1.0, - "rewards/chosen": 0.09917531162500381, - "rewards/margins": 11.424158096313477, - "rewards/rejected": -11.324983596801758, - "step": 9160 + "rewards/chosen": -1.4039243459701538, + "rewards/margins": 12.162347793579102, + "rewards/rejected": -13.566271781921387, + "step": 9630 }, { "epoch": 2.32, - "learning_rate": 1.2629903567081736e-07, - "logits/chosen": -2.36436128616333, - "logits/rejected": -2.354889392852783, - "logps/chosen": -259.3992919921875, - "logps/rejected": -392.87396240234375, - "loss": 0.0201, + "learning_rate": 1.259137101087538e-07, + "logits/chosen": -2.7416768074035645, + "logits/rejected": -2.649817943572998, + "logps/chosen": -285.6194152832031, + "logps/rejected": -333.78131103515625, + "loss": 0.0207, "rewards/accuracies": 1.0, - "rewards/chosen": -1.1882728338241577, - "rewards/margins": 11.059471130371094, - "rewards/rejected": -12.247743606567383, - "step": 9170 + "rewards/chosen": -1.3569809198379517, + "rewards/margins": 10.611593246459961, + "rewards/rejected": -11.968573570251465, + "step": 9640 }, { "epoch": 2.32, - "learning_rate": 1.2583091470836062e-07, - "logits/chosen": -2.565932035446167, - "logits/rejected": -2.3525166511535645, - "logps/chosen": -254.3878631591797, - "logps/rejected": -318.05963134765625, - "loss": 0.0233, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5903030633926392, - "rewards/margins": 10.55272102355957, - "rewards/rejected": -11.143023490905762, - "step": 9180 + "learning_rate": 1.254679978605812e-07, + "logits/chosen": -2.615839958190918, + "logits/rejected": -2.37062668800354, + "logps/chosen": -329.7728576660156, + "logps/rejected": -370.6965637207031, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8861749768257141, + "rewards/margins": 11.890405654907227, + "rewards/rejected": -12.776578903198242, + "step": 9650 }, { "epoch": 2.32, - "learning_rate": 1.2536279374590394e-07, - "logits/chosen": -2.463155508041382, - "logits/rejected": -2.339812755584717, - "logps/chosen": -225.89944458007812, - "logps/rejected": -430.6494140625, - "loss": 0.0138, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0934032201766968, - "rewards/margins": 11.839479446411133, - "rewards/rejected": -12.932882308959961, - "step": 9190 + "learning_rate": 1.2502228561240863e-07, + "logits/chosen": -2.6863248348236084, + "logits/rejected": -2.5339202880859375, + "logps/chosen": -337.3868103027344, + "logps/rejected": -345.2395935058594, + "loss": 0.0359, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1403334140777588, + "rewards/margins": 9.43513011932373, + "rewards/rejected": -10.575462341308594, + "step": 9660 }, { "epoch": 2.33, - "learning_rate": 1.2489467278344723e-07, - "logits/chosen": -2.4740588665008545, - "logits/rejected": -2.270139455795288, - "logps/chosen": -258.7538146972656, - "logps/rejected": -514.9562377929688, - "loss": 0.0298, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0893930196762085, - "rewards/margins": 14.917033195495605, - "rewards/rejected": -16.006427764892578, - "step": 9200 + "learning_rate": 1.2457657336423606e-07, + "logits/chosen": -2.7165229320526123, + "logits/rejected": -2.630030393600464, + "logps/chosen": -303.4105529785156, + "logps/rejected": -427.3023376464844, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2123167514801025, + "rewards/margins": 14.350173950195312, + "rewards/rejected": -13.137857437133789, + "step": 9670 }, { "epoch": 2.33, - "learning_rate": 1.2442655182099054e-07, - "logits/chosen": -2.522426128387451, - "logits/rejected": -2.581494092941284, - "logps/chosen": -283.09307861328125, - "logps/rejected": -441.8668518066406, - "loss": 0.0257, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.10143113136291504, - "rewards/margins": 14.812362670898438, - "rewards/rejected": -14.913793563842773, - "step": 9210 + "learning_rate": 1.2413086111606346e-07, + "logits/chosen": -2.5882039070129395, + "logits/rejected": -2.4288363456726074, + "logps/chosen": -254.5582733154297, + "logps/rejected": -373.54327392578125, + "loss": 0.0408, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7697980403900146, + "rewards/margins": 11.864550590515137, + "rewards/rejected": -13.63434886932373, + "step": 9680 }, { "epoch": 2.33, - "learning_rate": 1.2395843085853386e-07, - "logits/chosen": -2.485147476196289, - "logits/rejected": -2.320444345474243, - "logps/chosen": -319.28997802734375, - "logps/rejected": -346.0876770019531, - "loss": 0.0212, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.241127848625183, - "rewards/margins": 10.240372657775879, - "rewards/rejected": -11.481500625610352, - "step": 9220 + "learning_rate": 1.236851488678909e-07, + "logits/chosen": -2.676398515701294, + "logits/rejected": -2.6112141609191895, + "logps/chosen": -324.1001892089844, + "logps/rejected": -520.2833862304688, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1287245750427246, + "rewards/margins": 12.324732780456543, + "rewards/rejected": -13.453455924987793, + "step": 9690 }, { "epoch": 2.33, - "learning_rate": 1.2349030989607715e-07, - "logits/chosen": -2.55083966255188, - "logits/rejected": -2.4898033142089844, - "logps/chosen": -255.58920288085938, - "logps/rejected": -358.78155517578125, - "loss": 0.0202, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.457207441329956, - "rewards/margins": 11.063669204711914, - "rewards/rejected": -12.520875930786133, - "step": 9230 + "learning_rate": 1.2323943661971832e-07, + "logits/chosen": -2.5960118770599365, + "logits/rejected": -2.6163535118103027, + "logps/chosen": -246.47561645507812, + "logps/rejected": -356.0195007324219, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06816250085830688, + "rewards/margins": 12.434173583984375, + "rewards/rejected": -12.502337455749512, + "step": 9700 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -2.3099844455718994, + "eval_logits/rejected": -2.2505338191986084, + "eval_logps/chosen": -271.9911804199219, + "eval_logps/rejected": -303.114013671875, + "eval_loss": 0.6130265593528748, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -7.603017807006836, + "eval_rewards/margins": 4.269077301025391, + "eval_rewards/rejected": -11.872095108032227, + "eval_runtime": 132.082, + "eval_samples_per_second": 23.894, + "eval_steps_per_second": 0.379, + "step": 9700 }, { "epoch": 2.34, - "learning_rate": 1.2302218893362044e-07, - "logits/chosen": -2.5158586502075195, - "logits/rejected": -2.3378074169158936, - "logps/chosen": -288.1528625488281, - "logps/rejected": -298.3981628417969, - "loss": 0.0205, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.2239353656768799, - "rewards/margins": 8.507644653320312, - "rewards/rejected": -9.731579780578613, - "step": 9240 + "learning_rate": 1.2279372437154572e-07, + "logits/chosen": -2.611393451690674, + "logits/rejected": -2.4602303504943848, + "logps/chosen": -297.416259765625, + "logps/rejected": -343.02337646484375, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.003729224205017, + "rewards/margins": 13.627870559692383, + "rewards/rejected": -12.62414264678955, + "step": 9710 }, { "epoch": 2.34, - "learning_rate": 1.2255406797116375e-07, - "logits/chosen": -2.4785284996032715, - "logits/rejected": -2.3354721069335938, - "logps/chosen": -290.882568359375, - "logps/rejected": -308.3099670410156, - "loss": 0.016, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4787499904632568, - "rewards/margins": 10.869338035583496, - "rewards/rejected": -12.348088264465332, - "step": 9250 + "learning_rate": 1.2234801212337315e-07, + "logits/chosen": -2.6968941688537598, + "logits/rejected": -2.633288860321045, + "logps/chosen": -263.99578857421875, + "logps/rejected": -370.8405456542969, + "loss": 0.0457, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.723885178565979, + "rewards/margins": 10.335236549377441, + "rewards/rejected": -12.059122085571289, + "step": 9720 }, { "epoch": 2.34, - "learning_rate": 1.2208594700870704e-07, - "logits/chosen": -2.5821151733398438, - "logits/rejected": -2.450249671936035, - "logps/chosen": -309.484130859375, - "logps/rejected": -372.41534423828125, - "loss": 0.0257, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7294740676879883, - "rewards/margins": 11.100632667541504, - "rewards/rejected": -12.830106735229492, - "step": 9260 + "learning_rate": 1.2190229987520055e-07, + "logits/chosen": -2.675483226776123, + "logits/rejected": -2.657735586166382, + "logps/chosen": -342.79071044921875, + "logps/rejected": -447.1167907714844, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7835860848426819, + "rewards/margins": 12.174049377441406, + "rewards/rejected": -12.957635879516602, + "step": 9730 }, { "epoch": 2.34, - "learning_rate": 1.2161782604625036e-07, - "logits/chosen": -2.374976634979248, - "logits/rejected": -2.3441929817199707, - "logps/chosen": -290.5916748046875, - "logps/rejected": -347.4356384277344, - "loss": 0.0163, + "learning_rate": 1.2145658762702798e-07, + "logits/chosen": -2.63665509223938, + "logits/rejected": -2.5963196754455566, + "logps/chosen": -230.5588836669922, + "logps/rejected": -317.8227233886719, + "loss": 0.0315, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.8753108978271484, - "rewards/margins": 8.887423515319824, - "rewards/rejected": -11.762734413146973, - "step": 9270 + "rewards/chosen": -2.044243097305298, + "rewards/margins": 11.024585723876953, + "rewards/rejected": -13.068829536437988, + "step": 9740 }, { "epoch": 2.35, - "learning_rate": 1.2114970508379365e-07, - "logits/chosen": -2.481886386871338, - "logits/rejected": -2.4045209884643555, - "logps/chosen": -195.5873565673828, - "logps/rejected": -339.64373779296875, - "loss": 0.0252, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.2074648141860962, - "rewards/margins": 9.534823417663574, - "rewards/rejected": -10.742288589477539, - "step": 9280 + "learning_rate": 1.210108753788554e-07, + "logits/chosen": -2.432250738143921, + "logits/rejected": -2.298980712890625, + "logps/chosen": -339.8245849609375, + "logps/rejected": -363.7259216308594, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1412423849105835, + "rewards/margins": 12.045231819152832, + "rewards/rejected": -13.18647575378418, + "step": 9750 }, { "epoch": 2.35, - "learning_rate": 1.2068158412133694e-07, - "logits/chosen": -2.5621516704559326, - "logits/rejected": -2.567854404449463, - "logps/chosen": -298.40313720703125, - "logps/rejected": -410.24267578125, - "loss": 0.0122, + "learning_rate": 1.2056516313068281e-07, + "logits/chosen": -2.3919894695281982, + "logits/rejected": -2.199410915374756, + "logps/chosen": -302.87982177734375, + "logps/rejected": -470.61395263671875, + "loss": 0.0333, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6887444257736206, - "rewards/margins": 12.210759162902832, - "rewards/rejected": -11.522014617919922, - "step": 9290 + "rewards/chosen": -3.032304525375366, + "rewards/margins": 14.317781448364258, + "rewards/rejected": -17.350086212158203, + "step": 9760 }, { "epoch": 2.35, - "learning_rate": 1.2021346315888025e-07, - "logits/chosen": -2.139488697052002, - "logits/rejected": -2.2088325023651123, - "logps/chosen": -208.7119140625, - "logps/rejected": -313.7248840332031, - "loss": 0.0204, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9525458812713623, - "rewards/margins": 8.045282363891602, - "rewards/rejected": -10.997827529907227, - "step": 9300 + "learning_rate": 1.2011945088251024e-07, + "logits/chosen": -2.6363577842712402, + "logits/rejected": -2.6026124954223633, + "logps/chosen": -213.67977905273438, + "logps/rejected": -325.95745849609375, + "loss": 0.0066, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.43132483959198, + "rewards/margins": 10.520303726196289, + "rewards/rejected": -11.951627731323242, + "step": 9770 }, { "epoch": 2.35, - "learning_rate": 1.1974534219642357e-07, - "logits/chosen": -2.704371452331543, - "logits/rejected": -2.423408269882202, - "logps/chosen": -410.0580139160156, - "logps/rejected": -451.2791442871094, - "loss": 0.0138, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.14632168412208557, - "rewards/margins": 15.756739616394043, - "rewards/rejected": -15.610417366027832, - "step": 9310 + "learning_rate": 1.1967373863433767e-07, + "logits/chosen": -2.779999256134033, + "logits/rejected": -2.4936938285827637, + "logps/chosen": -360.51361083984375, + "logps/rejected": -325.37237548828125, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5159060955047607, + "rewards/margins": 10.373706817626953, + "rewards/rejected": -11.889612197875977, + "step": 9780 }, { "epoch": 2.36, - "learning_rate": 1.1927722123396686e-07, - "logits/chosen": -2.501812696456909, - "logits/rejected": -2.5868372917175293, - "logps/chosen": -320.36383056640625, - "logps/rejected": -485.62835693359375, - "loss": 0.0139, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1520987749099731, - "rewards/margins": 12.831914901733398, - "rewards/rejected": -13.984013557434082, - "step": 9320 + "learning_rate": 1.1922802638616508e-07, + "logits/chosen": -2.607733964920044, + "logits/rejected": -2.42354416847229, + "logps/chosen": -262.10113525390625, + "logps/rejected": -377.81695556640625, + "loss": 0.0239, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5643954277038574, + "rewards/margins": 11.185702323913574, + "rewards/rejected": -13.750099182128906, + "step": 9790 }, { "epoch": 2.36, - "learning_rate": 1.1880910027151016e-07, - "logits/chosen": -2.5127670764923096, - "logits/rejected": -2.3409245014190674, - "logps/chosen": -268.2634582519531, - "logps/rejected": -256.263671875, - "loss": 0.0186, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.45365089178085327, - "rewards/margins": 9.961838722229004, - "rewards/rejected": -9.508188247680664, - "step": 9330 + "learning_rate": 1.187823141379925e-07, + "logits/chosen": -2.672086238861084, + "logits/rejected": -2.6203792095184326, + "logps/chosen": -245.962890625, + "logps/rejected": -398.09979248046875, + "loss": 0.0361, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.213087797164917, + "rewards/margins": 10.098217010498047, + "rewards/rejected": -12.311304092407227, + "step": 9800 }, { "epoch": 2.36, - "learning_rate": 1.1834097930905346e-07, - "logits/chosen": -2.5624654293060303, - "logits/rejected": -2.5135717391967773, - "logps/chosen": -262.6741943359375, - "logps/rejected": -349.298095703125, - "loss": 0.0382, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.664712429046631, - "rewards/margins": 9.775838851928711, - "rewards/rejected": -12.440550804138184, - "step": 9340 + "eval_logits/chosen": -2.286639928817749, + "eval_logits/rejected": -2.224881172180176, + "eval_logps/chosen": -277.2341003417969, + "eval_logps/rejected": -310.4034423828125, + "eval_loss": 0.6247988939285278, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -8.127306938171387, + "eval_rewards/margins": 4.473727703094482, + "eval_rewards/rejected": -12.601035118103027, + "eval_runtime": 132.3403, + "eval_samples_per_second": 23.848, + "eval_steps_per_second": 0.378, + "step": 9800 }, { "epoch": 2.36, - "learning_rate": 1.1787285834659675e-07, - "logits/chosen": -2.4618287086486816, - "logits/rejected": -2.3042941093444824, - "logps/chosen": -248.24282836914062, - "logps/rejected": -318.86090087890625, - "loss": 0.0463, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3148791193962097, - "rewards/margins": 9.420307159423828, - "rewards/rejected": -9.735186576843262, - "step": 9350 + "learning_rate": 1.1833660188981992e-07, + "logits/chosen": -2.5166168212890625, + "logits/rejected": -2.461851119995117, + "logps/chosen": -299.8094177246094, + "logps/rejected": -390.20477294921875, + "loss": 0.0451, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7335727214813232, + "rewards/margins": 10.28907299041748, + "rewards/rejected": -13.022645950317383, + "step": 9810 + }, + { + "epoch": 2.36, + "learning_rate": 1.1789088964164735e-07, + "logits/chosen": -2.447730779647827, + "logits/rejected": -2.3857762813568115, + "logps/chosen": -238.92440795898438, + "logps/rejected": -390.6455993652344, + "loss": 0.0319, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9215359687805176, + "rewards/margins": 11.32634162902832, + "rewards/rejected": -14.24787712097168, + "step": 9820 }, { "epoch": 2.37, - "learning_rate": 1.1740473738414005e-07, - "logits/chosen": -2.482856273651123, - "logits/rejected": -2.5394861698150635, - "logps/chosen": -361.1878967285156, - "logps/rejected": -414.9200134277344, - "loss": 0.0301, + "learning_rate": 1.1744517739347477e-07, + "logits/chosen": -2.370506763458252, + "logits/rejected": -2.244635581970215, + "logps/chosen": -213.49697875976562, + "logps/rejected": -312.31207275390625, + "loss": 0.0334, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.191426992416382, - "rewards/margins": 11.798062324523926, - "rewards/rejected": -13.98948860168457, - "step": 9360 + "rewards/chosen": -2.4892866611480713, + "rewards/margins": 11.822929382324219, + "rewards/rejected": -14.312215805053711, + "step": 9830 }, { "epoch": 2.37, - "learning_rate": 1.1693661642168336e-07, - "logits/chosen": -2.445554733276367, - "logits/rejected": -2.4558825492858887, - "logps/chosen": -248.78958129882812, - "logps/rejected": -354.56280517578125, - "loss": 0.0133, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0716218948364258, - "rewards/margins": 11.133877754211426, - "rewards/rejected": -12.205499649047852, - "step": 9370 + "learning_rate": 1.169994651453022e-07, + "logits/chosen": -2.581648111343384, + "logits/rejected": -2.45662260055542, + "logps/chosen": -337.57928466796875, + "logps/rejected": -355.1318664550781, + "loss": 0.0518, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.595403671264648, + "rewards/margins": 9.963247299194336, + "rewards/rejected": -14.5586519241333, + "step": 9840 }, { "epoch": 2.37, - "learning_rate": 1.1646849545922666e-07, - "logits/chosen": -2.413811445236206, - "logits/rejected": -2.4532883167266846, - "logps/chosen": -328.21826171875, - "logps/rejected": -404.242431640625, - "loss": 0.0167, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.2563796043395996, - "rewards/margins": 10.868789672851562, - "rewards/rejected": -14.12516975402832, - "step": 9380 + "learning_rate": 1.1655375289712961e-07, + "logits/chosen": -2.5107738971710205, + "logits/rejected": -2.4025065898895264, + "logps/chosen": -156.2109375, + "logps/rejected": -285.67010498046875, + "loss": 0.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6317264437675476, + "rewards/margins": 11.287073135375977, + "rewards/rejected": -11.918798446655273, + "step": 9850 }, { "epoch": 2.37, - "learning_rate": 1.1600037449676996e-07, - "logits/chosen": -2.4219603538513184, - "logits/rejected": -2.388962507247925, - "logps/chosen": -275.37677001953125, - "logps/rejected": -308.3039245605469, - "loss": 0.024, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.2332690954208374, - "rewards/margins": 9.072051048278809, - "rewards/rejected": -10.305319786071777, - "step": 9390 + "learning_rate": 1.1610804064895703e-07, + "logits/chosen": -2.52023983001709, + "logits/rejected": -2.5326385498046875, + "logps/chosen": -210.8177947998047, + "logps/rejected": -321.39031982421875, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30382394790649414, + "rewards/margins": 12.216788291931152, + "rewards/rejected": -12.520612716674805, + "step": 9860 }, { "epoch": 2.38, - "learning_rate": 1.1553225353431325e-07, - "logits/chosen": -2.4700279235839844, - "logits/rejected": -2.3462586402893066, - "logps/chosen": -204.75643920898438, - "logps/rejected": -344.16973876953125, - "loss": 0.0123, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.9443933963775635, - "rewards/margins": 10.954790115356445, - "rewards/rejected": -13.89918327331543, - "step": 9400 + "learning_rate": 1.1566232840078444e-07, + "logits/chosen": -2.6442270278930664, + "logits/rejected": -2.537446975708008, + "logps/chosen": -311.75177001953125, + "logps/rejected": -411.7709045410156, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09315244108438492, + "rewards/margins": 12.612189292907715, + "rewards/rejected": -12.519037246704102, + "step": 9870 }, { "epoch": 2.38, - "learning_rate": 1.1506413257185655e-07, - "logits/chosen": -2.33776593208313, - "logits/rejected": -2.3722383975982666, - "logps/chosen": -352.3572998046875, - "logps/rejected": -380.77789306640625, - "loss": 0.0115, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.33109062910079956, - "rewards/margins": 11.438632011413574, - "rewards/rejected": -11.10754108428955, - "step": 9410 + "learning_rate": 1.1521661615261187e-07, + "logits/chosen": -2.4287455081939697, + "logits/rejected": -2.30899715423584, + "logps/chosen": -212.9908447265625, + "logps/rejected": -376.57501220703125, + "loss": 0.0242, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.574534893035889, + "rewards/margins": 9.451489448547363, + "rewards/rejected": -14.026025772094727, + "step": 9880 }, { "epoch": 2.38, - "learning_rate": 1.1459601160939987e-07, - "logits/chosen": -2.5390992164611816, - "logits/rejected": -2.3394479751586914, - "logps/chosen": -291.4183654785156, - "logps/rejected": -307.16278076171875, - "loss": 0.0293, + "learning_rate": 1.1477090390443929e-07, + "logits/chosen": -2.620258092880249, + "logits/rejected": -2.505781650543213, + "logps/chosen": -260.06182861328125, + "logps/rejected": -337.3952941894531, + "loss": 0.0258, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.1895270347595215, - "rewards/margins": 9.163797378540039, - "rewards/rejected": -12.353324890136719, - "step": 9420 + "rewards/chosen": -2.016458034515381, + "rewards/margins": 9.409826278686523, + "rewards/rejected": -11.426284790039062, + "step": 9890 + }, + { + "epoch": 2.38, + "learning_rate": 1.143251916562667e-07, + "logits/chosen": -2.396528482437134, + "logits/rejected": -2.3226048946380615, + "logps/chosen": -210.72518920898438, + "logps/rejected": -251.00595092773438, + "loss": 0.0289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4992809295654297, + "rewards/margins": 8.538719177246094, + "rewards/rejected": -11.038000106811523, + "step": 9900 + }, + { + "epoch": 2.38, + "eval_logits/chosen": -2.3067312240600586, + "eval_logits/rejected": -2.247349262237549, + "eval_logps/chosen": -275.88525390625, + "eval_logps/rejected": -308.2185363769531, + "eval_loss": 0.6191706657409668, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -7.992426872253418, + "eval_rewards/margins": 4.390118598937988, + "eval_rewards/rejected": -12.382543563842773, + "eval_runtime": 132.5321, + "eval_samples_per_second": 23.813, + "eval_steps_per_second": 0.377, + "step": 9900 }, { - "epoch": 2.38, - "learning_rate": 1.1412789064694317e-07, - "logits/chosen": -2.480581045150757, - "logits/rejected": -2.452671527862549, - "logps/chosen": -257.4129638671875, - "logps/rejected": -364.5929260253906, - "loss": 0.0351, + "epoch": 2.39, + "learning_rate": 1.1387947940809412e-07, + "logits/chosen": -2.5669615268707275, + "logits/rejected": -2.534998655319214, + "logps/chosen": -308.62481689453125, + "logps/rejected": -409.72576904296875, + "loss": 0.0206, "rewards/accuracies": 1.0, - "rewards/chosen": -1.3671848773956299, - "rewards/margins": 12.583539962768555, - "rewards/rejected": -13.950726509094238, - "step": 9430 + "rewards/chosen": -2.140786647796631, + "rewards/margins": 11.700662612915039, + "rewards/rejected": -13.841448783874512, + "step": 9910 }, { "epoch": 2.39, - "learning_rate": 1.1365976968448647e-07, - "logits/chosen": -2.4294826984405518, - "logits/rejected": -2.2800369262695312, - "logps/chosen": -264.89166259765625, - "logps/rejected": -358.344482421875, - "loss": 0.0216, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5198668241500854, - "rewards/margins": 11.957700729370117, - "rewards/rejected": -13.477566719055176, - "step": 9440 + "learning_rate": 1.1343376715992155e-07, + "logits/chosen": -2.5555453300476074, + "logits/rejected": -2.405968189239502, + "logps/chosen": -285.4302062988281, + "logps/rejected": -339.9258117675781, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9601826667785645, + "rewards/margins": 10.785021781921387, + "rewards/rejected": -14.745203971862793, + "step": 9920 }, { "epoch": 2.39, - "learning_rate": 1.1319164872202978e-07, - "logits/chosen": -2.599461078643799, - "logits/rejected": -2.4103665351867676, - "logps/chosen": -346.68896484375, - "logps/rejected": -361.1012268066406, - "loss": 0.0247, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.0498604774475098, - "rewards/margins": 11.50390338897705, - "rewards/rejected": -13.553762435913086, - "step": 9450 + "learning_rate": 1.1298805491174897e-07, + "logits/chosen": -2.4446024894714355, + "logits/rejected": -2.3654847145080566, + "logps/chosen": -252.49124145507812, + "logps/rejected": -322.60150146484375, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5803260803222656, + "rewards/margins": 11.27856731414795, + "rewards/rejected": -14.858892440795898, + "step": 9930 }, { "epoch": 2.39, - "learning_rate": 1.1272352775957307e-07, - "logits/chosen": -2.304784059524536, - "logits/rejected": -2.3479790687561035, - "logps/chosen": -221.0005340576172, - "logps/rejected": -371.857666015625, - "loss": 0.0273, + "learning_rate": 1.1254234266357638e-07, + "logits/chosen": -2.7171902656555176, + "logits/rejected": -2.5758886337280273, + "logps/chosen": -356.2979431152344, + "logps/rejected": -413.72991943359375, + "loss": 0.0457, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.1888279914855957, - "rewards/margins": 11.113363265991211, - "rewards/rejected": -13.302189826965332, - "step": 9460 + "rewards/chosen": -2.645761489868164, + "rewards/margins": 10.4742431640625, + "rewards/rejected": -13.120004653930664, + "step": 9940 }, { "epoch": 2.39, - "learning_rate": 1.1225540679711637e-07, - "logits/chosen": -2.4169933795928955, - "logits/rejected": -2.522143840789795, - "logps/chosen": -269.467529296875, - "logps/rejected": -374.2334289550781, - "loss": 0.0199, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.357435703277588, - "rewards/margins": 9.489862442016602, - "rewards/rejected": -11.847297668457031, - "step": 9470 + "learning_rate": 1.1209663041540381e-07, + "logits/chosen": -2.4007773399353027, + "logits/rejected": -2.448000431060791, + "logps/chosen": -280.7416076660156, + "logps/rejected": -418.7703552246094, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8805170059204102, + "rewards/margins": 9.915616989135742, + "rewards/rejected": -11.796133041381836, + "step": 9950 }, { "epoch": 2.4, - "learning_rate": 1.1178728583465967e-07, - "logits/chosen": -2.375798463821411, - "logits/rejected": -2.2957353591918945, - "logps/chosen": -262.4629211425781, - "logps/rejected": -425.6641540527344, - "loss": 0.0326, + "learning_rate": 1.1165091816723123e-07, + "logits/chosen": -2.462285041809082, + "logits/rejected": -2.357201099395752, + "logps/chosen": -279.7169494628906, + "logps/rejected": -416.44000244140625, + "loss": 0.0251, "rewards/accuracies": 1.0, - "rewards/chosen": -2.5441296100616455, - "rewards/margins": 12.342843055725098, - "rewards/rejected": -14.886972427368164, - "step": 9480 + "rewards/chosen": -1.9342237710952759, + "rewards/margins": 11.604345321655273, + "rewards/rejected": -13.538569450378418, + "step": 9960 }, { "epoch": 2.4, - "learning_rate": 1.1131916487220297e-07, - "logits/chosen": -2.465956211090088, - "logits/rejected": -2.360863208770752, - "logps/chosen": -239.08798217773438, - "logps/rejected": -299.3997497558594, - "loss": 0.0207, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6827244758605957, - "rewards/margins": 10.561367988586426, - "rewards/rejected": -11.24409008026123, - "step": 9490 + "learning_rate": 1.1120520591905864e-07, + "logits/chosen": -2.3776774406433105, + "logits/rejected": -2.357255458831787, + "logps/chosen": -147.9410858154297, + "logps/rejected": -241.4050750732422, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9215605854988098, + "rewards/margins": 11.041479110717773, + "rewards/rejected": -11.963040351867676, + "step": 9970 }, { "epoch": 2.4, - "learning_rate": 1.1085104390974628e-07, - "logits/chosen": -2.694605588912964, - "logits/rejected": -2.5516393184661865, - "logps/chosen": -361.26629638671875, - "logps/rejected": -474.02587890625, - "loss": 0.0252, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9704413414001465, - "rewards/margins": 12.27275276184082, - "rewards/rejected": -13.243192672729492, - "step": 9500 + "learning_rate": 1.1075949367088606e-07, + "logits/chosen": -2.635788679122925, + "logits/rejected": -2.558600902557373, + "logps/chosen": -326.90289306640625, + "logps/rejected": -343.66717529296875, + "loss": 0.0212, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.17625749111175537, + "rewards/margins": 11.242159843444824, + "rewards/rejected": -11.418416976928711, + "step": 9980 }, { "epoch": 2.4, - "learning_rate": 1.1038292294728956e-07, - "logits/chosen": -2.609485387802124, - "logits/rejected": -2.6828622817993164, - "logps/chosen": -199.8148956298828, - "logps/rejected": -420.41717529296875, - "loss": 0.0143, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8817756772041321, - "rewards/margins": 13.943887710571289, - "rewards/rejected": -14.825662612915039, - "step": 9510 + "learning_rate": 1.103137814227135e-07, + "logits/chosen": -2.329444408416748, + "logits/rejected": -2.3129782676696777, + "logps/chosen": -221.0357666015625, + "logps/rejected": -340.22198486328125, + "loss": 0.0203, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.532474517822266, + "rewards/margins": 9.228216171264648, + "rewards/rejected": -13.76069164276123, + "step": 9990 }, { "epoch": 2.41, - "learning_rate": 1.0991480198483287e-07, - "logits/chosen": -2.4950063228607178, - "logits/rejected": -2.3991646766662598, - "logps/chosen": -244.9822235107422, - "logps/rejected": -465.58447265625, - "loss": 0.027, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.1490936279296875, - "rewards/margins": 12.020115852355957, - "rewards/rejected": -16.169208526611328, - "step": 9520 + "learning_rate": 1.0986806917454092e-07, + "logits/chosen": -2.6066360473632812, + "logits/rejected": -2.510643720626831, + "logps/chosen": -213.7716064453125, + "logps/rejected": -294.8090515136719, + "loss": 0.038, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7541530132293701, + "rewards/margins": 10.131963729858398, + "rewards/rejected": -11.886116981506348, + "step": 10000 }, { "epoch": 2.41, - "learning_rate": 1.0944668102237618e-07, - "logits/chosen": -2.490548610687256, - "logits/rejected": -2.2655842304229736, - "logps/chosen": -310.09515380859375, - "logps/rejected": -302.21435546875, - "loss": 0.0202, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3610144257545471, - "rewards/margins": 10.774946212768555, - "rewards/rejected": -11.135960578918457, - "step": 9530 + "eval_logits/chosen": -2.293797016143799, + "eval_logits/rejected": -2.2312023639678955, + "eval_logps/chosen": -280.0753173828125, + "eval_logps/rejected": -313.0937194824219, + "eval_loss": 0.6250460147857666, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -8.411430358886719, + "eval_rewards/margins": 4.458633899688721, + "eval_rewards/rejected": -12.870064735412598, + "eval_runtime": 132.4257, + "eval_samples_per_second": 23.832, + "eval_steps_per_second": 0.378, + "step": 10000 }, { "epoch": 2.41, - "learning_rate": 1.0897856005991949e-07, - "logits/chosen": -2.5174641609191895, - "logits/rejected": -2.4812374114990234, - "logps/chosen": -318.38665771484375, - "logps/rejected": -402.00018310546875, - "loss": 0.0143, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.6079021692276, - "rewards/margins": 10.268229484558105, - "rewards/rejected": -11.87613296508789, - "step": 9540 + "learning_rate": 1.0942235692636834e-07, + "logits/chosen": -2.36403751373291, + "logits/rejected": -2.325706958770752, + "logps/chosen": -291.67279052734375, + "logps/rejected": -403.5048522949219, + "loss": 0.0227, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.5491859912872314, + "rewards/margins": 9.878864288330078, + "rewards/rejected": -13.42805004119873, + "step": 10010 }, { "epoch": 2.41, - "learning_rate": 1.0851043909746279e-07, - "logits/chosen": -2.3226113319396973, - "logits/rejected": -2.332900285720825, - "logps/chosen": -221.42001342773438, - "logps/rejected": -340.07373046875, - "loss": 0.0105, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.173617959022522, - "rewards/margins": 11.125650405883789, - "rewards/rejected": -12.299267768859863, - "step": 9550 + "learning_rate": 1.0897664467819575e-07, + "logits/chosen": -2.5598981380462646, + "logits/rejected": -2.383474349975586, + "logps/chosen": -244.7018280029297, + "logps/rejected": -367.6690368652344, + "loss": 0.0318, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.567253589630127, + "rewards/margins": 9.21489429473877, + "rewards/rejected": -13.782148361206055, + "step": 10020 + }, + { + "epoch": 2.41, + "learning_rate": 1.0853093243002318e-07, + "logits/chosen": -2.58016300201416, + "logits/rejected": -2.5092506408691406, + "logps/chosen": -392.6916198730469, + "logps/rejected": -357.5271911621094, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9882642030715942, + "rewards/margins": 12.676253318786621, + "rewards/rejected": -13.664517402648926, + "step": 10030 }, { "epoch": 2.42, - "learning_rate": 1.0804231813500608e-07, - "logits/chosen": -2.6040127277374268, - "logits/rejected": -2.653808116912842, - "logps/chosen": -227.6526336669922, - "logps/rejected": -364.04486083984375, - "loss": 0.0113, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.9157921075820923, - "rewards/margins": 10.997711181640625, - "rewards/rejected": -12.913503646850586, - "step": 9560 + "learning_rate": 1.080852201818506e-07, + "logits/chosen": -2.5156655311584473, + "logits/rejected": -2.398411512374878, + "logps/chosen": -298.3486633300781, + "logps/rejected": -350.2539978027344, + "loss": 0.0205, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.5151679515838623, + "rewards/margins": 10.046378135681152, + "rewards/rejected": -13.561546325683594, + "step": 10040 }, { "epoch": 2.42, - "learning_rate": 1.0757419717254938e-07, - "logits/chosen": -2.4118499755859375, - "logits/rejected": -2.430004596710205, - "logps/chosen": -235.00588989257812, - "logps/rejected": -390.02215576171875, - "loss": 0.0203, + "learning_rate": 1.0763950793367801e-07, + "logits/chosen": -2.6228508949279785, + "logits/rejected": -2.5314218997955322, + "logps/chosen": -255.3243408203125, + "logps/rejected": -383.2738037109375, + "loss": 0.0198, "rewards/accuracies": 1.0, - "rewards/chosen": 0.05648164823651314, - "rewards/margins": 15.429672241210938, - "rewards/rejected": -15.373188972473145, - "step": 9570 + "rewards/chosen": -0.3044343888759613, + "rewards/margins": 10.470191955566406, + "rewards/rejected": -10.774625778198242, + "step": 10050 }, { "epoch": 2.42, - "learning_rate": 1.0710607621009268e-07, - "logits/chosen": -2.375131607055664, - "logits/rejected": -2.3413290977478027, - "logps/chosen": -231.0093994140625, - "logps/rejected": -349.61065673828125, - "loss": 0.0116, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6894218921661377, - "rewards/margins": 10.271553039550781, - "rewards/rejected": -12.960973739624023, - "step": 9580 + "learning_rate": 1.0719379568550543e-07, + "logits/chosen": -2.4695911407470703, + "logits/rejected": -2.411768913269043, + "logps/chosen": -300.9437561035156, + "logps/rejected": -367.5747375488281, + "loss": 0.0245, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5962684154510498, + "rewards/margins": 10.513290405273438, + "rewards/rejected": -12.109560012817383, + "step": 10060 }, { "epoch": 2.42, - "learning_rate": 1.0663795524763599e-07, - "logits/chosen": -2.2757697105407715, - "logits/rejected": -2.280949831008911, - "logps/chosen": -175.64895629882812, - "logps/rejected": -300.08734130859375, - "loss": 0.019, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.390406847000122, - "rewards/margins": 10.89712142944336, - "rewards/rejected": -13.287528991699219, - "step": 9590 + "learning_rate": 1.0674808343733286e-07, + "logits/chosen": -2.5905518531799316, + "logits/rejected": -2.497077703475952, + "logps/chosen": -282.8239440917969, + "logps/rejected": -362.37884521484375, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4664157629013062, + "rewards/margins": 12.256301879882812, + "rewards/rejected": -13.72271728515625, + "step": 10070 }, { "epoch": 2.43, - "learning_rate": 1.0616983428517929e-07, - "logits/chosen": -2.48026704788208, - "logits/rejected": -2.4394729137420654, - "logps/chosen": -257.1330871582031, - "logps/rejected": -344.04376220703125, - "loss": 0.0236, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.5281569957733154, - "rewards/margins": 8.856816291809082, - "rewards/rejected": -12.38497257232666, - "step": 9600 + "learning_rate": 1.0630237118916027e-07, + "logits/chosen": -2.570895195007324, + "logits/rejected": -2.4456238746643066, + "logps/chosen": -286.3117980957031, + "logps/rejected": -396.8953857421875, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10510861873626709, + "rewards/margins": 15.796917915344238, + "rewards/rejected": -15.902026176452637, + "step": 10080 }, { "epoch": 2.43, - "learning_rate": 1.0570171332272258e-07, - "logits/chosen": -2.5033538341522217, - "logits/rejected": -2.3946590423583984, - "logps/chosen": -394.1955261230469, - "logps/rejected": -396.7699279785156, - "loss": 0.0201, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2552104890346527, - "rewards/margins": 11.802976608276367, - "rewards/rejected": -12.058186531066895, - "step": 9610 + "learning_rate": 1.0585665894098769e-07, + "logits/chosen": -2.6075587272644043, + "logits/rejected": -2.5166683197021484, + "logps/chosen": -273.12835693359375, + "logps/rejected": -417.6787109375, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7467008829116821, + "rewards/margins": 13.216984748840332, + "rewards/rejected": -13.96368408203125, + "step": 10090 }, { "epoch": 2.43, - "learning_rate": 1.0523359236026588e-07, - "logits/chosen": -2.4539897441864014, - "logits/rejected": -2.4002108573913574, - "logps/chosen": -290.2524719238281, - "logps/rejected": -446.51708984375, - "loss": 0.0096, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.071610927581787, - "rewards/margins": 11.670352935791016, - "rewards/rejected": -13.741963386535645, - "step": 9620 + "learning_rate": 1.0541094669281511e-07, + "logits/chosen": -2.5382962226867676, + "logits/rejected": -2.4988698959350586, + "logps/chosen": -336.58953857421875, + "logps/rejected": -386.9100036621094, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33577728271484375, + "rewards/margins": 13.491998672485352, + "rewards/rejected": -13.156219482421875, + "step": 10100 }, { "epoch": 2.43, - "learning_rate": 1.0476547139780918e-07, - "logits/chosen": -2.487164258956909, - "logits/rejected": -2.5121045112609863, - "logps/chosen": -289.3143005371094, - "logps/rejected": -355.0195617675781, - "loss": 0.0161, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.06483952701091766, - "rewards/margins": 11.031146049499512, - "rewards/rejected": -11.09598445892334, - "step": 9630 + "eval_logits/chosen": -2.2946829795837402, + "eval_logits/rejected": -2.23030948638916, + "eval_logps/chosen": -287.7679443359375, + "eval_logps/rejected": -321.8813171386719, + "eval_loss": 0.6261496543884277, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -9.180694580078125, + "eval_rewards/margins": 4.5681304931640625, + "eval_rewards/rejected": -13.748824119567871, + "eval_runtime": 132.3626, + "eval_samples_per_second": 23.844, + "eval_steps_per_second": 0.378, + "step": 10100 + }, + { + "epoch": 2.43, + "learning_rate": 1.0496523444464254e-07, + "logits/chosen": -2.733978509902954, + "logits/rejected": -2.360318660736084, + "logps/chosen": -311.40435791015625, + "logps/rejected": -329.2252197265625, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1363164186477661, + "rewards/margins": 13.572772026062012, + "rewards/rejected": -14.709088325500488, + "step": 10110 }, { "epoch": 2.44, - "learning_rate": 1.042973504353525e-07, - "logits/chosen": -2.5066590309143066, - "logits/rejected": -2.42234468460083, - "logps/chosen": -224.26168823242188, - "logps/rejected": -343.7847595214844, - "loss": 0.0226, + "learning_rate": 1.0451952219646995e-07, + "logits/chosen": -2.6775975227355957, + "logits/rejected": -2.624905824661255, + "logps/chosen": -276.4446716308594, + "logps/rejected": -349.79754638671875, + "loss": 0.0501, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7051777839660645, - "rewards/margins": 11.361398696899414, - "rewards/rejected": -12.06657600402832, - "step": 9640 + "rewards/chosen": -5.87570333480835, + "rewards/margins": 9.55392837524414, + "rewards/rejected": -15.429631233215332, + "step": 10120 }, { "epoch": 2.44, - "learning_rate": 1.038292294728958e-07, - "logits/chosen": -2.538245439529419, - "logits/rejected": -2.390660047531128, - "logps/chosen": -244.62551879882812, - "logps/rejected": -343.1199035644531, - "loss": 0.0232, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -3.566230058670044, - "rewards/margins": 9.027008056640625, - "rewards/rejected": -12.593238830566406, - "step": 9650 + "learning_rate": 1.0407380994829737e-07, + "logits/chosen": -2.602470874786377, + "logits/rejected": -2.4946277141571045, + "logps/chosen": -327.5586242675781, + "logps/rejected": -340.1914978027344, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1518864631652832, + "rewards/margins": 12.21545696258545, + "rewards/rejected": -13.367342948913574, + "step": 10130 }, { "epoch": 2.44, - "learning_rate": 1.0336110851043909e-07, - "logits/chosen": -2.3509981632232666, - "logits/rejected": -2.280942916870117, - "logps/chosen": -278.4637451171875, - "logps/rejected": -337.9454040527344, - "loss": 0.0212, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.56687331199646, - "rewards/margins": 10.138782501220703, - "rewards/rejected": -11.705657005310059, - "step": 9660 + "learning_rate": 1.0362809770012478e-07, + "logits/chosen": -2.5528435707092285, + "logits/rejected": -2.531981945037842, + "logps/chosen": -223.9571533203125, + "logps/rejected": -372.1998596191406, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1788834035396576, + "rewards/margins": 11.404667854309082, + "rewards/rejected": -11.583551406860352, + "step": 10140 }, { "epoch": 2.44, - "learning_rate": 1.0289298754798239e-07, - "logits/chosen": -2.4716315269470215, - "logits/rejected": -2.550530195236206, - "logps/chosen": -304.77490234375, - "logps/rejected": -407.2782287597656, - "loss": 0.0355, + "learning_rate": 1.0318238545195221e-07, + "logits/chosen": -2.625014066696167, + "logits/rejected": -2.5478367805480957, + "logps/chosen": -269.31072998046875, + "logps/rejected": -317.3898620605469, + "loss": 0.0326, "rewards/accuracies": 1.0, - "rewards/chosen": -1.5738002061843872, - "rewards/margins": 11.042047500610352, - "rewards/rejected": -12.615848541259766, - "step": 9670 + "rewards/chosen": -1.1366454362869263, + "rewards/margins": 11.11518669128418, + "rewards/rejected": -12.251832962036133, + "step": 10150 }, { "epoch": 2.45, - "learning_rate": 1.024248665855257e-07, - "logits/chosen": -2.5989766120910645, - "logits/rejected": -2.4732518196105957, - "logps/chosen": -260.64581298828125, - "logps/rejected": -375.7298583984375, - "loss": 0.0199, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0580739974975586, - "rewards/margins": 13.893926620483398, - "rewards/rejected": -14.952000617980957, - "step": 9680 + "learning_rate": 1.0273667320377964e-07, + "logits/chosen": -2.5161309242248535, + "logits/rejected": -2.3050713539123535, + "logps/chosen": -256.6337585449219, + "logps/rejected": -299.70013427734375, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8057987689971924, + "rewards/margins": 10.062045097351074, + "rewards/rejected": -12.86784553527832, + "step": 10160 }, { "epoch": 2.45, - "learning_rate": 1.01956745623069e-07, - "logits/chosen": -2.539679527282715, - "logits/rejected": -2.405225992202759, - "logps/chosen": -327.54522705078125, - "logps/rejected": -370.77813720703125, - "loss": 0.0111, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4409523010253906, - "rewards/margins": 11.447229385375977, - "rewards/rejected": -12.88818073272705, - "step": 9690 + "learning_rate": 1.0229096095560706e-07, + "logits/chosen": -2.5865914821624756, + "logits/rejected": -2.3879811763763428, + "logps/chosen": -347.2486877441406, + "logps/rejected": -334.393798828125, + "loss": 0.0371, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3725287914276123, + "rewards/margins": 10.65519905090332, + "rewards/rejected": -13.027727127075195, + "step": 10170 }, { "epoch": 2.45, - "learning_rate": 1.014886246606123e-07, - "logits/chosen": -2.3739826679229736, - "logits/rejected": -2.367341995239258, - "logps/chosen": -195.1979217529297, - "logps/rejected": -286.34320068359375, - "loss": 0.0184, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5407164096832275, - "rewards/margins": 9.584169387817383, - "rewards/rejected": -11.124883651733398, - "step": 9700 + "learning_rate": 1.0184524870743448e-07, + "logits/chosen": -2.6206719875335693, + "logits/rejected": -2.478562831878662, + "logps/chosen": -283.15142822265625, + "logps/rejected": -360.8038330078125, + "loss": 0.0542, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5782840251922607, + "rewards/margins": 10.573938369750977, + "rewards/rejected": -14.1522216796875, + "step": 10180 }, { "epoch": 2.45, - "learning_rate": 1.0102050369815559e-07, - "logits/chosen": -2.5260393619537354, - "logits/rejected": -2.2634921073913574, - "logps/chosen": -344.4964599609375, - "logps/rejected": -284.90545654296875, - "loss": 0.0157, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.781498908996582, - "rewards/margins": 10.852533340454102, - "rewards/rejected": -12.634031295776367, - "step": 9710 + "learning_rate": 1.013995364592619e-07, + "logits/chosen": -2.5559656620025635, + "logits/rejected": -2.5310585498809814, + "logps/chosen": -314.6678161621094, + "logps/rejected": -317.8202209472656, + "loss": 0.0416, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.259921073913574, + "rewards/margins": 9.425006866455078, + "rewards/rejected": -13.684926986694336, + "step": 10190 + }, + { + "epoch": 2.45, + "learning_rate": 1.0095382421108932e-07, + "logits/chosen": -2.51534104347229, + "logits/rejected": -2.3936638832092285, + "logps/chosen": -282.4815673828125, + "logps/rejected": -308.4475402832031, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.455737590789795, + "rewards/margins": 10.04170036315918, + "rewards/rejected": -12.497437477111816, + "step": 10200 + }, + { + "epoch": 2.45, + "eval_logits/chosen": -2.2451605796813965, + "eval_logits/rejected": -2.181662082672119, + "eval_logps/chosen": -294.175048828125, + "eval_logps/rejected": -327.1667175292969, + "eval_loss": 0.6373821496963501, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -9.821403503417969, + "eval_rewards/margins": 4.455959320068359, + "eval_rewards/rejected": -14.277362823486328, + "eval_runtime": 132.4199, + "eval_samples_per_second": 23.833, + "eval_steps_per_second": 0.378, + "step": 10200 }, { "epoch": 2.46, - "learning_rate": 1.0055238273569889e-07, - "logits/chosen": -2.525456190109253, - "logits/rejected": -2.550560235977173, - "logps/chosen": -259.0410461425781, - "logps/rejected": -403.1743469238281, - "loss": 0.0335, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.9123313426971436, - "rewards/margins": 9.142938613891602, - "rewards/rejected": -11.055269241333008, - "step": 9720 + "learning_rate": 1.0050811196291674e-07, + "logits/chosen": -2.5477230548858643, + "logits/rejected": -2.457096576690674, + "logps/chosen": -318.61688232421875, + "logps/rejected": -342.5857849121094, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1317331790924072, + "rewards/margins": 13.212003707885742, + "rewards/rejected": -14.34373664855957, + "step": 10210 }, { "epoch": 2.46, - "learning_rate": 1.000842617732422e-07, - "logits/chosen": -2.3116941452026367, - "logits/rejected": -2.2560160160064697, - "logps/chosen": -230.031005859375, - "logps/rejected": -308.4913330078125, - "loss": 0.0196, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.852482318878174, - "rewards/margins": 8.964485168457031, - "rewards/rejected": -11.816967964172363, - "step": 9730 + "learning_rate": 1.0006239971474415e-07, + "logits/chosen": -2.4535489082336426, + "logits/rejected": -2.289975881576538, + "logps/chosen": -261.37591552734375, + "logps/rejected": -268.0931091308594, + "loss": 0.0332, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.003266334533691, + "rewards/margins": 7.796333312988281, + "rewards/rejected": -12.799600601196289, + "step": 10220 }, { "epoch": 2.46, - "learning_rate": 9.96161408107855e-08, - "logits/chosen": -2.4614522457122803, - "logits/rejected": -2.355747938156128, - "logps/chosen": -289.4332275390625, - "logps/rejected": -358.6775817871094, - "loss": 0.0115, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.407428741455078, - "rewards/margins": 10.502245903015137, - "rewards/rejected": -13.909673690795898, - "step": 9740 + "learning_rate": 9.961668746657158e-08, + "logits/chosen": -2.5622589588165283, + "logits/rejected": -2.4903953075408936, + "logps/chosen": -339.3744201660156, + "logps/rejected": -380.05560302734375, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.748084545135498, + "rewards/margins": 13.121284484863281, + "rewards/rejected": -15.869367599487305, + "step": 10230 }, { "epoch": 2.46, - "learning_rate": 9.914801984832881e-08, - "logits/chosen": -2.6264684200286865, - "logits/rejected": -2.492650032043457, - "logps/chosen": -338.1631774902344, - "logps/rejected": -362.98101806640625, - "loss": 0.0048, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.27077144384384155, - "rewards/margins": 12.973017692565918, - "rewards/rejected": -12.70224666595459, - "step": 9750 + "learning_rate": 9.9170975218399e-08, + "logits/chosen": -2.4305975437164307, + "logits/rejected": -2.288562297821045, + "logps/chosen": -273.37127685546875, + "logps/rejected": -384.6258850097656, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.67263126373291, + "rewards/margins": 12.442885398864746, + "rewards/rejected": -15.115516662597656, + "step": 10240 }, { "epoch": 2.47, - "learning_rate": 9.867989888587212e-08, - "logits/chosen": -2.56974720954895, - "logits/rejected": -2.4586358070373535, - "logps/chosen": -375.69732666015625, - "logps/rejected": -473.6441345214844, - "loss": 0.0146, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.16659900546073914, - "rewards/margins": 11.835259437561035, - "rewards/rejected": -12.001858711242676, - "step": 9760 + "learning_rate": 9.872526297022641e-08, + "logits/chosen": -2.782963991165161, + "logits/rejected": -2.578382730484009, + "logps/chosen": -322.8417053222656, + "logps/rejected": -392.8038024902344, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.467021942138672, + "rewards/margins": 10.122015953063965, + "rewards/rejected": -12.589037895202637, + "step": 10250 }, { "epoch": 2.47, - "learning_rate": 9.82117779234154e-08, - "logits/chosen": -2.362417697906494, - "logits/rejected": -2.4201173782348633, - "logps/chosen": -202.46035766601562, - "logps/rejected": -443.12335205078125, - "loss": 0.0191, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.9886672496795654, - "rewards/margins": 12.812800407409668, - "rewards/rejected": -15.801467895507812, - "step": 9770 + "learning_rate": 9.827955072205383e-08, + "logits/chosen": -2.342217206954956, + "logits/rejected": -2.5006392002105713, + "logps/chosen": -188.2440185546875, + "logps/rejected": -363.35577392578125, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.273957252502441, + "rewards/margins": 11.456818580627441, + "rewards/rejected": -15.730775833129883, + "step": 10260 }, { "epoch": 2.47, - "learning_rate": 9.774365696095871e-08, - "logits/chosen": -2.5065131187438965, - "logits/rejected": -2.3762283325195312, - "logps/chosen": -254.2418975830078, - "logps/rejected": -370.89971923828125, - "loss": 0.0057, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.246609687805176, - "rewards/margins": 11.305074691772461, - "rewards/rejected": -13.551684379577637, - "step": 9780 + "learning_rate": 9.783383847388126e-08, + "logits/chosen": -2.409719944000244, + "logits/rejected": -2.35951828956604, + "logps/chosen": -256.25555419921875, + "logps/rejected": -399.2924499511719, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.982908010482788, + "rewards/margins": 11.558671951293945, + "rewards/rejected": -13.541580200195312, + "step": 10270 }, { "epoch": 2.47, - "learning_rate": 9.727553599850201e-08, - "logits/chosen": -2.370347261428833, - "logits/rejected": -2.3666515350341797, - "logps/chosen": -273.8157958984375, - "logps/rejected": -380.5888977050781, - "loss": 0.0202, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6909661293029785, - "rewards/margins": 13.555691719055176, - "rewards/rejected": -14.246658325195312, - "step": 9790 + "learning_rate": 9.738812622570868e-08, + "logits/chosen": -2.631664752960205, + "logits/rejected": -2.5223374366760254, + "logps/chosen": -294.74896240234375, + "logps/rejected": -340.75762939453125, + "loss": 0.1085, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.424164295196533, + "rewards/margins": 9.47216510772705, + "rewards/rejected": -13.896328926086426, + "step": 10280 }, { "epoch": 2.48, - "learning_rate": 9.680741503604531e-08, - "logits/chosen": -2.531897783279419, - "logits/rejected": -2.4267642498016357, - "logps/chosen": -265.67303466796875, - "logps/rejected": -383.75384521484375, - "loss": 0.0063, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9569188356399536, - "rewards/margins": 11.649717330932617, - "rewards/rejected": -12.606637001037598, - "step": 9800 + "learning_rate": 9.694241397753609e-08, + "logits/chosen": -2.610546112060547, + "logits/rejected": -2.541151762008667, + "logps/chosen": -233.2912139892578, + "logps/rejected": -351.20404052734375, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.288332462310791, + "rewards/margins": 10.961092948913574, + "rewards/rejected": -12.249425888061523, + "step": 10290 }, { "epoch": 2.48, - "learning_rate": 9.633929407358862e-08, - "logits/chosen": -2.5569138526916504, - "logits/rejected": -2.396172046661377, - "logps/chosen": -285.2621765136719, - "logps/rejected": -310.68389892578125, - "loss": 0.007, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.2880444526672363, - "rewards/margins": 9.889970779418945, - "rewards/rejected": -12.178014755249023, - "step": 9810 + "learning_rate": 9.649670172936351e-08, + "logits/chosen": -2.7449355125427246, + "logits/rejected": -2.634033203125, + "logps/chosen": -325.7273864746094, + "logps/rejected": -421.8634338378906, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.180922269821167, + "rewards/margins": 11.618342399597168, + "rewards/rejected": -14.79926586151123, + "step": 10300 }, { "epoch": 2.48, - "learning_rate": 9.58711731111319e-08, - "logits/chosen": -2.445523738861084, - "logits/rejected": -2.241865873336792, - "logps/chosen": -355.5696105957031, - "logps/rejected": -393.40972900390625, - "loss": 0.026, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.43356913328170776, - "rewards/margins": 12.471688270568848, - "rewards/rejected": -12.905256271362305, - "step": 9820 + "eval_logits/chosen": -2.3521158695220947, + "eval_logits/rejected": -2.2947356700897217, + "eval_logps/chosen": -279.23907470703125, + "eval_logps/rejected": -310.0836181640625, + "eval_loss": 0.6298311948776245, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -8.327805519104004, + "eval_rewards/margins": 4.241252422332764, + "eval_rewards/rejected": -12.56905746459961, + "eval_runtime": 132.2939, + "eval_samples_per_second": 23.856, + "eval_steps_per_second": 0.378, + "step": 10300 }, { "epoch": 2.48, - "learning_rate": 9.540305214867521e-08, - "logits/chosen": -2.519758701324463, - "logits/rejected": -2.4892241954803467, - "logps/chosen": -238.73196411132812, - "logps/rejected": -396.1922302246094, - "loss": 0.0104, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1257517337799072, - "rewards/margins": 12.754495620727539, - "rewards/rejected": -13.880247116088867, - "step": 9830 + "learning_rate": 9.605098948119094e-08, + "logits/chosen": -2.3786349296569824, + "logits/rejected": -2.2514824867248535, + "logps/chosen": -333.1474609375, + "logps/rejected": -416.96142578125, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6525623798370361, + "rewards/margins": 13.515604972839355, + "rewards/rejected": -15.168169021606445, + "step": 10310 + }, + { + "epoch": 2.48, + "learning_rate": 9.560527723301835e-08, + "logits/chosen": -2.5710606575012207, + "logits/rejected": -2.410841226577759, + "logps/chosen": -351.69708251953125, + "logps/rejected": -350.79364013671875, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.712020397186279, + "rewards/margins": 9.47273063659668, + "rewards/rejected": -15.184751510620117, + "step": 10320 }, { "epoch": 2.49, - "learning_rate": 9.493493118621851e-08, - "logits/chosen": -2.6485443115234375, - "logits/rejected": -2.5044517517089844, - "logps/chosen": -353.70904541015625, - "logps/rejected": -471.74151611328125, - "loss": 0.0163, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.10478146374225616, - "rewards/margins": 14.041101455688477, - "rewards/rejected": -14.145881652832031, - "step": 9840 + "learning_rate": 9.515956498484578e-08, + "logits/chosen": -2.4247822761535645, + "logits/rejected": -2.545729398727417, + "logps/chosen": -313.1791076660156, + "logps/rejected": -367.5338134765625, + "loss": 0.0197, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.4451382160186768, + "rewards/margins": 9.958581924438477, + "rewards/rejected": -13.403719902038574, + "step": 10330 }, { "epoch": 2.49, - "learning_rate": 9.446681022376183e-08, - "logits/chosen": -2.4805495738983154, - "logits/rejected": -2.302246570587158, - "logps/chosen": -229.20370483398438, - "logps/rejected": -325.60711669921875, - "loss": 0.0118, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.490037441253662, - "rewards/margins": 14.055638313293457, - "rewards/rejected": -12.565601348876953, - "step": 9850 + "learning_rate": 9.47138527366732e-08, + "logits/chosen": -2.5888113975524902, + "logits/rejected": -2.551140069961548, + "logps/chosen": -314.3278503417969, + "logps/rejected": -412.1373596191406, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.000945806503296, + "rewards/margins": 13.686769485473633, + "rewards/rejected": -15.687715530395508, + "step": 10340 }, { "epoch": 2.49, - "learning_rate": 9.399868926130513e-08, - "logits/chosen": -2.7085890769958496, - "logits/rejected": -2.5546762943267822, - "logps/chosen": -302.5946960449219, - "logps/rejected": -341.9223327636719, - "loss": 0.028, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5961366295814514, - "rewards/margins": 10.983573913574219, - "rewards/rejected": -11.579710006713867, - "step": 9860 + "learning_rate": 9.426814048850063e-08, + "logits/chosen": -2.664057493209839, + "logits/rejected": -2.5835258960723877, + "logps/chosen": -218.8526611328125, + "logps/rejected": -286.86846923828125, + "loss": 0.0368, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.882927656173706, + "rewards/margins": 11.654741287231445, + "rewards/rejected": -13.537668228149414, + "step": 10350 }, { "epoch": 2.49, - "learning_rate": 9.353056829884842e-08, - "logits/chosen": -2.4775044918060303, - "logits/rejected": -2.400139331817627, - "logps/chosen": -312.041015625, - "logps/rejected": -324.03338623046875, - "loss": 0.0158, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2200387716293335, - "rewards/margins": 9.557716369628906, - "rewards/rejected": -10.777755737304688, - "step": 9870 + "learning_rate": 9.382242824032804e-08, + "logits/chosen": -2.5984394550323486, + "logits/rejected": -2.3874740600585938, + "logps/chosen": -255.716796875, + "logps/rejected": -384.89971923828125, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7643020153045654, + "rewards/margins": 14.913984298706055, + "rewards/rejected": -17.678287506103516, + "step": 10360 }, { "epoch": 2.5, - "learning_rate": 9.306244733639172e-08, - "logits/chosen": -2.431150197982788, - "logits/rejected": -2.3163318634033203, - "logps/chosen": -235.7882080078125, - "logps/rejected": -308.95074462890625, - "loss": 0.0419, + "learning_rate": 9.337671599215546e-08, + "logits/chosen": -2.799041986465454, + "logits/rejected": -2.629462242126465, + "logps/chosen": -354.2980041503906, + "logps/rejected": -415.33392333984375, + "loss": 0.0302, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.866422176361084, + "rewards/margins": 11.385327339172363, + "rewards/rejected": -14.251749038696289, + "step": 10370 + }, + { + "epoch": 2.5, + "learning_rate": 9.293100374398288e-08, + "logits/chosen": -2.5886058807373047, + "logits/rejected": -2.508739709854126, + "logps/chosen": -240.0004119873047, + "logps/rejected": -407.1943359375, + "loss": 0.0265, "rewards/accuracies": 1.0, - "rewards/chosen": -2.804135799407959, - "rewards/margins": 8.712458610534668, - "rewards/rejected": -11.516595840454102, - "step": 9880 + "rewards/chosen": -2.6057114601135254, + "rewards/margins": 11.837148666381836, + "rewards/rejected": -14.442858695983887, + "step": 10380 }, { "epoch": 2.5, - "learning_rate": 9.259432637393502e-08, - "logits/chosen": -2.443704128265381, - "logits/rejected": -2.328042507171631, - "logps/chosen": -288.6375427246094, - "logps/rejected": -308.34906005859375, - "loss": 0.0332, + "learning_rate": 9.24852914958103e-08, + "logits/chosen": -2.529982328414917, + "logits/rejected": -2.4887478351593018, + "logps/chosen": -280.7078552246094, + "logps/rejected": -339.2616271972656, + "loss": 0.0472, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.7502975463867188, - "rewards/margins": 9.743748664855957, - "rewards/rejected": -12.494046211242676, - "step": 9890 + "rewards/chosen": -2.501162528991699, + "rewards/margins": 10.011792182922363, + "rewards/rejected": -12.512954711914062, + "step": 10390 }, { "epoch": 2.5, - "learning_rate": 9.212620541147833e-08, - "logits/chosen": -2.547600507736206, - "logits/rejected": -2.4481041431427, - "logps/chosen": -252.47866821289062, - "logps/rejected": -333.0774841308594, - "loss": 0.0261, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8022094964981079, - "rewards/margins": 12.065972328186035, - "rewards/rejected": -12.868181228637695, - "step": 9900 + "learning_rate": 9.203957924763772e-08, + "logits/chosen": -2.580894947052002, + "logits/rejected": -2.51870059967041, + "logps/chosen": -345.10369873046875, + "logps/rejected": -428.8946228027344, + "loss": 0.0423, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.1254000663757324, + "rewards/margins": 12.60618782043457, + "rewards/rejected": -14.731587409973145, + "step": 10400 + }, + { + "epoch": 2.5, + "eval_logits/chosen": -2.362020254135132, + "eval_logits/rejected": -2.3034491539001465, + "eval_logps/chosen": -283.4878845214844, + "eval_logps/rejected": -316.9453430175781, + "eval_loss": 0.6267058849334717, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -8.752685546875, + "eval_rewards/margins": 4.502540111541748, + "eval_rewards/rejected": -13.255226135253906, + "eval_runtime": 132.3141, + "eval_samples_per_second": 23.852, + "eval_steps_per_second": 0.378, + "step": 10400 }, { "epoch": 2.51, - "learning_rate": 9.165808444902163e-08, - "logits/chosen": -2.5878243446350098, - "logits/rejected": -2.607832908630371, - "logps/chosen": -299.35406494140625, - "logps/rejected": -416.83135986328125, - "loss": 0.0455, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.320135474205017, - "rewards/margins": 12.124985694885254, - "rewards/rejected": -13.445119857788086, - "step": 9910 + "learning_rate": 9.159386699946514e-08, + "logits/chosen": -2.5030887126922607, + "logits/rejected": -2.4565987586975098, + "logps/chosen": -294.126220703125, + "logps/rejected": -344.9376220703125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8797158002853394, + "rewards/margins": 11.618200302124023, + "rewards/rejected": -12.497915267944336, + "step": 10410 }, { "epoch": 2.51, - "learning_rate": 9.118996348656492e-08, - "logits/chosen": -2.421605110168457, - "logits/rejected": -2.3391876220703125, - "logps/chosen": -212.12454223632812, - "logps/rejected": -311.01495361328125, - "loss": 0.0068, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.06767783313989639, - "rewards/margins": 10.119757652282715, - "rewards/rejected": -10.187434196472168, - "step": 9920 + "learning_rate": 9.114815475129255e-08, + "logits/chosen": -2.5555949211120605, + "logits/rejected": -2.477242946624756, + "logps/chosen": -234.8366241455078, + "logps/rejected": -339.01019287109375, + "loss": 0.0237, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.5718536376953125, + "rewards/margins": 10.904671669006348, + "rewards/rejected": -14.476526260375977, + "step": 10420 }, { "epoch": 2.51, - "learning_rate": 9.072184252410822e-08, - "logits/chosen": -2.3627657890319824, - "logits/rejected": -2.311931848526001, - "logps/chosen": -230.1654510498047, - "logps/rejected": -427.72674560546875, - "loss": 0.024, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9032829403877258, - "rewards/margins": 11.931960105895996, - "rewards/rejected": -12.835243225097656, - "step": 9930 + "learning_rate": 9.070244250311998e-08, + "logits/chosen": -2.433098316192627, + "logits/rejected": -2.5825276374816895, + "logps/chosen": -353.30621337890625, + "logps/rejected": -500.61285400390625, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.441417932510376, + "rewards/margins": 13.522689819335938, + "rewards/rejected": -14.96410846710205, + "step": 10430 }, { "epoch": 2.51, - "learning_rate": 9.025372156165152e-08, - "logits/chosen": -2.4075100421905518, - "logits/rejected": -2.3419651985168457, - "logps/chosen": -276.5333251953125, - "logps/rejected": -372.3677673339844, - "loss": 0.0305, + "learning_rate": 9.02567302549474e-08, + "logits/chosen": -2.4559600353240967, + "logits/rejected": -2.434187173843384, + "logps/chosen": -308.4311828613281, + "logps/rejected": -331.27490234375, + "loss": 0.0322, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.7151522636413574, - "rewards/margins": 10.86931324005127, - "rewards/rejected": -13.584465026855469, - "step": 9940 + "rewards/chosen": -3.0938048362731934, + "rewards/margins": 8.832279205322266, + "rewards/rejected": -11.9260835647583, + "step": 10440 }, { "epoch": 2.52, - "learning_rate": 8.978560059919482e-08, - "logits/chosen": -2.3619284629821777, - "logits/rejected": -2.4651925563812256, - "logps/chosen": -204.21096801757812, - "logps/rejected": -323.39202880859375, - "loss": 0.0195, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.7284538745880127, - "rewards/margins": 8.96215534210205, - "rewards/rejected": -10.6906099319458, - "step": 9950 + "learning_rate": 8.981101800677482e-08, + "logits/chosen": -2.6544010639190674, + "logits/rejected": -2.553966999053955, + "logps/chosen": -292.5162658691406, + "logps/rejected": -420.3251037597656, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.859484076499939, + "rewards/margins": 13.371658325195312, + "rewards/rejected": -14.231144905090332, + "step": 10450 }, { "epoch": 2.52, - "learning_rate": 8.931747963673814e-08, - "logits/chosen": -2.414381504058838, - "logits/rejected": -2.4175143241882324, - "logps/chosen": -192.42576599121094, - "logps/rejected": -274.3816223144531, - "loss": 0.0139, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.4563064575195312, - "rewards/margins": 8.960624694824219, - "rewards/rejected": -11.416932106018066, - "step": 9960 + "learning_rate": 8.936530575860223e-08, + "logits/chosen": -2.6823782920837402, + "logits/rejected": -2.5964131355285645, + "logps/chosen": -272.8282165527344, + "logps/rejected": -414.7937927246094, + "loss": 0.0294, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.34129709005355835, + "rewards/margins": 13.988594055175781, + "rewards/rejected": -14.3298921585083, + "step": 10460 }, { "epoch": 2.52, - "learning_rate": 8.884935867428143e-08, - "logits/chosen": -2.4522461891174316, - "logits/rejected": -2.6154751777648926, - "logps/chosen": -191.37452697753906, - "logps/rejected": -345.6150817871094, - "loss": 0.0111, + "learning_rate": 8.891959351042966e-08, + "logits/chosen": -2.4881157875061035, + "logits/rejected": -2.4436609745025635, + "logps/chosen": -265.05535888671875, + "logps/rejected": -297.63543701171875, + "loss": 0.0301, "rewards/accuracies": 1.0, - "rewards/chosen": -0.5650936365127563, - "rewards/margins": 10.567245483398438, - "rewards/rejected": -11.13233757019043, - "step": 9970 + "rewards/chosen": -1.0244308710098267, + "rewards/margins": 9.457536697387695, + "rewards/rejected": -10.48196792602539, + "step": 10470 }, { "epoch": 2.52, - "learning_rate": 8.838123771182473e-08, - "logits/chosen": -2.7119133472442627, - "logits/rejected": -2.5880613327026367, - "logps/chosen": -296.63763427734375, - "logps/rejected": -398.0306091308594, - "loss": 0.0317, + "learning_rate": 8.847388126225708e-08, + "logits/chosen": -2.7403132915496826, + "logits/rejected": -2.5968213081359863, + "logps/chosen": -240.14120483398438, + "logps/rejected": -340.9478454589844, + "loss": 0.0495, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7520133256912231, - "rewards/margins": 12.314008712768555, - "rewards/rejected": -11.561994552612305, - "step": 9980 + "rewards/chosen": -2.190155267715454, + "rewards/margins": 11.323097229003906, + "rewards/rejected": -13.513254165649414, + "step": 10480 + }, + { + "epoch": 2.52, + "learning_rate": 8.80281690140845e-08, + "logits/chosen": -2.538989305496216, + "logits/rejected": -2.508791446685791, + "logps/chosen": -196.5592041015625, + "logps/rejected": -318.3958435058594, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3761475086212158, + "rewards/margins": 11.380166053771973, + "rewards/rejected": -12.756312370300293, + "step": 10490 }, { "epoch": 2.53, - "learning_rate": 8.791311674936804e-08, - "logits/chosen": -2.6000936031341553, - "logits/rejected": -2.5926265716552734, - "logps/chosen": -289.90057373046875, - "logps/rejected": -521.1102294921875, - "loss": 0.014, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6040196418762207, - "rewards/margins": 15.795877456665039, - "rewards/rejected": -15.191858291625977, - "step": 9990 + "learning_rate": 8.758245676591194e-08, + "logits/chosen": -2.5338008403778076, + "logits/rejected": -2.581998348236084, + "logps/chosen": -263.45587158203125, + "logps/rejected": -339.2855224609375, + "loss": 0.0329, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4973080158233643, + "rewards/margins": 11.250231742858887, + "rewards/rejected": -13.747540473937988, + "step": 10500 + }, + { + "epoch": 2.53, + "eval_logits/chosen": -2.3422703742980957, + "eval_logits/rejected": -2.2819228172302246, + "eval_logps/chosen": -285.3151550292969, + "eval_logps/rejected": -319.9424133300781, + "eval_loss": 0.6386201977729797, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -8.935415267944336, + "eval_rewards/margins": 4.619517803192139, + "eval_rewards/rejected": -13.55493450164795, + "eval_runtime": 132.3207, + "eval_samples_per_second": 23.851, + "eval_steps_per_second": 0.378, + "step": 10500 }, { "epoch": 2.53, - "learning_rate": 8.744499578691134e-08, - "logits/chosen": -2.2788772583007812, - "logits/rejected": -2.2455339431762695, - "logps/chosen": -230.094482421875, - "logps/rejected": -294.2077331542969, - "loss": 0.0215, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3577768802642822, - "rewards/margins": 10.781793594360352, - "rewards/rejected": -12.139569282531738, - "step": 10000 + "learning_rate": 8.713674451773935e-08, + "logits/chosen": -2.560263156890869, + "logits/rejected": -2.4164462089538574, + "logps/chosen": -221.02206420898438, + "logps/rejected": -299.6241760253906, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.742326021194458, + "rewards/margins": 11.723387718200684, + "rewards/rejected": -13.465713500976562, + "step": 10510 }, { "epoch": 2.53, - "learning_rate": 8.697687482445464e-08, - "logits/chosen": -2.37874174118042, - "logits/rejected": -2.263432025909424, - "logps/chosen": -267.250244140625, - "logps/rejected": -287.97381591796875, - "loss": 0.0293, + "learning_rate": 8.669103226956677e-08, + "logits/chosen": -2.604015350341797, + "logits/rejected": -2.453403949737549, + "logps/chosen": -238.9454803466797, + "logps/rejected": -342.53497314453125, + "loss": 0.0386, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.6297651529312134, - "rewards/margins": 10.835100173950195, - "rewards/rejected": -12.464864730834961, - "step": 10010 + "rewards/chosen": -3.5726406574249268, + "rewards/margins": 9.781166076660156, + "rewards/rejected": -13.35380744934082, + "step": 10520 }, { "epoch": 2.53, - "learning_rate": 8.650875386199793e-08, - "logits/chosen": -2.319446086883545, - "logits/rejected": -2.2784368991851807, - "logps/chosen": -272.7980041503906, - "logps/rejected": -331.6700744628906, - "loss": 0.0071, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4239766597747803, - "rewards/margins": 12.43993091583252, - "rewards/rejected": -12.01595401763916, - "step": 10020 + "learning_rate": 8.624532002139418e-08, + "logits/chosen": -2.3917243480682373, + "logits/rejected": -2.4111485481262207, + "logps/chosen": -248.33023071289062, + "logps/rejected": -302.805419921875, + "loss": 0.0527, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.320046424865723, + "rewards/margins": 7.962688446044922, + "rewards/rejected": -13.282734870910645, + "step": 10530 }, { "epoch": 2.54, - "learning_rate": 8.604063289954123e-08, - "logits/chosen": -2.3687281608581543, - "logits/rejected": -2.155780792236328, - "logps/chosen": -311.53936767578125, - "logps/rejected": -348.9579162597656, - "loss": 0.0385, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.45428338646888733, - "rewards/margins": 12.450576782226562, - "rewards/rejected": -12.904861450195312, - "step": 10030 + "learning_rate": 8.579960777322161e-08, + "logits/chosen": -2.586074113845825, + "logits/rejected": -2.667752504348755, + "logps/chosen": -246.10354614257812, + "logps/rejected": -350.5034484863281, + "loss": 0.0285, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0459742546081543, + "rewards/margins": 8.832014083862305, + "rewards/rejected": -11.877988815307617, + "step": 10540 }, { "epoch": 2.54, - "learning_rate": 8.557251193708453e-08, - "logits/chosen": -2.408719539642334, - "logits/rejected": -2.3702263832092285, - "logps/chosen": -272.25091552734375, - "logps/rejected": -429.01751708984375, - "loss": 0.0311, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.8166736364364624, - "rewards/margins": 13.995099067687988, - "rewards/rejected": -15.811772346496582, - "step": 10040 + "learning_rate": 8.535389552504903e-08, + "logits/chosen": -2.5090839862823486, + "logits/rejected": -2.224073886871338, + "logps/chosen": -230.83071899414062, + "logps/rejected": -319.4825134277344, + "loss": 0.0536, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.72802209854126, + "rewards/margins": 9.805669784545898, + "rewards/rejected": -15.533693313598633, + "step": 10550 }, { "epoch": 2.54, - "learning_rate": 8.510439097462784e-08, - "logits/chosen": -2.5389504432678223, - "logits/rejected": -2.3398921489715576, - "logps/chosen": -295.73358154296875, - "logps/rejected": -410.68170166015625, - "loss": 0.0562, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.004246234893799, - "rewards/margins": 13.201797485351562, - "rewards/rejected": -15.20604419708252, - "step": 10050 + "learning_rate": 8.490818327687645e-08, + "logits/chosen": -2.566270351409912, + "logits/rejected": -2.310544729232788, + "logps/chosen": -282.0804443359375, + "logps/rejected": -419.6634826660156, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.303030490875244, + "rewards/margins": 14.08696174621582, + "rewards/rejected": -16.389989852905273, + "step": 10560 }, { "epoch": 2.54, - "learning_rate": 8.463627001217114e-08, - "logits/chosen": -2.0855343341827393, - "logits/rejected": -2.0940823554992676, - "logps/chosen": -206.9429473876953, - "logps/rejected": -316.58355712890625, - "loss": 0.0202, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.951176643371582, - "rewards/margins": 8.924515724182129, - "rewards/rejected": -11.875692367553711, - "step": 10060 + "learning_rate": 8.446247102870386e-08, + "logits/chosen": -2.5808005332946777, + "logits/rejected": -2.393312692642212, + "logps/chosen": -239.3979949951172, + "logps/rejected": -350.3460998535156, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7821457386016846, + "rewards/margins": 11.313864707946777, + "rewards/rejected": -15.0960111618042, + "step": 10570 }, { "epoch": 2.55, - "learning_rate": 8.416814904971444e-08, - "logits/chosen": -2.6794869899749756, - "logits/rejected": -2.4969735145568848, - "logps/chosen": -294.0050964355469, - "logps/rejected": -331.2002258300781, - "loss": 0.0099, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5639679431915283, - "rewards/margins": 9.706094741821289, - "rewards/rejected": -11.270063400268555, - "step": 10070 + "learning_rate": 8.401675878053129e-08, + "logits/chosen": -2.669401168823242, + "logits/rejected": -2.684598445892334, + "logps/chosen": -342.88421630859375, + "logps/rejected": -435.95294189453125, + "loss": 0.0336, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.739382743835449, + "rewards/margins": 11.050076484680176, + "rewards/rejected": -16.789459228515625, + "step": 10580 }, { "epoch": 2.55, - "learning_rate": 8.370002808725774e-08, - "logits/chosen": -2.5366883277893066, - "logits/rejected": -2.519639492034912, - "logps/chosen": -227.18185424804688, - "logps/rejected": -352.42352294921875, - "loss": 0.0097, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.13427084684371948, - "rewards/margins": 14.200691223144531, - "rewards/rejected": -14.334962844848633, - "step": 10080 + "learning_rate": 8.357104653235871e-08, + "logits/chosen": -2.601301670074463, + "logits/rejected": -2.4707229137420654, + "logps/chosen": -301.48480224609375, + "logps/rejected": -380.2607116699219, + "loss": 0.0375, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9680715799331665, + "rewards/margins": 10.451790809631348, + "rewards/rejected": -12.419861793518066, + "step": 10590 }, { "epoch": 2.55, - "learning_rate": 8.323190712480105e-08, - "logits/chosen": -2.360050916671753, - "logits/rejected": -2.469874858856201, - "logps/chosen": -275.49981689453125, - "logps/rejected": -420.4410705566406, - "loss": 0.0234, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.37065351009368896, - "rewards/margins": 13.31440258026123, - "rewards/rejected": -13.68505573272705, - "step": 10090 + "learning_rate": 8.312533428418612e-08, + "logits/chosen": -2.5119433403015137, + "logits/rejected": -2.5108985900878906, + "logps/chosen": -266.65106201171875, + "logps/rejected": -327.990966796875, + "loss": 0.039, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.4838366508483887, + "rewards/margins": 9.881607055664062, + "rewards/rejected": -13.365443229675293, + "step": 10600 }, { "epoch": 2.55, - "learning_rate": 8.276378616234435e-08, - "logits/chosen": -2.5125153064727783, - "logits/rejected": -2.4173789024353027, - "logps/chosen": -170.84732055664062, - "logps/rejected": -282.93170166015625, - "loss": 0.0164, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.286710262298584, - "rewards/margins": 9.803840637207031, - "rewards/rejected": -12.090551376342773, - "step": 10100 + "eval_logits/chosen": -2.3527743816375732, + "eval_logits/rejected": -2.292367696762085, + "eval_logps/chosen": -279.51031494140625, + "eval_logps/rejected": -313.2565612792969, + "eval_loss": 0.6330491900444031, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -8.354928970336914, + "eval_rewards/margins": 4.531416416168213, + "eval_rewards/rejected": -12.886346817016602, + "eval_runtime": 132.2523, + "eval_samples_per_second": 23.863, + "eval_steps_per_second": 0.378, + "step": 10600 }, { - "epoch": 2.56, - "learning_rate": 8.229566519988765e-08, - "logits/chosen": -2.3780689239501953, - "logits/rejected": -2.3163952827453613, - "logps/chosen": -241.56271362304688, - "logps/rejected": -270.06591796875, - "loss": 0.0285, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4059072732925415, - "rewards/margins": 10.28950309753418, - "rewards/rejected": -11.695409774780273, - "step": 10110 + "epoch": 2.55, + "learning_rate": 8.267962203601354e-08, + "logits/chosen": -2.4786009788513184, + "logits/rejected": -2.494345188140869, + "logps/chosen": -268.06549072265625, + "logps/rejected": -350.462158203125, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4662370681762695, + "rewards/margins": 10.99804401397705, + "rewards/rejected": -12.46428108215332, + "step": 10610 }, { "epoch": 2.56, - "learning_rate": 8.182754423743094e-08, - "logits/chosen": -2.333916187286377, - "logits/rejected": -2.1758086681365967, - "logps/chosen": -327.21319580078125, - "logps/rejected": -247.1071319580078, - "loss": 0.0103, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.4194493293762207, - "rewards/margins": 8.837392807006836, - "rewards/rejected": -12.256841659545898, - "step": 10120 + "learning_rate": 8.223390978784097e-08, + "logits/chosen": -2.5598702430725098, + "logits/rejected": -2.636221408843994, + "logps/chosen": -179.05477905273438, + "logps/rejected": -348.11322021484375, + "loss": 0.0288, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.151271343231201, + "rewards/margins": 10.681897163391113, + "rewards/rejected": -12.833168029785156, + "step": 10620 }, { "epoch": 2.56, - "learning_rate": 8.135942327497424e-08, - "logits/chosen": -2.3765933513641357, - "logits/rejected": -2.1695456504821777, - "logps/chosen": -347.8133850097656, - "logps/rejected": -362.703369140625, - "loss": 0.0155, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.11873920261859894, - "rewards/margins": 13.10112190246582, - "rewards/rejected": -12.982383728027344, - "step": 10130 + "learning_rate": 8.178819753966839e-08, + "logits/chosen": -2.66211199760437, + "logits/rejected": -2.514752149581909, + "logps/chosen": -365.4998779296875, + "logps/rejected": -425.56280517578125, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7573599815368652, + "rewards/margins": 12.800786018371582, + "rewards/rejected": -13.558145523071289, + "step": 10630 }, { "epoch": 2.56, - "learning_rate": 8.089130231251755e-08, - "logits/chosen": -2.4001736640930176, - "logits/rejected": -2.3628358840942383, - "logps/chosen": -304.4840393066406, - "logps/rejected": -364.996337890625, - "loss": 0.0232, + "learning_rate": 8.13424852914958e-08, + "logits/chosen": -2.5688462257385254, + "logits/rejected": -2.568859338760376, + "logps/chosen": -268.8397521972656, + "logps/rejected": -360.98162841796875, + "loss": 0.0234, "rewards/accuracies": 1.0, - "rewards/chosen": 0.12035231292247772, - "rewards/margins": 13.0234375, - "rewards/rejected": -12.903085708618164, - "step": 10140 + "rewards/chosen": -2.270838737487793, + "rewards/margins": 11.449193954467773, + "rewards/rejected": -13.72003173828125, + "step": 10640 }, { - "epoch": 2.57, - "learning_rate": 8.042318135006085e-08, - "logits/chosen": -2.6083054542541504, - "logits/rejected": -2.470574378967285, - "logps/chosen": -307.7094421386719, - "logps/rejected": -354.67230224609375, - "loss": 0.0112, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.18828299641609192, - "rewards/margins": 12.188252449035645, - "rewards/rejected": -11.999969482421875, - "step": 10150 + "epoch": 2.56, + "learning_rate": 8.089677304332322e-08, + "logits/chosen": -2.402529239654541, + "logits/rejected": -2.3983139991760254, + "logps/chosen": -284.66009521484375, + "logps/rejected": -356.12322998046875, + "loss": 0.0385, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.005434179212898016, + "rewards/margins": 12.113489151000977, + "rewards/rejected": -12.108055114746094, + "step": 10650 }, { "epoch": 2.57, - "learning_rate": 7.995506038760415e-08, - "logits/chosen": -2.5575153827667236, - "logits/rejected": -2.4259867668151855, - "logps/chosen": -345.7724914550781, - "logps/rejected": -446.3836975097656, - "loss": 0.0333, + "learning_rate": 8.045106079515065e-08, + "logits/chosen": -2.362799644470215, + "logits/rejected": -2.3354263305664062, + "logps/chosen": -298.5352783203125, + "logps/rejected": -426.84637451171875, + "loss": 0.0336, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.486109495162964, - "rewards/margins": 12.340263366699219, - "rewards/rejected": -14.826370239257812, - "step": 10160 + "rewards/chosen": -3.16329288482666, + "rewards/margins": 13.24603271484375, + "rewards/rejected": -16.409324645996094, + "step": 10660 }, { "epoch": 2.57, - "learning_rate": 7.948693942514744e-08, - "logits/chosen": -2.618077039718628, - "logits/rejected": -2.6674914360046387, - "logps/chosen": -302.4067077636719, - "logps/rejected": -483.14501953125, - "loss": 0.0104, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.2378050833940506, - "rewards/margins": 14.155160903930664, - "rewards/rejected": -13.917353630065918, - "step": 10170 + "learning_rate": 8.000534854697808e-08, + "logits/chosen": -2.5995945930480957, + "logits/rejected": -2.4467437267303467, + "logps/chosen": -249.5279083251953, + "logps/rejected": -370.5962829589844, + "loss": 0.0407, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.550787448883057, + "rewards/margins": 10.217794418334961, + "rewards/rejected": -14.768580436706543, + "step": 10670 }, { "epoch": 2.57, - "learning_rate": 7.901881846269076e-08, - "logits/chosen": -2.5556085109710693, - "logits/rejected": -2.5346038341522217, - "logps/chosen": -276.97344970703125, - "logps/rejected": -341.01983642578125, - "loss": 0.0275, + "learning_rate": 7.955963629880549e-08, + "logits/chosen": -2.711503505706787, + "logits/rejected": -2.570249319076538, + "logps/chosen": -282.08880615234375, + "logps/rejected": -435.83074951171875, + "loss": 0.0268, "rewards/accuracies": 1.0, - "rewards/chosen": -0.3403923511505127, - "rewards/margins": 10.849390029907227, - "rewards/rejected": -11.189783096313477, - "step": 10180 + "rewards/chosen": -1.6130192279815674, + "rewards/margins": 14.018228530883789, + "rewards/rejected": -15.631248474121094, + "step": 10680 + }, + { + "epoch": 2.57, + "learning_rate": 7.911392405063291e-08, + "logits/chosen": -2.581488847732544, + "logits/rejected": -2.5314581394195557, + "logps/chosen": -214.5835723876953, + "logps/rejected": -292.8533630371094, + "loss": 0.0331, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6624927520751953, + "rewards/margins": 10.166467666625977, + "rewards/rejected": -11.828961372375488, + "step": 10690 }, { "epoch": 2.58, - "learning_rate": 7.855069750023406e-08, - "logits/chosen": -2.4308063983917236, - "logits/rejected": -2.3705153465270996, - "logps/chosen": -357.98162841796875, - "logps/rejected": -456.73394775390625, - "loss": 0.0252, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.5117690563201904, - "rewards/margins": 11.085670471191406, - "rewards/rejected": -12.597439765930176, - "step": 10190 + "learning_rate": 7.866821180246034e-08, + "logits/chosen": -2.727112293243408, + "logits/rejected": -2.7361741065979004, + "logps/chosen": -263.22320556640625, + "logps/rejected": -425.9029235839844, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6015348434448242, + "rewards/margins": 11.10651683807373, + "rewards/rejected": -12.708049774169922, + "step": 10700 }, { "epoch": 2.58, - "learning_rate": 7.808257653777736e-08, - "logits/chosen": -2.4045298099517822, - "logits/rejected": -2.304861545562744, - "logps/chosen": -252.59622192382812, - "logps/rejected": -342.4645080566406, - "loss": 0.0088, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.10743260383605957, - "rewards/margins": 14.025423049926758, - "rewards/rejected": -13.917988777160645, - "step": 10200 + "eval_logits/chosen": -2.292858123779297, + "eval_logits/rejected": -2.2319202423095703, + "eval_logps/chosen": -282.7149963378906, + "eval_logps/rejected": -316.1258239746094, + "eval_loss": 0.6336334943771362, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -8.675398826599121, + "eval_rewards/margins": 4.497876167297363, + "eval_rewards/rejected": -13.173274040222168, + "eval_runtime": 132.0139, + "eval_samples_per_second": 23.907, + "eval_steps_per_second": 0.379, + "step": 10700 }, { "epoch": 2.58, - "learning_rate": 7.761445557532067e-08, - "logits/chosen": -2.368105411529541, - "logits/rejected": -2.483786106109619, - "logps/chosen": -233.50930786132812, - "logps/rejected": -337.35333251953125, - "loss": 0.0148, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.8818254470825195, - "rewards/margins": 9.318094253540039, - "rewards/rejected": -11.199919700622559, - "step": 10210 + "learning_rate": 7.822249955428775e-08, + "logits/chosen": -2.6853411197662354, + "logits/rejected": -2.504542350769043, + "logps/chosen": -242.5248260498047, + "logps/rejected": -299.0537414550781, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5234460830688477, + "rewards/margins": 11.240208625793457, + "rewards/rejected": -11.763654708862305, + "step": 10710 }, { "epoch": 2.58, - "learning_rate": 7.714633461286397e-08, - "logits/chosen": -2.4343035221099854, - "logits/rejected": -2.398287057876587, - "logps/chosen": -286.1935119628906, - "logps/rejected": -382.47076416015625, - "loss": 0.0124, + "learning_rate": 7.777678730611517e-08, + "logits/chosen": -2.4415481090545654, + "logits/rejected": -2.4609909057617188, + "logps/chosen": -382.9562683105469, + "logps/rejected": -364.94110107421875, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2528579235076904, + "rewards/margins": 10.895849227905273, + "rewards/rejected": -14.148707389831543, + "step": 10720 + }, + { + "epoch": 2.58, + "learning_rate": 7.733107505794259e-08, + "logits/chosen": -2.4440081119537354, + "logits/rejected": -2.322683334350586, + "logps/chosen": -225.28317260742188, + "logps/rejected": -373.979248046875, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2343361377716064, + "rewards/margins": 12.532217025756836, + "rewards/rejected": -15.766552925109863, + "step": 10730 + }, + { + "epoch": 2.58, + "learning_rate": 7.688536280977002e-08, + "logits/chosen": -2.419126510620117, + "logits/rejected": -2.443342924118042, + "logps/chosen": -231.28909301757812, + "logps/rejected": -351.28912353515625, + "loss": 0.0232, "rewards/accuracies": 1.0, - "rewards/chosen": -1.5427314043045044, - "rewards/margins": 11.932634353637695, - "rewards/rejected": -13.475366592407227, - "step": 10220 + "rewards/chosen": -2.4508190155029297, + "rewards/margins": 11.187644004821777, + "rewards/rejected": -13.638463973999023, + "step": 10740 }, { "epoch": 2.59, - "learning_rate": 7.667821365040726e-08, - "logits/chosen": -2.5406994819641113, - "logits/rejected": -2.5999538898468018, - "logps/chosen": -417.00640869140625, - "logps/rejected": -448.77294921875, - "loss": 0.0145, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8344262838363647, - "rewards/margins": 13.575149536132812, - "rewards/rejected": -12.74072265625, - "step": 10230 + "learning_rate": 7.643965056159743e-08, + "logits/chosen": -2.6441986560821533, + "logits/rejected": -2.6302781105041504, + "logps/chosen": -239.594970703125, + "logps/rejected": -364.35589599609375, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1983789205551147, + "rewards/margins": 10.848573684692383, + "rewards/rejected": -12.046953201293945, + "step": 10750 }, { "epoch": 2.59, - "learning_rate": 7.621009268795056e-08, - "logits/chosen": -2.561500072479248, - "logits/rejected": -2.3849411010742188, - "logps/chosen": -283.8385925292969, - "logps/rejected": -375.70819091796875, - "loss": 0.0163, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.6725066900253296, - "rewards/margins": 13.281970024108887, - "rewards/rejected": -14.954477310180664, - "step": 10240 + "learning_rate": 7.599393831342485e-08, + "logits/chosen": -2.4153685569763184, + "logits/rejected": -2.2735838890075684, + "logps/chosen": -257.03204345703125, + "logps/rejected": -391.2391052246094, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.229942560195923, + "rewards/margins": 14.766741752624512, + "rewards/rejected": -16.99668312072754, + "step": 10760 }, { "epoch": 2.59, - "learning_rate": 7.574197172549386e-08, - "logits/chosen": -2.305802822113037, - "logits/rejected": -2.366856813430786, - "logps/chosen": -264.9994201660156, - "logps/rejected": -385.60260009765625, - "loss": 0.0188, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8811006546020508, - "rewards/margins": 12.801080703735352, - "rewards/rejected": -13.682180404663086, - "step": 10250 + "learning_rate": 7.554822606525226e-08, + "logits/chosen": -2.6447205543518066, + "logits/rejected": -2.5658411979675293, + "logps/chosen": -307.20184326171875, + "logps/rejected": -377.345947265625, + "loss": 0.0279, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.941560745239258, + "rewards/margins": 8.738297462463379, + "rewards/rejected": -11.679858207702637, + "step": 10770 }, { "epoch": 2.59, - "learning_rate": 7.527385076303716e-08, - "logits/chosen": -2.336945056915283, - "logits/rejected": -2.31264066696167, - "logps/chosen": -275.8536376953125, - "logps/rejected": -343.1841125488281, - "loss": 0.0135, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8622077703475952, - "rewards/margins": 11.723356246948242, - "rewards/rejected": -13.585565567016602, - "step": 10260 + "learning_rate": 7.510251381707969e-08, + "logits/chosen": -2.385326385498047, + "logits/rejected": -2.269052505493164, + "logps/chosen": -275.6403503417969, + "logps/rejected": -349.2908630371094, + "loss": 0.0393, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5092380046844482, + "rewards/margins": 9.348655700683594, + "rewards/rejected": -11.857892990112305, + "step": 10780 }, { "epoch": 2.6, - "learning_rate": 7.480572980058047e-08, - "logits/chosen": -2.5478365421295166, - "logits/rejected": -2.541980266571045, - "logps/chosen": -231.7345733642578, - "logps/rejected": -281.2032470703125, - "loss": 0.0196, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7049830555915833, - "rewards/margins": 7.908007621765137, - "rewards/rejected": -8.612991333007812, - "step": 10270 + "learning_rate": 7.465680156890711e-08, + "logits/chosen": -2.5282692909240723, + "logits/rejected": -2.566809892654419, + "logps/chosen": -266.95892333984375, + "logps/rejected": -389.5796813964844, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1874176263809204, + "rewards/margins": 13.380450248718262, + "rewards/rejected": -14.56786823272705, + "step": 10790 }, { "epoch": 2.6, - "learning_rate": 7.433760883812376e-08, - "logits/chosen": -2.669074296951294, - "logits/rejected": -2.5242648124694824, - "logps/chosen": -355.5440979003906, - "logps/rejected": -615.2041625976562, - "loss": 0.0279, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.465815544128418, - "rewards/margins": 18.569738388061523, - "rewards/rejected": -17.103923797607422, - "step": 10280 + "learning_rate": 7.421108932073453e-08, + "logits/chosen": -2.604846239089966, + "logits/rejected": -2.645174026489258, + "logps/chosen": -247.1967315673828, + "logps/rejected": -444.3323669433594, + "loss": 0.0606, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2870583534240723, + "rewards/margins": 10.787885665893555, + "rewards/rejected": -14.074945449829102, + "step": 10800 }, { "epoch": 2.6, - "learning_rate": 7.386948787566707e-08, - "logits/chosen": -2.5442676544189453, - "logits/rejected": -2.506998062133789, - "logps/chosen": -302.8976135253906, - "logps/rejected": -450.32257080078125, - "loss": 0.0113, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.4359608590602875, - "rewards/margins": 12.673746109008789, - "rewards/rejected": -13.109707832336426, - "step": 10290 + "eval_logits/chosen": -2.2731094360351562, + "eval_logits/rejected": -2.2115509510040283, + "eval_logps/chosen": -283.1195068359375, + "eval_logps/rejected": -315.2100524902344, + "eval_loss": 0.6299323439598083, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -8.715847969055176, + "eval_rewards/margins": 4.365845680236816, + "eval_rewards/rejected": -13.081694602966309, + "eval_runtime": 132.2817, + "eval_samples_per_second": 23.858, + "eval_steps_per_second": 0.378, + "step": 10800 }, { "epoch": 2.6, - "learning_rate": 7.340136691321037e-08, - "logits/chosen": -2.494982957839966, - "logits/rejected": -2.3586652278900146, - "logps/chosen": -286.8078308105469, - "logps/rejected": -388.41607666015625, - "loss": 0.0072, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3223403692245483, - "rewards/margins": 12.64293384552002, - "rewards/rejected": -13.9652738571167, - "step": 10300 + "learning_rate": 7.376537707256194e-08, + "logits/chosen": -2.560774564743042, + "logits/rejected": -2.455479145050049, + "logps/chosen": -258.66412353515625, + "logps/rejected": -340.37554931640625, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.270498752593994, + "rewards/margins": 9.799797058105469, + "rewards/rejected": -12.070294380187988, + "step": 10810 + }, + { + "epoch": 2.6, + "learning_rate": 7.331966482438937e-08, + "logits/chosen": -2.460191249847412, + "logits/rejected": -2.395469903945923, + "logps/chosen": -235.09255981445312, + "logps/rejected": -329.55963134765625, + "loss": 0.0279, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.6527023315429688, + "rewards/margins": 11.900350570678711, + "rewards/rejected": -12.55305290222168, + "step": 10820 }, { "epoch": 2.61, - "learning_rate": 7.293324595075368e-08, - "logits/chosen": -2.5453314781188965, - "logits/rejected": -2.445521831512451, - "logps/chosen": -364.0186462402344, - "logps/rejected": -498.9871520996094, - "loss": 0.0272, + "learning_rate": 7.287395257621679e-08, + "logits/chosen": -2.456638813018799, + "logits/rejected": -2.3913378715515137, + "logps/chosen": -334.17242431640625, + "logps/rejected": -331.2437438964844, + "loss": 0.0252, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.6890239715576172, - "rewards/margins": 12.470296859741211, - "rewards/rejected": -13.159318923950195, - "step": 10310 + "rewards/chosen": -3.613585948944092, + "rewards/margins": 9.240872383117676, + "rewards/rejected": -12.854456901550293, + "step": 10830 }, { "epoch": 2.61, - "learning_rate": 7.246512498829698e-08, - "logits/chosen": -2.5487751960754395, - "logits/rejected": -2.3223040103912354, - "logps/chosen": -253.3618927001953, - "logps/rejected": -332.0083312988281, - "loss": 0.0236, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.7357704639434814, - "rewards/margins": 11.522217750549316, - "rewards/rejected": -14.257987976074219, - "step": 10320 + "learning_rate": 7.24282403280442e-08, + "logits/chosen": -2.6633636951446533, + "logits/rejected": -2.66283917427063, + "logps/chosen": -356.80401611328125, + "logps/rejected": -456.3463439941406, + "loss": 0.1325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.268617630004883, + "rewards/margins": 9.673254013061523, + "rewards/rejected": -13.941873550415039, + "step": 10840 }, { "epoch": 2.61, - "learning_rate": 7.199700402584027e-08, - "logits/chosen": -2.178081512451172, - "logits/rejected": -2.2869818210601807, - "logps/chosen": -262.9320983886719, - "logps/rejected": -408.1930236816406, - "loss": 0.0277, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -3.4747023582458496, - "rewards/margins": 9.987930297851562, - "rewards/rejected": -13.462631225585938, - "step": 10330 + "learning_rate": 7.198252807987163e-08, + "logits/chosen": -2.507540702819824, + "logits/rejected": -2.504884958267212, + "logps/chosen": -203.59963989257812, + "logps/rejected": -348.88958740234375, + "loss": 0.0437, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5577945709228516, + "rewards/margins": 11.011690139770508, + "rewards/rejected": -14.569483757019043, + "step": 10850 }, { "epoch": 2.61, - "learning_rate": 7.152888306338357e-08, - "logits/chosen": -2.255789041519165, - "logits/rejected": -2.1870675086975098, - "logps/chosen": -212.7251739501953, - "logps/rejected": -505.8531799316406, - "loss": 0.0102, + "learning_rate": 7.153681583169906e-08, + "logits/chosen": -2.5359976291656494, + "logits/rejected": -2.4084975719451904, + "logps/chosen": -206.982421875, + "logps/rejected": -292.72601318359375, + "loss": 0.0316, "rewards/accuracies": 1.0, - "rewards/chosen": -0.9834052324295044, - "rewards/margins": 15.1971435546875, - "rewards/rejected": -16.1805477142334, - "step": 10340 + "rewards/chosen": -2.091850757598877, + "rewards/margins": 10.337549209594727, + "rewards/rejected": -12.429400444030762, + "step": 10860 }, { "epoch": 2.62, - "learning_rate": 7.106076210092687e-08, - "logits/chosen": -2.4865643978118896, - "logits/rejected": -2.325779676437378, - "logps/chosen": -221.14950561523438, - "logps/rejected": -271.050048828125, - "loss": 0.0191, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4804035425186157, - "rewards/margins": 9.898581504821777, - "rewards/rejected": -11.378985404968262, - "step": 10350 + "learning_rate": 7.109110358352648e-08, + "logits/chosen": -2.590742588043213, + "logits/rejected": -2.578723192214966, + "logps/chosen": -305.46075439453125, + "logps/rejected": -405.740234375, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.302114486694336, + "rewards/margins": 12.8751802444458, + "rewards/rejected": -16.177291870117188, + "step": 10870 }, { "epoch": 2.62, - "learning_rate": 7.059264113847018e-08, - "logits/chosen": -2.522327423095703, - "logits/rejected": -2.4662699699401855, - "logps/chosen": -383.5025329589844, - "logps/rejected": -463.1015625, - "loss": 0.0198, + "learning_rate": 7.06453913353539e-08, + "logits/chosen": -2.572540283203125, + "logits/rejected": -2.5231688022613525, + "logps/chosen": -260.60809326171875, + "logps/rejected": -396.53302001953125, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40392112731933594, + "rewards/margins": 13.815750122070312, + "rewards/rejected": -14.219671249389648, + "step": 10880 + }, + { + "epoch": 2.62, + "learning_rate": 7.019967908718131e-08, + "logits/chosen": -2.49473237991333, + "logits/rejected": -2.380850315093994, + "logps/chosen": -276.9905090332031, + "logps/rejected": -347.0209045410156, + "loss": 0.0186, "rewards/accuracies": 1.0, - "rewards/chosen": -1.0164690017700195, - "rewards/margins": 12.747421264648438, - "rewards/rejected": -13.763890266418457, - "step": 10360 + "rewards/chosen": -2.4326560497283936, + "rewards/margins": 11.627863883972168, + "rewards/rejected": -14.060519218444824, + "step": 10890 }, { "epoch": 2.62, - "learning_rate": 7.012452017601348e-08, - "logits/chosen": -2.5477945804595947, - "logits/rejected": -2.496734619140625, - "logps/chosen": -281.07708740234375, - "logps/rejected": -404.2988586425781, - "loss": 0.0261, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1302664279937744, - "rewards/margins": 11.660921096801758, - "rewards/rejected": -13.79118537902832, - "step": 10370 + "learning_rate": 6.975396683900874e-08, + "logits/chosen": -2.5959994792938232, + "logits/rejected": -2.6235971450805664, + "logps/chosen": -229.9144287109375, + "logps/rejected": -399.907470703125, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2233142852783203, + "rewards/margins": 11.059015274047852, + "rewards/rejected": -14.282330513000488, + "step": 10900 }, { "epoch": 2.62, - "learning_rate": 6.965639921355677e-08, - "logits/chosen": -2.6205244064331055, - "logits/rejected": -2.451359272003174, - "logps/chosen": -301.0621032714844, - "logps/rejected": -327.03765869140625, - "loss": 0.0291, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.4722011089324951, - "rewards/margins": 10.043096542358398, - "rewards/rejected": -11.515296936035156, - "step": 10380 + "eval_logits/chosen": -2.2208776473999023, + "eval_logits/rejected": -2.1572036743164062, + "eval_logps/chosen": -285.0531921386719, + "eval_logps/rejected": -317.3194274902344, + "eval_loss": 0.625907301902771, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -8.909217834472656, + "eval_rewards/margins": 4.383413791656494, + "eval_rewards/rejected": -13.292632102966309, + "eval_runtime": 132.2665, + "eval_samples_per_second": 23.861, + "eval_steps_per_second": 0.378, + "step": 10900 }, { "epoch": 2.63, - "learning_rate": 6.918827825110008e-08, - "logits/chosen": -2.4276673793792725, - "logits/rejected": -2.5138256549835205, - "logps/chosen": -192.591064453125, - "logps/rejected": -331.2646179199219, - "loss": 0.0118, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2287522852420807, - "rewards/margins": 10.684351921081543, - "rewards/rejected": -10.913103103637695, - "step": 10390 + "learning_rate": 6.930825459083616e-08, + "logits/chosen": -2.6994035243988037, + "logits/rejected": -2.493536949157715, + "logps/chosen": -320.419921875, + "logps/rejected": -303.1698913574219, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7820870876312256, + "rewards/margins": 9.871232986450195, + "rewards/rejected": -12.653319358825684, + "step": 10910 }, { "epoch": 2.63, - "learning_rate": 6.872015728864339e-08, - "logits/chosen": -2.5545268058776855, - "logits/rejected": -2.3641724586486816, - "logps/chosen": -262.8212890625, - "logps/rejected": -310.0434265136719, - "loss": 0.0158, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.6818441152572632, - "rewards/margins": 10.600229263305664, - "rewards/rejected": -12.282073020935059, - "step": 10400 + "learning_rate": 6.886254234266357e-08, + "logits/chosen": -2.456472158432007, + "logits/rejected": -2.290017604827881, + "logps/chosen": -294.9557800292969, + "logps/rejected": -299.8589172363281, + "loss": 0.0355, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.400362253189087, + "rewards/margins": 10.303778648376465, + "rewards/rejected": -11.704141616821289, + "step": 10920 }, { "epoch": 2.63, - "learning_rate": 6.825203632618669e-08, - "logits/chosen": -2.5672922134399414, - "logits/rejected": -2.389941453933716, - "logps/chosen": -191.87167358398438, - "logps/rejected": -313.29840087890625, - "loss": 0.014, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3071268796920776, - "rewards/margins": 10.873403549194336, - "rewards/rejected": -12.18053150177002, - "step": 10410 + "learning_rate": 6.841683009449099e-08, + "logits/chosen": -2.2713801860809326, + "logits/rejected": -2.1981537342071533, + "logps/chosen": -249.8577880859375, + "logps/rejected": -351.7023620605469, + "loss": 0.0192, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.400978088378906, + "rewards/margins": 9.766181945800781, + "rewards/rejected": -14.167160034179688, + "step": 10930 }, { "epoch": 2.63, - "learning_rate": 6.778391536372999e-08, - "logits/chosen": -2.5447535514831543, - "logits/rejected": -2.637056589126587, - "logps/chosen": -276.07684326171875, - "logps/rejected": -379.71685791015625, - "loss": 0.0335, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1864304542541504, - "rewards/margins": 10.02700424194336, - "rewards/rejected": -11.213434219360352, - "step": 10420 + "learning_rate": 6.797111784631842e-08, + "logits/chosen": -2.391309976577759, + "logits/rejected": -2.4095988273620605, + "logps/chosen": -290.07659912109375, + "logps/rejected": -400.82598876953125, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5592705011367798, + "rewards/margins": 15.656946182250977, + "rewards/rejected": -17.216217041015625, + "step": 10940 }, { "epoch": 2.64, - "learning_rate": 6.731579440127328e-08, - "logits/chosen": -2.62526273727417, - "logits/rejected": -2.5361416339874268, - "logps/chosen": -335.57208251953125, - "logps/rejected": -381.73590087890625, - "loss": 0.0105, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.532575011253357, - "rewards/margins": 11.382027626037598, - "rewards/rejected": -12.914602279663086, - "step": 10430 + "learning_rate": 6.752540559814583e-08, + "logits/chosen": -2.293604612350464, + "logits/rejected": -2.2017250061035156, + "logps/chosen": -223.31436157226562, + "logps/rejected": -304.8475036621094, + "loss": 0.0254, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.7671866416931152, + "rewards/margins": 12.997503280639648, + "rewards/rejected": -15.764691352844238, + "step": 10950 }, { "epoch": 2.64, - "learning_rate": 6.684767343881658e-08, - "logits/chosen": -2.4614133834838867, - "logits/rejected": -2.4959049224853516, - "logps/chosen": -284.344482421875, - "logps/rejected": -406.2197265625, - "loss": 0.0164, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.527146339416504, - "rewards/margins": 12.429636001586914, - "rewards/rejected": -14.95678424835205, - "step": 10440 + "learning_rate": 6.707969334997325e-08, + "logits/chosen": -2.4939675331115723, + "logits/rejected": -2.40246844291687, + "logps/chosen": -265.65313720703125, + "logps/rejected": -362.1214294433594, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3323516845703125, + "rewards/margins": 11.565495491027832, + "rewards/rejected": -12.897847175598145, + "step": 10960 }, { "epoch": 2.64, - "learning_rate": 6.637955247635989e-08, - "logits/chosen": -2.5398077964782715, - "logits/rejected": -2.6172759532928467, - "logps/chosen": -310.23724365234375, - "logps/rejected": -421.4280700683594, - "loss": 0.0229, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.645061731338501, - "rewards/margins": 11.452204704284668, - "rewards/rejected": -13.097265243530273, - "step": 10450 + "learning_rate": 6.663398110180066e-08, + "logits/chosen": -2.4434008598327637, + "logits/rejected": -2.443864345550537, + "logps/chosen": -199.9407196044922, + "logps/rejected": -323.4927978515625, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.253484010696411, + "rewards/margins": 11.236198425292969, + "rewards/rejected": -14.489680290222168, + "step": 10970 }, { "epoch": 2.64, - "learning_rate": 6.591143151390319e-08, - "logits/chosen": -2.495894193649292, - "logits/rejected": -2.372702121734619, - "logps/chosen": -197.970458984375, - "logps/rejected": -267.6205139160156, - "loss": 0.0129, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6794233322143555, - "rewards/margins": 7.389477729797363, - "rewards/rejected": -10.068901062011719, - "step": 10460 + "learning_rate": 6.61882688536281e-08, + "logits/chosen": -2.427109956741333, + "logits/rejected": -2.303581476211548, + "logps/chosen": -337.94610595703125, + "logps/rejected": -370.94268798828125, + "loss": 0.0363, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.3679184913635254, + "rewards/margins": 13.027687072753906, + "rewards/rejected": -15.395604133605957, + "step": 10980 }, { "epoch": 2.65, - "learning_rate": 6.544331055144649e-08, - "logits/chosen": -2.656646251678467, - "logits/rejected": -2.4655566215515137, - "logps/chosen": -312.5660705566406, - "logps/rejected": -395.36932373046875, - "loss": 0.0049, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6871867775917053, - "rewards/margins": 12.126971244812012, - "rewards/rejected": -11.439784049987793, - "step": 10470 + "learning_rate": 6.574255660545551e-08, + "logits/chosen": -2.7133941650390625, + "logits/rejected": -2.531919002532959, + "logps/chosen": -392.5488586425781, + "logps/rejected": -340.8443908691406, + "loss": 0.0358, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.8709425926208496, + "rewards/margins": 9.18415355682373, + "rewards/rejected": -12.055096626281738, + "step": 10990 }, { "epoch": 2.65, - "learning_rate": 6.497518958898978e-08, - "logits/chosen": -2.4221789836883545, - "logits/rejected": -2.395259380340576, - "logps/chosen": -283.78118896484375, - "logps/rejected": -339.09893798828125, - "loss": 0.0267, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.4997386932373047, - "rewards/margins": 10.91909408569336, - "rewards/rejected": -11.418832778930664, - "step": 10480 + "learning_rate": 6.529684435728293e-08, + "logits/chosen": -2.572009325027466, + "logits/rejected": -2.3479113578796387, + "logps/chosen": -297.80657958984375, + "logps/rejected": -334.44915771484375, + "loss": 0.0196, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.541450500488281, + "rewards/margins": 9.187665939331055, + "rewards/rejected": -14.729113578796387, + "step": 11000 }, { "epoch": 2.65, - "learning_rate": 6.450706862653308e-08, - "logits/chosen": -2.5740628242492676, - "logits/rejected": -2.4489102363586426, - "logps/chosen": -309.19097900390625, - "logps/rejected": -340.6146545410156, - "loss": 0.0252, + "eval_logits/chosen": -2.2162604331970215, + "eval_logits/rejected": -2.1532843112945557, + "eval_logps/chosen": -287.7436218261719, + "eval_logps/rejected": -320.01043701171875, + "eval_loss": 0.6219382286071777, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -9.178262710571289, + "eval_rewards/margins": 4.3834733963012695, + "eval_rewards/rejected": -13.561734199523926, + "eval_runtime": 132.0293, + "eval_samples_per_second": 23.904, + "eval_steps_per_second": 0.379, + "step": 11000 + }, + { + "epoch": 2.65, + "learning_rate": 6.485113210911034e-08, + "logits/chosen": -2.3823342323303223, + "logits/rejected": -2.2997987270355225, + "logps/chosen": -245.1785125732422, + "logps/rejected": -332.0592041015625, + "loss": 0.0239, "rewards/accuracies": 1.0, - "rewards/chosen": -0.41388407349586487, - "rewards/margins": 11.361598014831543, - "rewards/rejected": -11.775482177734375, - "step": 10490 + "rewards/chosen": -2.745577812194824, + "rewards/margins": 10.425636291503906, + "rewards/rejected": -13.17121410369873, + "step": 11010 }, { "epoch": 2.65, - "learning_rate": 6.40389476640764e-08, - "logits/chosen": -2.3948919773101807, - "logits/rejected": -2.3630425930023193, - "logps/chosen": -253.672607421875, - "logps/rejected": -326.77642822265625, - "loss": 0.0156, + "learning_rate": 6.440541986093779e-08, + "logits/chosen": -2.390684127807617, + "logits/rejected": -2.4325602054595947, + "logps/chosen": -229.12863159179688, + "logps/rejected": -364.1730041503906, + "loss": 0.0199, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.9204368591308594, - "rewards/margins": 9.881596565246582, - "rewards/rejected": -13.802035331726074, - "step": 10500 + "rewards/chosen": -2.2415194511413574, + "rewards/margins": 11.985854148864746, + "rewards/rejected": -14.227374076843262, + "step": 11020 + }, + { + "epoch": 2.65, + "learning_rate": 6.39597076127652e-08, + "logits/chosen": -2.4481539726257324, + "logits/rejected": -2.346932888031006, + "logps/chosen": -251.73776245117188, + "logps/rejected": -350.64288330078125, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9580209255218506, + "rewards/margins": 12.372736930847168, + "rewards/rejected": -14.330757141113281, + "step": 11030 }, { "epoch": 2.66, - "learning_rate": 6.35708267016197e-08, - "logits/chosen": -2.634661912918091, - "logits/rejected": -2.3652424812316895, - "logps/chosen": -318.3630065917969, - "logps/rejected": -486.04620361328125, - "loss": 0.0108, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.4128196239471436, - "rewards/margins": 19.352506637573242, - "rewards/rejected": -16.939685821533203, - "step": 10510 + "learning_rate": 6.351399536459262e-08, + "logits/chosen": -2.6451168060302734, + "logits/rejected": -2.6366941928863525, + "logps/chosen": -290.16204833984375, + "logps/rejected": -376.4173583984375, + "loss": 0.0205, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.347623348236084, + "rewards/margins": 9.883213996887207, + "rewards/rejected": -14.23083782196045, + "step": 11040 }, { "epoch": 2.66, - "learning_rate": 6.3102705739163e-08, - "logits/chosen": -2.475853443145752, - "logits/rejected": -2.535086154937744, - "logps/chosen": -231.3102569580078, - "logps/rejected": -308.42950439453125, - "loss": 0.0158, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7528905868530273, - "rewards/margins": 11.808151245117188, - "rewards/rejected": -13.561042785644531, - "step": 10520 + "learning_rate": 6.306828311642005e-08, + "logits/chosen": -2.5894360542297363, + "logits/rejected": -2.4921929836273193, + "logps/chosen": -256.55108642578125, + "logps/rejected": -337.4603271484375, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0477309226989746, + "rewards/margins": 11.119123458862305, + "rewards/rejected": -14.166852951049805, + "step": 11050 }, { "epoch": 2.66, - "learning_rate": 6.26345847767063e-08, - "logits/chosen": -2.5275111198425293, - "logits/rejected": -2.401824474334717, - "logps/chosen": -286.4459533691406, - "logps/rejected": -336.7516784667969, - "loss": 0.0404, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -3.68312406539917, - "rewards/margins": 10.263943672180176, - "rewards/rejected": -13.947067260742188, - "step": 10530 + "learning_rate": 6.262257086824746e-08, + "logits/chosen": -2.4641690254211426, + "logits/rejected": -2.440227746963501, + "logps/chosen": -255.63546752929688, + "logps/rejected": -392.258544921875, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3338589668273926, + "rewards/margins": 14.611223220825195, + "rewards/rejected": -16.945083618164062, + "step": 11060 }, { "epoch": 2.66, - "learning_rate": 6.21664638142496e-08, - "logits/chosen": -2.2744011878967285, - "logits/rejected": -2.3522772789001465, - "logps/chosen": -260.71173095703125, - "logps/rejected": -473.5498962402344, - "loss": 0.0144, + "learning_rate": 6.217685862007488e-08, + "logits/chosen": -2.478299856185913, + "logits/rejected": -2.546607255935669, + "logps/chosen": -292.7262268066406, + "logps/rejected": -433.99859619140625, + "loss": 0.0374, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.97296142578125, - "rewards/margins": 15.849021911621094, - "rewards/rejected": -16.821985244750977, - "step": 10540 + "rewards/chosen": -2.87895131111145, + "rewards/margins": 11.304300308227539, + "rewards/rejected": -14.183253288269043, + "step": 11070 }, { "epoch": 2.67, - "learning_rate": 6.16983428517929e-08, - "logits/chosen": -2.4145588874816895, - "logits/rejected": -2.5145926475524902, - "logps/chosen": -253.80517578125, - "logps/rejected": -349.5704040527344, - "loss": 0.0885, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6969666481018066, - "rewards/margins": 9.819896697998047, - "rewards/rejected": -12.516862869262695, - "step": 10550 + "learning_rate": 6.17311463719023e-08, + "logits/chosen": -2.2551088333129883, + "logits/rejected": -2.2163028717041016, + "logps/chosen": -234.3866424560547, + "logps/rejected": -292.14007568359375, + "loss": 0.0377, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.815051078796387, + "rewards/margins": 8.480108261108398, + "rewards/rejected": -13.295160293579102, + "step": 11080 }, { "epoch": 2.67, - "learning_rate": 6.12302218893362e-08, - "logits/chosen": -2.484348773956299, - "logits/rejected": -2.463733673095703, - "logps/chosen": -348.70855712890625, - "logps/rejected": -386.32208251953125, - "loss": 0.0314, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.07341752201318741, - "rewards/margins": 12.499094009399414, - "rewards/rejected": -12.425674438476562, - "step": 10560 + "learning_rate": 6.128543412372972e-08, + "logits/chosen": -2.4844613075256348, + "logits/rejected": -2.344010829925537, + "logps/chosen": -324.102783203125, + "logps/rejected": -405.00115966796875, + "loss": 0.0212, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9842230081558228, + "rewards/margins": 10.340616226196289, + "rewards/rejected": -12.32483959197998, + "step": 11090 }, { "epoch": 2.67, - "learning_rate": 6.07621009268795e-08, - "logits/chosen": -2.4193224906921387, - "logits/rejected": -2.3769335746765137, - "logps/chosen": -242.24105834960938, - "logps/rejected": -316.8022766113281, - "loss": 0.0094, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3817594051361084, - "rewards/margins": 10.313627243041992, - "rewards/rejected": -11.695385932922363, - "step": 10570 + "learning_rate": 6.083972187555714e-08, + "logits/chosen": -2.378952980041504, + "logits/rejected": -2.2691707611083984, + "logps/chosen": -217.2647705078125, + "logps/rejected": -268.33837890625, + "loss": 0.0405, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3123905658721924, + "rewards/margins": 10.918035507202148, + "rewards/rejected": -14.230427742004395, + "step": 11100 }, { "epoch": 2.67, - "learning_rate": 6.029397996442281e-08, - "logits/chosen": -2.511824131011963, - "logits/rejected": -2.4816884994506836, - "logps/chosen": -306.25030517578125, - "logps/rejected": -437.5084533691406, - "loss": 0.011, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6340714693069458, - "rewards/margins": 14.742956161499023, - "rewards/rejected": -14.108884811401367, - "step": 10580 + "eval_logits/chosen": -2.201704263687134, + "eval_logits/rejected": -2.137828826904297, + "eval_logps/chosen": -285.8733825683594, + "eval_logps/rejected": -317.4329833984375, + "eval_loss": 0.6208570599555969, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -8.99123764038086, + "eval_rewards/margins": 4.312750339508057, + "eval_rewards/rejected": -13.303988456726074, + "eval_runtime": 132.1661, + "eval_samples_per_second": 23.879, + "eval_steps_per_second": 0.378, + "step": 11100 + }, + { + "epoch": 2.67, + "learning_rate": 6.039400962738456e-08, + "logits/chosen": -2.385667324066162, + "logits/rejected": -2.391662836074829, + "logps/chosen": -460.06781005859375, + "logps/rejected": -389.2989807128906, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.789900779724121, + "rewards/margins": 11.668432235717773, + "rewards/rejected": -14.458333969116211, + "step": 11110 }, { "epoch": 2.68, - "learning_rate": 5.982585900196611e-08, - "logits/chosen": -2.169275999069214, - "logits/rejected": -2.2812275886535645, - "logps/chosen": -256.701416015625, - "logps/rejected": -390.88909912109375, - "loss": 0.0068, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.529537200927734, - "rewards/margins": 11.55712604522705, - "rewards/rejected": -16.0866641998291, - "step": 10590 + "learning_rate": 5.994829737921197e-08, + "logits/chosen": -2.490661382675171, + "logits/rejected": -2.416771411895752, + "logps/chosen": -256.6833801269531, + "logps/rejected": -370.6900939941406, + "loss": 0.0339, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.583309173583984, + "rewards/margins": 10.862531661987305, + "rewards/rejected": -15.445841789245605, + "step": 11120 }, { "epoch": 2.68, - "learning_rate": 5.9357738039509406e-08, - "logits/chosen": -2.545469284057617, - "logits/rejected": -2.611917734146118, - "logps/chosen": -241.7361297607422, - "logps/rejected": -361.86517333984375, - "loss": 0.0256, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0422825813293457, - "rewards/margins": 10.897112846374512, - "rewards/rejected": -11.939393997192383, - "step": 10600 + "learning_rate": 5.9502585131039395e-08, + "logits/chosen": -2.4973323345184326, + "logits/rejected": -2.233750820159912, + "logps/chosen": -272.9446716308594, + "logps/rejected": -350.979248046875, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4391045570373535, + "rewards/margins": 11.984623908996582, + "rewards/rejected": -15.423727035522461, + "step": 11130 }, { "epoch": 2.68, - "learning_rate": 5.888961707705271e-08, - "logits/chosen": -2.51267409324646, - "logits/rejected": -2.470567226409912, - "logps/chosen": -233.7913055419922, - "logps/rejected": -376.94189453125, - "loss": 0.0156, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.0417325496673584, - "rewards/margins": 12.69041633605957, - "rewards/rejected": -12.732148170471191, - "step": 10610 + "learning_rate": 5.9056872882866825e-08, + "logits/chosen": -2.4382312297821045, + "logits/rejected": -2.423234701156616, + "logps/chosen": -255.75942993164062, + "logps/rejected": -328.5585021972656, + "loss": 0.0417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.036255836486816, + "rewards/margins": 8.134908676147461, + "rewards/rejected": -14.171163558959961, + "step": 11140 }, { "epoch": 2.68, - "learning_rate": 5.8421496114596004e-08, - "logits/chosen": -2.5030910968780518, - "logits/rejected": -2.6085524559020996, - "logps/chosen": -192.46926879882812, - "logps/rejected": -350.96728515625, - "loss": 0.018, + "learning_rate": 5.861116063469424e-08, + "logits/chosen": -2.3884105682373047, + "logits/rejected": -2.3940882682800293, + "logps/chosen": -214.0800018310547, + "logps/rejected": -318.2631530761719, + "loss": 0.0144, "rewards/accuracies": 1.0, - "rewards/chosen": -0.685454249382019, - "rewards/margins": 10.346867561340332, - "rewards/rejected": -11.03232192993164, - "step": 10620 + "rewards/chosen": -1.377820611000061, + "rewards/margins": 12.905527114868164, + "rewards/rejected": -14.283346176147461, + "step": 11150 }, { "epoch": 2.69, - "learning_rate": 5.7953375152139307e-08, - "logits/chosen": -2.4607090950012207, - "logits/rejected": -2.535057544708252, - "logps/chosen": -340.30914306640625, - "logps/rejected": -384.41802978515625, - "loss": 0.0085, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.945713222026825, - "rewards/margins": 11.22203254699707, - "rewards/rejected": -12.167744636535645, - "step": 10630 + "learning_rate": 5.8165448386521663e-08, + "logits/chosen": -2.463395118713379, + "logits/rejected": -2.3233237266540527, + "logps/chosen": -313.16387939453125, + "logps/rejected": -381.10479736328125, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.902303695678711, + "rewards/margins": 14.464693069458008, + "rewards/rejected": -17.366994857788086, + "step": 11160 }, { "epoch": 2.69, - "learning_rate": 5.7485254189682616e-08, - "logits/chosen": -2.472196578979492, - "logits/rejected": -2.4685492515563965, - "logps/chosen": -281.9466247558594, - "logps/rejected": -399.18682861328125, - "loss": 0.0078, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.288421630859375, - "rewards/margins": 11.785292625427246, - "rewards/rejected": -13.073715209960938, - "step": 10640 + "learning_rate": 5.771973613834908e-08, + "logits/chosen": -2.262241840362549, + "logits/rejected": -2.360203266143799, + "logps/chosen": -264.2096862792969, + "logps/rejected": -478.0140686035156, + "loss": 0.0412, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3736424446105957, + "rewards/margins": 13.551648139953613, + "rewards/rejected": -16.925291061401367, + "step": 11170 }, { "epoch": 2.69, - "learning_rate": 5.701713322722591e-08, - "logits/chosen": -2.378995418548584, - "logits/rejected": -2.4454381465911865, - "logps/chosen": -316.51458740234375, - "logps/rejected": -399.79681396484375, - "loss": 0.0245, + "learning_rate": 5.72740238901765e-08, + "logits/chosen": -2.410750150680542, + "logits/rejected": -2.169556140899658, + "logps/chosen": -297.9613342285156, + "logps/rejected": -356.1993713378906, + "loss": 0.0102, "rewards/accuracies": 1.0, - "rewards/chosen": -2.655643939971924, - "rewards/margins": 12.092453002929688, - "rewards/rejected": -14.748095512390137, - "step": 10650 + "rewards/chosen": -3.014568567276001, + "rewards/margins": 11.262365341186523, + "rewards/rejected": -14.276933670043945, + "step": 11180 }, { "epoch": 2.69, - "learning_rate": 5.6549012264769214e-08, - "logits/chosen": -2.4572110176086426, - "logits/rejected": -2.284625768661499, - "logps/chosen": -237.8368682861328, - "logps/rejected": -315.8360290527344, - "loss": 0.0104, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2194294929504395, - "rewards/margins": 11.172201156616211, - "rewards/rejected": -12.391630172729492, - "step": 10660 + "learning_rate": 5.682831164200392e-08, + "logits/chosen": -2.681650161743164, + "logits/rejected": -2.485395669937134, + "logps/chosen": -312.2545166015625, + "logps/rejected": -452.75274658203125, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.13000226020813, + "rewards/margins": 15.450372695922852, + "rewards/rejected": -17.58037567138672, + "step": 11190 }, { "epoch": 2.7, - "learning_rate": 5.608089130231251e-08, - "logits/chosen": -2.373015880584717, - "logits/rejected": -2.5167531967163086, - "logps/chosen": -240.7644805908203, - "logps/rejected": -398.7191162109375, - "loss": 0.016, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.427962064743042, - "rewards/margins": 11.068601608276367, - "rewards/rejected": -12.496563911437988, - "step": 10670 + "learning_rate": 5.638259939383134e-08, + "logits/chosen": -2.5486068725585938, + "logits/rejected": -2.4126861095428467, + "logps/chosen": -363.5796203613281, + "logps/rejected": -459.06842041015625, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.07317852973938, + "rewards/margins": 14.759126663208008, + "rewards/rejected": -16.832304000854492, + "step": 11200 }, { "epoch": 2.7, - "learning_rate": 5.561277033985581e-08, - "logits/chosen": -2.5520968437194824, - "logits/rejected": -2.472224235534668, - "logps/chosen": -297.46484375, - "logps/rejected": -420.49755859375, - "loss": 0.0052, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.14769385755062103, - "rewards/margins": 13.067815780639648, - "rewards/rejected": -12.920123100280762, - "step": 10680 + "eval_logits/chosen": -2.1861989498138428, + "eval_logits/rejected": -2.1220171451568604, + "eval_logps/chosen": -294.2787170410156, + "eval_logps/rejected": -327.0770568847656, + "eval_loss": 0.6300050616264343, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -9.831768989562988, + "eval_rewards/margins": 4.4366278648376465, + "eval_rewards/rejected": -14.268396377563477, + "eval_runtime": 132.0909, + "eval_samples_per_second": 23.893, + "eval_steps_per_second": 0.379, + "step": 11200 }, { "epoch": 2.7, - "learning_rate": 5.514464937739912e-08, - "logits/chosen": -2.552321195602417, - "logits/rejected": -2.521683692932129, - "logps/chosen": -324.34600830078125, - "logps/rejected": -350.0135803222656, - "loss": 0.0124, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.8432782888412476, - "rewards/margins": 15.473007202148438, - "rewards/rejected": -13.629728317260742, - "step": 10690 + "learning_rate": 5.593688714565876e-08, + "logits/chosen": -2.55031156539917, + "logits/rejected": -2.4817073345184326, + "logps/chosen": -290.4566955566406, + "logps/rejected": -414.14239501953125, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8868212699890137, + "rewards/margins": 12.00037956237793, + "rewards/rejected": -14.887199401855469, + "step": 11210 }, { "epoch": 2.7, - "learning_rate": 5.467652841494242e-08, - "logits/chosen": -2.4410221576690674, - "logits/rejected": -2.261835813522339, - "logps/chosen": -206.8491668701172, - "logps/rejected": -321.8210754394531, - "loss": 0.0236, + "learning_rate": 5.549117489748618e-08, + "logits/chosen": -2.500797986984253, + "logits/rejected": -2.2686524391174316, + "logps/chosen": -423.191162109375, + "logps/rejected": -550.5567626953125, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15317955613136292, + "rewards/margins": 16.616588592529297, + "rewards/rejected": -16.769765853881836, + "step": 11220 + }, + { + "epoch": 2.7, + "learning_rate": 5.50454626493136e-08, + "logits/chosen": -2.3440825939178467, + "logits/rejected": -2.288924217224121, + "logps/chosen": -227.54122924804688, + "logps/rejected": -296.5143737792969, + "loss": 0.0246, "rewards/accuracies": 1.0, - "rewards/chosen": -2.9713339805603027, - "rewards/margins": 10.97213077545166, - "rewards/rejected": -13.943461418151855, - "step": 10700 + "rewards/chosen": -2.2405099868774414, + "rewards/margins": 10.538477897644043, + "rewards/rejected": -12.778987884521484, + "step": 11230 }, { "epoch": 2.71, - "learning_rate": 5.420840745248572e-08, - "logits/chosen": -2.504394054412842, - "logits/rejected": -2.1629796028137207, - "logps/chosen": -270.15753173828125, - "logps/rejected": -349.7294921875, - "loss": 0.0234, + "learning_rate": 5.4599750401141025e-08, + "logits/chosen": -2.452763557434082, + "logits/rejected": -2.2897536754608154, + "logps/chosen": -254.8859100341797, + "logps/rejected": -382.4427795410156, + "loss": 0.0247, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.390135288238525, + "rewards/margins": 10.920894622802734, + "rewards/rejected": -15.311029434204102, + "step": 11240 + }, + { + "epoch": 2.71, + "learning_rate": 5.415403815296844e-08, + "logits/chosen": -2.314424514770508, + "logits/rejected": -2.342744827270508, + "logps/chosen": -281.2926330566406, + "logps/rejected": -362.99212646484375, + "loss": 0.0277, "rewards/accuracies": 1.0, - "rewards/chosen": -1.5076284408569336, - "rewards/margins": 10.655898094177246, - "rewards/rejected": -12.16352653503418, - "step": 10710 + "rewards/chosen": -1.7888396978378296, + "rewards/margins": 13.558688163757324, + "rewards/rejected": -15.347529411315918, + "step": 11250 }, { "epoch": 2.71, - "learning_rate": 5.374028649002902e-08, - "logits/chosen": -2.432774543762207, - "logits/rejected": -2.387944459915161, - "logps/chosen": -270.55169677734375, - "logps/rejected": -308.4688720703125, - "loss": 0.0056, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.218773365020752, - "rewards/margins": 11.117988586425781, - "rewards/rejected": -13.336763381958008, - "step": 10720 + "learning_rate": 5.3708325904795864e-08, + "logits/chosen": -2.4904584884643555, + "logits/rejected": -2.365490436553955, + "logps/chosen": -294.51116943359375, + "logps/rejected": -382.7975769042969, + "loss": 0.0218, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7524781227111816, + "rewards/margins": 11.13813591003418, + "rewards/rejected": -14.890612602233887, + "step": 11260 }, { "epoch": 2.71, - "learning_rate": 5.327216552757232e-08, - "logits/chosen": -2.429544687271118, - "logits/rejected": -2.3968300819396973, - "logps/chosen": -211.1251983642578, - "logps/rejected": -322.4640808105469, - "loss": 0.0258, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0799713134765625, - "rewards/margins": 12.16854190826416, - "rewards/rejected": -14.248514175415039, - "step": 10730 + "learning_rate": 5.326261365662328e-08, + "logits/chosen": -2.4797186851501465, + "logits/rejected": -2.3399758338928223, + "logps/chosen": -260.575439453125, + "logps/rejected": -416.10931396484375, + "loss": 0.0344, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0430617332458496, + "rewards/margins": 14.793481826782227, + "rewards/rejected": -16.8365421295166, + "step": 11270 }, { "epoch": 2.71, - "learning_rate": 5.280404456511563e-08, - "logits/chosen": -2.376677989959717, - "logits/rejected": -2.472378969192505, - "logps/chosen": -201.08544921875, - "logps/rejected": -444.22314453125, - "loss": 0.0092, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6852868795394897, - "rewards/margins": 11.311137199401855, - "rewards/rejected": -12.996424674987793, - "step": 10740 + "learning_rate": 5.28169014084507e-08, + "logits/chosen": -2.6418023109436035, + "logits/rejected": -2.43471097946167, + "logps/chosen": -458.48736572265625, + "logps/rejected": -571.591796875, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4694422781467438, + "rewards/margins": 21.418861389160156, + "rewards/rejected": -20.949419021606445, + "step": 11280 }, { "epoch": 2.72, - "learning_rate": 5.2335923602658924e-08, - "logits/chosen": -2.5168099403381348, - "logits/rejected": -2.4923369884490967, - "logps/chosen": -308.19244384765625, - "logps/rejected": -460.25579833984375, - "loss": 0.023, + "learning_rate": 5.237118916027812e-08, + "logits/chosen": -2.4042813777923584, + "logits/rejected": -2.4403462409973145, + "logps/chosen": -244.8955078125, + "logps/rejected": -355.9122314453125, + "loss": 0.0161, "rewards/accuracies": 1.0, - "rewards/chosen": -1.394303560256958, - "rewards/margins": 15.313270568847656, - "rewards/rejected": -16.70757484436035, - "step": 10750 + "rewards/chosen": -2.4056479930877686, + "rewards/margins": 12.673379898071289, + "rewards/rejected": -15.079028129577637, + "step": 11290 }, { "epoch": 2.72, - "learning_rate": 5.186780264020223e-08, - "logits/chosen": -2.423064708709717, - "logits/rejected": -2.377765417098999, - "logps/chosen": -280.87457275390625, - "logps/rejected": -397.24615478515625, - "loss": 0.0355, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.421639919281006, - "rewards/margins": 11.679462432861328, - "rewards/rejected": -15.101102828979492, - "step": 10760 + "learning_rate": 5.192547691210554e-08, + "logits/chosen": -2.5386104583740234, + "logits/rejected": -2.481544017791748, + "logps/chosen": -299.6481628417969, + "logps/rejected": -354.152099609375, + "loss": 0.0307, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.359282493591309, + "rewards/margins": 9.520303726196289, + "rewards/rejected": -14.879585266113281, + "step": 11300 }, { "epoch": 2.72, - "learning_rate": 5.139968167774553e-08, - "logits/chosen": -2.542541742324829, - "logits/rejected": -2.5660011768341064, - "logps/chosen": -311.539306640625, - "logps/rejected": -374.19122314453125, - "loss": 0.0229, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.1220595836639404, - "rewards/margins": 9.83003044128418, - "rewards/rejected": -12.9520902633667, - "step": 10770 + "eval_logits/chosen": -2.1944549083709717, + "eval_logits/rejected": -2.1315855979919434, + "eval_logps/chosen": -292.988037109375, + "eval_logps/rejected": -326.1575622558594, + "eval_loss": 0.635567307472229, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -9.702699661254883, + "eval_rewards/margins": 4.473745822906494, + "eval_rewards/rejected": -14.176446914672852, + "eval_runtime": 132.1934, + "eval_samples_per_second": 23.874, + "eval_steps_per_second": 0.378, + "step": 11300 }, { "epoch": 2.72, - "learning_rate": 5.0931560715288825e-08, - "logits/chosen": -2.3945837020874023, - "logits/rejected": -2.451707601547241, - "logps/chosen": -262.0988464355469, - "logps/rejected": -347.1482238769531, - "loss": 0.0153, + "learning_rate": 5.147976466393296e-08, + "logits/chosen": -2.319854497909546, + "logits/rejected": -2.2335917949676514, + "logps/chosen": -224.5947723388672, + "logps/rejected": -339.7945861816406, + "loss": 0.0248, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.767332553863525, + "rewards/margins": 8.129494667053223, + "rewards/rejected": -14.896825790405273, + "step": 11310 + }, + { + "epoch": 2.72, + "learning_rate": 5.103405241576039e-08, + "logits/chosen": -2.2997264862060547, + "logits/rejected": -2.1724860668182373, + "logps/chosen": -199.0841522216797, + "logps/rejected": -284.02752685546875, + "loss": 0.0471, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.5155863761901855, + "rewards/margins": 11.367011070251465, + "rewards/rejected": -15.882598876953125, + "step": 11320 + }, + { + "epoch": 2.73, + "learning_rate": 5.05883401675878e-08, + "logits/chosen": -2.5776190757751465, + "logits/rejected": -2.463186264038086, + "logps/chosen": -359.964599609375, + "logps/rejected": -505.83135986328125, + "loss": 0.0195, "rewards/accuracies": 1.0, - "rewards/chosen": -2.536090850830078, - "rewards/margins": 11.240296363830566, - "rewards/rejected": -13.776387214660645, - "step": 10780 + "rewards/chosen": -1.4989402294158936, + "rewards/margins": 16.936222076416016, + "rewards/rejected": -18.435163497924805, + "step": 11330 }, { "epoch": 2.73, - "learning_rate": 5.046343975283213e-08, - "logits/chosen": -2.617018222808838, - "logits/rejected": -2.5935912132263184, - "logps/chosen": -279.4792175292969, - "logps/rejected": -379.79638671875, - "loss": 0.015, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.8385303020477295, - "rewards/margins": 9.282624244689941, - "rewards/rejected": -13.12115478515625, - "step": 10790 + "learning_rate": 5.0142627919415226e-08, + "logits/chosen": -2.5486507415771484, + "logits/rejected": -2.3956305980682373, + "logps/chosen": -237.5111541748047, + "logps/rejected": -290.0249328613281, + "loss": 0.0243, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9125919342041016, + "rewards/margins": 11.886371612548828, + "rewards/rejected": -13.79896354675293, + "step": 11340 }, { "epoch": 2.73, - "learning_rate": 4.999531879037543e-08, - "logits/chosen": -2.360745906829834, - "logits/rejected": -2.4422788619995117, - "logps/chosen": -244.77255249023438, - "logps/rejected": -348.60455322265625, - "loss": 0.0138, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.752306342124939, - "rewards/margins": 12.563000679016113, - "rewards/rejected": -14.3153076171875, - "step": 10800 + "learning_rate": 4.969691567124264e-08, + "logits/chosen": -2.4473633766174316, + "logits/rejected": -2.421595573425293, + "logps/chosen": -381.11602783203125, + "logps/rejected": -475.2928161621094, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.312753438949585, + "rewards/margins": 13.693679809570312, + "rewards/rejected": -15.006433486938477, + "step": 11350 }, { "epoch": 2.73, - "learning_rate": 4.952719782791873e-08, - "logits/chosen": -2.506380558013916, - "logits/rejected": -2.4522621631622314, - "logps/chosen": -245.85659790039062, - "logps/rejected": -313.57568359375, - "loss": 0.0214, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.0090579986572266, - "rewards/margins": 8.213888168334961, - "rewards/rejected": -11.222947120666504, - "step": 10810 + "learning_rate": 4.9251203423070065e-08, + "logits/chosen": -2.4241907596588135, + "logits/rejected": -2.341451406478882, + "logps/chosen": -261.43609619140625, + "logps/rejected": -268.0079040527344, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4055161476135254, + "rewards/margins": 9.849943161010742, + "rewards/rejected": -13.255459785461426, + "step": 11360 }, { "epoch": 2.74, - "learning_rate": 4.9059076865462035e-08, - "logits/chosen": -2.6841819286346436, - "logits/rejected": -2.5684549808502197, - "logps/chosen": -318.21685791015625, - "logps/rejected": -471.38177490234375, - "loss": 0.0165, + "learning_rate": 4.880549117489748e-08, + "logits/chosen": -2.6168875694274902, + "logits/rejected": -2.3666248321533203, + "logps/chosen": -360.3699951171875, + "logps/rejected": -351.4325256347656, + "loss": 0.0205, "rewards/accuracies": 1.0, - "rewards/chosen": -1.1088840961456299, - "rewards/margins": 12.368844985961914, - "rewards/rejected": -13.477727890014648, - "step": 10820 + "rewards/chosen": -3.3059535026550293, + "rewards/margins": 11.053776741027832, + "rewards/rejected": -14.35973072052002, + "step": 11370 }, { "epoch": 2.74, - "learning_rate": 4.859095590300533e-08, - "logits/chosen": -2.425753593444824, - "logits/rejected": -2.3956360816955566, - "logps/chosen": -189.13650512695312, - "logps/rejected": -332.4715576171875, - "loss": 0.0065, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3720785677433014, - "rewards/margins": 12.712925910949707, - "rewards/rejected": -13.085004806518555, - "step": 10830 + "learning_rate": 4.8359778926724904e-08, + "logits/chosen": -2.3658604621887207, + "logits/rejected": -2.321112871170044, + "logps/chosen": -288.92864990234375, + "logps/rejected": -374.6175842285156, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9107131958007812, + "rewards/margins": 13.017892837524414, + "rewards/rejected": -15.928606033325195, + "step": 11380 }, { "epoch": 2.74, - "learning_rate": 4.8122834940548634e-08, - "logits/chosen": -2.610264301300049, - "logits/rejected": -2.447481155395508, - "logps/chosen": -342.28509521484375, - "logps/rejected": -377.2023010253906, - "loss": 0.0113, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.163722515106201, - "rewards/margins": 11.125509262084961, - "rewards/rejected": -13.28923225402832, - "step": 10840 + "learning_rate": 4.791406667855232e-08, + "logits/chosen": -2.52622652053833, + "logits/rejected": -2.490788698196411, + "logps/chosen": -324.60223388671875, + "logps/rejected": -403.5930480957031, + "loss": 0.0299, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.902942180633545, + "rewards/margins": 11.955315589904785, + "rewards/rejected": -15.858256340026855, + "step": 11390 }, { "epoch": 2.74, - "learning_rate": 4.765471397809194e-08, - "logits/chosen": -2.4246602058410645, - "logits/rejected": -2.326582670211792, - "logps/chosen": -297.70037841796875, - "logps/rejected": -320.5494079589844, - "loss": 0.0116, + "learning_rate": 4.746835443037975e-08, + "logits/chosen": -2.4016480445861816, + "logits/rejected": -2.228372097015381, + "logps/chosen": -257.6540222167969, + "logps/rejected": -376.7303161621094, + "loss": 0.0242, "rewards/accuracies": 1.0, - "rewards/chosen": 0.0987861379981041, - "rewards/margins": 13.165461540222168, - "rewards/rejected": -13.066675186157227, - "step": 10850 + "rewards/chosen": -2.582202196121216, + "rewards/margins": 14.4175386428833, + "rewards/rejected": -16.999740600585938, + "step": 11400 }, { - "epoch": 2.75, - "learning_rate": 4.718659301563524e-08, - "logits/chosen": -2.4775731563568115, - "logits/rejected": -2.311589002609253, - "logps/chosen": -388.44189453125, - "logps/rejected": -500.1787109375, - "loss": 0.0101, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.110121488571167, - "rewards/margins": 12.782126426696777, - "rewards/rejected": -15.892248153686523, - "step": 10860 + "epoch": 2.74, + "eval_logits/chosen": -2.168041944503784, + "eval_logits/rejected": -2.107154130935669, + "eval_logps/chosen": -294.0464782714844, + "eval_logps/rejected": -326.9674072265625, + "eval_loss": 0.6327061057090759, + "eval_rewards/accuracies": 0.6625000238418579, + "eval_rewards/chosen": -9.808545112609863, + "eval_rewards/margins": 4.448887348175049, + "eval_rewards/rejected": -14.25743293762207, + "eval_runtime": 132.2281, + "eval_samples_per_second": 23.868, + "eval_steps_per_second": 0.378, + "step": 11400 }, { "epoch": 2.75, - "learning_rate": 4.671847205317854e-08, - "logits/chosen": -2.391537666320801, - "logits/rejected": -2.377824306488037, - "logps/chosen": -342.33648681640625, - "logps/rejected": -438.40264892578125, - "loss": 0.0285, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.7127606272697449, - "rewards/margins": 11.989851951599121, - "rewards/rejected": -12.702611923217773, - "step": 10870 + "learning_rate": 4.7022642182207165e-08, + "logits/chosen": -2.578219175338745, + "logits/rejected": -2.581782341003418, + "logps/chosen": -422.07080078125, + "logps/rejected": -463.7003479003906, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4582388401031494, + "rewards/margins": 13.07538890838623, + "rewards/rejected": -14.533628463745117, + "step": 11410 }, { "epoch": 2.75, - "learning_rate": 4.625035109072184e-08, - "logits/chosen": -2.449780225753784, - "logits/rejected": -2.3901445865631104, - "logps/chosen": -292.65264892578125, - "logps/rejected": -432.4183044433594, - "loss": 0.0172, + "learning_rate": 4.657692993403459e-08, + "logits/chosen": -2.2454330921173096, + "logits/rejected": -2.206204891204834, + "logps/chosen": -228.2700958251953, + "logps/rejected": -372.06756591796875, + "loss": 0.0252, "rewards/accuracies": 1.0, - "rewards/chosen": -2.945425033569336, - "rewards/margins": 12.054902076721191, - "rewards/rejected": -15.000328063964844, - "step": 10880 + "rewards/chosen": -3.5328586101531982, + "rewards/margins": 12.089221000671387, + "rewards/rejected": -15.622079849243164, + "step": 11420 }, { "epoch": 2.75, - "learning_rate": 4.578223012826514e-08, - "logits/chosen": -2.4154934883117676, - "logits/rejected": -2.4078152179718018, - "logps/chosen": -267.01678466796875, - "logps/rejected": -375.1787109375, - "loss": 0.0268, + "learning_rate": 4.6131217685862004e-08, + "logits/chosen": -2.318665027618408, + "logits/rejected": -2.389437198638916, + "logps/chosen": -224.6763153076172, + "logps/rejected": -352.53729248046875, + "loss": 0.0258, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.3427653312683105, - "rewards/margins": 11.284711837768555, - "rewards/rejected": -14.627476692199707, - "step": 10890 + "rewards/chosen": -3.5910286903381348, + "rewards/margins": 12.367441177368164, + "rewards/rejected": -15.958467483520508, + "step": 11430 + }, + { + "epoch": 2.75, + "learning_rate": 4.5685505437689427e-08, + "logits/chosen": -2.618743658065796, + "logits/rejected": -2.401430606842041, + "logps/chosen": -350.2810974121094, + "logps/rejected": -369.3810729980469, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.773944616317749, + "rewards/margins": 10.302160263061523, + "rewards/rejected": -13.076105117797852, + "step": 11440 }, { "epoch": 2.76, - "learning_rate": 4.531410916580844e-08, - "logits/chosen": -2.5893101692199707, - "logits/rejected": -2.4205315113067627, - "logps/chosen": -273.5794982910156, - "logps/rejected": -325.26898193359375, - "loss": 0.0136, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7947503924369812, - "rewards/margins": 10.781389236450195, - "rewards/rejected": -9.986639976501465, - "step": 10900 + "learning_rate": 4.523979318951684e-08, + "logits/chosen": -2.4543709754943848, + "logits/rejected": -2.285076856613159, + "logps/chosen": -304.74786376953125, + "logps/rejected": -383.43133544921875, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9792327880859375, + "rewards/margins": 10.029706001281738, + "rewards/rejected": -14.008938789367676, + "step": 11450 }, { "epoch": 2.76, - "learning_rate": 4.4845988203351745e-08, - "logits/chosen": -2.6362905502319336, - "logits/rejected": -2.4879088401794434, - "logps/chosen": -290.63250732421875, - "logps/rejected": -322.5184326171875, - "loss": 0.0187, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.050183653831482, - "rewards/margins": 9.369373321533203, - "rewards/rejected": -10.419557571411133, - "step": 10910 + "learning_rate": 4.4794080941344265e-08, + "logits/chosen": -2.4451160430908203, + "logits/rejected": -2.3822360038757324, + "logps/chosen": -322.95989990234375, + "logps/rejected": -428.0362243652344, + "loss": 0.0276, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.2576375007629395, + "rewards/margins": 10.91586685180664, + "rewards/rejected": -15.173504829406738, + "step": 11460 }, { "epoch": 2.76, - "learning_rate": 4.437786724089505e-08, - "logits/chosen": -2.4191794395446777, - "logits/rejected": -2.46612548828125, - "logps/chosen": -319.10308837890625, - "logps/rejected": -402.46124267578125, - "loss": 0.0101, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.31841006875038147, - "rewards/margins": 11.323060035705566, - "rewards/rejected": -11.641469955444336, - "step": 10920 + "learning_rate": 4.434836869317168e-08, + "logits/chosen": -2.4058711528778076, + "logits/rejected": -2.3612260818481445, + "logps/chosen": -362.9645690917969, + "logps/rejected": -422.44061279296875, + "loss": 0.034, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.062591552734375, + "rewards/margins": 10.733223915100098, + "rewards/rejected": -15.795814514160156, + "step": 11470 }, { "epoch": 2.76, - "learning_rate": 4.3909746278438344e-08, - "logits/chosen": -2.5394818782806396, - "logits/rejected": -2.576970100402832, - "logps/chosen": -236.96401977539062, - "logps/rejected": -392.5608825683594, - "loss": 0.0169, + "learning_rate": 4.3902656444999104e-08, + "logits/chosen": -2.489445209503174, + "logits/rejected": -2.4200334548950195, + "logps/chosen": -300.0162048339844, + "logps/rejected": -375.11358642578125, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8128858804702759, + "rewards/margins": 12.119314193725586, + "rewards/rejected": -13.932199478149414, + "step": 11480 + }, + { + "epoch": 2.77, + "learning_rate": 4.345694419682653e-08, + "logits/chosen": -2.5976650714874268, + "logits/rejected": -2.4194092750549316, + "logps/chosen": -265.0852355957031, + "logps/rejected": -372.4889221191406, + "loss": 0.0304, "rewards/accuracies": 1.0, - "rewards/chosen": -2.159064769744873, - "rewards/margins": 11.52956771850586, - "rewards/rejected": -13.688632011413574, - "step": 10930 + "rewards/chosen": -1.777126669883728, + "rewards/margins": 12.253846168518066, + "rewards/rejected": -14.030970573425293, + "step": 11490 }, { "epoch": 2.77, - "learning_rate": 4.3441625315981646e-08, - "logits/chosen": -2.5915493965148926, - "logits/rejected": -2.467534303665161, - "logps/chosen": -250.9964599609375, - "logps/rejected": -292.21600341796875, - "loss": 0.0328, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.5387306213378906, - "rewards/margins": 9.883870124816895, - "rewards/rejected": -11.422601699829102, - "step": 10940 + "learning_rate": 4.301123194865395e-08, + "logits/chosen": -2.4793620109558105, + "logits/rejected": -2.42339825630188, + "logps/chosen": -299.32647705078125, + "logps/rejected": -384.76495361328125, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.754706084728241, + "rewards/margins": 12.703972816467285, + "rewards/rejected": -13.45867919921875, + "step": 11500 }, { "epoch": 2.77, - "learning_rate": 4.297350435352495e-08, - "logits/chosen": -2.506401538848877, - "logits/rejected": -2.489722728729248, - "logps/chosen": -351.91064453125, - "logps/rejected": -438.94818115234375, - "loss": 0.0162, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.900360107421875, - "rewards/margins": 14.668928146362305, - "rewards/rejected": -16.569290161132812, - "step": 10950 + "eval_logits/chosen": -2.1882050037384033, + "eval_logits/rejected": -2.1272687911987305, + "eval_logps/chosen": -289.6584777832031, + "eval_logps/rejected": -322.81353759765625, + "eval_loss": 0.6307923793792725, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -9.369746208190918, + "eval_rewards/margins": 4.472295761108398, + "eval_rewards/rejected": -13.84204387664795, + "eval_runtime": 132.0795, + "eval_samples_per_second": 23.895, + "eval_steps_per_second": 0.379, + "step": 11500 }, { "epoch": 2.77, - "learning_rate": 4.250538339106825e-08, - "logits/chosen": -2.2875542640686035, - "logits/rejected": -2.2873737812042236, - "logps/chosen": -345.0179748535156, - "logps/rejected": -385.7926940917969, - "loss": 0.0428, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6500468254089355, - "rewards/margins": 11.161527633666992, - "rewards/rejected": -14.811576843261719, - "step": 10960 + "learning_rate": 4.2565519700481366e-08, + "logits/chosen": -2.4647128582000732, + "logits/rejected": -2.417418956756592, + "logps/chosen": -241.6924285888672, + "logps/rejected": -435.2928771972656, + "loss": 0.0628, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.0452921390533447, + "rewards/margins": 12.85204792022705, + "rewards/rejected": -14.8973388671875, + "step": 11510 }, { "epoch": 2.77, - "learning_rate": 4.2037262428611554e-08, - "logits/chosen": -2.2688958644866943, - "logits/rejected": -2.2536237239837646, - "logps/chosen": -306.314697265625, - "logps/rejected": -414.4088439941406, - "loss": 0.0198, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5722497701644897, - "rewards/margins": 11.489215850830078, - "rewards/rejected": -13.0614652633667, - "step": 10970 + "learning_rate": 4.211980745230879e-08, + "logits/chosen": -2.409764528274536, + "logits/rejected": -2.4370346069335938, + "logps/chosen": -272.5648498535156, + "logps/rejected": -344.78924560546875, + "loss": 0.0318, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9780356884002686, + "rewards/margins": 12.006660461425781, + "rewards/rejected": -14.984695434570312, + "step": 11520 + }, + { + "epoch": 2.77, + "learning_rate": 4.1674095204136205e-08, + "logits/chosen": -2.5047061443328857, + "logits/rejected": -2.4376213550567627, + "logps/chosen": -273.2869567871094, + "logps/rejected": -384.69293212890625, + "loss": 0.0631, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7339588403701782, + "rewards/margins": 11.270682334899902, + "rewards/rejected": -13.00464153289795, + "step": 11530 }, { "epoch": 2.78, - "learning_rate": 4.156914146615485e-08, - "logits/chosen": -2.456655502319336, - "logits/rejected": -2.4888079166412354, - "logps/chosen": -206.24093627929688, - "logps/rejected": -329.0902404785156, - "loss": 0.007, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9091620445251465, - "rewards/margins": 9.75031566619873, - "rewards/rejected": -12.659477233886719, - "step": 10980 + "learning_rate": 4.122838295596363e-08, + "logits/chosen": -2.453369617462158, + "logits/rejected": -2.4379312992095947, + "logps/chosen": -221.60073852539062, + "logps/rejected": -381.62921142578125, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.29557466506958, + "rewards/margins": 12.355402946472168, + "rewards/rejected": -15.650978088378906, + "step": 11540 }, { "epoch": 2.78, - "learning_rate": 4.110102050369815e-08, - "logits/chosen": -2.4782252311706543, - "logits/rejected": -2.437685966491699, - "logps/chosen": -224.4599609375, - "logps/rejected": -357.8743591308594, - "loss": 0.0249, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.37739634513855, - "rewards/margins": 12.426565170288086, - "rewards/rejected": -15.803962707519531, - "step": 10990 + "learning_rate": 4.0782670707791043e-08, + "logits/chosen": -2.3646228313446045, + "logits/rejected": -2.2564029693603516, + "logps/chosen": -251.2555389404297, + "logps/rejected": -406.7834167480469, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.391803026199341, + "rewards/margins": 12.854314804077148, + "rewards/rejected": -15.246116638183594, + "step": 11550 }, { "epoch": 2.78, - "learning_rate": 4.0632899541241455e-08, - "logits/chosen": -2.588283061981201, - "logits/rejected": -2.510744094848633, - "logps/chosen": -332.6466369628906, - "logps/rejected": -408.37982177734375, - "loss": 0.0127, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.765860080718994, - "rewards/margins": 11.549457550048828, - "rewards/rejected": -14.315317153930664, - "step": 11000 + "learning_rate": 4.0336958459618466e-08, + "logits/chosen": -2.6218631267547607, + "logits/rejected": -2.465564727783203, + "logps/chosen": -353.37396240234375, + "logps/rejected": -430.240966796875, + "loss": 0.0918, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6435168981552124, + "rewards/margins": 13.525812149047852, + "rewards/rejected": -15.169326782226562, + "step": 11560 }, { "epoch": 2.78, - "learning_rate": 4.016477857878476e-08, - "logits/chosen": -2.4237632751464844, - "logits/rejected": -2.3809571266174316, - "logps/chosen": -360.4668273925781, - "logps/rejected": -399.2850036621094, - "loss": 0.0125, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.020248282700777054, - "rewards/margins": 12.246782302856445, - "rewards/rejected": -12.267030715942383, - "step": 11010 + "learning_rate": 3.989124621144589e-08, + "logits/chosen": -2.489464044570923, + "logits/rejected": -2.4563944339752197, + "logps/chosen": -259.73602294921875, + "logps/rejected": -350.0437927246094, + "loss": 0.0414, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1594955921173096, + "rewards/margins": 12.046110153198242, + "rewards/rejected": -14.205607414245605, + "step": 11570 }, { "epoch": 2.79, - "learning_rate": 3.969665761632806e-08, - "logits/chosen": -2.5303540229797363, - "logits/rejected": -2.4834442138671875, - "logps/chosen": -244.11532592773438, - "logps/rejected": -366.6121520996094, - "loss": 0.0095, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3335506916046143, - "rewards/margins": 10.373838424682617, - "rewards/rejected": -12.707388877868652, - "step": 11020 + "learning_rate": 3.944553396327331e-08, + "logits/chosen": -2.3765103816986084, + "logits/rejected": -2.348388195037842, + "logps/chosen": -245.48880004882812, + "logps/rejected": -311.7701721191406, + "loss": 0.0548, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.978712558746338, + "rewards/margins": 9.561111450195312, + "rewards/rejected": -15.539823532104492, + "step": 11580 }, { "epoch": 2.79, - "learning_rate": 3.9228536653871356e-08, - "logits/chosen": -2.4338903427124023, - "logits/rejected": -2.4130215644836426, - "logps/chosen": -202.8179168701172, - "logps/rejected": -318.9550476074219, - "loss": 0.0063, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7410482168197632, - "rewards/margins": 11.074950218200684, - "rewards/rejected": -11.815999031066895, - "step": 11030 + "learning_rate": 3.899982171510073e-08, + "logits/chosen": -2.4896092414855957, + "logits/rejected": -2.5042903423309326, + "logps/chosen": -250.5653839111328, + "logps/rejected": -323.84625244140625, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.221368312835693, + "rewards/margins": 9.85468864440918, + "rewards/rejected": -14.076057434082031, + "step": 11590 }, { "epoch": 2.79, - "learning_rate": 3.876041569141466e-08, - "logits/chosen": -2.6266818046569824, - "logits/rejected": -2.570533275604248, - "logps/chosen": -289.25604248046875, - "logps/rejected": -355.9947814941406, - "loss": 0.0115, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.634669303894043, - "rewards/margins": 10.495134353637695, - "rewards/rejected": -12.129803657531738, - "step": 11040 + "learning_rate": 3.855410946692815e-08, + "logits/chosen": -2.474365711212158, + "logits/rejected": -2.526387929916382, + "logps/chosen": -383.45367431640625, + "logps/rejected": -628.0545043945312, + "loss": 0.0337, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.589602470397949, + "rewards/margins": 14.03071117401123, + "rewards/rejected": -16.620315551757812, + "step": 11600 }, { "epoch": 2.79, - "learning_rate": 3.829229472895796e-08, - "logits/chosen": -2.49495267868042, - "logits/rejected": -2.5380091667175293, - "logps/chosen": -273.33343505859375, - "logps/rejected": -460.66094970703125, - "loss": 0.0109, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1227961778640747, - "rewards/margins": 16.76388931274414, - "rewards/rejected": -15.641095161437988, - "step": 11050 + "eval_logits/chosen": -2.221506357192993, + "eval_logits/rejected": -2.160006046295166, + "eval_logps/chosen": -288.7711181640625, + "eval_logps/rejected": -322.3100280761719, + "eval_loss": 0.6350419521331787, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -9.281011581420898, + "eval_rewards/margins": 4.510683536529541, + "eval_rewards/rejected": -13.791694641113281, + "eval_runtime": 132.298, + "eval_samples_per_second": 23.855, + "eval_steps_per_second": 0.378, + "step": 11600 }, { - "epoch": 2.8, - "learning_rate": 3.782417376650126e-08, - "logits/chosen": -2.365868330001831, - "logits/rejected": -2.394813060760498, - "logps/chosen": -311.5312194824219, - "logps/rejected": -355.22900390625, - "loss": 0.0101, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.4357383251190186, - "rewards/margins": 9.920340538024902, - "rewards/rejected": -13.3560791015625, - "step": 11060 + "epoch": 2.79, + "learning_rate": 3.8108397218755566e-08, + "logits/chosen": -2.414795398712158, + "logits/rejected": -2.3537979125976562, + "logps/chosen": -263.09429931640625, + "logps/rejected": -358.0596618652344, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2074527740478516, + "rewards/margins": 9.277387619018555, + "rewards/rejected": -11.484840393066406, + "step": 11610 }, { "epoch": 2.8, - "learning_rate": 3.7356052804044567e-08, - "logits/chosen": -2.744568347930908, - "logits/rejected": -2.6600661277770996, - "logps/chosen": -371.8135681152344, - "logps/rejected": -508.0810546875, - "loss": 0.026, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.4500129222869873, - "rewards/margins": 11.732702255249023, - "rewards/rejected": -13.182714462280273, - "step": 11070 + "learning_rate": 3.766268497058299e-08, + "logits/chosen": -2.495978832244873, + "logits/rejected": -2.456198215484619, + "logps/chosen": -357.42193603515625, + "logps/rejected": -405.55780029296875, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0076830387115479, + "rewards/margins": 12.910600662231445, + "rewards/rejected": -13.91828441619873, + "step": 11620 }, { "epoch": 2.8, - "learning_rate": 3.688793184158787e-08, - "logits/chosen": -2.5530102252960205, - "logits/rejected": -2.4516406059265137, - "logps/chosen": -269.38934326171875, - "logps/rejected": -336.32220458984375, - "loss": 0.0244, + "learning_rate": 3.721697272241041e-08, + "logits/chosen": -2.580117702484131, + "logits/rejected": -2.311505079269409, + "logps/chosen": -289.43798828125, + "logps/rejected": -362.5500183105469, + "loss": 0.0205, "rewards/accuracies": 1.0, - "rewards/chosen": -1.0966869592666626, - "rewards/margins": 10.917011260986328, - "rewards/rejected": -12.01369857788086, - "step": 11080 + "rewards/chosen": -1.445685625076294, + "rewards/margins": 13.38715934753418, + "rewards/rejected": -14.832844734191895, + "step": 11630 }, { "epoch": 2.8, - "learning_rate": 3.6419810879131165e-08, - "logits/chosen": -2.6439976692199707, - "logits/rejected": -2.5483946800231934, - "logps/chosen": -311.73126220703125, - "logps/rejected": -396.8368835449219, - "loss": 0.0286, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8862838745117188, - "rewards/margins": 11.635208129882812, - "rewards/rejected": -14.521492004394531, - "step": 11090 + "learning_rate": 3.677126047423783e-08, + "logits/chosen": -2.606765031814575, + "logits/rejected": -2.6090919971466064, + "logps/chosen": -275.20953369140625, + "logps/rejected": -405.8874816894531, + "loss": 0.0367, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.679659605026245, + "rewards/margins": 10.039040565490723, + "rewards/rejected": -12.718700408935547, + "step": 11640 + }, + { + "epoch": 2.8, + "learning_rate": 3.632554822606525e-08, + "logits/chosen": -2.5372602939605713, + "logits/rejected": -2.552436351776123, + "logps/chosen": -291.2813415527344, + "logps/rejected": -414.78692626953125, + "loss": 0.0625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.434548854827881, + "rewards/margins": 12.363186836242676, + "rewards/rejected": -14.797735214233398, + "step": 11650 }, { "epoch": 2.81, - "learning_rate": 3.595168991667447e-08, - "logits/chosen": -2.478001117706299, - "logits/rejected": -2.47713303565979, - "logps/chosen": -291.5771179199219, - "logps/rejected": -327.6292419433594, - "loss": 0.027, + "learning_rate": 3.5879835977892673e-08, + "logits/chosen": -2.5731091499328613, + "logits/rejected": -2.512305974960327, + "logps/chosen": -293.6867980957031, + "logps/rejected": -501.38250732421875, + "loss": 0.0156, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.647336006164551, - "rewards/margins": 10.578722953796387, - "rewards/rejected": -13.226058959960938, - "step": 11100 + "rewards/chosen": -1.7110779285430908, + "rewards/margins": 13.870793342590332, + "rewards/rejected": -15.581871032714844, + "step": 11660 }, { "epoch": 2.81, - "learning_rate": 3.5483568954217764e-08, - "logits/chosen": -2.380610704421997, - "logits/rejected": -2.281994342803955, - "logps/chosen": -330.2130126953125, - "logps/rejected": -382.11248779296875, - "loss": 0.0167, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2577191889286041, - "rewards/margins": 13.458898544311523, - "rewards/rejected": -13.716618537902832, - "step": 11110 + "learning_rate": 3.5434123729720096e-08, + "logits/chosen": -2.381565809249878, + "logits/rejected": -2.215604782104492, + "logps/chosen": -291.3077087402344, + "logps/rejected": -447.95220947265625, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6165575981140137, + "rewards/margins": 12.55855941772461, + "rewards/rejected": -15.175119400024414, + "step": 11670 }, { "epoch": 2.81, - "learning_rate": 3.501544799176107e-08, - "logits/chosen": -2.6637930870056152, - "logits/rejected": -2.6697707176208496, - "logps/chosen": -315.9070739746094, - "logps/rejected": -398.7461242675781, - "loss": 0.0239, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1222403049468994, - "rewards/margins": 12.726017951965332, - "rewards/rejected": -13.848257064819336, - "step": 11120 + "learning_rate": 3.498841148154751e-08, + "logits/chosen": -2.338855028152466, + "logits/rejected": -2.3192977905273438, + "logps/chosen": -264.0408020019531, + "logps/rejected": -458.1502380371094, + "loss": 0.0473, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.708033084869385, + "rewards/margins": 10.177639961242676, + "rewards/rejected": -16.88567352294922, + "step": 11680 }, { "epoch": 2.81, - "learning_rate": 3.4547327029304375e-08, - "logits/chosen": -2.4287617206573486, - "logits/rejected": -2.3643672466278076, - "logps/chosen": -240.102294921875, - "logps/rejected": -320.3937072753906, - "loss": 0.011, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.3844046592712402, - "rewards/margins": 9.640578269958496, - "rewards/rejected": -13.024984359741211, - "step": 11130 + "learning_rate": 3.4542699233374935e-08, + "logits/chosen": -2.5677645206451416, + "logits/rejected": -2.498251438140869, + "logps/chosen": -363.9451904296875, + "logps/rejected": -439.3435974121094, + "loss": 0.033, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1151933670043945, + "rewards/margins": 11.453530311584473, + "rewards/rejected": -14.568723678588867, + "step": 11690 }, { "epoch": 2.82, - "learning_rate": 3.407920606684767e-08, - "logits/chosen": -2.393781900405884, - "logits/rejected": -2.2155890464782715, - "logps/chosen": -266.0910949707031, - "logps/rejected": -352.65338134765625, - "loss": 0.0098, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3181581497192383, - "rewards/margins": 13.180933952331543, - "rewards/rejected": -14.499092102050781, - "step": 11140 + "learning_rate": 3.409698698520235e-08, + "logits/chosen": -2.6495633125305176, + "logits/rejected": -2.5539846420288086, + "logps/chosen": -399.671142578125, + "logps/rejected": -453.7972106933594, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.049008846282959, + "rewards/margins": 13.715703964233398, + "rewards/rejected": -14.7647123336792, + "step": 11700 + }, + { + "epoch": 2.82, + "eval_logits/chosen": -2.19647216796875, + "eval_logits/rejected": -2.133913993835449, + "eval_logps/chosen": -298.714599609375, + "eval_logps/rejected": -333.9139404296875, + "eval_loss": 0.6450176239013672, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -10.275360107421875, + "eval_rewards/margins": 4.676724910736084, + "eval_rewards/rejected": -14.9520845413208, + "eval_runtime": 132.334, + "eval_samples_per_second": 23.849, + "eval_steps_per_second": 0.378, + "step": 11700 + }, + { + "epoch": 2.82, + "learning_rate": 3.3651274737029774e-08, + "logits/chosen": -2.4400439262390137, + "logits/rejected": -2.4519143104553223, + "logps/chosen": -256.12371826171875, + "logps/rejected": -355.0122985839844, + "loss": 0.0283, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.007424831390381, + "rewards/margins": 10.9447021484375, + "rewards/rejected": -14.952127456665039, + "step": 11710 + }, + { + "epoch": 2.82, + "learning_rate": 3.320556248885719e-08, + "logits/chosen": -2.6294469833374023, + "logits/rejected": -2.5450222492218018, + "logps/chosen": -370.88031005859375, + "logps/rejected": -458.5982360839844, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.074469566345215, + "rewards/margins": 12.876497268676758, + "rewards/rejected": -14.950965881347656, + "step": 11720 }, { "epoch": 2.82, - "learning_rate": 3.3611085104390974e-08, - "logits/chosen": -2.5786240100860596, - "logits/rejected": -2.470693349838257, - "logps/chosen": -399.3895568847656, - "logps/rejected": -467.3301696777344, - "loss": 0.0251, + "learning_rate": 3.275985024068461e-08, + "logits/chosen": -2.6166510581970215, + "logits/rejected": -2.418170928955078, + "logps/chosen": -362.4737243652344, + "logps/rejected": -430.04583740234375, + "loss": 0.0222, "rewards/accuracies": 1.0, - "rewards/chosen": -0.16405799984931946, - "rewards/margins": 13.329595565795898, - "rewards/rejected": -13.49365520477295, - "step": 11150 + "rewards/chosen": -2.231039047241211, + "rewards/margins": 12.742490768432617, + "rewards/rejected": -14.973528861999512, + "step": 11730 }, { - "epoch": 2.82, - "learning_rate": 3.314296414193427e-08, - "logits/chosen": -2.2679240703582764, - "logits/rejected": -2.252685070037842, - "logps/chosen": -187.90406799316406, - "logps/rejected": -356.1705017089844, - "loss": 0.003, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4501420855522156, - "rewards/margins": 13.709848403930664, - "rewards/rejected": -13.25970458984375, - "step": 11160 + "epoch": 2.83, + "learning_rate": 3.2314137992512035e-08, + "logits/chosen": -2.3542892932891846, + "logits/rejected": -2.1669318675994873, + "logps/chosen": -319.84423828125, + "logps/rejected": -377.96417236328125, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.435697555541992, + "rewards/margins": 13.781712532043457, + "rewards/rejected": -17.217411041259766, + "step": 11740 }, { - "epoch": 2.82, - "learning_rate": 3.267484317947757e-08, - "logits/chosen": -2.6549148559570312, - "logits/rejected": -2.5700325965881348, - "logps/chosen": -350.21295166015625, - "logps/rejected": -372.63189697265625, - "loss": 0.0103, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.11017012596130371, - "rewards/margins": 12.352564811706543, - "rewards/rejected": -12.242395401000977, - "step": 11170 + "epoch": 2.83, + "learning_rate": 3.186842574433946e-08, + "logits/chosen": -2.599973678588867, + "logits/rejected": -2.460090160369873, + "logps/chosen": -262.4917907714844, + "logps/rejected": -363.1957092285156, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9405644536018372, + "rewards/margins": 12.90803050994873, + "rewards/rejected": -13.848596572875977, + "step": 11750 }, { "epoch": 2.83, - "learning_rate": 3.220672221702088e-08, - "logits/chosen": -2.405151844024658, - "logits/rejected": -2.482081174850464, - "logps/chosen": -214.7506866455078, - "logps/rejected": -394.0699768066406, - "loss": 0.0192, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.3040611743927, - "rewards/margins": 11.162083625793457, - "rewards/rejected": -14.466143608093262, - "step": 11180 + "learning_rate": 3.1422713496166874e-08, + "logits/chosen": -2.49354887008667, + "logits/rejected": -2.465024471282959, + "logps/chosen": -229.1987762451172, + "logps/rejected": -406.35540771484375, + "loss": 0.0472, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5505380630493164, + "rewards/margins": 14.685193061828613, + "rewards/rejected": -16.235729217529297, + "step": 11760 }, { "epoch": 2.83, - "learning_rate": 3.173860125456418e-08, - "logits/chosen": -2.4875283241271973, - "logits/rejected": -2.5791056156158447, - "logps/chosen": -278.1722412109375, - "logps/rejected": -406.5775451660156, - "loss": 0.0202, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8419070243835449, - "rewards/margins": 12.21147632598877, - "rewards/rejected": -13.053384780883789, - "step": 11190 + "learning_rate": 3.09770012479943e-08, + "logits/chosen": -2.6159446239471436, + "logits/rejected": -2.479465961456299, + "logps/chosen": -319.8511962890625, + "logps/rejected": -575.2542724609375, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04054946452379227, + "rewards/margins": 22.537498474121094, + "rewards/rejected": -22.578044891357422, + "step": 11770 }, { - "epoch": 2.83, - "learning_rate": 3.127048029210748e-08, - "logits/chosen": -2.462008237838745, - "logits/rejected": -2.3970558643341064, - "logps/chosen": -202.25149536132812, - "logps/rejected": -290.50592041015625, - "loss": 0.0419, + "epoch": 2.84, + "learning_rate": 3.053128899982171e-08, + "logits/chosen": -2.416137933731079, + "logits/rejected": -2.2780098915100098, + "logps/chosen": -310.56915283203125, + "logps/rejected": -295.59100341796875, + "loss": 0.043, "rewards/accuracies": 1.0, - "rewards/chosen": -2.1536483764648438, - "rewards/margins": 9.67103385925293, - "rewards/rejected": -11.824682235717773, - "step": 11200 + "rewards/chosen": -3.6455485820770264, + "rewards/margins": 9.938628196716309, + "rewards/rejected": -13.58417797088623, + "step": 11780 }, { - "epoch": 2.83, - "learning_rate": 3.0802359329650776e-08, - "logits/chosen": -2.5316426753997803, - "logits/rejected": -2.4360640048980713, - "logps/chosen": -181.0486602783203, - "logps/rejected": -243.1605682373047, - "loss": 0.0021, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2299704551696777, - "rewards/margins": 10.221961975097656, - "rewards/rejected": -11.451932907104492, - "step": 11210 + "epoch": 2.84, + "learning_rate": 3.0085576751649136e-08, + "logits/chosen": -2.604844570159912, + "logits/rejected": -2.33046817779541, + "logps/chosen": -427.3414001464844, + "logps/rejected": -408.19769287109375, + "loss": 0.0157, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.988234043121338, + "rewards/margins": 12.347304344177246, + "rewards/rejected": -17.33553695678711, + "step": 11790 }, { "epoch": 2.84, - "learning_rate": 3.0334238367194085e-08, - "logits/chosen": -2.749342441558838, - "logits/rejected": -2.6762137413024902, - "logps/chosen": -331.9028015136719, - "logps/rejected": -490.9769592285156, - "loss": 0.0055, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8896066546440125, - "rewards/margins": 14.539273262023926, - "rewards/rejected": -13.649667739868164, - "step": 11220 + "learning_rate": 2.963986450347655e-08, + "logits/chosen": -2.666843891143799, + "logits/rejected": -2.4617562294006348, + "logps/chosen": -315.05438232421875, + "logps/rejected": -414.7613830566406, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.952371597290039, + "rewards/margins": 12.860366821289062, + "rewards/rejected": -15.812738418579102, + "step": 11800 }, { "epoch": 2.84, - "learning_rate": 2.986611740473738e-08, - "logits/chosen": -2.4288411140441895, - "logits/rejected": -2.336217164993286, - "logps/chosen": -238.291259765625, - "logps/rejected": -331.3802185058594, - "loss": 0.0064, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2435986995697021, - "rewards/margins": 10.70294189453125, - "rewards/rejected": -11.946540832519531, - "step": 11230 + "eval_logits/chosen": -2.1674180030822754, + "eval_logits/rejected": -2.104667901992798, + "eval_logps/chosen": -299.696533203125, + "eval_logps/rejected": -335.1365661621094, + "eval_loss": 0.6451202630996704, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -10.373553276062012, + "eval_rewards/margins": 4.70079231262207, + "eval_rewards/rejected": -15.074346542358398, + "eval_runtime": 132.2252, + "eval_samples_per_second": 23.868, + "eval_steps_per_second": 0.378, + "step": 11800 }, { "epoch": 2.84, - "learning_rate": 2.9397996442280687e-08, - "logits/chosen": -2.5155584812164307, - "logits/rejected": -2.2555930614471436, - "logps/chosen": -261.25, - "logps/rejected": -329.39801025390625, - "loss": 0.0203, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.6180357933044434, - "rewards/margins": 10.947762489318848, - "rewards/rejected": -13.56579875946045, - "step": 11240 + "learning_rate": 2.9194152255303974e-08, + "logits/chosen": -2.6123862266540527, + "logits/rejected": -2.3080999851226807, + "logps/chosen": -278.1975402832031, + "logps/rejected": -339.3916320800781, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.724158525466919, + "rewards/margins": 10.961214065551758, + "rewards/rejected": -13.685373306274414, + "step": 11810 }, { "epoch": 2.84, - "learning_rate": 2.8929875479823986e-08, - "logits/chosen": -2.5518200397491455, - "logits/rejected": -2.547988176345825, - "logps/chosen": -235.08847045898438, - "logps/rejected": -342.1391296386719, - "loss": 0.0255, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9798015356063843, - "rewards/margins": 14.8731050491333, - "rewards/rejected": -13.893304824829102, - "step": 11250 + "learning_rate": 2.8748440007131394e-08, + "logits/chosen": -2.4626944065093994, + "logits/rejected": -2.495983600616455, + "logps/chosen": -241.6798553466797, + "logps/rejected": -368.5050048828125, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.736865758895874, + "rewards/margins": 11.583398818969727, + "rewards/rejected": -14.320263862609863, + "step": 11820 }, { "epoch": 2.85, - "learning_rate": 2.8461754517367285e-08, - "logits/chosen": -2.3784825801849365, - "logits/rejected": -2.3726489543914795, - "logps/chosen": -240.8814239501953, - "logps/rejected": -376.7526550292969, - "loss": 0.0207, + "learning_rate": 2.8302727758958813e-08, + "logits/chosen": -2.470898151397705, + "logits/rejected": -2.24894380569458, + "logps/chosen": -284.7417907714844, + "logps/rejected": -396.5254211425781, + "loss": 0.0272, "rewards/accuracies": 1.0, - "rewards/chosen": 0.18654665350914001, - "rewards/margins": 11.669304847717285, - "rewards/rejected": -11.482756614685059, - "step": 11260 + "rewards/chosen": -2.4196250438690186, + "rewards/margins": 12.243185043334961, + "rewards/rejected": -14.662811279296875, + "step": 11830 }, { "epoch": 2.85, - "learning_rate": 2.7993633554910588e-08, - "logits/chosen": -2.561047077178955, - "logits/rejected": -2.493727207183838, - "logps/chosen": -254.25119018554688, - "logps/rejected": -413.8963317871094, - "loss": 0.0393, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2032550573349, - "rewards/margins": 15.142125129699707, - "rewards/rejected": -13.938870429992676, - "step": 11270 + "learning_rate": 2.7857015510786233e-08, + "logits/chosen": -2.4472086429595947, + "logits/rejected": -2.48282527923584, + "logps/chosen": -286.9671936035156, + "logps/rejected": -442.0672302246094, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.801032304763794, + "rewards/margins": 11.677374839782715, + "rewards/rejected": -14.47840690612793, + "step": 11840 }, { "epoch": 2.85, - "learning_rate": 2.7525512592453887e-08, - "logits/chosen": -2.58831787109375, - "logits/rejected": -2.430523157119751, - "logps/chosen": -289.0402526855469, - "logps/rejected": -370.84027099609375, - "loss": 0.017, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.0527355670928955, - "rewards/margins": 12.028496742248535, - "rewards/rejected": -13.081232070922852, - "step": 11280 + "learning_rate": 2.7411303262613655e-08, + "logits/chosen": -2.1491734981536865, + "logits/rejected": -2.073427438735962, + "logps/chosen": -205.7406463623047, + "logps/rejected": -262.1298828125, + "loss": 0.0333, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.932621002197266, + "rewards/margins": 8.782279968261719, + "rewards/rejected": -13.714900016784668, + "step": 11850 }, { "epoch": 2.85, - "learning_rate": 2.705739162999719e-08, - "logits/chosen": -2.622467279434204, - "logits/rejected": -2.708653211593628, - "logps/chosen": -349.805419921875, - "logps/rejected": -452.0779724121094, - "loss": 0.0013, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3241063952445984, - "rewards/margins": 11.919878005981445, - "rewards/rejected": -12.24398422241211, - "step": 11290 + "learning_rate": 2.6965591014441075e-08, + "logits/chosen": -2.4826626777648926, + "logits/rejected": -2.4393792152404785, + "logps/chosen": -280.3171691894531, + "logps/rejected": -424.0520935058594, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9108145236968994, + "rewards/margins": 11.856431007385254, + "rewards/rejected": -13.767245292663574, + "step": 11860 }, { "epoch": 2.86, - "learning_rate": 2.6589270667540492e-08, - "logits/chosen": -2.526979684829712, - "logits/rejected": -2.485935688018799, - "logps/chosen": -291.6176452636719, - "logps/rejected": -466.741943359375, - "loss": 0.0218, + "learning_rate": 2.6519878766268494e-08, + "logits/chosen": -2.5404858589172363, + "logits/rejected": -2.5333080291748047, + "logps/chosen": -376.43280029296875, + "logps/rejected": -458.4173278808594, + "loss": 0.0229, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.716256856918335, - "rewards/margins": 18.379528045654297, - "rewards/rejected": -20.095787048339844, - "step": 11300 + "rewards/chosen": -2.5944318771362305, + "rewards/margins": 14.112909317016602, + "rewards/rejected": -16.707340240478516, + "step": 11870 }, { "epoch": 2.86, - "learning_rate": 2.6121149705083792e-08, - "logits/chosen": -2.4464025497436523, - "logits/rejected": -2.3842031955718994, - "logps/chosen": -292.5438537597656, - "logps/rejected": -340.6904296875, - "loss": 0.0208, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.9673873782157898, - "rewards/margins": 11.448567390441895, - "rewards/rejected": -12.415953636169434, - "step": 11310 + "learning_rate": 2.6074166518095914e-08, + "logits/chosen": -2.4108517169952393, + "logits/rejected": -2.229393482208252, + "logps/chosen": -309.54583740234375, + "logps/rejected": -414.09100341796875, + "loss": 0.0334, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.199159145355225, + "rewards/margins": 10.307116508483887, + "rewards/rejected": -14.50627613067627, + "step": 11880 }, { "epoch": 2.86, - "learning_rate": 2.5653028742627094e-08, - "logits/chosen": -2.5121302604675293, - "logits/rejected": -2.506096363067627, - "logps/chosen": -249.0, - "logps/rejected": -413.91510009765625, - "loss": 0.0146, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6290432214736938, - "rewards/margins": 10.821525573730469, - "rewards/rejected": -12.450569152832031, - "step": 11320 + "learning_rate": 2.562845426992334e-08, + "logits/chosen": -2.5318827629089355, + "logits/rejected": -2.585554599761963, + "logps/chosen": -243.3291015625, + "logps/rejected": -393.29376220703125, + "loss": 0.0415, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.028994560241699, + "rewards/margins": 10.366655349731445, + "rewards/rejected": -12.395649909973145, + "step": 11890 }, { "epoch": 2.86, - "learning_rate": 2.5184907780170397e-08, - "logits/chosen": -2.5698561668395996, - "logits/rejected": -2.580622673034668, - "logps/chosen": -253.86483764648438, - "logps/rejected": -340.75372314453125, - "loss": 0.0099, + "learning_rate": 2.518274202175076e-08, + "logits/chosen": -2.29496693611145, + "logits/rejected": -2.265639305114746, + "logps/chosen": -275.6883239746094, + "logps/rejected": -328.33441162109375, + "loss": 0.0153, "rewards/accuracies": 1.0, - "rewards/chosen": -2.206702470779419, - "rewards/margins": 10.514699935913086, - "rewards/rejected": -12.721403121948242, - "step": 11330 + "rewards/chosen": -1.3148354291915894, + "rewards/margins": 13.072629928588867, + "rewards/rejected": -14.387463569641113, + "step": 11900 + }, + { + "epoch": 2.86, + "eval_logits/chosen": -2.172844171524048, + "eval_logits/rejected": -2.1102476119995117, + "eval_logps/chosen": -298.087158203125, + "eval_logps/rejected": -333.5195617675781, + "eval_loss": 0.6419631838798523, + "eval_rewards/accuracies": 0.6700000166893005, + "eval_rewards/chosen": -10.212615013122559, + "eval_rewards/margins": 4.700031280517578, + "eval_rewards/rejected": -14.912646293640137, + "eval_runtime": 132.1347, + "eval_samples_per_second": 23.885, + "eval_steps_per_second": 0.378, + "step": 11900 }, { "epoch": 2.87, - "learning_rate": 2.4716786817713696e-08, - "logits/chosen": -2.641425132751465, - "logits/rejected": -2.6758666038513184, - "logps/chosen": -334.5554504394531, - "logps/rejected": -419.2228088378906, - "loss": 0.025, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.09734637290239334, - "rewards/margins": 13.741844177246094, - "rewards/rejected": -13.839190483093262, - "step": 11340 + "learning_rate": 2.4737029773578178e-08, + "logits/chosen": -2.5085341930389404, + "logits/rejected": -2.565484046936035, + "logps/chosen": -288.0100402832031, + "logps/rejected": -386.5188903808594, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.094741106033325, + "rewards/margins": 12.823016166687012, + "rewards/rejected": -14.917757034301758, + "step": 11910 }, { "epoch": 2.87, - "learning_rate": 2.4248665855257e-08, - "logits/chosen": -2.596531391143799, - "logits/rejected": -2.584341526031494, - "logps/chosen": -270.8020935058594, - "logps/rejected": -418.1478576660156, - "loss": 0.0139, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.781860113143921, - "rewards/margins": 10.423555374145508, - "rewards/rejected": -13.205415725708008, - "step": 11350 + "learning_rate": 2.4291317525405598e-08, + "logits/chosen": -2.4081199169158936, + "logits/rejected": -2.3629062175750732, + "logps/chosen": -298.339599609375, + "logps/rejected": -378.9031677246094, + "loss": 0.0242, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7774710655212402, + "rewards/margins": 11.346016883850098, + "rewards/rejected": -15.12348747253418, + "step": 11920 }, { "epoch": 2.87, - "learning_rate": 2.3780544892800298e-08, - "logits/chosen": -2.4953293800354004, - "logits/rejected": -2.465010166168213, - "logps/chosen": -248.66635131835938, - "logps/rejected": -443.67816162109375, - "loss": 0.0388, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.2700600624084473, - "rewards/margins": 13.418362617492676, - "rewards/rejected": -15.688423156738281, - "step": 11360 + "learning_rate": 2.3845605277233017e-08, + "logits/chosen": -2.59757924079895, + "logits/rejected": -2.408696174621582, + "logps/chosen": -261.7337646484375, + "logps/rejected": -285.12451171875, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2069809436798096, + "rewards/margins": 9.957179069519043, + "rewards/rejected": -13.164159774780273, + "step": 11930 }, { "epoch": 2.87, - "learning_rate": 2.3312423930343597e-08, - "logits/chosen": -2.4786221981048584, - "logits/rejected": -2.429025650024414, - "logps/chosen": -261.0307312011719, - "logps/rejected": -350.3789978027344, - "loss": 0.0191, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.10001619905233383, - "rewards/margins": 14.259042739868164, - "rewards/rejected": -14.359057426452637, - "step": 11370 + "learning_rate": 2.339989302906044e-08, + "logits/chosen": -2.6483657360076904, + "logits/rejected": -2.498955249786377, + "logps/chosen": -278.2874450683594, + "logps/rejected": -327.51995849609375, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2007942199707031, + "rewards/margins": 11.14594554901123, + "rewards/rejected": -12.346739768981934, + "step": 11940 }, { "epoch": 2.88, - "learning_rate": 2.2844302967886903e-08, - "logits/chosen": -2.468181848526001, - "logits/rejected": -2.3002982139587402, - "logps/chosen": -270.9293518066406, - "logps/rejected": -373.20037841796875, - "loss": 0.0146, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.2403790950775146, - "rewards/margins": 12.128232955932617, - "rewards/rejected": -14.368614196777344, - "step": 11380 + "learning_rate": 2.295418078088786e-08, + "logits/chosen": -2.665330410003662, + "logits/rejected": -2.4401326179504395, + "logps/chosen": -364.44989013671875, + "logps/rejected": -394.2289733886719, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3014121055603027, + "rewards/margins": 10.444536209106445, + "rewards/rejected": -13.745948791503906, + "step": 11950 }, { "epoch": 2.88, - "learning_rate": 2.2376182005430202e-08, - "logits/chosen": -2.557896375656128, - "logits/rejected": -2.448179244995117, - "logps/chosen": -329.0253601074219, - "logps/rejected": -477.41351318359375, - "loss": 0.0338, + "learning_rate": 2.250846853271528e-08, + "logits/chosen": -2.2865519523620605, + "logits/rejected": -2.237614154815674, + "logps/chosen": -345.20098876953125, + "logps/rejected": -384.1597900390625, + "loss": 0.0274, "rewards/accuracies": 1.0, - "rewards/chosen": -2.408167600631714, - "rewards/margins": 12.042343139648438, - "rewards/rejected": -14.45051097869873, - "step": 11390 + "rewards/chosen": -1.8927990198135376, + "rewards/margins": 12.456912994384766, + "rewards/rejected": -14.349711418151855, + "step": 11960 }, { "epoch": 2.88, - "learning_rate": 2.19080610429735e-08, - "logits/chosen": -2.376495838165283, - "logits/rejected": -2.2772092819213867, - "logps/chosen": -261.8798828125, - "logps/rejected": -300.6114807128906, - "loss": 0.0145, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9720168113708496, - "rewards/margins": 9.81820011138916, - "rewards/rejected": -12.790217399597168, - "step": 11400 + "learning_rate": 2.2062756284542698e-08, + "logits/chosen": -2.225372076034546, + "logits/rejected": -2.310943126678467, + "logps/chosen": -397.21484375, + "logps/rejected": -478.82623291015625, + "loss": 0.0366, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1107540130615234, + "rewards/margins": 14.073896408081055, + "rewards/rejected": -17.184650421142578, + "step": 11970 }, { "epoch": 2.88, - "learning_rate": 2.1439940080516804e-08, - "logits/chosen": -2.4556946754455566, - "logits/rejected": -2.4068713188171387, - "logps/chosen": -358.0137634277344, - "logps/rejected": -476.787353515625, - "loss": 0.0021, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.019568920135498, - "rewards/margins": 18.053943634033203, - "rewards/rejected": -16.034372329711914, - "step": 11410 + "learning_rate": 2.161704403637012e-08, + "logits/chosen": -2.6753249168395996, + "logits/rejected": -2.644117832183838, + "logps/chosen": -350.1902770996094, + "logps/rejected": -412.377685546875, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8374068140983582, + "rewards/margins": 13.190289497375488, + "rewards/rejected": -14.02769660949707, + "step": 11980 }, { "epoch": 2.89, - "learning_rate": 2.0971819118060107e-08, - "logits/chosen": -2.536691188812256, - "logits/rejected": -2.4123189449310303, - "logps/chosen": -287.26849365234375, - "logps/rejected": -335.825439453125, - "loss": 0.0322, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -4.085879802703857, - "rewards/margins": 10.493057250976562, - "rewards/rejected": -14.578936576843262, - "step": 11420 + "learning_rate": 2.117133178819754e-08, + "logits/chosen": -2.4591317176818848, + "logits/rejected": -2.4213509559631348, + "logps/chosen": -295.6983642578125, + "logps/rejected": -423.86578369140625, + "loss": 0.032, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.302341938018799, + "rewards/margins": 13.132052421569824, + "rewards/rejected": -19.43439292907715, + "step": 11990 }, { "epoch": 2.89, - "learning_rate": 2.050369815560341e-08, - "logits/chosen": -2.3083388805389404, - "logits/rejected": -2.3589773178100586, - "logps/chosen": -225.0576171875, - "logps/rejected": -419.41583251953125, - "loss": 0.0062, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.46240463852882385, - "rewards/margins": 13.146392822265625, - "rewards/rejected": -13.608795166015625, - "step": 11430 + "learning_rate": 2.072561954002496e-08, + "logits/chosen": -2.582056999206543, + "logits/rejected": -2.4382543563842773, + "logps/chosen": -366.23028564453125, + "logps/rejected": -441.44415283203125, + "loss": 0.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6242170333862305, + "rewards/margins": 13.944511413574219, + "rewards/rejected": -16.568729400634766, + "step": 12000 }, { "epoch": 2.89, - "learning_rate": 2.003557719314671e-08, - "logits/chosen": -2.2630105018615723, - "logits/rejected": -2.2091078758239746, - "logps/chosen": -280.4766845703125, - "logps/rejected": -324.72369384765625, - "loss": 0.0695, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.704224109649658, - "rewards/margins": 9.590178489685059, - "rewards/rejected": -12.294401168823242, - "step": 11440 + "eval_logits/chosen": -2.1686856746673584, + "eval_logits/rejected": -2.1058926582336426, + "eval_logps/chosen": -298.0356140136719, + "eval_logps/rejected": -333.47406005859375, + "eval_loss": 0.6407229900360107, + "eval_rewards/accuracies": 0.6725000143051147, + "eval_rewards/chosen": -10.207459449768066, + "eval_rewards/margins": 4.700641632080078, + "eval_rewards/rejected": -14.908100128173828, + "eval_runtime": 132.1265, + "eval_samples_per_second": 23.886, + "eval_steps_per_second": 0.378, + "step": 12000 }, { "epoch": 2.89, - "learning_rate": 1.9567456230690008e-08, - "logits/chosen": -2.5785865783691406, - "logits/rejected": -2.4064249992370605, - "logps/chosen": -270.0608215332031, - "logps/rejected": -344.23199462890625, - "loss": 0.0189, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.2489631175994873, - "rewards/margins": 9.269261360168457, - "rewards/rejected": -11.518223762512207, - "step": 11450 + "learning_rate": 2.027990729185238e-08, + "logits/chosen": -2.417755126953125, + "logits/rejected": -2.4194352626800537, + "logps/chosen": -328.4615173339844, + "logps/rejected": -447.30987548828125, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3127981424331665, + "rewards/margins": 11.52728271484375, + "rewards/rejected": -12.840082168579102, + "step": 12010 + }, + { + "epoch": 2.89, + "learning_rate": 1.9834195043679802e-08, + "logits/chosen": -2.495596170425415, + "logits/rejected": -2.354806661605835, + "logps/chosen": -257.50018310546875, + "logps/rejected": -394.015869140625, + "loss": 0.0407, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.623464107513428, + "rewards/margins": 9.51017951965332, + "rewards/rejected": -15.133644104003906, + "step": 12020 }, { "epoch": 2.9, - "learning_rate": 1.909933526823331e-08, - "logits/chosen": -2.4120113849639893, - "logits/rejected": -2.4648735523223877, - "logps/chosen": -183.39422607421875, - "logps/rejected": -346.17572021484375, - "loss": 0.0335, + "learning_rate": 1.938848279550722e-08, + "logits/chosen": -2.2852301597595215, + "logits/rejected": -2.252769947052002, + "logps/chosen": -237.9971160888672, + "logps/rejected": -308.3721618652344, + "loss": 0.0248, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.9732882976531982, - "rewards/margins": 12.807714462280273, - "rewards/rejected": -14.78100299835205, - "step": 11460 + "rewards/chosen": -2.427917957305908, + "rewards/margins": 11.311907768249512, + "rewards/rejected": -13.739825248718262, + "step": 12030 }, { "epoch": 2.9, - "learning_rate": 1.8631214305776613e-08, - "logits/chosen": -2.3059537410736084, - "logits/rejected": -2.301574945449829, - "logps/chosen": -217.8255615234375, - "logps/rejected": -328.16278076171875, - "loss": 0.0222, + "learning_rate": 1.894277054733464e-08, + "logits/chosen": -2.2045233249664307, + "logits/rejected": -2.185724973678589, + "logps/chosen": -379.5198059082031, + "logps/rejected": -329.015869140625, + "loss": 0.0257, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.422218322753906, + "rewards/margins": 11.833602905273438, + "rewards/rejected": -17.25581932067871, + "step": 12040 + }, + { + "epoch": 2.9, + "learning_rate": 1.849705829916206e-08, + "logits/chosen": -2.4253971576690674, + "logits/rejected": -2.3414931297302246, + "logps/chosen": -296.16192626953125, + "logps/rejected": -360.591064453125, + "loss": 0.019, "rewards/accuracies": 1.0, - "rewards/chosen": -2.0877737998962402, - "rewards/margins": 11.274978637695312, - "rewards/rejected": -13.362752914428711, - "step": 11470 + "rewards/chosen": -2.84879994392395, + "rewards/margins": 11.08890438079834, + "rewards/rejected": -13.937704086303711, + "step": 12050 }, { "epoch": 2.9, - "learning_rate": 1.8163093343319912e-08, - "logits/chosen": -2.6343302726745605, - "logits/rejected": -2.536257266998291, - "logps/chosen": -309.25579833984375, - "logps/rejected": -385.6678771972656, - "loss": 0.0058, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.29305902123451233, - "rewards/margins": 11.193380355834961, - "rewards/rejected": -10.900321960449219, - "step": 11480 + "learning_rate": 1.8051346050989483e-08, + "logits/chosen": -2.4606800079345703, + "logits/rejected": -2.4103195667266846, + "logps/chosen": -309.3631896972656, + "logps/rejected": -428.0711975097656, + "loss": 0.0165, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.601666450500488, + "rewards/margins": 11.53504753112793, + "rewards/rejected": -16.136714935302734, + "step": 12060 }, { "epoch": 2.9, - "learning_rate": 1.7694972380863215e-08, - "logits/chosen": -2.252707004547119, - "logits/rejected": -2.2293648719787598, - "logps/chosen": -253.23196411132812, - "logps/rejected": -451.1348571777344, - "loss": 0.0292, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5220910906791687, - "rewards/margins": 12.508794784545898, - "rewards/rejected": -13.03088665008545, - "step": 11490 + "learning_rate": 1.7605633802816902e-08, + "logits/chosen": -2.394378423690796, + "logits/rejected": -2.2242891788482666, + "logps/chosen": -266.50225830078125, + "logps/rejected": -391.8316650390625, + "loss": 0.0973, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1371548175811768, + "rewards/margins": 13.504987716674805, + "rewards/rejected": -14.642141342163086, + "step": 12070 }, { "epoch": 2.91, - "learning_rate": 1.7226851418406514e-08, - "logits/chosen": -2.458756685256958, - "logits/rejected": -2.4679484367370605, - "logps/chosen": -298.9981689453125, - "logps/rejected": -424.33660888671875, - "loss": 0.0072, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1835033893585205, - "rewards/margins": 10.601720809936523, - "rewards/rejected": -13.785223007202148, - "step": 11500 + "learning_rate": 1.715992155464432e-08, + "logits/chosen": -2.4952521324157715, + "logits/rejected": -2.4258532524108887, + "logps/chosen": -327.77593994140625, + "logps/rejected": -421.78021240234375, + "loss": 0.0472, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.2891831398010254, + "rewards/margins": 11.675423622131348, + "rewards/rejected": -14.964607238769531, + "step": 12080 }, { "epoch": 2.91, - "learning_rate": 1.675873045594982e-08, - "logits/chosen": -2.4793026447296143, - "logits/rejected": -2.4787607192993164, - "logps/chosen": -251.3925323486328, - "logps/rejected": -372.2521057128906, - "loss": 0.0118, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.5706532001495361, - "rewards/margins": 13.254281997680664, - "rewards/rejected": -13.824934005737305, - "step": 11510 + "learning_rate": 1.671420930647174e-08, + "logits/chosen": -2.613152503967285, + "logits/rejected": -2.4630346298217773, + "logps/chosen": -317.3475341796875, + "logps/rejected": -343.37213134765625, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.476538062095642, + "rewards/margins": 11.087359428405762, + "rewards/rejected": -12.563897132873535, + "step": 12090 }, { "epoch": 2.91, - "learning_rate": 1.629060949349312e-08, - "logits/chosen": -2.3445253372192383, - "logits/rejected": -2.3248162269592285, - "logps/chosen": -308.88311767578125, - "logps/rejected": -379.0182189941406, - "loss": 0.0148, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7497143149375916, - "rewards/margins": 12.753561019897461, - "rewards/rejected": -13.503274917602539, - "step": 11520 + "learning_rate": 1.626849705829916e-08, + "logits/chosen": -2.482208728790283, + "logits/rejected": -2.3430240154266357, + "logps/chosen": -242.7880859375, + "logps/rejected": -430.07000732421875, + "loss": 0.0253, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6110856533050537, + "rewards/margins": 15.177764892578125, + "rewards/rejected": -17.788850784301758, + "step": 12100 }, { "epoch": 2.91, - "learning_rate": 1.5822488531036418e-08, - "logits/chosen": -2.6158080101013184, - "logits/rejected": -2.552791118621826, - "logps/chosen": -260.298583984375, - "logps/rejected": -346.90704345703125, - "loss": 0.03, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6114883422851562, - "rewards/margins": 11.58378791809082, - "rewards/rejected": -14.195277214050293, - "step": 11530 + "eval_logits/chosen": -2.159430742263794, + "eval_logits/rejected": -2.096773862838745, + "eval_logps/chosen": -296.8029479980469, + "eval_logps/rejected": -331.9907531738281, + "eval_loss": 0.6352577805519104, + "eval_rewards/accuracies": 0.6650000214576721, + "eval_rewards/chosen": -10.084195137023926, + "eval_rewards/margins": 4.675569534301758, + "eval_rewards/rejected": -14.759764671325684, + "eval_runtime": 132.2964, + "eval_samples_per_second": 23.856, + "eval_steps_per_second": 0.378, + "step": 12100 + }, + { + "epoch": 2.91, + "learning_rate": 1.5822784810126583e-08, + "logits/chosen": -2.5923213958740234, + "logits/rejected": -2.408043146133423, + "logps/chosen": -328.0230407714844, + "logps/rejected": -450.17071533203125, + "loss": 0.0407, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.227058410644531, + "rewards/margins": 10.479593276977539, + "rewards/rejected": -14.70665168762207, + "step": 12110 }, { "epoch": 2.92, - "learning_rate": 1.535436756857972e-08, - "logits/chosen": -2.60005259513855, - "logits/rejected": -2.506302833557129, - "logps/chosen": -284.90252685546875, - "logps/rejected": -413.2223205566406, - "loss": 0.0279, + "learning_rate": 1.5377072561954002e-08, + "logits/chosen": -2.3898749351501465, + "logits/rejected": -2.226194143295288, + "logps/chosen": -235.903076171875, + "logps/rejected": -330.5806579589844, + "loss": 0.0122, "rewards/accuracies": 1.0, - "rewards/chosen": -1.1742764711380005, - "rewards/margins": 13.892889022827148, - "rewards/rejected": -15.067166328430176, - "step": 11540 + "rewards/chosen": -2.8054912090301514, + "rewards/margins": 10.795156478881836, + "rewards/rejected": -13.60064697265625, + "step": 12120 }, { "epoch": 2.92, - "learning_rate": 1.4886246606123022e-08, - "logits/chosen": -2.702465295791626, - "logits/rejected": -2.673305034637451, - "logps/chosen": -271.26397705078125, - "logps/rejected": -452.94183349609375, - "loss": 0.0125, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.5330944061279297, - "rewards/margins": 13.213106155395508, - "rewards/rejected": -15.74620246887207, - "step": 11550 + "learning_rate": 1.4931360313781422e-08, + "logits/chosen": -2.5221052169799805, + "logits/rejected": -2.439319372177124, + "logps/chosen": -257.87213134765625, + "logps/rejected": -336.9048767089844, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8400112390518188, + "rewards/margins": 10.376493453979492, + "rewards/rejected": -12.21650505065918, + "step": 12130 }, { "epoch": 2.92, - "learning_rate": 1.4418125643666323e-08, - "logits/chosen": -2.5396695137023926, - "logits/rejected": -2.485886335372925, - "logps/chosen": -330.40960693359375, - "logps/rejected": -416.08807373046875, - "loss": 0.0228, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9634650945663452, - "rewards/margins": 10.967055320739746, - "rewards/rejected": -11.930520057678223, - "step": 11560 + "learning_rate": 1.4485648065608843e-08, + "logits/chosen": -2.3921761512756348, + "logits/rejected": -2.2921149730682373, + "logps/chosen": -293.24493408203125, + "logps/rejected": -489.3631896972656, + "loss": 0.0495, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.6178574562072754, + "rewards/margins": 15.063325881958008, + "rewards/rejected": -18.681182861328125, + "step": 12140 }, { "epoch": 2.92, - "learning_rate": 1.3950004681209625e-08, - "logits/chosen": -2.4903059005737305, - "logits/rejected": -2.3227312564849854, - "logps/chosen": -357.42529296875, - "logps/rejected": -345.2140197753906, - "loss": 0.0113, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0612937211990356, - "rewards/margins": 11.046579360961914, - "rewards/rejected": -12.107873916625977, - "step": 11570 + "learning_rate": 1.4039935817436262e-08, + "logits/chosen": -2.579136610031128, + "logits/rejected": -2.5086379051208496, + "logps/chosen": -261.5919494628906, + "logps/rejected": -399.455322265625, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.437955141067505, + "rewards/margins": 11.588395118713379, + "rewards/rejected": -14.026350021362305, + "step": 12150 }, { "epoch": 2.93, - "learning_rate": 1.3481883718752925e-08, - "logits/chosen": -2.5871903896331787, - "logits/rejected": -2.5559890270233154, - "logps/chosen": -234.2266082763672, - "logps/rejected": -360.0385437011719, - "loss": 0.0098, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6578261852264404, - "rewards/margins": 10.966938972473145, - "rewards/rejected": -13.624765396118164, - "step": 11580 + "learning_rate": 1.3594223569263683e-08, + "logits/chosen": -2.3314452171325684, + "logits/rejected": -2.219850540161133, + "logps/chosen": -237.8750762939453, + "logps/rejected": -375.03692626953125, + "loss": 0.0479, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.9237544536590576, + "rewards/margins": 10.353140830993652, + "rewards/rejected": -14.276895523071289, + "step": 12160 }, { "epoch": 2.93, - "learning_rate": 1.3013762756296227e-08, - "logits/chosen": -2.605954170227051, - "logits/rejected": -2.6144227981567383, - "logps/chosen": -272.157470703125, - "logps/rejected": -378.93939208984375, - "loss": 0.017, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1602685451507568, - "rewards/margins": 11.329082489013672, - "rewards/rejected": -12.489351272583008, - "step": 11590 + "learning_rate": 1.3148511321091103e-08, + "logits/chosen": -2.4586215019226074, + "logits/rejected": -2.3522372245788574, + "logps/chosen": -304.6685485839844, + "logps/rejected": -429.9214782714844, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3366432189941406, + "rewards/margins": 11.99636459350586, + "rewards/rejected": -15.333009719848633, + "step": 12170 }, { "epoch": 2.93, - "learning_rate": 1.2545641793839528e-08, - "logits/chosen": -2.4722487926483154, - "logits/rejected": -2.380265712738037, - "logps/chosen": -335.23895263671875, - "logps/rejected": -396.14385986328125, - "loss": 0.0303, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5032278895378113, - "rewards/margins": 14.567062377929688, - "rewards/rejected": -14.063835144042969, - "step": 11600 + "learning_rate": 1.2702799072918524e-08, + "logits/chosen": -2.513718366622925, + "logits/rejected": -2.3902573585510254, + "logps/chosen": -305.97930908203125, + "logps/rejected": -414.04071044921875, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4954414367675781, + "rewards/margins": 13.224698066711426, + "rewards/rejected": -14.720138549804688, + "step": 12180 }, { "epoch": 2.93, - "learning_rate": 1.2077520831382827e-08, - "logits/chosen": -2.3190627098083496, - "logits/rejected": -2.398951530456543, - "logps/chosen": -233.6508026123047, - "logps/rejected": -414.9471130371094, - "loss": 0.0174, + "learning_rate": 1.2257086824745943e-08, + "logits/chosen": -2.555018901824951, + "logits/rejected": -2.444200038909912, + "logps/chosen": -303.7113342285156, + "logps/rejected": -387.5549621582031, + "loss": 0.0239, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.061067581176758, + "rewards/margins": 9.820379257202148, + "rewards/rejected": -14.881446838378906, + "step": 12190 + }, + { + "epoch": 2.94, + "learning_rate": 1.1811374576573364e-08, + "logits/chosen": -2.563249111175537, + "logits/rejected": -2.466925859451294, + "logps/chosen": -313.5289611816406, + "logps/rejected": -338.11700439453125, + "loss": 0.0317, "rewards/accuracies": 1.0, - "rewards/chosen": -1.216247797012329, - "rewards/margins": 13.317113876342773, - "rewards/rejected": -14.533361434936523, - "step": 11610 + "rewards/chosen": -2.8505196571350098, + "rewards/margins": 8.939030647277832, + "rewards/rejected": -11.789548873901367, + "step": 12200 }, { "epoch": 2.94, - "learning_rate": 1.160939986892613e-08, - "logits/chosen": -2.4299168586730957, - "logits/rejected": -2.365037202835083, - "logps/chosen": -373.56597900390625, - "logps/rejected": -381.29498291015625, - "loss": 0.0278, + "eval_logits/chosen": -2.1664927005767822, + "eval_logits/rejected": -2.1042051315307617, + "eval_logps/chosen": -295.9168701171875, + "eval_logps/rejected": -331.2122802734375, + "eval_loss": 0.6351790428161621, + "eval_rewards/accuracies": 0.675000011920929, + "eval_rewards/chosen": -9.995587348937988, + "eval_rewards/margins": 4.686328411102295, + "eval_rewards/rejected": -14.681915283203125, + "eval_runtime": 132.3606, + "eval_samples_per_second": 23.844, + "eval_steps_per_second": 0.378, + "step": 12200 + }, + { + "epoch": 2.94, + "learning_rate": 1.1365662328400784e-08, + "logits/chosen": -2.388841152191162, + "logits/rejected": -2.2465455532073975, + "logps/chosen": -397.5497741699219, + "logps/rejected": -374.1092834472656, + "loss": 0.0169, "rewards/accuracies": 1.0, - "rewards/chosen": -1.5779365301132202, - "rewards/margins": 10.094257354736328, - "rewards/rejected": -11.67219352722168, - "step": 11620 + "rewards/chosen": -1.3950287103652954, + "rewards/margins": 12.57457160949707, + "rewards/rejected": -13.969599723815918, + "step": 12210 }, { "epoch": 2.94, - "learning_rate": 1.114127890646943e-08, - "logits/chosen": -2.496657371520996, - "logits/rejected": -2.456730365753174, - "logps/chosen": -264.4086608886719, - "logps/rejected": -353.732666015625, - "loss": 0.0111, + "learning_rate": 1.0919950080228205e-08, + "logits/chosen": -2.6038150787353516, + "logits/rejected": -2.5172927379608154, + "logps/chosen": -432.30926513671875, + "logps/rejected": -458.5791931152344, + "loss": 0.0311, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.3022593259811401, - "rewards/margins": 11.434806823730469, - "rewards/rejected": -12.737066268920898, - "step": 11630 + "rewards/chosen": -1.0042035579681396, + "rewards/margins": 14.580434799194336, + "rewards/rejected": -15.584637641906738, + "step": 12220 }, { "epoch": 2.94, - "learning_rate": 1.0673157944012733e-08, - "logits/chosen": -2.5562052726745605, - "logits/rejected": -2.61944580078125, - "logps/chosen": -262.9412536621094, - "logps/rejected": -453.12969970703125, - "loss": 0.0113, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.9975343942642212, - "rewards/margins": 13.116531372070312, - "rewards/rejected": -15.114065170288086, - "step": 11640 + "learning_rate": 1.0474237832055624e-08, + "logits/chosen": -2.523059368133545, + "logits/rejected": -2.3870327472686768, + "logps/chosen": -359.78887939453125, + "logps/rejected": -439.7530212402344, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2755260467529297, + "rewards/margins": 11.349427223205566, + "rewards/rejected": -13.62495231628418, + "step": 12230 }, { - "epoch": 2.94, - "learning_rate": 1.0205036981556033e-08, - "logits/chosen": -2.4997823238372803, - "logits/rejected": -2.3326516151428223, - "logps/chosen": -318.5556640625, - "logps/rejected": -383.9226989746094, - "loss": 0.0124, + "epoch": 2.95, + "learning_rate": 1.0028525583883044e-08, + "logits/chosen": -2.464730739593506, + "logits/rejected": -2.280881881713867, + "logps/chosen": -323.5257873535156, + "logps/rejected": -416.1053161621094, + "loss": 0.0266, "rewards/accuracies": 1.0, - "rewards/chosen": -0.2242390215396881, - "rewards/margins": 12.315638542175293, - "rewards/rejected": -12.539877891540527, - "step": 11650 + "rewards/chosen": -2.8695874214172363, + "rewards/margins": 10.034102439880371, + "rewards/rejected": -12.90368938446045, + "step": 12240 }, { "epoch": 2.95, - "learning_rate": 9.736916019099335e-09, - "logits/chosen": -2.6021151542663574, - "logits/rejected": -2.3831443786621094, - "logps/chosen": -246.7200469970703, - "logps/rejected": -266.804931640625, - "loss": 0.0107, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.0942278727889061, - "rewards/margins": 11.007195472717285, - "rewards/rejected": -11.101422309875488, - "step": 11660 + "learning_rate": 9.582813335710465e-09, + "logits/chosen": -2.576092004776001, + "logits/rejected": -2.361546277999878, + "logps/chosen": -313.176513671875, + "logps/rejected": -411.9351501464844, + "loss": 0.0305, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.629349946975708, + "rewards/margins": 10.773197174072266, + "rewards/rejected": -14.402546882629395, + "step": 12250 }, { "epoch": 2.95, - "learning_rate": 9.268795056642636e-09, - "logits/chosen": -2.6219844818115234, - "logits/rejected": -2.498939037322998, - "logps/chosen": -310.24639892578125, - "logps/rejected": -428.74481201171875, - "loss": 0.0217, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.2587666511535645, - "rewards/margins": 11.268743515014648, - "rewards/rejected": -13.527508735656738, - "step": 11670 + "learning_rate": 9.137101087537884e-09, + "logits/chosen": -2.6408421993255615, + "logits/rejected": -2.5849997997283936, + "logps/chosen": -319.0190124511719, + "logps/rejected": -423.1532287597656, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1850690841674805, + "rewards/margins": 11.655340194702148, + "rewards/rejected": -12.840408325195312, + "step": 12260 }, { "epoch": 2.95, - "learning_rate": 8.800674094185939e-09, - "logits/chosen": -2.4541478157043457, - "logits/rejected": -2.366241931915283, - "logps/chosen": -258.74945068359375, - "logps/rejected": -385.4808044433594, - "loss": 0.0149, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.320427417755127, - "rewards/margins": 11.502983093261719, - "rewards/rejected": -13.823410034179688, - "step": 11680 + "learning_rate": 8.691388839365305e-09, + "logits/chosen": -2.4322009086608887, + "logits/rejected": -2.39558482170105, + "logps/chosen": -242.736083984375, + "logps/rejected": -418.80474853515625, + "loss": 0.038, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.5634684562683105, + "rewards/margins": 16.998226165771484, + "rewards/rejected": -21.561695098876953, + "step": 12270 }, { "epoch": 2.96, - "learning_rate": 8.332553131729238e-09, - "logits/chosen": -2.3575940132141113, - "logits/rejected": -2.297339916229248, - "logps/chosen": -266.781005859375, - "logps/rejected": -337.98150634765625, - "loss": 0.0142, + "learning_rate": 8.245676591192724e-09, + "logits/chosen": -2.49717378616333, + "logits/rejected": -2.3689796924591064, + "logps/chosen": -352.26507568359375, + "logps/rejected": -471.14886474609375, + "loss": 0.0365, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.519464135169983, - "rewards/margins": 15.5191011428833, - "rewards/rejected": -17.038564682006836, - "step": 11690 + "rewards/chosen": -5.072332382202148, + "rewards/margins": 10.935014724731445, + "rewards/rejected": -16.007347106933594, + "step": 12280 }, { "epoch": 2.96, - "learning_rate": 7.864432169272539e-09, - "logits/chosen": -2.328925371170044, - "logits/rejected": -2.376373052597046, - "logps/chosen": -187.71844482421875, - "logps/rejected": -261.5347900390625, - "loss": 0.0155, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.7654637098312378, - "rewards/margins": 10.228033065795898, - "rewards/rejected": -10.993496894836426, - "step": 11700 + "learning_rate": 7.799964343020146e-09, + "logits/chosen": -2.4430925846099854, + "logits/rejected": -2.2894511222839355, + "logps/chosen": -164.51394653320312, + "logps/rejected": -246.76864624023438, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9641733169555664, + "rewards/margins": 9.361350059509277, + "rewards/rejected": -11.325523376464844, + "step": 12290 }, { "epoch": 2.96, - "learning_rate": 7.396311206815841e-09, - "logits/chosen": -2.4738736152648926, - "logits/rejected": -2.4614264965057373, - "logps/chosen": -213.5368194580078, - "logps/rejected": -259.68914794921875, - "loss": 0.0103, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2424598932266235, - "rewards/margins": 9.811772346496582, - "rewards/rejected": -11.054232597351074, - "step": 11710 + "learning_rate": 7.3542520948475666e-09, + "logits/chosen": -2.6351709365844727, + "logits/rejected": -2.440642833709717, + "logps/chosen": -381.7463684082031, + "logps/rejected": -407.460693359375, + "loss": 0.0431, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.327102184295654, + "rewards/margins": 9.93896770477295, + "rewards/rejected": -15.266069412231445, + "step": 12300 }, { "epoch": 2.96, - "learning_rate": 6.928190244359142e-09, - "logits/chosen": -2.4830522537231445, - "logits/rejected": -2.361623764038086, - "logps/chosen": -292.7064514160156, - "logps/rejected": -451.8169860839844, - "loss": 0.0184, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8870089650154114, - "rewards/margins": 14.211830139160156, - "rewards/rejected": -15.098838806152344, - "step": 11720 + "eval_logits/chosen": -2.166001319885254, + "eval_logits/rejected": -2.1034488677978516, + "eval_logps/chosen": -294.76763916015625, + "eval_logps/rejected": -329.9331970214844, + "eval_loss": 0.6336598992347717, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -9.88066291809082, + "eval_rewards/margins": 4.673349380493164, + "eval_rewards/rejected": -14.554011344909668, + "eval_runtime": 132.0044, + "eval_samples_per_second": 23.908, + "eval_steps_per_second": 0.379, + "step": 12300 + }, + { + "epoch": 2.96, + "learning_rate": 6.908539846674986e-09, + "logits/chosen": -2.499121904373169, + "logits/rejected": -2.2830657958984375, + "logps/chosen": -340.805908203125, + "logps/rejected": -378.74334716796875, + "loss": 0.0209, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.102209091186523, + "rewards/margins": 12.57048225402832, + "rewards/rejected": -16.672691345214844, + "step": 12310 }, { "epoch": 2.97, - "learning_rate": 6.460069281902443e-09, - "logits/chosen": -2.6505773067474365, - "logits/rejected": -2.6242470741271973, - "logps/chosen": -367.317138671875, - "logps/rejected": -419.08953857421875, - "loss": 0.0119, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5306159853935242, - "rewards/margins": 12.968234062194824, - "rewards/rejected": -12.43761920928955, - "step": 11730 + "learning_rate": 6.462827598502406e-09, + "logits/chosen": -2.3793792724609375, + "logits/rejected": -2.337700366973877, + "logps/chosen": -237.06808471679688, + "logps/rejected": -336.19580078125, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.333949327468872, + "rewards/margins": 11.843382835388184, + "rewards/rejected": -14.177332878112793, + "step": 12320 }, { "epoch": 2.97, - "learning_rate": 5.991948319445744e-09, - "logits/chosen": -2.4809062480926514, - "logits/rejected": -2.439175605773926, - "logps/chosen": -190.18470764160156, - "logps/rejected": -313.22698974609375, - "loss": 0.0493, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -3.1452090740203857, - "rewards/margins": 11.021424293518066, - "rewards/rejected": -14.166631698608398, - "step": 11740 + "learning_rate": 6.0171153503298264e-09, + "logits/chosen": -2.494096517562866, + "logits/rejected": -2.4046335220336914, + "logps/chosen": -308.0247497558594, + "logps/rejected": -441.5491638183594, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3346580266952515, + "rewards/margins": 13.772607803344727, + "rewards/rejected": -15.107264518737793, + "step": 12330 }, { "epoch": 2.97, - "learning_rate": 5.523827356989046e-09, - "logits/chosen": -2.4870567321777344, - "logits/rejected": -2.391294002532959, - "logps/chosen": -321.2856750488281, - "logps/rejected": -426.3085021972656, - "loss": 0.0057, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.7549960613250732, - "rewards/margins": 15.484231948852539, - "rewards/rejected": -17.239227294921875, - "step": 11750 + "learning_rate": 5.571403102157247e-09, + "logits/chosen": -2.4762885570526123, + "logits/rejected": -2.4520134925842285, + "logps/chosen": -335.8169860839844, + "logps/rejected": -440.61761474609375, + "loss": 0.0329, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.054269790649414, + "rewards/margins": 13.740753173828125, + "rewards/rejected": -16.795024871826172, + "step": 12340 }, { "epoch": 2.97, - "learning_rate": 5.055706394532347e-09, - "logits/chosen": -2.315706491470337, - "logits/rejected": -2.325279712677002, - "logps/chosen": -227.36172485351562, - "logps/rejected": -390.9896545410156, - "loss": 0.0161, + "learning_rate": 5.125690853984667e-09, + "logits/chosen": -2.3965306282043457, + "logits/rejected": -2.3636691570281982, + "logps/chosen": -312.0904846191406, + "logps/rejected": -542.581787109375, + "loss": 0.0376, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.5550304651260376, - "rewards/margins": 12.025665283203125, - "rewards/rejected": -12.580698013305664, - "step": 11760 + "rewards/chosen": -0.9671527743339539, + "rewards/margins": 16.593849182128906, + "rewards/rejected": -17.56100082397461, + "step": 12350 + }, + { + "epoch": 2.97, + "learning_rate": 4.679978605812087e-09, + "logits/chosen": -2.436551809310913, + "logits/rejected": -2.4685463905334473, + "logps/chosen": -200.72666931152344, + "logps/rejected": -409.8763732910156, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4209479093551636, + "rewards/margins": 13.043127059936523, + "rewards/rejected": -14.464075088500977, + "step": 12360 }, { "epoch": 2.98, - "learning_rate": 4.5875854320756484e-09, - "logits/chosen": -2.507659435272217, - "logits/rejected": -2.4059300422668457, - "logps/chosen": -261.05743408203125, - "logps/rejected": -381.0530700683594, - "loss": 0.0126, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6033679842948914, - "rewards/margins": 11.23347282409668, - "rewards/rejected": -11.83684253692627, - "step": 11770 + "learning_rate": 4.234266357639507e-09, + "logits/chosen": -2.410393476486206, + "logits/rejected": -2.3812057971954346, + "logps/chosen": -283.7486572265625, + "logps/rejected": -338.99322509765625, + "loss": 0.0272, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3243625164031982, + "rewards/margins": 10.762374877929688, + "rewards/rejected": -14.086736679077148, + "step": 12370 }, { "epoch": 2.98, - "learning_rate": 4.119464469618949e-09, - "logits/chosen": -2.589505672454834, - "logits/rejected": -2.5626962184906006, - "logps/chosen": -358.2600402832031, - "logps/rejected": -409.32379150390625, - "loss": 0.0172, + "learning_rate": 3.788554109466928e-09, + "logits/chosen": -2.359842300415039, + "logits/rejected": -2.133486032485962, + "logps/chosen": -356.2956237792969, + "logps/rejected": -349.1375427246094, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.918581485748291, + "rewards/margins": 11.80932903289795, + "rewards/rejected": -14.727910041809082, + "step": 12380 + }, + { + "epoch": 2.98, + "learning_rate": 3.3428418612943483e-09, + "logits/chosen": -2.4385485649108887, + "logits/rejected": -2.3963093757629395, + "logps/chosen": -227.2484588623047, + "logps/rejected": -494.984130859375, + "loss": 0.0284, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.2080825567245483, - "rewards/margins": 12.394508361816406, - "rewards/rejected": -13.602592468261719, - "step": 11780 + "rewards/chosen": 0.1440451443195343, + "rewards/margins": 16.851680755615234, + "rewards/rejected": -16.7076358795166, + "step": 12390 }, { "epoch": 2.98, - "learning_rate": 3.6513435071622503e-09, - "logits/chosen": -2.5097475051879883, - "logits/rejected": -2.4207422733306885, - "logps/chosen": -289.1883544921875, - "logps/rejected": -337.92327880859375, - "loss": 0.0205, + "learning_rate": 2.8971296131217685e-09, + "logits/chosen": -2.3916614055633545, + "logits/rejected": -2.2580373287200928, + "logps/chosen": -347.2048034667969, + "logps/rejected": -400.259521484375, + "loss": 0.0233, "rewards/accuracies": 1.0, - "rewards/chosen": -1.319321870803833, - "rewards/margins": 9.911137580871582, - "rewards/rejected": -11.23045825958252, - "step": 11790 + "rewards/chosen": -3.204678773880005, + "rewards/margins": 10.869322776794434, + "rewards/rejected": -14.074002265930176, + "step": 12400 }, { "epoch": 2.98, - "learning_rate": 3.1832225447055516e-09, - "logits/chosen": -2.3756625652313232, - "logits/rejected": -2.3374183177948, - "logps/chosen": -259.49835205078125, - "logps/rejected": -432.2469787597656, - "loss": 0.0189, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6284923553466797, - "rewards/margins": 15.795854568481445, - "rewards/rejected": -16.424346923828125, - "step": 11800 + "eval_logits/chosen": -2.165696620941162, + "eval_logits/rejected": -2.1032466888427734, + "eval_logps/chosen": -294.7567443847656, + "eval_logps/rejected": -329.84222412109375, + "eval_loss": 0.632635772228241, + "eval_rewards/accuracies": 0.6675000190734863, + "eval_rewards/chosen": -9.879573822021484, + "eval_rewards/margins": 4.665342330932617, + "eval_rewards/rejected": -14.544916152954102, + "eval_runtime": 132.0816, + "eval_samples_per_second": 23.894, + "eval_steps_per_second": 0.379, + "step": 12400 }, { "epoch": 2.99, - "learning_rate": 2.715101582248853e-09, - "logits/chosen": -2.232285737991333, - "logits/rejected": -2.2487640380859375, - "logps/chosen": -264.79022216796875, - "logps/rejected": -377.00616455078125, - "loss": 0.0285, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.6382107734680176, - "rewards/margins": 11.905860900878906, - "rewards/rejected": -14.54407024383545, - "step": 11810 + "learning_rate": 2.4514173649491887e-09, + "logits/chosen": -2.426769256591797, + "logits/rejected": -2.3483054637908936, + "logps/chosen": -350.297607421875, + "logps/rejected": -370.11981201171875, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.844559907913208, + "rewards/margins": 12.144742965698242, + "rewards/rejected": -14.989303588867188, + "step": 12410 }, { "epoch": 2.99, - "learning_rate": 2.2469806197921542e-09, - "logits/chosen": -2.4739320278167725, - "logits/rejected": -2.522197723388672, - "logps/chosen": -309.99188232421875, - "logps/rejected": -447.0345153808594, - "loss": 0.0257, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.363677740097046, - "rewards/margins": 12.926587104797363, - "rewards/rejected": -14.290265083312988, - "step": 11820 + "learning_rate": 2.005705116776609e-09, + "logits/chosen": -2.516035556793213, + "logits/rejected": -2.287766695022583, + "logps/chosen": -293.1377868652344, + "logps/rejected": -346.6216125488281, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2884551286697388, + "rewards/margins": 12.446874618530273, + "rewards/rejected": -13.735328674316406, + "step": 12420 }, { "epoch": 2.99, - "learning_rate": 1.7788596573354553e-09, - "logits/chosen": -2.6093578338623047, - "logits/rejected": -2.5824170112609863, - "logps/chosen": -354.9128112792969, - "logps/rejected": -400.4103088378906, - "loss": 0.1116, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.216612458229065, - "rewards/margins": 10.741962432861328, - "rewards/rejected": -11.958575248718262, - "step": 11830 + "learning_rate": 1.5599928686040292e-09, + "logits/chosen": -2.356520175933838, + "logits/rejected": -2.0958051681518555, + "logps/chosen": -380.82171630859375, + "logps/rejected": -367.86212158203125, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.274829149246216, + "rewards/margins": 10.359758377075195, + "rewards/rejected": -13.634587287902832, + "step": 12430 }, { "epoch": 2.99, - "learning_rate": 1.3107386948787567e-09, - "logits/chosen": -2.597052812576294, - "logits/rejected": -2.5173323154449463, - "logps/chosen": -339.32379150390625, - "logps/rejected": -349.84088134765625, - "loss": 0.0097, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -1.9821268320083618, - "rewards/margins": 10.569337844848633, - "rewards/rejected": -12.551465034484863, - "step": 11840 + "learning_rate": 1.1142806204314494e-09, + "logits/chosen": -2.41813325881958, + "logits/rejected": -2.4418747425079346, + "logps/chosen": -319.6686706542969, + "logps/rejected": -421.96978759765625, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8550525903701782, + "rewards/margins": 10.559477806091309, + "rewards/rejected": -12.414529800415039, + "step": 12440 }, { "epoch": 3.0, - "learning_rate": 8.426177324220578e-10, - "logits/chosen": -2.583278179168701, - "logits/rejected": -2.4176480770111084, - "logps/chosen": -250.9386444091797, - "logps/rejected": -297.49237060546875, - "loss": 0.0144, + "learning_rate": 6.685683722588697e-10, + "logits/chosen": -2.5191800594329834, + "logits/rejected": -2.3644657135009766, + "logps/chosen": -366.04254150390625, + "logps/rejected": -406.83135986328125, + "loss": 0.0253, "rewards/accuracies": 1.0, - "rewards/chosen": -0.9199004173278809, - "rewards/margins": 10.506296157836914, - "rewards/rejected": -11.426196098327637, - "step": 11850 + "rewards/chosen": -2.2928309440612793, + "rewards/margins": 13.958070755004883, + "rewards/rejected": -16.25090217590332, + "step": 12450 }, { "epoch": 3.0, - "learning_rate": 3.74496769965359e-10, - "logits/chosen": -2.468799114227295, - "logits/rejected": -2.3663735389709473, - "logps/chosen": -318.8310852050781, - "logps/rejected": -367.59356689453125, - "loss": 0.0238, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5627554655075073, - "rewards/margins": 9.739514350891113, - "rewards/rejected": -11.302268981933594, - "step": 11860 + "learning_rate": 2.2285612408628988e-10, + "logits/chosen": -2.5135140419006348, + "logits/rejected": -2.3846256732940674, + "logps/chosen": -339.0476989746094, + "logps/rejected": -336.9720764160156, + "loss": 0.0195, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.9884878396987915, + "rewards/margins": 11.433095932006836, + "rewards/rejected": -13.42158317565918, + "step": 12460 }, { "epoch": 3.0, - "step": 11868, + "step": 12465, "total_flos": 0.0, - "train_loss": 0.2383290374584692, - "train_runtime": 16014.0387, - "train_samples_per_second": 11.855, - "train_steps_per_second": 0.741 + "train_loss": 0.24498563167372717, + "train_runtime": 32981.6015, + "train_samples_per_second": 6.046, + "train_steps_per_second": 0.378 } ], "logging_steps": 10, - "max_steps": 11868, + "max_steps": 12465, "num_input_tokens_seen": 0, "num_train_epochs": 3, - "save_steps": 1187, + "save_steps": 1247, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null,