diff --git "a/checkpoint-160/trainer_state.json" "b/checkpoint-160/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-160/trainer_state.json" @@ -0,0 +1,2901 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.2075471698113207, + "eval_steps": 500, + "global_step": 160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007547169811320755, + "grad_norm": 3.9455888271331787, + "learning_rate": 1.8518518518518518e-07, + "logps/chosen": -28.77263641357422, + "logps/rejected": -33.715965270996094, + "loss": 0.6962, + "losses/dpo": 0.6816703081130981, + "losses/sft": 1.0569090843200684, + "losses/total": 0.6816703081130981, + "ref_logps/chosen": -28.74100112915039, + "ref_logps/rejected": -33.742530822753906, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.0031636161729693413, + "rewards/margins": -0.005820129066705704, + "rewards/rejected": 0.002656512428075075, + "step": 1 + }, + { + "epoch": 0.01509433962264151, + "grad_norm": 4.175387859344482, + "learning_rate": 3.7037037037037036e-07, + "logps/chosen": -27.101844787597656, + "logps/rejected": -33.89026641845703, + "loss": 0.6957, + "losses/dpo": 0.6874121427536011, + "losses/sft": 1.0693237781524658, + "losses/total": 0.6874121427536011, + "ref_logps/chosen": -27.079509735107422, + "ref_logps/rejected": -33.91672134399414, + "rewards/accuracies": 0.4296875, + "rewards/chosen": -0.0022332118824124336, + "rewards/margins": -0.00487890001386404, + "rewards/rejected": 0.002645687432959676, + "step": 2 + }, + { + "epoch": 0.022641509433962263, + "grad_norm": 4.457658290863037, + "learning_rate": 5.555555555555555e-07, + "logps/chosen": -31.50066566467285, + "logps/rejected": -39.910255432128906, + "loss": 0.6943, + "losses/dpo": 0.6945112943649292, + "losses/sft": 1.2076711654663086, + "losses/total": 0.6945112943649292, + "ref_logps/chosen": -31.49291229248047, + "ref_logps/rejected": -39.922569274902344, + "rewards/accuracies": 0.4921875, + "rewards/chosen": -0.0007753549725748599, + "rewards/margins": -0.0020072408951818943, + "rewards/rejected": 0.0012318857479840517, + "step": 3 + }, + { + "epoch": 0.03018867924528302, + "grad_norm": 3.9046316146850586, + "learning_rate": 7.407407407407407e-07, + "logps/chosen": -29.450044631958008, + "logps/rejected": -35.36616516113281, + "loss": 0.6926, + "losses/dpo": 0.6948321461677551, + "losses/sft": 1.0938293933868408, + "losses/total": 0.6948321461677551, + "ref_logps/chosen": -29.46489715576172, + "ref_logps/rejected": -35.368446350097656, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0014853038592264056, + "rewards/margins": 0.0012573779094964266, + "rewards/rejected": 0.00022792589152231812, + "step": 4 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 4.023809432983398, + "learning_rate": 9.259259259259259e-07, + "logps/chosen": -33.57536697387695, + "logps/rejected": -37.974143981933594, + "loss": 0.6928, + "losses/dpo": 0.6999431848526001, + "losses/sft": 0.9456014633178711, + "losses/total": 0.6999431848526001, + "ref_logps/chosen": -33.59346389770508, + "ref_logps/rejected": -37.9796028137207, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.0018096657004207373, + "rewards/margins": 0.0012635205639526248, + "rewards/rejected": 0.000546145427506417, + "step": 5 + }, + { + "epoch": 0.045283018867924525, + "grad_norm": 4.098718166351318, + "learning_rate": 1.111111111111111e-06, + "logps/chosen": -27.6701602935791, + "logps/rejected": -33.560577392578125, + "loss": 0.6946, + "losses/dpo": 0.6877298951148987, + "losses/sft": 0.9011062383651733, + "losses/total": 0.6877298951148987, + "ref_logps/chosen": -27.687847137451172, + "ref_logps/rejected": -33.60447692871094, + "rewards/accuracies": 0.4765625, + "rewards/chosen": 0.001768420566804707, + "rewards/margins": -0.002621597610414028, + "rewards/rejected": 0.004390018526464701, + "step": 6 + }, + { + "epoch": 0.052830188679245285, + "grad_norm": 4.2839579582214355, + "learning_rate": 1.2962962962962962e-06, + "logps/chosen": -28.553794860839844, + "logps/rejected": -34.572933197021484, + "loss": 0.6917, + "losses/dpo": 0.6917375326156616, + "losses/sft": 1.1112768650054932, + "losses/total": 0.6917375326156616, + "ref_logps/chosen": -28.584508895874023, + "ref_logps/rejected": -34.5716552734375, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.0030715037137269974, + "rewards/margins": 0.0031996367033571005, + "rewards/rejected": -0.00012813357170671225, + "step": 7 + }, + { + "epoch": 0.06037735849056604, + "grad_norm": 3.9941976070404053, + "learning_rate": 1.4814814814814815e-06, + "logps/chosen": -34.030635833740234, + "logps/rejected": -34.67448425292969, + "loss": 0.6927, + "losses/dpo": 0.6898777484893799, + "losses/sft": 1.0375126600265503, + "losses/total": 0.6898777484893799, + "ref_logps/chosen": -34.03396224975586, + "ref_logps/rejected": -34.66481399536133, + "rewards/accuracies": 0.4921875, + "rewards/chosen": 0.0003325394354760647, + "rewards/margins": 0.001299483934417367, + "rewards/rejected": -0.000966944731771946, + "step": 8 + }, + { + "epoch": 0.06792452830188679, + "grad_norm": 4.303864479064941, + "learning_rate": 1.6666666666666667e-06, + "logps/chosen": -29.883249282836914, + "logps/rejected": -39.53127670288086, + "loss": 0.6935, + "losses/dpo": 0.6918261051177979, + "losses/sft": 0.8372335433959961, + "losses/total": 0.6918261051177979, + "ref_logps/chosen": -29.827882766723633, + "ref_logps/rejected": -39.478904724121094, + "rewards/accuracies": 0.4765625, + "rewards/chosen": -0.005536716431379318, + "rewards/margins": -0.0002995349932461977, + "rewards/rejected": -0.005237181670963764, + "step": 9 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 3.851869821548462, + "learning_rate": 1.8518518518518519e-06, + "logps/chosen": -25.46642303466797, + "logps/rejected": -33.54438018798828, + "loss": 0.6865, + "losses/dpo": 0.6891911625862122, + "losses/sft": 0.8832869529724121, + "losses/total": 0.6891911625862122, + "ref_logps/chosen": -25.50303840637207, + "ref_logps/rejected": -33.44293212890625, + "rewards/accuracies": 0.6171875, + "rewards/chosen": 0.003661695634946227, + "rewards/margins": 0.013806111179292202, + "rewards/rejected": -0.010144416242837906, + "step": 10 + }, + { + "epoch": 0.0830188679245283, + "grad_norm": 3.7789742946624756, + "learning_rate": 2.037037037037037e-06, + "logps/chosen": -28.199861526489258, + "logps/rejected": -32.44050598144531, + "loss": 0.6899, + "losses/dpo": 0.6870408058166504, + "losses/sft": 1.1733014583587646, + "losses/total": 0.6870408058166504, + "ref_logps/chosen": -28.221607208251953, + "ref_logps/rejected": -32.392120361328125, + "rewards/accuracies": 0.5546875, + "rewards/chosen": 0.002174636349081993, + "rewards/margins": 0.007013445254415274, + "rewards/rejected": -0.004838809370994568, + "step": 11 + }, + { + "epoch": 0.09056603773584905, + "grad_norm": 3.8337173461914062, + "learning_rate": 2.222222222222222e-06, + "logps/chosen": -30.373441696166992, + "logps/rejected": -33.936431884765625, + "loss": 0.6929, + "losses/dpo": 0.7091586589813232, + "losses/sft": 0.9355933666229248, + "losses/total": 0.7091586589813232, + "ref_logps/chosen": -30.290546417236328, + "ref_logps/rejected": -33.837547302246094, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00828930176794529, + "rewards/margins": 0.0015991395339369774, + "rewards/rejected": -0.009888442233204842, + "step": 12 + }, + { + "epoch": 0.09811320754716982, + "grad_norm": 3.946864366531372, + "learning_rate": 2.4074074074074075e-06, + "logps/chosen": -30.485576629638672, + "logps/rejected": -39.38032531738281, + "loss": 0.6893, + "losses/dpo": 0.7040209174156189, + "losses/sft": 1.1447887420654297, + "losses/total": 0.7040209174156189, + "ref_logps/chosen": -30.43646240234375, + "ref_logps/rejected": -39.23271560668945, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.004911163821816444, + "rewards/margins": 0.009850391186773777, + "rewards/rejected": -0.014761554077267647, + "step": 13 + }, + { + "epoch": 0.10566037735849057, + "grad_norm": 3.5996339321136475, + "learning_rate": 2.5925925925925925e-06, + "logps/chosen": -27.093900680541992, + "logps/rejected": -33.41731643676758, + "loss": 0.6858, + "losses/dpo": 0.6817034482955933, + "losses/sft": 0.8694231510162354, + "losses/total": 0.6817034482955933, + "ref_logps/chosen": -27.065704345703125, + "ref_logps/rejected": -33.2171630859375, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.002819519955664873, + "rewards/margins": 0.017195925116539, + "rewards/rejected": -0.020015446469187737, + "step": 14 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 3.8346712589263916, + "learning_rate": 2.7777777777777783e-06, + "logps/chosen": -28.581281661987305, + "logps/rejected": -34.18381118774414, + "loss": 0.686, + "losses/dpo": 0.7017788290977478, + "losses/sft": 1.0305365324020386, + "losses/total": 0.7017788290977478, + "ref_logps/chosen": -28.47262191772461, + "ref_logps/rejected": -33.8912353515625, + "rewards/accuracies": 0.5703125, + "rewards/chosen": -0.010865979827940464, + "rewards/margins": 0.01839156076312065, + "rewards/rejected": -0.02925753779709339, + "step": 15 + }, + { + "epoch": 0.12075471698113208, + "grad_norm": 3.754934072494507, + "learning_rate": 2.962962962962963e-06, + "logps/chosen": -30.27764892578125, + "logps/rejected": -31.89042854309082, + "loss": 0.6933, + "losses/dpo": 0.6706632375717163, + "losses/sft": 0.9468050599098206, + "losses/total": 0.6706632375717163, + "ref_logps/chosen": -29.9567813873291, + "ref_logps/rejected": -31.522958755493164, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.03208652138710022, + "rewards/margins": 0.004660369828343391, + "rewards/rejected": -0.03674689307808876, + "step": 16 + }, + { + "epoch": 0.12830188679245283, + "grad_norm": 4.00182580947876, + "learning_rate": 3.1481481481481483e-06, + "logps/chosen": -31.08722686767578, + "logps/rejected": -35.48697280883789, + "loss": 0.6834, + "losses/dpo": 0.7330925464630127, + "losses/sft": 0.9602083563804626, + "losses/total": 0.7330925464630127, + "ref_logps/chosen": -30.763084411621094, + "ref_logps/rejected": -34.88302993774414, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.03241410106420517, + "rewards/margins": 0.02798011153936386, + "rewards/rejected": -0.06039421260356903, + "step": 17 + }, + { + "epoch": 0.13584905660377358, + "grad_norm": 3.9149599075317383, + "learning_rate": 3.3333333333333333e-06, + "logps/chosen": -29.620763778686523, + "logps/rejected": -34.89619827270508, + "loss": 0.667, + "losses/dpo": 0.6938140988349915, + "losses/sft": 1.1796362400054932, + "losses/total": 0.6938140988349915, + "ref_logps/chosen": -29.360824584960938, + "ref_logps/rejected": -34.01747512817383, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.02599395252764225, + "rewards/margins": 0.061878398060798645, + "rewards/rejected": -0.08787235617637634, + "step": 18 + }, + { + "epoch": 0.14339622641509434, + "grad_norm": 3.761768341064453, + "learning_rate": 3.5185185185185187e-06, + "logps/chosen": -25.612323760986328, + "logps/rejected": -36.279903411865234, + "loss": 0.6632, + "losses/dpo": 0.7176868915557861, + "losses/sft": 0.962547242641449, + "losses/total": 0.7176868915557861, + "ref_logps/chosen": -25.287181854248047, + "ref_logps/rejected": -35.18791961669922, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.03251434862613678, + "rewards/margins": 0.07668425142765045, + "rewards/rejected": -0.10919859260320663, + "step": 19 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 3.7553532123565674, + "learning_rate": 3.7037037037037037e-06, + "logps/chosen": -30.204524993896484, + "logps/rejected": -35.833290100097656, + "loss": 0.6655, + "losses/dpo": 0.6513245701789856, + "losses/sft": 0.7598574161529541, + "losses/total": 0.6513245701789856, + "ref_logps/chosen": -29.58572769165039, + "ref_logps/rejected": -34.43686294555664, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.061879415065050125, + "rewards/margins": 0.07776333391666412, + "rewards/rejected": -0.13964274525642395, + "step": 20 + }, + { + "epoch": 0.15849056603773584, + "grad_norm": 4.008821487426758, + "learning_rate": 3.88888888888889e-06, + "logps/chosen": -30.718704223632812, + "logps/rejected": -41.57155990600586, + "loss": 0.6565, + "losses/dpo": 0.6501861810684204, + "losses/sft": 0.9706050157546997, + "losses/total": 0.6501861810684204, + "ref_logps/chosen": -29.69339942932129, + "ref_logps/rejected": -39.531036376953125, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.1025303453207016, + "rewards/margins": 0.10152260214090347, + "rewards/rejected": -0.20405295491218567, + "step": 21 + }, + { + "epoch": 0.1660377358490566, + "grad_norm": 4.064249515533447, + "learning_rate": 4.074074074074074e-06, + "logps/chosen": -29.448623657226562, + "logps/rejected": -37.82110595703125, + "loss": 0.6745, + "losses/dpo": 0.5472462177276611, + "losses/sft": 0.8530066013336182, + "losses/total": 0.5472462177276611, + "ref_logps/chosen": -28.203754425048828, + "ref_logps/rejected": -35.803627014160156, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.12448696047067642, + "rewards/margins": 0.07726091891527176, + "rewards/rejected": -0.20174787938594818, + "step": 22 + }, + { + "epoch": 0.17358490566037735, + "grad_norm": 3.9436683654785156, + "learning_rate": 4.2592592592592596e-06, + "logps/chosen": -29.838685989379883, + "logps/rejected": -41.74559020996094, + "loss": 0.6559, + "losses/dpo": 0.6158649921417236, + "losses/sft": 1.2145159244537354, + "losses/total": 0.6158649921417236, + "ref_logps/chosen": -28.296924591064453, + "ref_logps/rejected": -38.93891525268555, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.15417605638504028, + "rewards/margins": 0.1264912486076355, + "rewards/rejected": -0.2806673049926758, + "step": 23 + }, + { + "epoch": 0.1811320754716981, + "grad_norm": 4.2482686042785645, + "learning_rate": 4.444444444444444e-06, + "logps/chosen": -32.36741638183594, + "logps/rejected": -39.53350067138672, + "loss": 0.6718, + "losses/dpo": 0.7839959859848022, + "losses/sft": 1.165102243423462, + "losses/total": 0.7839959859848022, + "ref_logps/chosen": -30.38999366760254, + "ref_logps/rejected": -36.677555084228516, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.19774213433265686, + "rewards/margins": 0.08785250037908554, + "rewards/rejected": -0.285594642162323, + "step": 24 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 4.061373710632324, + "learning_rate": 4.62962962962963e-06, + "logps/chosen": -30.96800422668457, + "logps/rejected": -36.01205825805664, + "loss": 0.665, + "losses/dpo": 0.5365759134292603, + "losses/sft": 1.1550390720367432, + "losses/total": 0.5365759134292603, + "ref_logps/chosen": -29.243209838867188, + "ref_logps/rejected": -33.078861236572266, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.17247943580150604, + "rewards/margins": 0.12084060907363892, + "rewards/rejected": -0.29332002997398376, + "step": 25 + }, + { + "epoch": 0.19622641509433963, + "grad_norm": 4.22770881652832, + "learning_rate": 4.814814814814815e-06, + "logps/chosen": -30.431867599487305, + "logps/rejected": -40.13795852661133, + "loss": 0.6457, + "losses/dpo": 0.6960878372192383, + "losses/sft": 0.7802775502204895, + "losses/total": 0.6960878372192383, + "ref_logps/chosen": -28.257612228393555, + "ref_logps/rejected": -36.28306579589844, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.21742568910121918, + "rewards/margins": 0.1680639088153839, + "rewards/rejected": -0.3854895830154419, + "step": 26 + }, + { + "epoch": 0.2037735849056604, + "grad_norm": 3.826033592224121, + "learning_rate": 5e-06, + "logps/chosen": -27.43597412109375, + "logps/rejected": -36.42435073852539, + "loss": 0.6108, + "losses/dpo": 0.6934707164764404, + "losses/sft": 0.7890737652778625, + "losses/total": 0.6934707164764404, + "ref_logps/chosen": -26.13240623474121, + "ref_logps/rejected": -32.615989685058594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13035674393177032, + "rewards/margins": 0.25047942996025085, + "rewards/rejected": -0.38083615899086, + "step": 27 + }, + { + "epoch": 0.21132075471698114, + "grad_norm": 5.319561004638672, + "learning_rate": 4.978902953586498e-06, + "logps/chosen": -35.206329345703125, + "logps/rejected": -38.99248123168945, + "loss": 0.6844, + "losses/dpo": 0.7745039463043213, + "losses/sft": 1.3574930429458618, + "losses/total": 0.7745039463043213, + "ref_logps/chosen": -32.247459411621094, + "ref_logps/rejected": -34.93423080444336, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.29588693380355835, + "rewards/margins": 0.10993809252977371, + "rewards/rejected": -0.40582501888275146, + "step": 28 + }, + { + "epoch": 0.2188679245283019, + "grad_norm": 4.341159343719482, + "learning_rate": 4.957805907172996e-06, + "logps/chosen": -32.983768463134766, + "logps/rejected": -42.1301383972168, + "loss": 0.6272, + "losses/dpo": 0.6987279653549194, + "losses/sft": 1.430372953414917, + "losses/total": 0.6987279653549194, + "ref_logps/chosen": -30.753263473510742, + "ref_logps/rejected": -37.52302551269531, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.22305050492286682, + "rewards/margins": 0.2376612424850464, + "rewards/rejected": -0.4607117772102356, + "step": 29 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 4.440830230712891, + "learning_rate": 4.936708860759495e-06, + "logps/chosen": -32.3709716796875, + "logps/rejected": -40.325225830078125, + "loss": 0.6576, + "losses/dpo": 0.6192151308059692, + "losses/sft": 1.148033618927002, + "losses/total": 0.6192151308059692, + "ref_logps/chosen": -29.924030303955078, + "ref_logps/rejected": -36.34340286254883, + "rewards/accuracies": 0.5703125, + "rewards/chosen": -0.24469399452209473, + "rewards/margins": 0.153488427400589, + "rewards/rejected": -0.39818239212036133, + "step": 30 + }, + { + "epoch": 0.2339622641509434, + "grad_norm": 4.891458988189697, + "learning_rate": 4.915611814345992e-06, + "logps/chosen": -31.931894302368164, + "logps/rejected": -42.4509391784668, + "loss": 0.6329, + "losses/dpo": 0.6155243515968323, + "losses/sft": 0.8349864482879639, + "losses/total": 0.6155243515968323, + "ref_logps/chosen": -29.68793487548828, + "ref_logps/rejected": -38.151771545410156, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.22439587116241455, + "rewards/margins": 0.20552130043506622, + "rewards/rejected": -0.4299171566963196, + "step": 31 + }, + { + "epoch": 0.24150943396226415, + "grad_norm": 4.204699993133545, + "learning_rate": 4.89451476793249e-06, + "logps/chosen": -29.974943161010742, + "logps/rejected": -39.986690521240234, + "loss": 0.6125, + "losses/dpo": 0.513115644454956, + "losses/sft": 1.2455755472183228, + "losses/total": 0.513115644454956, + "ref_logps/chosen": -28.33128547668457, + "ref_logps/rejected": -36.01262664794922, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.1643659621477127, + "rewards/margins": 0.23304040729999542, + "rewards/rejected": -0.39740633964538574, + "step": 32 + }, + { + "epoch": 0.2490566037735849, + "grad_norm": 3.757606267929077, + "learning_rate": 4.873417721518987e-06, + "logps/chosen": -26.58209228515625, + "logps/rejected": -33.947696685791016, + "loss": 0.6025, + "losses/dpo": 0.560856819152832, + "losses/sft": 0.8093036413192749, + "losses/total": 0.560856819152832, + "ref_logps/chosen": -25.453628540039062, + "ref_logps/rejected": -30.35793685913086, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.11284616589546204, + "rewards/margins": 0.2461298555135727, + "rewards/rejected": -0.35897600650787354, + "step": 33 + }, + { + "epoch": 0.25660377358490566, + "grad_norm": 4.396605968475342, + "learning_rate": 4.852320675105486e-06, + "logps/chosen": -33.36553192138672, + "logps/rejected": -41.59575653076172, + "loss": 0.6381, + "losses/dpo": 0.6220612525939941, + "losses/sft": 1.1687374114990234, + "losses/total": 0.6220612525939941, + "ref_logps/chosen": -31.325790405273438, + "ref_logps/rejected": -37.433998107910156, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.2039741724729538, + "rewards/margins": 0.21220101416110992, + "rewards/rejected": -0.4161751866340637, + "step": 34 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 4.430360794067383, + "learning_rate": 4.831223628691984e-06, + "logps/chosen": -32.509361267089844, + "logps/rejected": -40.17280578613281, + "loss": 0.6123, + "losses/dpo": 0.7444272041320801, + "losses/sft": 1.3237799406051636, + "losses/total": 0.7444272041320801, + "ref_logps/chosen": -30.152652740478516, + "ref_logps/rejected": -35.37242126464844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23567090928554535, + "rewards/margins": 0.2443673312664032, + "rewards/rejected": -0.48003822565078735, + "step": 35 + }, + { + "epoch": 0.27169811320754716, + "grad_norm": 4.685129642486572, + "learning_rate": 4.8101265822784815e-06, + "logps/chosen": -33.738468170166016, + "logps/rejected": -43.074119567871094, + "loss": 0.6086, + "losses/dpo": 0.5348072052001953, + "losses/sft": 0.8256391286849976, + "losses/total": 0.5348072052001953, + "ref_logps/chosen": -30.881942749023438, + "ref_logps/rejected": -37.54746627807617, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.2856525182723999, + "rewards/margins": 0.2670130133628845, + "rewards/rejected": -0.5526655316352844, + "step": 36 + }, + { + "epoch": 0.2792452830188679, + "grad_norm": 4.623603343963623, + "learning_rate": 4.789029535864979e-06, + "logps/chosen": -31.62742805480957, + "logps/rejected": -38.05494689941406, + "loss": 0.6021, + "losses/dpo": 0.6446419358253479, + "losses/sft": 0.8820241689682007, + "losses/total": 0.6446419358253479, + "ref_logps/chosen": -29.002288818359375, + "ref_logps/rejected": -32.60838317871094, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.26251420378685, + "rewards/margins": 0.2821422219276428, + "rewards/rejected": -0.5446563959121704, + "step": 37 + }, + { + "epoch": 0.28679245283018867, + "grad_norm": 5.011680603027344, + "learning_rate": 4.767932489451477e-06, + "logps/chosen": -34.585872650146484, + "logps/rejected": -41.604644775390625, + "loss": 0.6674, + "losses/dpo": 0.6461673974990845, + "losses/sft": 1.2532376050949097, + "losses/total": 0.6461673974990845, + "ref_logps/chosen": -30.655323028564453, + "ref_logps/rejected": -35.37353515625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3930549621582031, + "rewards/margins": 0.23005636036396027, + "rewards/rejected": -0.623111367225647, + "step": 38 + }, + { + "epoch": 0.2943396226415094, + "grad_norm": 4.6953020095825195, + "learning_rate": 4.746835443037975e-06, + "logps/chosen": -31.12554931640625, + "logps/rejected": -39.2439079284668, + "loss": 0.6073, + "losses/dpo": 0.5849568843841553, + "losses/sft": 1.0929570198059082, + "losses/total": 0.5849568843841553, + "ref_logps/chosen": -27.928836822509766, + "ref_logps/rejected": -32.81389617919922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3196712136268616, + "rewards/margins": 0.323330283164978, + "rewards/rejected": -0.6430015563964844, + "step": 39 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 5.162503242492676, + "learning_rate": 4.725738396624473e-06, + "logps/chosen": -30.529884338378906, + "logps/rejected": -39.740753173828125, + "loss": 0.6392, + "losses/dpo": 0.7722287774085999, + "losses/sft": 1.5352623462677002, + "losses/total": 0.7722287774085999, + "ref_logps/chosen": -25.823863983154297, + "ref_logps/rejected": -32.63992691040039, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.4706021547317505, + "rewards/margins": 0.23948083817958832, + "rewards/rejected": -0.7100830078125, + "step": 40 + }, + { + "epoch": 0.30943396226415093, + "grad_norm": 4.303088188171387, + "learning_rate": 4.7046413502109714e-06, + "logps/chosen": -30.205230712890625, + "logps/rejected": -41.375083923339844, + "loss": 0.5456, + "losses/dpo": 0.5303640961647034, + "losses/sft": 1.0894774198532104, + "losses/total": 0.5303640961647034, + "ref_logps/chosen": -26.36581039428711, + "ref_logps/rejected": -32.568206787109375, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.38394197821617126, + "rewards/margins": 0.4967448115348816, + "rewards/rejected": -0.8806868195533752, + "step": 41 + }, + { + "epoch": 0.3169811320754717, + "grad_norm": 4.682536602020264, + "learning_rate": 4.683544303797468e-06, + "logps/chosen": -34.19953155517578, + "logps/rejected": -44.52813720703125, + "loss": 0.5636, + "losses/dpo": 0.7600305676460266, + "losses/sft": 1.3765720129013062, + "losses/total": 0.7600305676460266, + "ref_logps/chosen": -30.037071228027344, + "ref_logps/rejected": -35.797386169433594, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.416246235370636, + "rewards/margins": 0.456828773021698, + "rewards/rejected": -0.873075008392334, + "step": 42 + }, + { + "epoch": 0.32452830188679244, + "grad_norm": 4.896885395050049, + "learning_rate": 4.662447257383967e-06, + "logps/chosen": -34.50700759887695, + "logps/rejected": -43.02534484863281, + "loss": 0.5957, + "losses/dpo": 0.5419960618019104, + "losses/sft": 1.3564319610595703, + "losses/total": 0.5419960618019104, + "ref_logps/chosen": -30.103796005249023, + "ref_logps/rejected": -34.897117614746094, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.44032126665115356, + "rewards/margins": 0.3725017309188843, + "rewards/rejected": -0.8128229975700378, + "step": 43 + }, + { + "epoch": 0.3320754716981132, + "grad_norm": 5.244832992553711, + "learning_rate": 4.641350210970465e-06, + "logps/chosen": -30.753883361816406, + "logps/rejected": -42.095909118652344, + "loss": 0.6235, + "losses/dpo": 0.789696216583252, + "losses/sft": 1.1438733339309692, + "losses/total": 0.789696216583252, + "ref_logps/chosen": -26.017452239990234, + "ref_logps/rejected": -33.98859405517578, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.47364309430122375, + "rewards/margins": 0.33708813786506653, + "rewards/rejected": -0.8107312917709351, + "step": 44 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 5.0572896003723145, + "learning_rate": 4.620253164556963e-06, + "logps/chosen": -34.20557403564453, + "logps/rejected": -41.09657287597656, + "loss": 0.6262, + "losses/dpo": 0.6448432803153992, + "losses/sft": 0.9824965596199036, + "losses/total": 0.6448432803153992, + "ref_logps/chosen": -28.524147033691406, + "ref_logps/rejected": -31.738750457763672, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5681423544883728, + "rewards/margins": 0.36763995885849, + "rewards/rejected": -0.9357823133468628, + "step": 45 + }, + { + "epoch": 0.3471698113207547, + "grad_norm": 5.253727912902832, + "learning_rate": 4.5991561181434605e-06, + "logps/chosen": -34.27809143066406, + "logps/rejected": -44.58618927001953, + "loss": 0.5952, + "losses/dpo": 0.7227557897567749, + "losses/sft": 1.337683916091919, + "losses/total": 0.7227557897567749, + "ref_logps/chosen": -28.621898651123047, + "ref_logps/rejected": -34.67859649658203, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.5656192302703857, + "rewards/margins": 0.4251391291618347, + "rewards/rejected": -0.9907584190368652, + "step": 46 + }, + { + "epoch": 0.35471698113207545, + "grad_norm": 4.966336250305176, + "learning_rate": 4.578059071729958e-06, + "logps/chosen": -38.38945007324219, + "logps/rejected": -44.60417175292969, + "loss": 0.5908, + "losses/dpo": 0.6308821439743042, + "losses/sft": 1.1848210096359253, + "losses/total": 0.6308821439743042, + "ref_logps/chosen": -32.108848571777344, + "ref_logps/rejected": -33.745201110839844, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.6280601620674133, + "rewards/margins": 0.4578371047973633, + "rewards/rejected": -1.0858973264694214, + "step": 47 + }, + { + "epoch": 0.3622641509433962, + "grad_norm": 4.452719688415527, + "learning_rate": 4.556962025316456e-06, + "logps/chosen": -33.836082458496094, + "logps/rejected": -45.02879333496094, + "loss": 0.5266, + "losses/dpo": 0.44185441732406616, + "losses/sft": 0.9253690242767334, + "losses/total": 0.44185441732406616, + "ref_logps/chosen": -28.681163787841797, + "ref_logps/rejected": -34.032012939453125, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.5154916644096375, + "rewards/margins": 0.5841861367225647, + "rewards/rejected": -1.0996778011322021, + "step": 48 + }, + { + "epoch": 0.36981132075471695, + "grad_norm": 4.610968112945557, + "learning_rate": 4.535864978902954e-06, + "logps/chosen": -29.84048080444336, + "logps/rejected": -43.44829559326172, + "loss": 0.532, + "losses/dpo": 0.501494824886322, + "losses/sft": 1.050083875656128, + "losses/total": 0.501494824886322, + "ref_logps/chosen": -24.668697357177734, + "ref_logps/rejected": -32.59248733520508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5171782374382019, + "rewards/margins": 0.5684031248092651, + "rewards/rejected": -1.0855813026428223, + "step": 49 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 4.861355304718018, + "learning_rate": 4.514767932489452e-06, + "logps/chosen": -36.48255920410156, + "logps/rejected": -49.0556755065918, + "loss": 0.5485, + "losses/dpo": 0.48369336128234863, + "losses/sft": 1.1876718997955322, + "losses/total": 0.48369336128234863, + "ref_logps/chosen": -29.26421356201172, + "ref_logps/rejected": -35.85875701904297, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.7218344211578369, + "rewards/margins": 0.5978572368621826, + "rewards/rejected": -1.3196916580200195, + "step": 50 + }, + { + "epoch": 0.3849056603773585, + "grad_norm": 5.479549884796143, + "learning_rate": 4.4936708860759495e-06, + "logps/chosen": -37.56206512451172, + "logps/rejected": -47.11095428466797, + "loss": 0.5811, + "losses/dpo": 0.5307995676994324, + "losses/sft": 1.2354857921600342, + "losses/total": 0.5307995676994324, + "ref_logps/chosen": -30.669218063354492, + "ref_logps/rejected": -34.38352966308594, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.6892848014831543, + "rewards/margins": 0.5834579467773438, + "rewards/rejected": -1.272742748260498, + "step": 51 + }, + { + "epoch": 0.39245283018867927, + "grad_norm": 5.13350248336792, + "learning_rate": 4.472573839662447e-06, + "logps/chosen": -32.728759765625, + "logps/rejected": -46.00183868408203, + "loss": 0.5574, + "losses/dpo": 0.6999551057815552, + "losses/sft": 1.6744334697723389, + "losses/total": 0.6999551057815552, + "ref_logps/chosen": -26.183910369873047, + "ref_logps/rejected": -33.9737548828125, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6544848680496216, + "rewards/margins": 0.5483235120773315, + "rewards/rejected": -1.2028083801269531, + "step": 52 + }, + { + "epoch": 0.4, + "grad_norm": 4.932290554046631, + "learning_rate": 4.451476793248945e-06, + "logps/chosen": -32.513145446777344, + "logps/rejected": -42.56912612915039, + "loss": 0.5682, + "losses/dpo": 0.3805396854877472, + "losses/sft": 1.036488652229309, + "losses/total": 0.3805396854877472, + "ref_logps/chosen": -25.970481872558594, + "ref_logps/rejected": -30.01813316345215, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.6542659401893616, + "rewards/margins": 0.6008330583572388, + "rewards/rejected": -1.2550990581512451, + "step": 53 + }, + { + "epoch": 0.4075471698113208, + "grad_norm": 5.151583194732666, + "learning_rate": 4.430379746835443e-06, + "logps/chosen": -37.81098175048828, + "logps/rejected": -44.5388069152832, + "loss": 0.5803, + "losses/dpo": 0.5972741842269897, + "losses/sft": 1.2775373458862305, + "losses/total": 0.5972741842269897, + "ref_logps/chosen": -30.793987274169922, + "ref_logps/rejected": -32.02744674682617, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.701699435710907, + "rewards/margins": 0.5494363903999329, + "rewards/rejected": -1.2511358261108398, + "step": 54 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 5.815583229064941, + "learning_rate": 4.409282700421942e-06, + "logps/chosen": -35.73405075073242, + "logps/rejected": -45.81892395019531, + "loss": 0.5914, + "losses/dpo": 0.7572274804115295, + "losses/sft": 1.0465750694274902, + "losses/total": 0.7572274804115295, + "ref_logps/chosen": -28.158559799194336, + "ref_logps/rejected": -32.85423278808594, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.7575492262840271, + "rewards/margins": 0.5389198064804077, + "rewards/rejected": -1.29646897315979, + "step": 55 + }, + { + "epoch": 0.4226415094339623, + "grad_norm": 5.257417678833008, + "learning_rate": 4.3881856540084394e-06, + "logps/chosen": -35.593929290771484, + "logps/rejected": -44.64434814453125, + "loss": 0.586, + "losses/dpo": 0.5316880345344543, + "losses/sft": 1.2705625295639038, + "losses/total": 0.5316880345344543, + "ref_logps/chosen": -28.618404388427734, + "ref_logps/rejected": -32.222808837890625, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.6975523829460144, + "rewards/margins": 0.5446016788482666, + "rewards/rejected": -1.2421541213989258, + "step": 56 + }, + { + "epoch": 0.43018867924528303, + "grad_norm": 5.2874603271484375, + "learning_rate": 4.367088607594937e-06, + "logps/chosen": -38.34560775756836, + "logps/rejected": -49.868648529052734, + "loss": 0.5292, + "losses/dpo": 0.4450991749763489, + "losses/sft": 1.307680368423462, + "losses/total": 0.4450991749763489, + "ref_logps/chosen": -30.47826385498047, + "ref_logps/rejected": -34.6351432800293, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.786734402179718, + "rewards/margins": 0.7366155982017517, + "rewards/rejected": -1.5233500003814697, + "step": 57 + }, + { + "epoch": 0.4377358490566038, + "grad_norm": 5.186312198638916, + "learning_rate": 4.345991561181435e-06, + "logps/chosen": -35.11970138549805, + "logps/rejected": -45.68661117553711, + "loss": 0.5706, + "losses/dpo": 0.7511149644851685, + "losses/sft": 1.2385737895965576, + "losses/total": 0.7511149644851685, + "ref_logps/chosen": -27.67850685119629, + "ref_logps/rejected": -31.763341903686523, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7441191673278809, + "rewards/margins": 0.6482076644897461, + "rewards/rejected": -1.392326831817627, + "step": 58 + }, + { + "epoch": 0.44528301886792454, + "grad_norm": 5.047269344329834, + "learning_rate": 4.324894514767933e-06, + "logps/chosen": -38.08847427368164, + "logps/rejected": -55.2148551940918, + "loss": 0.4994, + "losses/dpo": 0.6142607927322388, + "losses/sft": 1.288847804069519, + "losses/total": 0.6142607927322388, + "ref_logps/chosen": -30.37006378173828, + "ref_logps/rejected": -38.910037994384766, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7718411684036255, + "rewards/margins": 0.8586408495903015, + "rewards/rejected": -1.6304820775985718, + "step": 59 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 6.175255298614502, + "learning_rate": 4.303797468354431e-06, + "logps/chosen": -38.292877197265625, + "logps/rejected": -48.04629898071289, + "loss": 0.6101, + "losses/dpo": 0.4248647093772888, + "losses/sft": 1.304377555847168, + "losses/total": 0.4248647093772888, + "ref_logps/chosen": -29.055316925048828, + "ref_logps/rejected": -33.995643615722656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9237565994262695, + "rewards/margins": 0.4813089966773987, + "rewards/rejected": -1.4050655364990234, + "step": 60 + }, + { + "epoch": 0.46037735849056605, + "grad_norm": 5.533387660980225, + "learning_rate": 4.2827004219409285e-06, + "logps/chosen": -36.77204895019531, + "logps/rejected": -52.59957504272461, + "loss": 0.518, + "losses/dpo": 0.552452027797699, + "losses/sft": 1.484251856803894, + "losses/total": 0.552452027797699, + "ref_logps/chosen": -29.142227172851562, + "ref_logps/rejected": -36.99250793457031, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.7629822492599487, + "rewards/margins": 0.7977244853973389, + "rewards/rejected": -1.560706615447998, + "step": 61 + }, + { + "epoch": 0.4679245283018868, + "grad_norm": 5.486879825592041, + "learning_rate": 4.261603375527426e-06, + "logps/chosen": -38.279579162597656, + "logps/rejected": -46.59737014770508, + "loss": 0.5401, + "losses/dpo": 0.6086790561676025, + "losses/sft": 1.462537407875061, + "losses/total": 0.6086790561676025, + "ref_logps/chosen": -30.90871810913086, + "ref_logps/rejected": -32.91778564453125, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.7370861172676086, + "rewards/margins": 0.6308723092079163, + "rewards/rejected": -1.367958426475525, + "step": 62 + }, + { + "epoch": 0.47547169811320755, + "grad_norm": 5.4317240715026855, + "learning_rate": 4.240506329113924e-06, + "logps/chosen": -35.83028030395508, + "logps/rejected": -49.950408935546875, + "loss": 0.5264, + "losses/dpo": 0.6195108294487, + "losses/sft": 1.6638743877410889, + "losses/total": 0.6195108294487, + "ref_logps/chosen": -27.65540313720703, + "ref_logps/rejected": -34.44922637939453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8174874782562256, + "rewards/margins": 0.732630729675293, + "rewards/rejected": -1.5501182079315186, + "step": 63 + }, + { + "epoch": 0.4830188679245283, + "grad_norm": 5.210587978363037, + "learning_rate": 4.219409282700423e-06, + "logps/chosen": -36.09168243408203, + "logps/rejected": -49.114288330078125, + "loss": 0.5335, + "losses/dpo": 0.39958345890045166, + "losses/sft": 1.4642709493637085, + "losses/total": 0.39958345890045166, + "ref_logps/chosen": -26.870357513427734, + "ref_logps/rejected": -32.58376693725586, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9221324920654297, + "rewards/margins": 0.7309194207191467, + "rewards/rejected": -1.6530518531799316, + "step": 64 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 6.943666934967041, + "learning_rate": 4.19831223628692e-06, + "logps/chosen": -41.26749801635742, + "logps/rejected": -50.411800384521484, + "loss": 0.5908, + "losses/dpo": 0.6376281380653381, + "losses/sft": 1.720862865447998, + "losses/total": 0.6376281380653381, + "ref_logps/chosen": -31.677553176879883, + "ref_logps/rejected": -34.83952331542969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9589947462081909, + "rewards/margins": 0.598233163356781, + "rewards/rejected": -1.5572278499603271, + "step": 65 + }, + { + "epoch": 0.4981132075471698, + "grad_norm": 5.59391975402832, + "learning_rate": 4.177215189873418e-06, + "logps/chosen": -40.36487579345703, + "logps/rejected": -57.310428619384766, + "loss": 0.4791, + "losses/dpo": 0.3789316713809967, + "losses/sft": 1.1292752027511597, + "losses/total": 0.3789316713809967, + "ref_logps/chosen": -30.15877342224121, + "ref_logps/rejected": -38.379600524902344, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -1.020609974861145, + "rewards/margins": 0.8724727630615234, + "rewards/rejected": -1.8930827379226685, + "step": 66 + }, + { + "epoch": 0.5056603773584906, + "grad_norm": 5.978224277496338, + "learning_rate": 4.156118143459915e-06, + "logps/chosen": -37.91278076171875, + "logps/rejected": -50.369380950927734, + "loss": 0.529, + "losses/dpo": 0.754231870174408, + "losses/sft": 1.3422857522964478, + "losses/total": 0.754231870174408, + "ref_logps/chosen": -27.58785629272461, + "ref_logps/rejected": -32.66661834716797, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.0324923992156982, + "rewards/margins": 0.7377833127975464, + "rewards/rejected": -1.7702758312225342, + "step": 67 + }, + { + "epoch": 0.5132075471698113, + "grad_norm": 5.108936309814453, + "learning_rate": 4.135021097046414e-06, + "logps/chosen": -38.47068786621094, + "logps/rejected": -53.993675231933594, + "loss": 0.4518, + "losses/dpo": 0.5163459777832031, + "losses/sft": 0.7686138153076172, + "losses/total": 0.5163459777832031, + "ref_logps/chosen": -29.918363571166992, + "ref_logps/rejected": -34.99020004272461, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.8552323579788208, + "rewards/margins": 1.0451147556304932, + "rewards/rejected": -1.9003472328186035, + "step": 68 + }, + { + "epoch": 0.5207547169811321, + "grad_norm": 5.734493255615234, + "learning_rate": 4.113924050632912e-06, + "logps/chosen": -39.34405517578125, + "logps/rejected": -57.34173583984375, + "loss": 0.5283, + "losses/dpo": 0.2836895287036896, + "losses/sft": 1.1457918882369995, + "losses/total": 0.2836895287036896, + "ref_logps/chosen": -28.0538330078125, + "ref_logps/rejected": -37.21113586425781, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.1290223598480225, + "rewards/margins": 0.8840377926826477, + "rewards/rejected": -2.0130600929260254, + "step": 69 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 5.639418601989746, + "learning_rate": 4.09282700421941e-06, + "logps/chosen": -39.54931640625, + "logps/rejected": -51.75471496582031, + "loss": 0.5555, + "losses/dpo": 0.7307843565940857, + "losses/sft": 1.650888442993164, + "losses/total": 0.7307843565940857, + "ref_logps/chosen": -27.878376007080078, + "ref_logps/rejected": -31.94900894165039, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -1.1670942306518555, + "rewards/margins": 0.8134759664535522, + "rewards/rejected": -1.9805700778961182, + "step": 70 + }, + { + "epoch": 0.5358490566037736, + "grad_norm": 6.629848003387451, + "learning_rate": 4.0717299578059074e-06, + "logps/chosen": -39.743858337402344, + "logps/rejected": -54.10401153564453, + "loss": 0.5613, + "losses/dpo": 0.42555686831474304, + "losses/sft": 1.4092556238174438, + "losses/total": 0.42555686831474304, + "ref_logps/chosen": -28.001995086669922, + "ref_logps/rejected": -34.528568267822266, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -1.1741865873336792, + "rewards/margins": 0.7833576798439026, + "rewards/rejected": -1.9575443267822266, + "step": 71 + }, + { + "epoch": 0.5433962264150943, + "grad_norm": 6.291466236114502, + "learning_rate": 4.050632911392405e-06, + "logps/chosen": -42.26633834838867, + "logps/rejected": -59.72451400756836, + "loss": 0.5599, + "losses/dpo": 0.5707880854606628, + "losses/sft": 1.4650211334228516, + "losses/total": 0.5707880854606628, + "ref_logps/chosen": -28.813255310058594, + "ref_logps/rejected": -38.030792236328125, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -1.345308542251587, + "rewards/margins": 0.8240638971328735, + "rewards/rejected": -2.16937255859375, + "step": 72 + }, + { + "epoch": 0.5509433962264151, + "grad_norm": 5.905974864959717, + "learning_rate": 4.029535864978903e-06, + "logps/chosen": -40.998741149902344, + "logps/rejected": -58.57215118408203, + "loss": 0.4874, + "losses/dpo": 0.4569835364818573, + "losses/sft": 1.3307000398635864, + "losses/total": 0.4569835364818573, + "ref_logps/chosen": -28.905885696411133, + "ref_logps/rejected": -36.10292053222656, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -1.2092852592468262, + "rewards/margins": 1.0376380681991577, + "rewards/rejected": -2.2469234466552734, + "step": 73 + }, + { + "epoch": 0.5584905660377358, + "grad_norm": 7.09720516204834, + "learning_rate": 4.008438818565401e-06, + "logps/chosen": -44.92335510253906, + "logps/rejected": -56.01509475708008, + "loss": 0.6831, + "losses/dpo": 1.1340866088867188, + "losses/sft": 1.490488052368164, + "losses/total": 1.1340866088867188, + "ref_logps/chosen": -30.007701873779297, + "ref_logps/rejected": -35.08796691894531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4915653467178345, + "rewards/margins": 0.6011477708816528, + "rewards/rejected": -2.0927131175994873, + "step": 74 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 5.746171951293945, + "learning_rate": 3.9873417721518995e-06, + "logps/chosen": -41.865482330322266, + "logps/rejected": -59.758544921875, + "loss": 0.4792, + "losses/dpo": 0.5183165669441223, + "losses/sft": 1.4497301578521729, + "losses/total": 0.5183165669441223, + "ref_logps/chosen": -29.29530143737793, + "ref_logps/rejected": -35.658206939697266, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2570182085037231, + "rewards/margins": 1.1530158519744873, + "rewards/rejected": -2.4100341796875, + "step": 75 + }, + { + "epoch": 0.5735849056603773, + "grad_norm": 5.843383312225342, + "learning_rate": 3.9662447257383965e-06, + "logps/chosen": -42.45313262939453, + "logps/rejected": -57.09604263305664, + "loss": 0.5201, + "losses/dpo": 0.44522571563720703, + "losses/sft": 1.3398542404174805, + "losses/total": 0.44522571563720703, + "ref_logps/chosen": -29.47281265258789, + "ref_logps/rejected": -35.2691650390625, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2980321645736694, + "rewards/margins": 0.8846558928489685, + "rewards/rejected": -2.182687997817993, + "step": 76 + }, + { + "epoch": 0.5811320754716981, + "grad_norm": 5.965431213378906, + "learning_rate": 3.945147679324895e-06, + "logps/chosen": -39.575950622558594, + "logps/rejected": -53.393741607666016, + "loss": 0.5362, + "losses/dpo": 0.3466046452522278, + "losses/sft": 1.3638067245483398, + "losses/total": 0.3466046452522278, + "ref_logps/chosen": -28.15513038635254, + "ref_logps/rejected": -34.03931427001953, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -1.1420820951461792, + "rewards/margins": 0.7933610677719116, + "rewards/rejected": -1.9354430437088013, + "step": 77 + }, + { + "epoch": 0.5886792452830188, + "grad_norm": 4.78204345703125, + "learning_rate": 3.924050632911393e-06, + "logps/chosen": -38.114864349365234, + "logps/rejected": -59.80986022949219, + "loss": 0.4152, + "losses/dpo": 0.43964630365371704, + "losses/sft": 1.4384866952896118, + "losses/total": 0.43964630365371704, + "ref_logps/chosen": -28.37373161315918, + "ref_logps/rejected": -37.50841522216797, + "rewards/accuracies": 0.828125, + "rewards/chosen": -0.9741131067276001, + "rewards/margins": 1.2560316324234009, + "rewards/rejected": -2.23014497756958, + "step": 78 + }, + { + "epoch": 0.5962264150943396, + "grad_norm": 4.994002819061279, + "learning_rate": 3.902953586497891e-06, + "logps/chosen": -40.175628662109375, + "logps/rejected": -57.650360107421875, + "loss": 0.4189, + "losses/dpo": 0.22024545073509216, + "losses/sft": 1.0659160614013672, + "losses/total": 0.22024545073509216, + "ref_logps/chosen": -29.503093719482422, + "ref_logps/rejected": -34.744808197021484, + "rewards/accuracies": 0.828125, + "rewards/chosen": -1.0672534704208374, + "rewards/margins": 1.2233017683029175, + "rewards/rejected": -2.290555238723755, + "step": 79 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 6.177035808563232, + "learning_rate": 3.8818565400843886e-06, + "logps/chosen": -43.75751495361328, + "logps/rejected": -60.642181396484375, + "loss": 0.5021, + "losses/dpo": 0.20048275589942932, + "losses/sft": 1.5765597820281982, + "losses/total": 0.20048275589942932, + "ref_logps/chosen": -30.938560485839844, + "ref_logps/rejected": -36.10749816894531, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -1.2818955183029175, + "rewards/margins": 1.1715729236602783, + "rewards/rejected": -2.4534683227539062, + "step": 80 + }, + { + "epoch": 0.6113207547169811, + "grad_norm": 6.305562973022461, + "learning_rate": 3.860759493670886e-06, + "logps/chosen": -46.60446548461914, + "logps/rejected": -58.49646759033203, + "loss": 0.5512, + "losses/dpo": 0.4569854736328125, + "losses/sft": 1.9612996578216553, + "losses/total": 0.4569854736328125, + "ref_logps/chosen": -32.67444610595703, + "ref_logps/rejected": -34.81311798095703, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3930021524429321, + "rewards/margins": 0.9753324389457703, + "rewards/rejected": -2.3683345317840576, + "step": 81 + }, + { + "epoch": 0.6188679245283019, + "grad_norm": 6.709742546081543, + "learning_rate": 3.839662447257384e-06, + "logps/chosen": -41.50311279296875, + "logps/rejected": -53.91865158081055, + "loss": 0.6351, + "losses/dpo": 0.5462090969085693, + "losses/sft": 1.3806183338165283, + "losses/total": 0.5462090969085693, + "ref_logps/chosen": -28.80118751525879, + "ref_logps/rejected": -33.822689056396484, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2701926231384277, + "rewards/margins": 0.7394037842750549, + "rewards/rejected": -2.009596347808838, + "step": 82 + }, + { + "epoch": 0.6264150943396226, + "grad_norm": 6.182176113128662, + "learning_rate": 3.818565400843882e-06, + "logps/chosen": -40.66282653808594, + "logps/rejected": -53.295135498046875, + "loss": 0.554, + "losses/dpo": 0.7928386926651001, + "losses/sft": 1.208125114440918, + "losses/total": 0.7928386926651001, + "ref_logps/chosen": -29.511093139648438, + "ref_logps/rejected": -34.15443801879883, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.1151734590530396, + "rewards/margins": 0.7988965511322021, + "rewards/rejected": -1.9140698909759521, + "step": 83 + }, + { + "epoch": 0.6339622641509434, + "grad_norm": 6.570309162139893, + "learning_rate": 3.7974683544303802e-06, + "logps/chosen": -42.49887466430664, + "logps/rejected": -57.260040283203125, + "loss": 0.5188, + "losses/dpo": 0.38705140352249146, + "losses/sft": 1.4572505950927734, + "losses/total": 0.38705140352249146, + "ref_logps/chosen": -30.53290367126465, + "ref_logps/rejected": -35.829559326171875, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.196596622467041, + "rewards/margins": 0.9464513063430786, + "rewards/rejected": -2.143048048019409, + "step": 84 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 5.353418827056885, + "learning_rate": 3.776371308016878e-06, + "logps/chosen": -39.217124938964844, + "logps/rejected": -57.946510314941406, + "loss": 0.4608, + "losses/dpo": 0.28930217027664185, + "losses/sft": 1.2899055480957031, + "losses/total": 0.28930217027664185, + "ref_logps/chosen": -29.44240951538086, + "ref_logps/rejected": -36.399539947509766, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9774720072746277, + "rewards/margins": 1.1772253513336182, + "rewards/rejected": -2.1546974182128906, + "step": 85 + }, + { + "epoch": 0.6490566037735849, + "grad_norm": 6.3269782066345215, + "learning_rate": 3.755274261603376e-06, + "logps/chosen": -40.942657470703125, + "logps/rejected": -52.808250427246094, + "loss": 0.604, + "losses/dpo": 0.24938051402568817, + "losses/sft": 1.5317809581756592, + "losses/total": 0.24938051402568817, + "ref_logps/chosen": -30.415924072265625, + "ref_logps/rejected": -35.01182174682617, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0526734590530396, + "rewards/margins": 0.7269693613052368, + "rewards/rejected": -1.7796428203582764, + "step": 86 + }, + { + "epoch": 0.6566037735849056, + "grad_norm": 6.511273384094238, + "learning_rate": 3.7341772151898737e-06, + "logps/chosen": -42.028167724609375, + "logps/rejected": -56.50782775878906, + "loss": 0.5652, + "losses/dpo": 0.5143932700157166, + "losses/sft": 0.9819191098213196, + "losses/total": 0.5143932700157166, + "ref_logps/chosen": -30.59110450744629, + "ref_logps/rejected": -36.39942169189453, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -1.143706202507019, + "rewards/margins": 0.8671345114707947, + "rewards/rejected": -2.010840892791748, + "step": 87 + }, + { + "epoch": 0.6641509433962264, + "grad_norm": 5.80403995513916, + "learning_rate": 3.713080168776372e-06, + "logps/chosen": -41.87797546386719, + "logps/rejected": -52.76039123535156, + "loss": 0.5278, + "losses/dpo": 0.4774477481842041, + "losses/sft": 1.3735116720199585, + "losses/total": 0.4774477481842041, + "ref_logps/chosen": -31.67178726196289, + "ref_logps/rejected": -34.80708694458008, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0206185579299927, + "rewards/margins": 0.7747123837471008, + "rewards/rejected": -1.7953307628631592, + "step": 88 + }, + { + "epoch": 0.6716981132075471, + "grad_norm": 5.385847091674805, + "learning_rate": 3.6919831223628693e-06, + "logps/chosen": -39.39588928222656, + "logps/rejected": -56.48224639892578, + "loss": 0.4817, + "losses/dpo": 0.35498157143592834, + "losses/sft": 1.1201632022857666, + "losses/total": 0.35498157143592834, + "ref_logps/chosen": -29.14954376220703, + "ref_logps/rejected": -36.613067626953125, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.0246341228485107, + "rewards/margins": 0.962283730506897, + "rewards/rejected": -1.9869179725646973, + "step": 89 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 5.900775909423828, + "learning_rate": 3.6708860759493675e-06, + "logps/chosen": -41.41682052612305, + "logps/rejected": -50.79188919067383, + "loss": 0.5399, + "losses/dpo": 0.7219789624214172, + "losses/sft": 1.451216220855713, + "losses/total": 0.7219789624214172, + "ref_logps/chosen": -30.58350372314453, + "ref_logps/rejected": -32.79383850097656, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0833317041397095, + "rewards/margins": 0.7164729833602905, + "rewards/rejected": -1.7998046875, + "step": 90 + }, + { + "epoch": 0.6867924528301886, + "grad_norm": 6.2395548820495605, + "learning_rate": 3.649789029535865e-06, + "logps/chosen": -42.84817123413086, + "logps/rejected": -53.5147705078125, + "loss": 0.5697, + "losses/dpo": 0.3541460931301117, + "losses/sft": 1.4194457530975342, + "losses/total": 0.3541460931301117, + "ref_logps/chosen": -30.911819458007812, + "ref_logps/rejected": -34.3682975769043, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -1.1936352252960205, + "rewards/margins": 0.7210119962692261, + "rewards/rejected": -1.914647102355957, + "step": 91 + }, + { + "epoch": 0.6943396226415094, + "grad_norm": 5.378219127655029, + "learning_rate": 3.628691983122363e-06, + "logps/chosen": -42.4554443359375, + "logps/rejected": -58.785587310791016, + "loss": 0.4765, + "losses/dpo": 0.46248504519462585, + "losses/sft": 1.2167584896087646, + "losses/total": 0.46248504519462585, + "ref_logps/chosen": -32.64524841308594, + "ref_logps/rejected": -39.931739807128906, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.9810197949409485, + "rewards/margins": 0.9043647050857544, + "rewards/rejected": -1.8853845596313477, + "step": 92 + }, + { + "epoch": 0.7018867924528301, + "grad_norm": 6.185760498046875, + "learning_rate": 3.607594936708861e-06, + "logps/chosen": -39.256011962890625, + "logps/rejected": -52.359004974365234, + "loss": 0.6134, + "losses/dpo": 0.3328525424003601, + "losses/sft": 1.2595546245574951, + "losses/total": 0.3328525424003601, + "ref_logps/chosen": -27.483736038208008, + "ref_logps/rejected": -34.35957336425781, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -1.1772277355194092, + "rewards/margins": 0.6227158308029175, + "rewards/rejected": -1.799943447113037, + "step": 93 + }, + { + "epoch": 0.7094339622641509, + "grad_norm": 6.525961875915527, + "learning_rate": 3.586497890295359e-06, + "logps/chosen": -42.73849868774414, + "logps/rejected": -53.30558395385742, + "loss": 0.5494, + "losses/dpo": 0.7141259908676147, + "losses/sft": 1.6565158367156982, + "losses/total": 0.7141259908676147, + "ref_logps/chosen": -32.172119140625, + "ref_logps/rejected": -35.31774139404297, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0566380023956299, + "rewards/margins": 0.7421461343765259, + "rewards/rejected": -1.7987840175628662, + "step": 94 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 5.566583633422852, + "learning_rate": 3.5654008438818566e-06, + "logps/chosen": -42.38603973388672, + "logps/rejected": -55.09327697753906, + "loss": 0.5384, + "losses/dpo": 1.1330491304397583, + "losses/sft": 1.443207859992981, + "losses/total": 1.1330491304397583, + "ref_logps/chosen": -30.967670440673828, + "ref_logps/rejected": -35.5986328125, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -1.1418365240097046, + "rewards/margins": 0.807628333568573, + "rewards/rejected": -1.9494649171829224, + "step": 95 + }, + { + "epoch": 0.7245283018867924, + "grad_norm": 5.464733600616455, + "learning_rate": 3.544303797468355e-06, + "logps/chosen": -38.848716735839844, + "logps/rejected": -54.41267395019531, + "loss": 0.5239, + "losses/dpo": 0.35635316371917725, + "losses/sft": 1.365813970565796, + "losses/total": 0.35635316371917725, + "ref_logps/chosen": -29.416330337524414, + "ref_logps/rejected": -37.32257843017578, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.9432384371757507, + "rewards/margins": 0.7657711505889893, + "rewards/rejected": -1.7090096473693848, + "step": 96 + }, + { + "epoch": 0.7320754716981132, + "grad_norm": 5.017922878265381, + "learning_rate": 3.523206751054853e-06, + "logps/chosen": -39.072479248046875, + "logps/rejected": -57.39777755737305, + "loss": 0.4386, + "losses/dpo": 0.4663291871547699, + "losses/sft": 1.867389440536499, + "losses/total": 0.4663291871547699, + "ref_logps/chosen": -29.59061050415039, + "ref_logps/rejected": -36.96004867553711, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.9481869339942932, + "rewards/margins": 1.0955859422683716, + "rewards/rejected": -2.0437726974487305, + "step": 97 + }, + { + "epoch": 0.7396226415094339, + "grad_norm": 5.6357197761535645, + "learning_rate": 3.5021097046413504e-06, + "logps/chosen": -42.07758331298828, + "logps/rejected": -50.89552307128906, + "loss": 0.5536, + "losses/dpo": 0.5998523235321045, + "losses/sft": 1.1362240314483643, + "losses/total": 0.5998523235321045, + "ref_logps/chosen": -31.295028686523438, + "ref_logps/rejected": -33.88740158081055, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -1.0782551765441895, + "rewards/margins": 0.6225565075874329, + "rewards/rejected": -1.7008116245269775, + "step": 98 + }, + { + "epoch": 0.7471698113207547, + "grad_norm": 5.427838325500488, + "learning_rate": 3.4810126582278487e-06, + "logps/chosen": -38.637672424316406, + "logps/rejected": -50.87923812866211, + "loss": 0.5414, + "losses/dpo": 0.2993618845939636, + "losses/sft": 1.3700653314590454, + "losses/total": 0.2993618845939636, + "ref_logps/chosen": -28.061681747436523, + "ref_logps/rejected": -32.76121139526367, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.0575990676879883, + "rewards/margins": 0.7542036771774292, + "rewards/rejected": -1.811802625656128, + "step": 99 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 6.803074359893799, + "learning_rate": 3.459915611814346e-06, + "logps/chosen": -43.55335235595703, + "logps/rejected": -48.745689392089844, + "loss": 0.6394, + "losses/dpo": 0.29391834139823914, + "losses/sft": 1.1858327388763428, + "losses/total": 0.29391834139823914, + "ref_logps/chosen": -30.87877655029297, + "ref_logps/rejected": -30.499563217163086, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2674579620361328, + "rewards/margins": 0.5571544170379639, + "rewards/rejected": -1.8246122598648071, + "step": 100 + }, + { + "epoch": 0.7622641509433963, + "grad_norm": 5.51361608505249, + "learning_rate": 3.4388185654008443e-06, + "logps/chosen": -37.56087875366211, + "logps/rejected": -54.06119155883789, + "loss": 0.4868, + "losses/dpo": 0.4083021879196167, + "losses/sft": 1.6247344017028809, + "losses/total": 0.4083021879196167, + "ref_logps/chosen": -27.63301658630371, + "ref_logps/rejected": -35.83578872680664, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.9927864074707031, + "rewards/margins": 0.8297540545463562, + "rewards/rejected": -1.822540521621704, + "step": 101 + }, + { + "epoch": 0.769811320754717, + "grad_norm": 5.518378734588623, + "learning_rate": 3.417721518987342e-06, + "logps/chosen": -36.31249237060547, + "logps/rejected": -47.149452209472656, + "loss": 0.5743, + "losses/dpo": 0.3533702492713928, + "losses/sft": 1.2526724338531494, + "losses/total": 0.3533702492713928, + "ref_logps/chosen": -26.499692916870117, + "ref_logps/rejected": -30.55165672302246, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.9812799692153931, + "rewards/margins": 0.6784999370574951, + "rewards/rejected": -1.6597799062728882, + "step": 102 + }, + { + "epoch": 0.7773584905660378, + "grad_norm": 5.646484851837158, + "learning_rate": 3.39662447257384e-06, + "logps/chosen": -41.71933364868164, + "logps/rejected": -53.18317413330078, + "loss": 0.5629, + "losses/dpo": 0.3440595269203186, + "losses/sft": 1.5956023931503296, + "losses/total": 0.3440595269203186, + "ref_logps/chosen": -30.516389846801758, + "ref_logps/rejected": -34.345726013183594, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -1.1202945709228516, + "rewards/margins": 0.763449490070343, + "rewards/rejected": -1.8837440013885498, + "step": 103 + }, + { + "epoch": 0.7849056603773585, + "grad_norm": 5.42646598815918, + "learning_rate": 3.3755274261603377e-06, + "logps/chosen": -40.73961639404297, + "logps/rejected": -52.125335693359375, + "loss": 0.5474, + "losses/dpo": 0.4815681278705597, + "losses/sft": 1.5057909488677979, + "losses/total": 0.4815681278705597, + "ref_logps/chosen": -28.968732833862305, + "ref_logps/rejected": -33.72377014160156, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.177088737487793, + "rewards/margins": 0.6630680561065674, + "rewards/rejected": -1.8401566743850708, + "step": 104 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 4.834980010986328, + "learning_rate": 3.354430379746836e-06, + "logps/chosen": -37.545654296875, + "logps/rejected": -53.264549255371094, + "loss": 0.4882, + "losses/dpo": 0.5576643943786621, + "losses/sft": 1.43105947971344, + "losses/total": 0.5576643943786621, + "ref_logps/chosen": -26.574623107910156, + "ref_logps/rejected": -33.20962142944336, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.0971035957336426, + "rewards/margins": 0.9083890914916992, + "rewards/rejected": -2.005492687225342, + "step": 105 + }, + { + "epoch": 0.8, + "grad_norm": 4.880195140838623, + "learning_rate": 3.3333333333333333e-06, + "logps/chosen": -33.03081130981445, + "logps/rejected": -50.53649139404297, + "loss": 0.5356, + "losses/dpo": 0.42436158657073975, + "losses/sft": 1.0997377634048462, + "losses/total": 0.42436158657073975, + "ref_logps/chosen": -22.474742889404297, + "ref_logps/rejected": -32.44505310058594, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0556070804595947, + "rewards/margins": 0.7535369396209717, + "rewards/rejected": -1.8091439008712769, + "step": 106 + }, + { + "epoch": 0.8075471698113208, + "grad_norm": 6.5177507400512695, + "learning_rate": 3.3122362869198316e-06, + "logps/chosen": -43.58899688720703, + "logps/rejected": -53.82051086425781, + "loss": 0.6438, + "losses/dpo": 0.748566746711731, + "losses/sft": 1.7201393842697144, + "losses/total": 0.748566746711731, + "ref_logps/chosen": -30.728872299194336, + "ref_logps/rejected": -36.30797576904297, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -1.2860126495361328, + "rewards/margins": 0.4652411937713623, + "rewards/rejected": -1.7512538433074951, + "step": 107 + }, + { + "epoch": 0.8150943396226416, + "grad_norm": 5.6698126792907715, + "learning_rate": 3.2911392405063294e-06, + "logps/chosen": -40.143028259277344, + "logps/rejected": -52.1860237121582, + "loss": 0.5459, + "losses/dpo": 0.5774589776992798, + "losses/sft": 1.5638508796691895, + "losses/total": 0.5774589776992798, + "ref_logps/chosen": -28.397808074951172, + "ref_logps/rejected": -33.201290130615234, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -1.1745221614837646, + "rewards/margins": 0.7239515781402588, + "rewards/rejected": -1.8984739780426025, + "step": 108 + }, + { + "epoch": 0.8226415094339623, + "grad_norm": 5.696971893310547, + "learning_rate": 3.270042194092827e-06, + "logps/chosen": -41.59033966064453, + "logps/rejected": -55.59294891357422, + "loss": 0.5428, + "losses/dpo": 0.44945141673088074, + "losses/sft": 1.5387637615203857, + "losses/total": 0.44945141673088074, + "ref_logps/chosen": -30.985074996948242, + "ref_logps/rejected": -35.67661666870117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.060526728630066, + "rewards/margins": 0.9311071038246155, + "rewards/rejected": -1.9916338920593262, + "step": 109 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 6.090880870819092, + "learning_rate": 3.248945147679325e-06, + "logps/chosen": -38.453773498535156, + "logps/rejected": -49.97744369506836, + "loss": 0.6335, + "losses/dpo": 0.42103850841522217, + "losses/sft": 1.321776032447815, + "losses/total": 0.42103850841522217, + "ref_logps/chosen": -26.563941955566406, + "ref_logps/rejected": -32.72344207763672, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -1.1889832019805908, + "rewards/margins": 0.5364166498184204, + "rewards/rejected": -1.7253999710083008, + "step": 110 + }, + { + "epoch": 0.8377358490566038, + "grad_norm": 5.761321067810059, + "learning_rate": 3.2278481012658232e-06, + "logps/chosen": -41.03196716308594, + "logps/rejected": -54.82693862915039, + "loss": 0.5514, + "losses/dpo": 0.8645380735397339, + "losses/sft": 1.6444365978240967, + "losses/total": 0.8645380735397339, + "ref_logps/chosen": -28.32830810546875, + "ref_logps/rejected": -34.87389373779297, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2703659534454346, + "rewards/margins": 0.7249387502670288, + "rewards/rejected": -1.995304822921753, + "step": 111 + }, + { + "epoch": 0.8452830188679246, + "grad_norm": 5.467708587646484, + "learning_rate": 3.206751054852321e-06, + "logps/chosen": -42.725914001464844, + "logps/rejected": -55.90550231933594, + "loss": 0.5241, + "losses/dpo": 0.4794267416000366, + "losses/sft": 1.2581182718276978, + "losses/total": 0.4794267416000366, + "ref_logps/chosen": -30.471614837646484, + "ref_logps/rejected": -36.124515533447266, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -1.2254297733306885, + "rewards/margins": 0.7526689171791077, + "rewards/rejected": -1.978098750114441, + "step": 112 + }, + { + "epoch": 0.8528301886792453, + "grad_norm": 6.094525337219238, + "learning_rate": 3.185654008438819e-06, + "logps/chosen": -43.23112869262695, + "logps/rejected": -59.694095611572266, + "loss": 0.5275, + "losses/dpo": 0.36554813385009766, + "losses/sft": 1.4116802215576172, + "losses/total": 0.36554813385009766, + "ref_logps/chosen": -30.045276641845703, + "ref_logps/rejected": -38.06658935546875, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.3185853958129883, + "rewards/margins": 0.844165563583374, + "rewards/rejected": -2.1627509593963623, + "step": 113 + }, + { + "epoch": 0.8603773584905661, + "grad_norm": 5.982193470001221, + "learning_rate": 3.164556962025317e-06, + "logps/chosen": -40.64250564575195, + "logps/rejected": -52.11644744873047, + "loss": 0.6185, + "losses/dpo": 1.1455503702163696, + "losses/sft": 1.6093838214874268, + "losses/total": 1.1455503702163696, + "ref_logps/chosen": -27.430282592773438, + "ref_logps/rejected": -33.03566360473633, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -1.3212223052978516, + "rewards/margins": 0.5868560671806335, + "rewards/rejected": -1.9080784320831299, + "step": 114 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 5.549319744110107, + "learning_rate": 3.1434599156118145e-06, + "logps/chosen": -41.28040313720703, + "logps/rejected": -55.66100311279297, + "loss": 0.5242, + "losses/dpo": 0.7410661578178406, + "losses/sft": 1.33750581741333, + "losses/total": 0.7410661578178406, + "ref_logps/chosen": -28.972108840942383, + "ref_logps/rejected": -34.85984802246094, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.2308290004730225, + "rewards/margins": 0.8492862582206726, + "rewards/rejected": -2.08011531829834, + "step": 115 + }, + { + "epoch": 0.8754716981132076, + "grad_norm": 5.77667236328125, + "learning_rate": 3.1223628691983127e-06, + "logps/chosen": -45.21639633178711, + "logps/rejected": -54.769466400146484, + "loss": 0.554, + "losses/dpo": 0.7079298496246338, + "losses/sft": 1.7484166622161865, + "losses/total": 0.7079298496246338, + "ref_logps/chosen": -31.872028350830078, + "ref_logps/rejected": -34.47022247314453, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.3344368934631348, + "rewards/margins": 0.6954872012138367, + "rewards/rejected": -2.029924154281616, + "step": 116 + }, + { + "epoch": 0.8830188679245283, + "grad_norm": 5.909778594970703, + "learning_rate": 3.10126582278481e-06, + "logps/chosen": -42.068424224853516, + "logps/rejected": -53.47673034667969, + "loss": 0.5374, + "losses/dpo": 0.5232099890708923, + "losses/sft": 1.6901054382324219, + "losses/total": 0.5232099890708923, + "ref_logps/chosen": -29.165977478027344, + "ref_logps/rejected": -32.32148361206055, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -1.2902448177337646, + "rewards/margins": 0.825279712677002, + "rewards/rejected": -2.1155245304107666, + "step": 117 + }, + { + "epoch": 0.8905660377358491, + "grad_norm": 4.659523010253906, + "learning_rate": 3.0801687763713083e-06, + "logps/chosen": -41.64044189453125, + "logps/rejected": -57.89656066894531, + "loss": 0.4151, + "losses/dpo": 0.42053163051605225, + "losses/sft": 1.2252192497253418, + "losses/total": 0.42053163051605225, + "ref_logps/chosen": -30.252826690673828, + "ref_logps/rejected": -35.8792724609375, + "rewards/accuracies": 0.8828125, + "rewards/chosen": -1.1387616395950317, + "rewards/margins": 1.0629674196243286, + "rewards/rejected": -2.2017292976379395, + "step": 118 + }, + { + "epoch": 0.8981132075471698, + "grad_norm": 5.144362926483154, + "learning_rate": 3.059071729957806e-06, + "logps/chosen": -40.072845458984375, + "logps/rejected": -53.409576416015625, + "loss": 0.4796, + "losses/dpo": 0.371336966753006, + "losses/sft": 1.2902876138687134, + "losses/total": 0.371336966753006, + "ref_logps/chosen": -29.023841857910156, + "ref_logps/rejected": -33.788238525390625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1049001216888428, + "rewards/margins": 0.8572336435317993, + "rewards/rejected": -1.9621338844299316, + "step": 119 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 5.273873329162598, + "learning_rate": 3.037974683544304e-06, + "logps/chosen": -39.68785858154297, + "logps/rejected": -56.72168731689453, + "loss": 0.5046, + "losses/dpo": 0.6220200061798096, + "losses/sft": 1.4726953506469727, + "losses/total": 0.6220200061798096, + "ref_logps/chosen": -27.100461959838867, + "ref_logps/rejected": -34.93486022949219, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -1.258739709854126, + "rewards/margins": 0.9199427962303162, + "rewards/rejected": -2.178682565689087, + "step": 120 + }, + { + "epoch": 0.9132075471698113, + "grad_norm": 6.11952543258667, + "learning_rate": 3.0168776371308017e-06, + "logps/chosen": -43.74298095703125, + "logps/rejected": -58.74589920043945, + "loss": 0.571, + "losses/dpo": 0.7673947215080261, + "losses/sft": 1.3842945098876953, + "losses/total": 0.7673947215080261, + "ref_logps/chosen": -30.62641716003418, + "ref_logps/rejected": -37.81398010253906, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.3116567134857178, + "rewards/margins": 0.7815347909927368, + "rewards/rejected": -2.093191623687744, + "step": 121 + }, + { + "epoch": 0.9207547169811321, + "grad_norm": 4.754635334014893, + "learning_rate": 2.9957805907173e-06, + "logps/chosen": -43.16095733642578, + "logps/rejected": -60.58992004394531, + "loss": 0.4455, + "losses/dpo": 0.4510895609855652, + "losses/sft": 1.611796498298645, + "losses/total": 0.4510895609855652, + "ref_logps/chosen": -31.162626266479492, + "ref_logps/rejected": -37.329471588134766, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1998332738876343, + "rewards/margins": 1.126211166381836, + "rewards/rejected": -2.3260445594787598, + "step": 122 + }, + { + "epoch": 0.9283018867924528, + "grad_norm": 6.031177520751953, + "learning_rate": 2.9746835443037974e-06, + "logps/chosen": -41.943580627441406, + "logps/rejected": -58.17346954345703, + "loss": 0.5395, + "losses/dpo": 0.384907066822052, + "losses/sft": 1.5029709339141846, + "losses/total": 0.384907066822052, + "ref_logps/chosen": -28.434606552124023, + "ref_logps/rejected": -35.15749740600586, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.350897192955017, + "rewards/margins": 0.9506996273994446, + "rewards/rejected": -2.3015968799591064, + "step": 123 + }, + { + "epoch": 0.9358490566037736, + "grad_norm": 5.330562114715576, + "learning_rate": 2.9535864978902956e-06, + "logps/chosen": -42.06855010986328, + "logps/rejected": -55.33885192871094, + "loss": 0.4695, + "losses/dpo": 0.37185460329055786, + "losses/sft": 1.442354679107666, + "losses/total": 0.37185460329055786, + "ref_logps/chosen": -29.88962745666504, + "ref_logps/rejected": -32.71107864379883, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -1.2178921699523926, + "rewards/margins": 1.0448851585388184, + "rewards/rejected": -2.262777328491211, + "step": 124 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 6.3546671867370605, + "learning_rate": 2.932489451476794e-06, + "logps/chosen": -46.69574737548828, + "logps/rejected": -57.914039611816406, + "loss": 0.5629, + "losses/dpo": 0.754065752029419, + "losses/sft": 1.678948163986206, + "losses/total": 0.754065752029419, + "ref_logps/chosen": -31.99456787109375, + "ref_logps/rejected": -35.61518096923828, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4701181650161743, + "rewards/margins": 0.7597677707672119, + "rewards/rejected": -2.2298858165740967, + "step": 125 + }, + { + "epoch": 0.9509433962264151, + "grad_norm": 5.110890865325928, + "learning_rate": 2.9113924050632912e-06, + "logps/chosen": -45.18208312988281, + "logps/rejected": -64.22908020019531, + "loss": 0.4167, + "losses/dpo": 0.49617111682891846, + "losses/sft": 1.5784951448440552, + "losses/total": 0.49617111682891846, + "ref_logps/chosen": -31.254486083984375, + "ref_logps/rejected": -37.952110290527344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3927595615386963, + "rewards/margins": 1.2349375486373901, + "rewards/rejected": -2.627696990966797, + "step": 126 + }, + { + "epoch": 0.9584905660377359, + "grad_norm": 5.989363193511963, + "learning_rate": 2.8902953586497895e-06, + "logps/chosen": -46.54087448120117, + "logps/rejected": -61.26066207885742, + "loss": 0.4896, + "losses/dpo": 0.4928218722343445, + "losses/sft": 1.5410984754562378, + "losses/total": 0.4928218722343445, + "ref_logps/chosen": -32.51362609863281, + "ref_logps/rejected": -36.370338439941406, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.4027252197265625, + "rewards/margins": 1.0863069295883179, + "rewards/rejected": -2.489032030105591, + "step": 127 + }, + { + "epoch": 0.9660377358490566, + "grad_norm": 5.725657939910889, + "learning_rate": 2.8691983122362873e-06, + "logps/chosen": -44.60926818847656, + "logps/rejected": -56.73688888549805, + "loss": 0.5024, + "losses/dpo": 0.34672486782073975, + "losses/sft": 1.4699136018753052, + "losses/total": 0.34672486782073975, + "ref_logps/chosen": -30.018938064575195, + "ref_logps/rejected": -32.321044921875, + "rewards/accuracies": 0.734375, + "rewards/chosen": -1.4590332508087158, + "rewards/margins": 0.9825511574745178, + "rewards/rejected": -2.441584587097168, + "step": 128 + }, + { + "epoch": 0.9735849056603774, + "grad_norm": 5.313739776611328, + "learning_rate": 2.848101265822785e-06, + "logps/chosen": -43.46619415283203, + "logps/rejected": -57.948265075683594, + "loss": 0.4697, + "losses/dpo": 0.492125928401947, + "losses/sft": 1.7441303730010986, + "losses/total": 0.492125928401947, + "ref_logps/chosen": -28.644359588623047, + "ref_logps/rejected": -33.97466278076172, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.4821833372116089, + "rewards/margins": 0.9151768088340759, + "rewards/rejected": -2.397360324859619, + "step": 129 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 5.476495265960693, + "learning_rate": 2.827004219409283e-06, + "logps/chosen": -42.42694854736328, + "logps/rejected": -58.544918060302734, + "loss": 0.4981, + "losses/dpo": 0.5451189279556274, + "losses/sft": 1.419020175933838, + "losses/total": 0.5451189279556274, + "ref_logps/chosen": -26.963315963745117, + "ref_logps/rejected": -33.612648010253906, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -1.5463628768920898, + "rewards/margins": 0.9468642473220825, + "rewards/rejected": -2.493227243423462, + "step": 130 + }, + { + "epoch": 0.9886792452830189, + "grad_norm": 6.449576377868652, + "learning_rate": 2.805907172995781e-06, + "logps/chosen": -45.13196563720703, + "logps/rejected": -56.187950134277344, + "loss": 0.556, + "losses/dpo": 0.832069993019104, + "losses/sft": 2.0745010375976562, + "losses/total": 0.832069993019104, + "ref_logps/chosen": -29.580839157104492, + "ref_logps/rejected": -32.631561279296875, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.555112600326538, + "rewards/margins": 0.8005262613296509, + "rewards/rejected": -2.3556389808654785, + "step": 131 + }, + { + "epoch": 0.9962264150943396, + "grad_norm": 5.224597930908203, + "learning_rate": 2.7848101265822785e-06, + "logps/chosen": -43.97776412963867, + "logps/rejected": -62.334651947021484, + "loss": 0.4272, + "losses/dpo": 0.37034112215042114, + "losses/sft": 1.429057240486145, + "losses/total": 0.37034112215042114, + "ref_logps/chosen": -28.489038467407227, + "ref_logps/rejected": -33.802345275878906, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -1.5488728284835815, + "rewards/margins": 1.3043583631515503, + "rewards/rejected": -2.853231191635132, + "step": 132 + }, + { + "epoch": 1.0037735849056604, + "grad_norm": 5.927867412567139, + "learning_rate": 2.7637130801687767e-06, + "logps/chosen": -42.68547058105469, + "logps/rejected": -64.09191131591797, + "loss": 0.4739, + "losses/dpo": 0.8177364468574524, + "losses/sft": 1.34566068649292, + "losses/total": 0.8177364468574524, + "ref_logps/chosen": -27.38077163696289, + "ref_logps/rejected": -35.45868682861328, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -1.5304700136184692, + "rewards/margins": 1.3328523635864258, + "rewards/rejected": -2.8633224964141846, + "step": 133 + }, + { + "epoch": 1.0113207547169811, + "grad_norm": 2.91408371925354, + "learning_rate": 2.742616033755274e-06, + "logps/chosen": -39.36324691772461, + "logps/rejected": -67.83808135986328, + "loss": 0.1975, + "losses/dpo": 0.1529974639415741, + "losses/sft": 1.5831780433654785, + "losses/total": 0.1529974639415741, + "ref_logps/chosen": -29.105531692504883, + "ref_logps/rejected": -34.99441909790039, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -1.0257714986801147, + "rewards/margins": 2.258594274520874, + "rewards/rejected": -3.2843658924102783, + "step": 134 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 2.8387842178344727, + "learning_rate": 2.7215189873417724e-06, + "logps/chosen": -38.02918243408203, + "logps/rejected": -71.52278137207031, + "loss": 0.2007, + "losses/dpo": 0.2637289762496948, + "losses/sft": 1.3442529439926147, + "losses/total": 0.2637289762496948, + "ref_logps/chosen": -27.55896759033203, + "ref_logps/rejected": -37.357872009277344, + "rewards/accuracies": 0.953125, + "rewards/chosen": -1.0470216274261475, + "rewards/margins": 2.369469165802002, + "rewards/rejected": -3.4164910316467285, + "step": 135 + }, + { + "epoch": 1.0264150943396226, + "grad_norm": 3.233880043029785, + "learning_rate": 2.70042194092827e-06, + "logps/chosen": -43.10190963745117, + "logps/rejected": -65.63934326171875, + "loss": 0.2173, + "losses/dpo": 0.4220733046531677, + "losses/sft": 1.6242269277572632, + "losses/total": 0.4220733046531677, + "ref_logps/chosen": -32.96651077270508, + "ref_logps/rejected": -33.8997802734375, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.0135400295257568, + "rewards/margins": 2.1604163646698, + "rewards/rejected": -3.1739563941955566, + "step": 136 + }, + { + "epoch": 1.0339622641509434, + "grad_norm": 2.9888410568237305, + "learning_rate": 2.679324894514768e-06, + "logps/chosen": -38.224029541015625, + "logps/rejected": -68.30229187011719, + "loss": 0.2026, + "losses/dpo": 0.15418484807014465, + "losses/sft": 1.1837782859802246, + "losses/total": 0.15418484807014465, + "ref_logps/chosen": -30.028034210205078, + "ref_logps/rejected": -35.37809371948242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8195996284484863, + "rewards/margins": 2.472820281982422, + "rewards/rejected": -3.292419910430908, + "step": 137 + }, + { + "epoch": 1.0415094339622641, + "grad_norm": 2.6056151390075684, + "learning_rate": 2.6582278481012658e-06, + "logps/chosen": -36.850372314453125, + "logps/rejected": -70.21464538574219, + "loss": 0.1806, + "losses/dpo": 0.07243721187114716, + "losses/sft": 1.3981924057006836, + "losses/total": 0.07243721187114716, + "ref_logps/chosen": -28.195514678955078, + "ref_logps/rejected": -35.84675216674805, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.8654859066009521, + "rewards/margins": 2.571302652359009, + "rewards/rejected": -3.436788558959961, + "step": 138 + }, + { + "epoch": 1.049056603773585, + "grad_norm": 3.2604613304138184, + "learning_rate": 2.637130801687764e-06, + "logps/chosen": -35.98524475097656, + "logps/rejected": -67.50647735595703, + "loss": 0.2473, + "losses/dpo": 0.18576228618621826, + "losses/sft": 1.268462061882019, + "losses/total": 0.18576228618621826, + "ref_logps/chosen": -26.85390853881836, + "ref_logps/rejected": -35.17997741699219, + "rewards/accuracies": 0.9140625, + "rewards/chosen": -0.9131335020065308, + "rewards/margins": 2.319516181945801, + "rewards/rejected": -3.232649803161621, + "step": 139 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 2.6628074645996094, + "learning_rate": 2.6160337552742622e-06, + "logps/chosen": -43.86473083496094, + "logps/rejected": -74.11687469482422, + "loss": 0.178, + "losses/dpo": 0.300728440284729, + "losses/sft": 1.6412304639816284, + "losses/total": 0.300728440284729, + "ref_logps/chosen": -33.60987854003906, + "ref_logps/rejected": -37.86822509765625, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -1.0254850387573242, + "rewards/margins": 2.5993804931640625, + "rewards/rejected": -3.6248652935028076, + "step": 140 + }, + { + "epoch": 1.0641509433962264, + "grad_norm": 2.704582691192627, + "learning_rate": 2.5949367088607596e-06, + "logps/chosen": -37.317047119140625, + "logps/rejected": -66.24585723876953, + "loss": 0.1955, + "losses/dpo": 0.21787673234939575, + "losses/sft": 1.192077875137329, + "losses/total": 0.21787673234939575, + "ref_logps/chosen": -27.87858772277832, + "ref_logps/rejected": -33.62061309814453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9438458681106567, + "rewards/margins": 2.318678617477417, + "rewards/rejected": -3.2625246047973633, + "step": 141 + }, + { + "epoch": 1.0716981132075472, + "grad_norm": 3.186472177505493, + "learning_rate": 2.573839662447258e-06, + "logps/chosen": -35.49989318847656, + "logps/rejected": -60.943443298339844, + "loss": 0.2183, + "losses/dpo": 0.20879721641540527, + "losses/sft": 1.453169584274292, + "losses/total": 0.20879721641540527, + "ref_logps/chosen": -27.398571014404297, + "ref_logps/rejected": -30.07172966003418, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8101319074630737, + "rewards/margins": 2.2770400047302246, + "rewards/rejected": -3.087171792984009, + "step": 142 + }, + { + "epoch": 1.079245283018868, + "grad_norm": 2.6961019039154053, + "learning_rate": 2.5527426160337553e-06, + "logps/chosen": -41.55631637573242, + "logps/rejected": -73.67892456054688, + "loss": 0.1746, + "losses/dpo": 0.11362100392580032, + "losses/sft": 1.2692244052886963, + "losses/total": 0.11362100392580032, + "ref_logps/chosen": -31.214237213134766, + "ref_logps/rejected": -36.22808837890625, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.034208059310913, + "rewards/margins": 2.71087646484375, + "rewards/rejected": -3.745084285736084, + "step": 143 + }, + { + "epoch": 1.0867924528301887, + "grad_norm": 2.721705198287964, + "learning_rate": 2.5316455696202535e-06, + "logps/chosen": -36.01787567138672, + "logps/rejected": -73.40156555175781, + "loss": 0.1744, + "losses/dpo": 0.18423417210578918, + "losses/sft": 1.190388560295105, + "losses/total": 0.18423417210578918, + "ref_logps/chosen": -26.2912540435791, + "ref_logps/rejected": -36.96295928955078, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -0.972662091255188, + "rewards/margins": 2.671198844909668, + "rewards/rejected": -3.6438608169555664, + "step": 144 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 4.111969947814941, + "learning_rate": 2.5105485232067513e-06, + "logps/chosen": -36.39363098144531, + "logps/rejected": -68.23372650146484, + "loss": 0.1625, + "losses/dpo": 0.16966360807418823, + "losses/sft": 1.3258998394012451, + "losses/total": 0.16966360807418823, + "ref_logps/chosen": -28.285419464111328, + "ref_logps/rejected": -33.52401351928711, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -0.8108214735984802, + "rewards/margins": 2.6601500511169434, + "rewards/rejected": -3.4709715843200684, + "step": 145 + }, + { + "epoch": 1.1018867924528302, + "grad_norm": 2.4594967365264893, + "learning_rate": 2.489451476793249e-06, + "logps/chosen": -38.03108215332031, + "logps/rejected": -74.02571105957031, + "loss": 0.1689, + "losses/dpo": 0.18407484889030457, + "losses/sft": 1.6673762798309326, + "losses/total": 0.18407484889030457, + "ref_logps/chosen": -29.032581329345703, + "ref_logps/rejected": -37.598445892333984, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -0.8998502492904663, + "rewards/margins": 2.7428760528564453, + "rewards/rejected": -3.642726421356201, + "step": 146 + }, + { + "epoch": 1.109433962264151, + "grad_norm": 2.4531755447387695, + "learning_rate": 2.4683544303797473e-06, + "logps/chosen": -42.473106384277344, + "logps/rejected": -81.33661651611328, + "loss": 0.1411, + "losses/dpo": 0.09885497391223907, + "losses/sft": 1.4337823390960693, + "losses/total": 0.09885497391223907, + "ref_logps/chosen": -32.69062042236328, + "ref_logps/rejected": -40.296226501464844, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.9782487154006958, + "rewards/margins": 3.12579083442688, + "rewards/rejected": -4.104039192199707, + "step": 147 + }, + { + "epoch": 1.1169811320754717, + "grad_norm": 2.9463274478912354, + "learning_rate": 2.447257383966245e-06, + "logps/chosen": -33.57645034790039, + "logps/rejected": -64.7162857055664, + "loss": 0.1829, + "losses/dpo": 0.06601670384407043, + "losses/sft": 1.14356529712677, + "losses/total": 0.06601670384407043, + "ref_logps/chosen": -23.990320205688477, + "ref_logps/rejected": -29.673688888549805, + "rewards/accuracies": 0.953125, + "rewards/chosen": -0.9586129784584045, + "rewards/margins": 2.5456466674804688, + "rewards/rejected": -3.5042595863342285, + "step": 148 + }, + { + "epoch": 1.1245283018867924, + "grad_norm": 2.640474319458008, + "learning_rate": 2.426160337552743e-06, + "logps/chosen": -45.765403747558594, + "logps/rejected": -78.10466003417969, + "loss": 0.1328, + "losses/dpo": 0.10652376711368561, + "losses/sft": 1.882682204246521, + "losses/total": 0.10652376711368561, + "ref_logps/chosen": -33.539283752441406, + "ref_logps/rejected": -36.12788391113281, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.2226122617721558, + "rewards/margins": 2.9750657081604004, + "rewards/rejected": -4.197678089141846, + "step": 149 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 3.2133562564849854, + "learning_rate": 2.4050632911392408e-06, + "logps/chosen": -40.938926696777344, + "logps/rejected": -73.20862579345703, + "loss": 0.1627, + "losses/dpo": 0.12627126276493073, + "losses/sft": 1.4981681108474731, + "losses/total": 0.12627126276493073, + "ref_logps/chosen": -30.009746551513672, + "ref_logps/rejected": -35.256587982177734, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.0929179191589355, + "rewards/margins": 2.702286720275879, + "rewards/rejected": -3.7952044010162354, + "step": 150 + }, + { + "epoch": 1.139622641509434, + "grad_norm": 2.9940009117126465, + "learning_rate": 2.3839662447257386e-06, + "logps/chosen": -40.70891571044922, + "logps/rejected": -78.65632629394531, + "loss": 0.151, + "losses/dpo": 0.36528170108795166, + "losses/sft": 1.4617805480957031, + "losses/total": 0.36528170108795166, + "ref_logps/chosen": -29.670351028442383, + "ref_logps/rejected": -35.98125457763672, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.1038565635681152, + "rewards/margins": 3.1636507511138916, + "rewards/rejected": -4.267507553100586, + "step": 151 + }, + { + "epoch": 1.1471698113207547, + "grad_norm": 3.4196298122406006, + "learning_rate": 2.3628691983122364e-06, + "logps/chosen": -39.877235412597656, + "logps/rejected": -74.44210815429688, + "loss": 0.1838, + "losses/dpo": 0.1876736879348755, + "losses/sft": 1.493945837020874, + "losses/total": 0.1876736879348755, + "ref_logps/chosen": -25.441184997558594, + "ref_logps/rejected": -33.24305725097656, + "rewards/accuracies": 0.9296875, + "rewards/chosen": -1.4436049461364746, + "rewards/margins": 2.676300525665283, + "rewards/rejected": -4.119905471801758, + "step": 152 + }, + { + "epoch": 1.1547169811320754, + "grad_norm": 3.2757954597473145, + "learning_rate": 2.341772151898734e-06, + "logps/chosen": -40.69176483154297, + "logps/rejected": -71.23855590820312, + "loss": 0.1904, + "losses/dpo": 0.19736188650131226, + "losses/sft": 1.4453177452087402, + "losses/total": 0.19736188650131226, + "ref_logps/chosen": -26.163129806518555, + "ref_logps/rejected": -31.663166046142578, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.4528635740280151, + "rewards/margins": 2.5046753883361816, + "rewards/rejected": -3.9575390815734863, + "step": 153 + }, + { + "epoch": 1.1622641509433962, + "grad_norm": 2.864833116531372, + "learning_rate": 2.3206751054852324e-06, + "logps/chosen": -41.904991149902344, + "logps/rejected": -84.37300109863281, + "loss": 0.1472, + "losses/dpo": 0.09012404829263687, + "losses/sft": 1.942337989807129, + "losses/total": 0.09012404829263687, + "ref_logps/chosen": -28.38837242126465, + "ref_logps/rejected": -38.871055603027344, + "rewards/accuracies": 0.9453125, + "rewards/chosen": -1.3516615629196167, + "rewards/margins": 3.198533535003662, + "rewards/rejected": -4.55019474029541, + "step": 154 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 4.589540481567383, + "learning_rate": 2.2995780590717302e-06, + "logps/chosen": -42.954078674316406, + "logps/rejected": -80.19384765625, + "loss": 0.1453, + "losses/dpo": 0.13800232112407684, + "losses/sft": 1.500653624534607, + "losses/total": 0.13800232112407684, + "ref_logps/chosen": -29.534503936767578, + "ref_logps/rejected": -35.81602096557617, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.3419573307037354, + "rewards/margins": 3.095825433731079, + "rewards/rejected": -4.4377827644348145, + "step": 155 + }, + { + "epoch": 1.1773584905660377, + "grad_norm": 3.4918391704559326, + "learning_rate": 2.278481012658228e-06, + "logps/chosen": -42.988189697265625, + "logps/rejected": -79.981201171875, + "loss": 0.1665, + "losses/dpo": 0.18867120146751404, + "losses/sft": 1.3614583015441895, + "losses/total": 0.18867120146751404, + "ref_logps/chosen": -28.600976943969727, + "ref_logps/rejected": -34.73303985595703, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.4387214183807373, + "rewards/margins": 3.0860953330993652, + "rewards/rejected": -4.524816513061523, + "step": 156 + }, + { + "epoch": 1.1849056603773584, + "grad_norm": 2.938596725463867, + "learning_rate": 2.257383966244726e-06, + "logps/chosen": -39.78988265991211, + "logps/rejected": -76.49989318847656, + "loss": 0.1296, + "losses/dpo": 0.24511000514030457, + "losses/sft": 1.592597246170044, + "losses/total": 0.24511000514030457, + "ref_logps/chosen": -25.498641967773438, + "ref_logps/rejected": -30.84796714782715, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.429124116897583, + "rewards/margins": 3.1360692977905273, + "rewards/rejected": -4.5651936531066895, + "step": 157 + }, + { + "epoch": 1.1924528301886792, + "grad_norm": 3.095472574234009, + "learning_rate": 2.2362869198312237e-06, + "logps/chosen": -43.18218231201172, + "logps/rejected": -83.31824493408203, + "loss": 0.1515, + "losses/dpo": 0.14324676990509033, + "losses/sft": 1.613158941268921, + "losses/total": 0.14324676990509033, + "ref_logps/chosen": -29.27816390991211, + "ref_logps/rejected": -35.679847717285156, + "rewards/accuracies": 0.9609375, + "rewards/chosen": -1.3904017210006714, + "rewards/margins": 3.3734383583068848, + "rewards/rejected": -4.7638397216796875, + "step": 158 + }, + { + "epoch": 1.2, + "grad_norm": 3.023688316345215, + "learning_rate": 2.2151898734177215e-06, + "logps/chosen": -45.36747360229492, + "logps/rejected": -84.181884765625, + "loss": 0.1305, + "losses/dpo": 0.06274432688951492, + "losses/sft": 1.7756646871566772, + "losses/total": 0.06274432688951492, + "ref_logps/chosen": -29.679649353027344, + "ref_logps/rejected": -35.470863342285156, + "rewards/accuracies": 0.9765625, + "rewards/chosen": -1.568782091140747, + "rewards/margins": 3.3023202419281006, + "rewards/rejected": -4.871102333068848, + "step": 159 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 3.6034018993377686, + "learning_rate": 2.1940928270042197e-06, + "logps/chosen": -41.78567886352539, + "logps/rejected": -81.32054138183594, + "loss": 0.1411, + "losses/dpo": 0.21304282546043396, + "losses/sft": 1.4726674556732178, + "losses/total": 0.21304282546043396, + "ref_logps/chosen": -27.83294677734375, + "ref_logps/rejected": -33.414634704589844, + "rewards/accuracies": 0.953125, + "rewards/chosen": -1.395273208618164, + "rewards/margins": 3.395317554473877, + "rewards/rejected": -4.790591239929199, + "step": 160 + } + ], + "logging_steps": 1.0, + "max_steps": 264, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 40, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}