diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 30.983733539891556, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.061967467079783116, + "grad_norm": 0.9636571407318115, + "learning_rate": 4e-05, + "logits/chosen": -2.2784409523010254, + "logits/rejected": -1.8663209676742554, + "logps/chosen": -371.1264953613281, + "logps/rejected": -214.287109375, + "loss": 0.6698, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": 0.04825574904680252, + "rewards/margins": 0.04888930916786194, + "rewards/rejected": -0.0006335576181299984, + "step": 20 + }, + { + "epoch": 0.12393493415956623, + "grad_norm": 0.8874484300613403, + "learning_rate": 7.800000000000001e-05, + "logits/chosen": -2.2781295776367188, + "logits/rejected": -1.883906602859497, + "logps/chosen": -354.96759033203125, + "logps/rejected": -200.45201110839844, + "loss": 0.4393, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": 0.6998767852783203, + "rewards/margins": 0.8133988380432129, + "rewards/rejected": -0.11352206766605377, + "step": 40 + }, + { + "epoch": 0.18590240123934934, + "grad_norm": 0.5087229013442993, + "learning_rate": 0.000118, + "logits/chosen": -2.292691469192505, + "logits/rejected": -1.8518846035003662, + "logps/chosen": -332.63885498046875, + "logps/rejected": -207.2465057373047, + "loss": 0.181, + "rewards/accuracies": 0.9593750238418579, + "rewards/chosen": 1.8739244937896729, + "rewards/margins": 3.126392364501953, + "rewards/rejected": -1.2524681091308594, + "step": 60 + }, + { + "epoch": 0.24786986831913246, + "grad_norm": 0.28147539496421814, + "learning_rate": 0.00015800000000000002, + "logits/chosen": -2.2652933597564697, + "logits/rejected": -1.7805808782577515, + "logps/chosen": -333.56829833984375, + "logps/rejected": -216.029541015625, + "loss": 0.0837, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": 1.9676460027694702, + "rewards/margins": 4.509096145629883, + "rewards/rejected": -2.541449785232544, + "step": 80 + }, + { + "epoch": 0.30983733539891556, + "grad_norm": 0.2960963547229767, + "learning_rate": 0.00019800000000000002, + "logits/chosen": -2.193760871887207, + "logits/rejected": -1.7519451379776, + "logps/chosen": -314.7029113769531, + "logps/rejected": -225.7288055419922, + "loss": 0.0583, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": 1.717433214187622, + "rewards/margins": 5.224194526672363, + "rewards/rejected": -3.506760835647583, + "step": 100 + }, + { + "epoch": 0.3718048024786987, + "grad_norm": 0.5042657852172852, + "learning_rate": 0.00019999818237098496, + "logits/chosen": -2.103884220123291, + "logits/rejected": -1.6459449529647827, + "logps/chosen": -353.2919921875, + "logps/rejected": -271.13140869140625, + "loss": 0.0305, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.9538187980651855, + "rewards/margins": 6.551173210144043, + "rewards/rejected": -5.597354888916016, + "step": 120 + }, + { + "epoch": 0.4337722695584818, + "grad_norm": 0.04080686345696449, + "learning_rate": 0.00019999234186476365, + "logits/chosen": -1.97976815700531, + "logits/rejected": -1.4605042934417725, + "logps/chosen": -360.1809387207031, + "logps/rejected": -283.61322021484375, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14382997155189514, + "rewards/margins": 8.151076316833496, + "rewards/rejected": -8.294907569885254, + "step": 140 + }, + { + "epoch": 0.4957397366382649, + "grad_norm": 0.2120855450630188, + "learning_rate": 0.00019998247368159224, + "logits/chosen": -1.9709131717681885, + "logits/rejected": -1.4924025535583496, + "logps/chosen": -352.43408203125, + "logps/rejected": -283.54193115234375, + "loss": 0.0202, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.14206284284591675, + "rewards/margins": 8.227814674377441, + "rewards/rejected": -8.085752487182617, + "step": 160 + }, + { + "epoch": 0.557707203718048, + "grad_norm": 0.14296187460422516, + "learning_rate": 0.00019996857821895966, + "logits/chosen": -1.9138180017471313, + "logits/rejected": -1.3899542093276978, + "logps/chosen": -349.78985595703125, + "logps/rejected": -291.88946533203125, + "loss": 0.014, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -0.20763222873210907, + "rewards/margins": 9.207175254821777, + "rewards/rejected": -9.414807319641113, + "step": 180 + }, + { + "epoch": 0.6196746707978311, + "grad_norm": 0.15746493637561798, + "learning_rate": 0.00019995065603657316, + "logits/chosen": -1.803430199623108, + "logits/rejected": -1.2661969661712646, + "logps/chosen": -365.71539306640625, + "logps/rejected": -320.00897216796875, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6435772180557251, + "rewards/margins": 9.713006973266602, + "rewards/rejected": -10.356585502624512, + "step": 200 + }, + { + "epoch": 0.6816421378776143, + "grad_norm": 0.03454677388072014, + "learning_rate": 0.00019992870785633563, + "logits/chosen": -1.8543188571929932, + "logits/rejected": -1.3436486721038818, + "logps/chosen": -354.7937316894531, + "logps/rejected": -284.83819580078125, + "loss": 0.014, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.3698399066925049, + "rewards/margins": 8.893186569213867, + "rewards/rejected": -8.523347854614258, + "step": 220 + }, + { + "epoch": 0.7436096049573974, + "grad_norm": 0.6197882294654846, + "learning_rate": 0.0001999027345623165, + "logits/chosen": -1.9029203653335571, + "logits/rejected": -1.3840315341949463, + "logps/chosen": -342.14666748046875, + "logps/rejected": -296.29547119140625, + "loss": 0.0103, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": 0.0756167322397232, + "rewards/margins": 9.7724027633667, + "rewards/rejected": -9.696786880493164, + "step": 240 + }, + { + "epoch": 0.8055770720371804, + "grad_norm": 0.03329584375023842, + "learning_rate": 0.00019987273720071632, + "logits/chosen": -1.856236457824707, + "logits/rejected": -1.350318193435669, + "logps/chosen": -358.27630615234375, + "logps/rejected": -334.4754943847656, + "loss": 0.0112, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -0.7377322912216187, + "rewards/margins": 10.27548599243164, + "rewards/rejected": -11.01321792602539, + "step": 260 + }, + { + "epoch": 0.8675445391169636, + "grad_norm": 0.01427725050598383, + "learning_rate": 0.00019983871697982445, + "logits/chosen": -1.7166755199432373, + "logits/rejected": -1.1573728322982788, + "logps/chosen": -357.41461181640625, + "logps/rejected": -323.04693603515625, + "loss": 0.0072, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.0735583305358887, + "rewards/margins": 10.781556129455566, + "rewards/rejected": -11.855114936828613, + "step": 280 + }, + { + "epoch": 0.9295120061967467, + "grad_norm": 0.03855278715491295, + "learning_rate": 0.00019980067526997045, + "logits/chosen": -1.6517670154571533, + "logits/rejected": -1.0374754667282104, + "logps/chosen": -368.36083984375, + "logps/rejected": -319.9913635253906, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.422989010810852, + "rewards/margins": 10.918716430664062, + "rewards/rejected": -12.341705322265625, + "step": 300 + }, + { + "epoch": 0.9914794732765299, + "grad_norm": 0.019272373989224434, + "learning_rate": 0.00019975861360346876, + "logits/chosen": -1.6813256740570068, + "logits/rejected": -1.0804450511932373, + "logps/chosen": -365.1048278808594, + "logps/rejected": -309.0344543457031, + "loss": 0.0082, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -0.9812471270561218, + "rewards/margins": 10.517460823059082, + "rewards/rejected": -11.498706817626953, + "step": 320 + }, + { + "epoch": 1.053446940356313, + "grad_norm": 0.05422881618142128, + "learning_rate": 0.00019971253367455727, + "logits/chosen": -1.667327880859375, + "logits/rejected": -1.058142066001892, + "logps/chosen": -386.6405944824219, + "logps/rejected": -340.94427490234375, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.013697862625122, + "rewards/margins": 11.128829956054688, + "rewards/rejected": -12.142528533935547, + "step": 340 + }, + { + "epoch": 1.115414407436096, + "grad_norm": 0.009864700958132744, + "learning_rate": 0.00019966243733932873, + "logits/chosen": -1.5947678089141846, + "logits/rejected": -0.8976384997367859, + "logps/chosen": -368.97564697265625, + "logps/rejected": -330.3668212890625, + "loss": 0.0036, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.5015220642089844, + "rewards/margins": 12.364274978637695, + "rewards/rejected": -13.865796089172363, + "step": 360 + }, + { + "epoch": 1.1773818745158793, + "grad_norm": 0.0011158271227031946, + "learning_rate": 0.00019960832661565622, + "logits/chosen": -1.644182562828064, + "logits/rejected": -1.0111494064331055, + "logps/chosen": -380.652099609375, + "logps/rejected": -347.1996765136719, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7853857278823853, + "rewards/margins": 12.316856384277344, + "rewards/rejected": -14.102241516113281, + "step": 380 + }, + { + "epoch": 1.2393493415956622, + "grad_norm": 0.003002722980454564, + "learning_rate": 0.00019955020368311183, + "logits/chosen": -1.5604654550552368, + "logits/rejected": -0.855100154876709, + "logps/chosen": -383.905517578125, + "logps/rejected": -356.8383483886719, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.063133478164673, + "rewards/margins": 13.559541702270508, + "rewards/rejected": -15.622674942016602, + "step": 400 + }, + { + "epoch": 1.3013168086754454, + "grad_norm": 0.12509553134441376, + "learning_rate": 0.00019948807088287883, + "logits/chosen": -1.4892700910568237, + "logits/rejected": -0.7589560747146606, + "logps/chosen": -369.834716796875, + "logps/rejected": -347.0176086425781, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5435351133346558, + "rewards/margins": 13.051846504211426, + "rewards/rejected": -14.595380783081055, + "step": 420 + }, + { + "epoch": 1.3632842757552286, + "grad_norm": 0.17950604856014252, + "learning_rate": 0.0001994219307176573, + "logits/chosen": -1.3477447032928467, + "logits/rejected": -0.5739808678627014, + "logps/chosen": -383.2652282714844, + "logps/rejected": -377.4029541015625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7989494800567627, + "rewards/margins": 13.98778247833252, + "rewards/rejected": -16.786731719970703, + "step": 440 + }, + { + "epoch": 1.4252517428350115, + "grad_norm": 0.030621694400906563, + "learning_rate": 0.00019935178585156347, + "logits/chosen": -1.2917251586914062, + "logits/rejected": -0.4901725649833679, + "logps/chosen": -387.3026428222656, + "logps/rejected": -381.1557312011719, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.059861660003662, + "rewards/margins": 14.754425048828125, + "rewards/rejected": -17.814287185668945, + "step": 460 + }, + { + "epoch": 1.4872192099147947, + "grad_norm": 0.018743343651294708, + "learning_rate": 0.00019927763911002232, + "logits/chosen": -1.2943724393844604, + "logits/rejected": -0.4242786765098572, + "logps/chosen": -402.34521484375, + "logps/rejected": -377.7867736816406, + "loss": 0.0055, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.333390474319458, + "rewards/margins": 14.473798751831055, + "rewards/rejected": -17.80718994140625, + "step": 480 + }, + { + "epoch": 1.549186676994578, + "grad_norm": 0.9102460741996765, + "learning_rate": 0.0001991994934796538, + "logits/chosen": -1.378178358078003, + "logits/rejected": -0.5917572975158691, + "logps/chosen": -379.13470458984375, + "logps/rejected": -377.18170166015625, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1097264289855957, + "rewards/margins": 13.761838912963867, + "rewards/rejected": -16.871564865112305, + "step": 500 + }, + { + "epoch": 1.6111541440743609, + "grad_norm": 0.08494719117879868, + "learning_rate": 0.0001991173521081525, + "logits/chosen": -1.6131643056869507, + "logits/rejected": -1.0569688081741333, + "logps/chosen": -353.74285888671875, + "logps/rejected": -293.4755859375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8248776197433472, + "rewards/margins": 10.194657325744629, + "rewards/rejected": -11.019535064697266, + "step": 520 + }, + { + "epoch": 1.673121611154144, + "grad_norm": 0.08382871747016907, + "learning_rate": 0.00019903121830416084, + "logits/chosen": -1.6320635080337524, + "logits/rejected": -1.0402500629425049, + "logps/chosen": -365.8774719238281, + "logps/rejected": -338.92730712890625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.775146484375, + "rewards/margins": 11.3680419921875, + "rewards/rejected": -12.143187522888184, + "step": 540 + }, + { + "epoch": 1.7350890782339272, + "grad_norm": 0.2668975293636322, + "learning_rate": 0.00019894109553713596, + "logits/chosen": -1.5704303979873657, + "logits/rejected": -0.9429661631584167, + "logps/chosen": -365.54425048828125, + "logps/rejected": -338.42523193359375, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.945920705795288, + "rewards/margins": 11.87964916229248, + "rewards/rejected": -13.825571060180664, + "step": 560 + }, + { + "epoch": 1.7970565453137102, + "grad_norm": 0.0014168431516736746, + "learning_rate": 0.00019884698743720974, + "logits/chosen": -1.5261198282241821, + "logits/rejected": -0.8152379989624023, + "logps/chosen": -387.27398681640625, + "logps/rejected": -349.8082580566406, + "loss": 0.0054, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.8853065967559814, + "rewards/margins": 12.5154447555542, + "rewards/rejected": -15.400751113891602, + "step": 580 + }, + { + "epoch": 1.8590240123934936, + "grad_norm": 0.060512229800224304, + "learning_rate": 0.00019874889779504274, + "logits/chosen": -1.4643357992172241, + "logits/rejected": -0.7492440938949585, + "logps/chosen": -371.1864013671875, + "logps/rejected": -374.9864196777344, + "loss": 0.0053, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.833853244781494, + "rewards/margins": 13.810094833374023, + "rewards/rejected": -16.64394760131836, + "step": 600 + }, + { + "epoch": 1.9209914794732765, + "grad_norm": 0.012925480492413044, + "learning_rate": 0.00019864683056167138, + "logits/chosen": -1.5827527046203613, + "logits/rejected": -0.8596705198287964, + "logps/chosen": -378.899658203125, + "logps/rejected": -334.9442443847656, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0560431480407715, + "rewards/margins": 11.832538604736328, + "rewards/rejected": -13.888582229614258, + "step": 620 + }, + { + "epoch": 1.9829589465530595, + "grad_norm": 0.02249460108578205, + "learning_rate": 0.00019854078984834903, + "logits/chosen": -1.6471761465072632, + "logits/rejected": -1.0331779718399048, + "logps/chosen": -384.35479736328125, + "logps/rejected": -350.6449890136719, + "loss": 0.007, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.5288515090942383, + "rewards/margins": 11.604903221130371, + "rewards/rejected": -13.133755683898926, + "step": 640 + }, + { + "epoch": 2.044926413632843, + "grad_norm": 0.08954928070306778, + "learning_rate": 0.00019843077992638008, + "logits/chosen": -1.6173429489135742, + "logits/rejected": -0.8341380953788757, + "logps/chosen": -364.81646728515625, + "logps/rejected": -330.5616149902344, + "loss": 0.007, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.3666422367095947, + "rewards/margins": 11.623918533325195, + "rewards/rejected": -12.990560531616211, + "step": 660 + }, + { + "epoch": 2.106893880712626, + "grad_norm": 0.007693038322031498, + "learning_rate": 0.00019831680522694822, + "logits/chosen": -1.5330901145935059, + "logits/rejected": -0.7276937365531921, + "logps/chosen": -367.1166687011719, + "logps/rejected": -353.382080078125, + "loss": 0.0045, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.0102535486221313, + "rewards/margins": 13.941228866577148, + "rewards/rejected": -14.951481819152832, + "step": 680 + }, + { + "epoch": 2.168861347792409, + "grad_norm": 0.04048047587275505, + "learning_rate": 0.00019819887034093768, + "logits/chosen": -1.6976232528686523, + "logits/rejected": -1.0186755657196045, + "logps/chosen": -371.5323791503906, + "logps/rejected": -328.90411376953125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6277135610580444, + "rewards/margins": 12.13409423828125, + "rewards/rejected": -13.761807441711426, + "step": 700 + }, + { + "epoch": 2.230828814872192, + "grad_norm": 0.0501631423830986, + "learning_rate": 0.00019807698001874846, + "logits/chosen": -1.7097208499908447, + "logits/rejected": -1.064752221107483, + "logps/chosen": -355.98419189453125, + "logps/rejected": -349.1392517089844, + "loss": 0.0034, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.9499849081039429, + "rewards/margins": 12.722036361694336, + "rewards/rejected": -14.672018051147461, + "step": 720 + }, + { + "epoch": 2.292796281951975, + "grad_norm": 0.09596577286720276, + "learning_rate": 0.000197951139170105, + "logits/chosen": -1.6241413354873657, + "logits/rejected": -0.8459763526916504, + "logps/chosen": -391.1998596191406, + "logps/rejected": -360.8172607421875, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.3230502605438232, + "rewards/margins": 13.51972770690918, + "rewards/rejected": -15.84277629852295, + "step": 740 + }, + { + "epoch": 2.3547637490317586, + "grad_norm": 0.009040975011885166, + "learning_rate": 0.0001978213528638583, + "logits/chosen": -1.6358083486557007, + "logits/rejected": -0.8858752250671387, + "logps/chosen": -385.79425048828125, + "logps/rejected": -363.9739685058594, + "loss": 0.0058, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.426802158355713, + "rewards/margins": 13.022024154663086, + "rewards/rejected": -15.448826789855957, + "step": 760 + }, + { + "epoch": 2.4167312161115415, + "grad_norm": 0.0005452932673506439, + "learning_rate": 0.00019768762632778187, + "logits/chosen": -1.5128395557403564, + "logits/rejected": -0.7464967966079712, + "logps/chosen": -386.6916198730469, + "logps/rejected": -367.53118896484375, + "loss": 0.0043, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.430518627166748, + "rewards/margins": 13.321751594543457, + "rewards/rejected": -15.752270698547363, + "step": 780 + }, + { + "epoch": 2.4786986831913245, + "grad_norm": 0.029041077941656113, + "learning_rate": 0.0001975499649483611, + "logits/chosen": -1.5379225015640259, + "logits/rejected": -0.7502544522285461, + "logps/chosen": -391.7222900390625, + "logps/rejected": -362.2694396972656, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.068117618560791, + "rewards/margins": 13.040313720703125, + "rewards/rejected": -15.108428955078125, + "step": 800 + }, + { + "epoch": 2.5406661502711074, + "grad_norm": 0.19298532605171204, + "learning_rate": 0.00019740837427057625, + "logits/chosen": -1.456314206123352, + "logits/rejected": -0.614872932434082, + "logps/chosen": -399.50787353515625, + "logps/rejected": -373.15673828125, + "loss": 0.0053, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.449636936187744, + "rewards/margins": 13.22521686553955, + "rewards/rejected": -16.674854278564453, + "step": 820 + }, + { + "epoch": 2.602633617350891, + "grad_norm": 0.07452496141195297, + "learning_rate": 0.00019726285999767919, + "logits/chosen": -1.4205154180526733, + "logits/rejected": -0.2992118299007416, + "logps/chosen": -389.38677978515625, + "logps/rejected": -361.76226806640625, + "loss": 0.0045, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.5656723976135254, + "rewards/margins": 13.694564819335938, + "rewards/rejected": -17.260236740112305, + "step": 840 + }, + { + "epoch": 2.664601084430674, + "grad_norm": 0.2981736361980438, + "learning_rate": 0.00019711342799096361, + "logits/chosen": -1.3148002624511719, + "logits/rejected": -0.20274639129638672, + "logps/chosen": -397.6435546875, + "logps/rejected": -394.09381103515625, + "loss": 0.0077, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.557480812072754, + "rewards/margins": 14.411791801452637, + "rewards/rejected": -18.96927261352539, + "step": 860 + }, + { + "epoch": 2.726568551510457, + "grad_norm": 0.0045928251929581165, + "learning_rate": 0.00019696008426952897, + "logits/chosen": -1.528051733970642, + "logits/rejected": -0.5431190133094788, + "logps/chosen": -370.94769287109375, + "logps/rejected": -352.85858154296875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.877048134803772, + "rewards/margins": 12.948789596557617, + "rewards/rejected": -14.825838088989258, + "step": 880 + }, + { + "epoch": 2.78853601859024, + "grad_norm": 0.004106991924345493, + "learning_rate": 0.00019680283501003797, + "logits/chosen": -1.4431769847869873, + "logits/rejected": -0.3284229636192322, + "logps/chosen": -385.5152893066406, + "logps/rejected": -374.2150573730469, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3450465202331543, + "rewards/margins": 15.452998161315918, + "rewards/rejected": -17.798046112060547, + "step": 900 + }, + { + "epoch": 2.850503485670023, + "grad_norm": 0.0018571156542748213, + "learning_rate": 0.00019664168654646787, + "logits/chosen": -1.432015299797058, + "logits/rejected": -0.47949114441871643, + "logps/chosen": -373.06402587890625, + "logps/rejected": -391.0716247558594, + "loss": 0.004, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.047811985015869, + "rewards/margins": 14.360496520996094, + "rewards/rejected": -16.408308029174805, + "step": 920 + }, + { + "epoch": 2.9124709527498065, + "grad_norm": 0.014180217869579792, + "learning_rate": 0.00019647664536985536, + "logits/chosen": -1.3931537866592407, + "logits/rejected": -0.2861904501914978, + "logps/chosen": -375.765380859375, + "logps/rejected": -380.55035400390625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.287869453430176, + "rewards/margins": 15.421821594238281, + "rewards/rejected": -17.70969009399414, + "step": 940 + }, + { + "epoch": 2.9744384198295895, + "grad_norm": 0.0018744635162875056, + "learning_rate": 0.00019630771812803482, + "logits/chosen": -1.1787105798721313, + "logits/rejected": -0.12521542608737946, + "logps/chosen": -369.18731689453125, + "logps/rejected": -404.814208984375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6008732318878174, + "rewards/margins": 16.838558197021484, + "rewards/rejected": -19.43943214416504, + "step": 960 + }, + { + "epoch": 3.0364058869093724, + "grad_norm": 0.0062718172557652, + "learning_rate": 0.00019613491162537105, + "logits/chosen": -1.1543958187103271, + "logits/rejected": 0.015751656144857407, + "logps/chosen": -387.29241943359375, + "logps/rejected": -409.869384765625, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.3296706676483154, + "rewards/margins": 16.757152557373047, + "rewards/rejected": -20.086822509765625, + "step": 980 + }, + { + "epoch": 3.098373353989156, + "grad_norm": 0.0017134093213826418, + "learning_rate": 0.00019595823282248472, + "logits/chosen": -1.1962834596633911, + "logits/rejected": 0.06610006093978882, + "logps/chosen": -384.97509765625, + "logps/rejected": -392.449462890625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8541882038116455, + "rewards/margins": 16.58295249938965, + "rewards/rejected": -19.437145233154297, + "step": 1000 + }, + { + "epoch": 3.1603408210689388, + "grad_norm": 0.004404888488352299, + "learning_rate": 0.00019577768883597224, + "logits/chosen": -1.120615839958191, + "logits/rejected": 0.19888155162334442, + "logps/chosen": -380.3006591796875, + "logps/rejected": -392.1662292480469, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.439927339553833, + "rewards/margins": 16.68685531616211, + "rewards/rejected": -20.126781463623047, + "step": 1020 + }, + { + "epoch": 3.2223082881487217, + "grad_norm": 0.008267875760793686, + "learning_rate": 0.00019559328693811908, + "logits/chosen": -1.243600845336914, + "logits/rejected": 0.01010244432836771, + "logps/chosen": -402.1722412109375, + "logps/rejected": -400.5585021972656, + "loss": 0.0026, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.8246588706970215, + "rewards/margins": 16.491710662841797, + "rewards/rejected": -19.31637191772461, + "step": 1040 + }, + { + "epoch": 3.284275755228505, + "grad_norm": 0.2958065867424011, + "learning_rate": 0.0001954050345566068, + "logits/chosen": -1.2641799449920654, + "logits/rejected": -0.049269963055849075, + "logps/chosen": -399.22235107421875, + "logps/rejected": -417.200439453125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.203291654586792, + "rewards/margins": 17.573293685913086, + "rewards/rejected": -20.776586532592773, + "step": 1060 + }, + { + "epoch": 3.346243222308288, + "grad_norm": 0.006309076678007841, + "learning_rate": 0.00019521293927421388, + "logits/chosen": -1.3915287256240845, + "logits/rejected": -0.25474995374679565, + "logps/chosen": -377.03460693359375, + "logps/rejected": -377.8516540527344, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6546690464019775, + "rewards/margins": 15.229527473449707, + "rewards/rejected": -17.88419532775879, + "step": 1080 + }, + { + "epoch": 3.4082106893880715, + "grad_norm": 0.017275001853704453, + "learning_rate": 0.0001950170088285103, + "logits/chosen": -1.320058822631836, + "logits/rejected": -0.22610430419445038, + "logps/chosen": -367.7281799316406, + "logps/rejected": -360.3981018066406, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8530807495117188, + "rewards/margins": 14.434481620788574, + "rewards/rejected": -17.28756332397461, + "step": 1100 + }, + { + "epoch": 3.4701781564678544, + "grad_norm": 0.006489979103207588, + "learning_rate": 0.00019481725111154577, + "logits/chosen": -1.1993420124053955, + "logits/rejected": 0.13253316283226013, + "logps/chosen": -399.0036315917969, + "logps/rejected": -387.82672119140625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1873600482940674, + "rewards/margins": 15.84624195098877, + "rewards/rejected": -19.03360366821289, + "step": 1120 + }, + { + "epoch": 3.5321456235476374, + "grad_norm": 0.01082077156752348, + "learning_rate": 0.00019461367416953208, + "logits/chosen": -1.0335127115249634, + "logits/rejected": 0.3021327555179596, + "logps/chosen": -389.12701416015625, + "logps/rejected": -399.442626953125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1247944831848145, + "rewards/margins": 17.54226303100586, + "rewards/rejected": -21.66705894470215, + "step": 1140 + }, + { + "epoch": 3.5941130906274203, + "grad_norm": 0.0018865488236770034, + "learning_rate": 0.00019440628620251874, + "logits/chosen": -0.9542962312698364, + "logits/rejected": 0.2728312611579895, + "logps/chosen": -408.50225830078125, + "logps/rejected": -458.05157470703125, + "loss": 0.0048, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.655162334442139, + "rewards/margins": 18.82590103149414, + "rewards/rejected": -24.481061935424805, + "step": 1160 + }, + { + "epoch": 3.6560805577072037, + "grad_norm": 0.00045730554847978055, + "learning_rate": 0.00019419509556406285, + "logits/chosen": -1.1230123043060303, + "logits/rejected": 0.33410170674324036, + "logps/chosen": -388.6370849609375, + "logps/rejected": -404.42462158203125, + "loss": 0.0025, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.1652071475982666, + "rewards/margins": 18.062408447265625, + "rewards/rejected": -21.227617263793945, + "step": 1180 + }, + { + "epoch": 3.7180480247869867, + "grad_norm": 0.12257017195224762, + "learning_rate": 0.00019398011076089252, + "logits/chosen": -1.0567783117294312, + "logits/rejected": 0.1874198019504547, + "logps/chosen": -391.79998779296875, + "logps/rejected": -427.81524658203125, + "loss": 0.0021, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.9393508434295654, + "rewards/margins": 17.934566497802734, + "rewards/rejected": -21.873916625976562, + "step": 1200 + }, + { + "epoch": 3.78001549186677, + "grad_norm": 0.020518837496638298, + "learning_rate": 0.00019376134045256423, + "logits/chosen": -1.6692512035369873, + "logits/rejected": -0.8992973566055298, + "logps/chosen": -377.19891357421875, + "logps/rejected": -354.7894592285156, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.314318060874939, + "rewards/margins": 12.59657096862793, + "rewards/rejected": -13.910888671875, + "step": 1220 + }, + { + "epoch": 3.841982958946553, + "grad_norm": 0.006729189306497574, + "learning_rate": 0.00019353879345111413, + "logits/chosen": -1.650059461593628, + "logits/rejected": -0.9682002067565918, + "logps/chosen": -370.81866455078125, + "logps/rejected": -390.57159423828125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.371593952178955, + "rewards/margins": 13.772104263305664, + "rewards/rejected": -16.14369773864746, + "step": 1240 + }, + { + "epoch": 3.903950426026336, + "grad_norm": 0.007746795192360878, + "learning_rate": 0.000193312478720703, + "logits/chosen": -1.6948245763778687, + "logits/rejected": -0.8788079023361206, + "logps/chosen": -388.7099304199219, + "logps/rejected": -379.38824462890625, + "loss": 0.0044, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.7807118892669678, + "rewards/margins": 14.142008781433105, + "rewards/rejected": -16.92272186279297, + "step": 1260 + }, + { + "epoch": 3.9659178931061194, + "grad_norm": 0.03579813614487648, + "learning_rate": 0.00019308240537725517, + "logits/chosen": -1.610775351524353, + "logits/rejected": -0.7109208703041077, + "logps/chosen": -386.43792724609375, + "logps/rejected": -369.51416015625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.900850534439087, + "rewards/margins": 14.74780559539795, + "rewards/rejected": -17.64865493774414, + "step": 1280 + }, + { + "epoch": 4.027885360185903, + "grad_norm": 0.038256481289863586, + "learning_rate": 0.00019284858268809137, + "logits/chosen": -1.6177009344100952, + "logits/rejected": -0.7443078756332397, + "logps/chosen": -374.1534423828125, + "logps/rejected": -370.35137939453125, + "loss": 0.0014, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.607776641845703, + "rewards/margins": 14.8814697265625, + "rewards/rejected": -18.48924446105957, + "step": 1300 + }, + { + "epoch": 4.089852827265686, + "grad_norm": 0.011043623089790344, + "learning_rate": 0.0001926110200715554, + "logits/chosen": -1.5368865728378296, + "logits/rejected": -0.7745493650436401, + "logps/chosen": -364.08038330078125, + "logps/rejected": -396.889404296875, + "loss": 0.0036, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.033879280090332, + "rewards/margins": 14.89660358428955, + "rewards/rejected": -18.930482864379883, + "step": 1320 + }, + { + "epoch": 4.151820294345469, + "grad_norm": 0.0009816307574510574, + "learning_rate": 0.00019236972709663487, + "logits/chosen": -1.4818181991577148, + "logits/rejected": -0.5778347849845886, + "logps/chosen": -388.94549560546875, + "logps/rejected": -394.2232971191406, + "loss": 0.0025, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.4815616607666016, + "rewards/margins": 15.029690742492676, + "rewards/rejected": -18.511253356933594, + "step": 1340 + }, + { + "epoch": 4.213787761425252, + "grad_norm": 0.015701670199632645, + "learning_rate": 0.00019212471348257562, + "logits/chosen": -1.4690208435058594, + "logits/rejected": -0.48099368810653687, + "logps/chosen": -396.7850036621094, + "logps/rejected": -408.1683349609375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.170346975326538, + "rewards/margins": 16.654111862182617, + "rewards/rejected": -19.824459075927734, + "step": 1360 + }, + { + "epoch": 4.275755228505035, + "grad_norm": 0.0036051629576832056, + "learning_rate": 0.0001918759890984902, + "logits/chosen": -1.3399614095687866, + "logits/rejected": -0.3685137629508972, + "logps/chosen": -372.87969970703125, + "logps/rejected": -388.63116455078125, + "loss": 0.0023, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.198568344116211, + "rewards/margins": 15.592170715332031, + "rewards/rejected": -19.79073715209961, + "step": 1380 + }, + { + "epoch": 4.337722695584818, + "grad_norm": 0.01418756041675806, + "learning_rate": 0.00019162356396296067, + "logits/chosen": -1.3985111713409424, + "logits/rejected": -0.25035908818244934, + "logps/chosen": -397.00213623046875, + "logps/rejected": -385.6864318847656, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2335734367370605, + "rewards/margins": 15.834146499633789, + "rewards/rejected": -20.067720413208008, + "step": 1400 + }, + { + "epoch": 4.3996901626646014, + "grad_norm": 0.0017586932517588139, + "learning_rate": 0.0001913674482436346, + "logits/chosen": -1.3915663957595825, + "logits/rejected": -0.2883684039115906, + "logps/chosen": -400.862060546875, + "logps/rejected": -400.7593078613281, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.649229526519775, + "rewards/margins": 16.46903419494629, + "rewards/rejected": -21.118263244628906, + "step": 1420 + }, + { + "epoch": 4.461657629744384, + "grad_norm": 0.0009894509566947818, + "learning_rate": 0.00019110765225681582, + "logits/chosen": -1.2623310089111328, + "logits/rejected": -0.3094506859779358, + "logps/chosen": -405.25714111328125, + "logps/rejected": -450.39678955078125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.272841453552246, + "rewards/margins": 17.283151626586914, + "rewards/rejected": -22.555994033813477, + "step": 1440 + }, + { + "epoch": 4.523625096824167, + "grad_norm": 0.011076156981289387, + "learning_rate": 0.00019084418646704882, + "logits/chosen": -1.3033758401870728, + "logits/rejected": -0.33031535148620605, + "logps/chosen": -394.64715576171875, + "logps/rejected": -434.3880310058594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.661707401275635, + "rewards/margins": 17.585865020751953, + "rewards/rejected": -22.247570037841797, + "step": 1460 + }, + { + "epoch": 4.58559256390395, + "grad_norm": 0.011492446064949036, + "learning_rate": 0.0001905770614866972, + "logits/chosen": -1.2698830366134644, + "logits/rejected": -0.2575158476829529, + "logps/chosen": -411.17333984375, + "logps/rejected": -460.8190002441406, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.470455646514893, + "rewards/margins": 19.05929946899414, + "rewards/rejected": -23.529756546020508, + "step": 1480 + }, + { + "epoch": 4.647560030983733, + "grad_norm": 0.0024670190177857876, + "learning_rate": 0.0001903062880755162, + "logits/chosen": -1.2591683864593506, + "logits/rejected": -0.018044818192720413, + "logps/chosen": -403.6463317871094, + "logps/rejected": -414.41552734375, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.826168060302734, + "rewards/margins": 17.654024124145508, + "rewards/rejected": -22.480192184448242, + "step": 1500 + }, + { + "epoch": 4.709527498063517, + "grad_norm": 0.006593796424567699, + "learning_rate": 0.00019003187714021938, + "logits/chosen": -1.2251088619232178, + "logits/rejected": 0.010517546907067299, + "logps/chosen": -413.086181640625, + "logps/rejected": -438.603515625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.791226387023926, + "rewards/margins": 18.92746925354004, + "rewards/rejected": -23.71869659423828, + "step": 1520 + }, + { + "epoch": 4.7714949651433, + "grad_norm": 0.004145272541791201, + "learning_rate": 0.00018975383973403914, + "logits/chosen": -1.0933794975280762, + "logits/rejected": 0.2759025990962982, + "logps/chosen": -422.998779296875, + "logps/rejected": -437.8479919433594, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.828397274017334, + "rewards/margins": 19.29062843322754, + "rewards/rejected": -25.1190242767334, + "step": 1540 + }, + { + "epoch": 4.833462432223083, + "grad_norm": 0.0056694443337619305, + "learning_rate": 0.00018947218705628167, + "logits/chosen": -1.1124125719070435, + "logits/rejected": 0.17458318173885345, + "logps/chosen": -418.27703857421875, + "logps/rejected": -469.0071716308594, + "loss": 0.0045, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.002425670623779, + "rewards/margins": 20.302865982055664, + "rewards/rejected": -25.3052921295166, + "step": 1560 + }, + { + "epoch": 4.895429899302866, + "grad_norm": 0.00018859546980820596, + "learning_rate": 0.0001891869304518758, + "logits/chosen": -1.0209033489227295, + "logits/rejected": 0.3391306698322296, + "logps/chosen": -400.2538146972656, + "logps/rejected": -430.3617248535156, + "loss": 0.0034, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.619088172912598, + "rewards/margins": 19.616975784301758, + "rewards/rejected": -24.23606300354004, + "step": 1580 + }, + { + "epoch": 4.957397366382649, + "grad_norm": 0.029597284272313118, + "learning_rate": 0.00018889808141091597, + "logits/chosen": -1.0116602182388306, + "logits/rejected": 0.38184159994125366, + "logps/chosen": -412.35205078125, + "logps/rejected": -459.47991943359375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.358821868896484, + "rewards/margins": 20.45774269104004, + "rewards/rejected": -25.816564559936523, + "step": 1600 + }, + { + "epoch": 5.019364833462432, + "grad_norm": 0.001491004484705627, + "learning_rate": 0.00018860565156819935, + "logits/chosen": -1.278867244720459, + "logits/rejected": -0.06797562539577484, + "logps/chosen": -397.50836181640625, + "logps/rejected": -435.203857421875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6724727153778076, + "rewards/margins": 17.823925018310547, + "rewards/rejected": -21.49639892578125, + "step": 1620 + }, + { + "epoch": 5.081332300542216, + "grad_norm": 0.012196192517876625, + "learning_rate": 0.00018830965270275746, + "logits/chosen": -1.1280790567398071, + "logits/rejected": 0.10533533245325089, + "logps/chosen": -397.92498779296875, + "logps/rejected": -435.8148498535156, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.880277633666992, + "rewards/margins": 18.349157333374023, + "rewards/rejected": -23.229434967041016, + "step": 1640 + }, + { + "epoch": 5.143299767621999, + "grad_norm": 0.0038448043633252382, + "learning_rate": 0.00018801009673738138, + "logits/chosen": -1.1577694416046143, + "logits/rejected": 0.07719329744577408, + "logps/chosen": -417.5320739746094, + "logps/rejected": -470.4189453125, + "loss": 0.0055, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.50655460357666, + "rewards/margins": 19.84600830078125, + "rewards/rejected": -24.352563858032227, + "step": 1660 + }, + { + "epoch": 5.205267234701782, + "grad_norm": 0.0032272750977426767, + "learning_rate": 0.00018770699573814176, + "logits/chosen": -1.0571038722991943, + "logits/rejected": 0.0797622948884964, + "logps/chosen": -390.07745361328125, + "logps/rejected": -450.1996154785156, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.879145622253418, + "rewards/margins": 18.845855712890625, + "rewards/rejected": -23.725000381469727, + "step": 1680 + }, + { + "epoch": 5.267234701781565, + "grad_norm": 0.0002681920013856143, + "learning_rate": 0.0001874003619139026, + "logits/chosen": -1.136499047279358, + "logits/rejected": 0.09882185608148575, + "logps/chosen": -404.396240234375, + "logps/rejected": -455.455322265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.68656587600708, + "rewards/margins": 19.550119400024414, + "rewards/rejected": -23.236684799194336, + "step": 1700 + }, + { + "epoch": 5.329202168861348, + "grad_norm": 0.002475241431966424, + "learning_rate": 0.00018709020761582967, + "logits/chosen": -1.1179392337799072, + "logits/rejected": 0.12788158655166626, + "logps/chosen": -407.6341857910156, + "logps/rejected": -452.46905517578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.104896068572998, + "rewards/margins": 19.286361694335938, + "rewards/rejected": -24.39125633239746, + "step": 1720 + }, + { + "epoch": 5.3911696359411305, + "grad_norm": 9.136780863627791e-05, + "learning_rate": 0.00018677654533689287, + "logits/chosen": -1.0823495388031006, + "logits/rejected": 0.1217053085565567, + "logps/chosen": -406.08746337890625, + "logps/rejected": -469.70672607421875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.2551984786987305, + "rewards/margins": 19.903423309326172, + "rewards/rejected": -25.15862274169922, + "step": 1740 + }, + { + "epoch": 5.453137103020914, + "grad_norm": 0.004714645445346832, + "learning_rate": 0.00018645938771136303, + "logits/chosen": -1.1251791715621948, + "logits/rejected": 0.17070087790489197, + "logps/chosen": -389.72161865234375, + "logps/rejected": -451.95098876953125, + "loss": 0.0076, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -4.84260368347168, + "rewards/margins": 20.070592880249023, + "rewards/rejected": -24.91319465637207, + "step": 1760 + }, + { + "epoch": 5.515104570100697, + "grad_norm": 0.0027761622332036495, + "learning_rate": 0.00018613874751430306, + "logits/chosen": -1.0996571779251099, + "logits/rejected": 0.08243855088949203, + "logps/chosen": -397.93212890625, + "logps/rejected": -448.5020446777344, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.580557346343994, + "rewards/margins": 19.335094451904297, + "rewards/rejected": -23.915653228759766, + "step": 1780 + }, + { + "epoch": 5.57707203718048, + "grad_norm": 0.0022653560154139996, + "learning_rate": 0.0001858146376610534, + "logits/chosen": -1.1286752223968506, + "logits/rejected": 0.13742080330848694, + "logps/chosen": -411.6807556152344, + "logps/rejected": -451.95098876953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.51253604888916, + "rewards/margins": 19.66611099243164, + "rewards/rejected": -24.178647994995117, + "step": 1800 + }, + { + "epoch": 5.639039504260263, + "grad_norm": 0.003803479950875044, + "learning_rate": 0.0001854870712067116, + "logits/chosen": -1.052321195602417, + "logits/rejected": 0.1815873682498932, + "logps/chosen": -400.9226379394531, + "logps/rejected": -471.00616455078125, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.826806545257568, + "rewards/margins": 20.531307220458984, + "rewards/rejected": -25.358118057250977, + "step": 1820 + }, + { + "epoch": 5.701006971340046, + "grad_norm": 0.0006538841407746077, + "learning_rate": 0.00018515606134560675, + "logits/chosen": -0.9289565086364746, + "logits/rejected": 0.3704550266265869, + "logps/chosen": -403.9594421386719, + "logps/rejected": -450.3981018066406, + "loss": 0.0024, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.7688727378845215, + "rewards/margins": 19.25821304321289, + "rewards/rejected": -25.027088165283203, + "step": 1840 + }, + { + "epoch": 5.76297443841983, + "grad_norm": 0.0022948060650378466, + "learning_rate": 0.00018482162141076778, + "logits/chosen": -0.9270216822624207, + "logits/rejected": 0.4516163766384125, + "logps/chosen": -397.4948425292969, + "logps/rejected": -454.2916564941406, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.666428089141846, + "rewards/margins": 20.301498413085938, + "rewards/rejected": -24.967926025390625, + "step": 1860 + }, + { + "epoch": 5.824941905499613, + "grad_norm": 0.006001237779855728, + "learning_rate": 0.00018448376487338646, + "logits/chosen": -0.8780455589294434, + "logits/rejected": 0.6470359563827515, + "logps/chosen": -391.3553161621094, + "logps/rejected": -444.733642578125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.144345283508301, + "rewards/margins": 21.074970245361328, + "rewards/rejected": -26.219318389892578, + "step": 1880 + }, + { + "epoch": 5.886909372579396, + "grad_norm": 0.005203678738325834, + "learning_rate": 0.00018414250534227485, + "logits/chosen": -0.8152813911437988, + "logits/rejected": 0.5252994894981384, + "logps/chosen": -437.67633056640625, + "logps/rejected": -507.53533935546875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.068148612976074, + "rewards/margins": 21.216217041015625, + "rewards/rejected": -27.28436279296875, + "step": 1900 + }, + { + "epoch": 5.948876839659179, + "grad_norm": 0.0005356409237720072, + "learning_rate": 0.00018379785656331713, + "logits/chosen": -0.9145771265029907, + "logits/rejected": 0.49262484908103943, + "logps/chosen": -414.5752868652344, + "logps/rejected": -481.29443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.121223449707031, + "rewards/margins": 21.19986343383789, + "rewards/rejected": -26.32108497619629, + "step": 1920 + }, + { + "epoch": 6.010844306738962, + "grad_norm": 0.003898509545251727, + "learning_rate": 0.00018344983241891586, + "logits/chosen": -0.9906449317932129, + "logits/rejected": 0.5498068332672119, + "logps/chosen": -414.0738830566406, + "logps/rejected": -446.76800537109375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.372982978820801, + "rewards/margins": 20.67215347290039, + "rewards/rejected": -26.045135498046875, + "step": 1940 + }, + { + "epoch": 6.072811773818745, + "grad_norm": 0.002160005969926715, + "learning_rate": 0.00018309844692743283, + "logits/chosen": -0.9815686941146851, + "logits/rejected": 0.5956189632415771, + "logps/chosen": -405.01654052734375, + "logps/rejected": -442.0355529785156, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.603247165679932, + "rewards/margins": 20.942607879638672, + "rewards/rejected": -25.545854568481445, + "step": 1960 + }, + { + "epoch": 6.134779240898529, + "grad_norm": 0.000817653548438102, + "learning_rate": 0.0001827437142426244, + "logits/chosen": -0.8936567306518555, + "logits/rejected": 0.5212098360061646, + "logps/chosen": -395.4510498046875, + "logps/rejected": -475.28631591796875, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.184081077575684, + "rewards/margins": 21.514982223510742, + "rewards/rejected": -26.69906234741211, + "step": 1980 + }, + { + "epoch": 6.196746707978312, + "grad_norm": 0.002587874187156558, + "learning_rate": 0.00018238564865307138, + "logits/chosen": -0.8976683616638184, + "logits/rejected": 0.5809212923049927, + "logps/chosen": -413.9544372558594, + "logps/rejected": -460.4183654785156, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.504066467285156, + "rewards/margins": 20.807832717895508, + "rewards/rejected": -26.311901092529297, + "step": 2000 + }, + { + "epoch": 6.258714175058095, + "grad_norm": 0.0015577581943944097, + "learning_rate": 0.00018202426458160354, + "logits/chosen": -0.9174526333808899, + "logits/rejected": 0.5119841694831848, + "logps/chosen": -408.99609375, + "logps/rejected": -474.6453552246094, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.049027919769287, + "rewards/margins": 21.92804718017578, + "rewards/rejected": -26.977075576782227, + "step": 2020 + }, + { + "epoch": 6.3206816421378775, + "grad_norm": 0.0013881104532629251, + "learning_rate": 0.00018165957658471853, + "logits/chosen": -0.8762157559394836, + "logits/rejected": 0.3336823880672455, + "logps/chosen": -424.74237060546875, + "logps/rejected": -481.97198486328125, + "loss": 0.0044, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.531933784484863, + "rewards/margins": 19.711212158203125, + "rewards/rejected": -26.243144989013672, + "step": 2040 + }, + { + "epoch": 6.3826491092176605, + "grad_norm": 0.0015647505642846227, + "learning_rate": 0.00018129159935199572, + "logits/chosen": -0.9118504524230957, + "logits/rejected": 0.40582960844039917, + "logps/chosen": -405.77001953125, + "logps/rejected": -471.5838317871094, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.325821876525879, + "rewards/margins": 20.694194793701172, + "rewards/rejected": -26.02001953125, + "step": 2060 + }, + { + "epoch": 6.4446165762974434, + "grad_norm": 0.00021198451577220112, + "learning_rate": 0.00018092034770550436, + "logits/chosen": -0.9560929536819458, + "logits/rejected": 0.44417256116867065, + "logps/chosen": -407.6698303222656, + "logps/rejected": -475.4552307128906, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.728771209716797, + "rewards/margins": 21.15816879272461, + "rewards/rejected": -26.88694190979004, + "step": 2080 + }, + { + "epoch": 6.506584043377227, + "grad_norm": 0.002910887822508812, + "learning_rate": 0.00018054583659920669, + "logits/chosen": -0.913163959980011, + "logits/rejected": 0.4632677435874939, + "logps/chosen": -397.37841796875, + "logps/rejected": -459.6831970214844, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.117273807525635, + "rewards/margins": 21.222431182861328, + "rewards/rejected": -26.339706420898438, + "step": 2100 + }, + { + "epoch": 6.56855151045701, + "grad_norm": 0.0028291107155382633, + "learning_rate": 0.00018016808111835544, + "logits/chosen": -0.9535878896713257, + "logits/rejected": 0.4175487160682678, + "logps/chosen": -405.0999755859375, + "logps/rejected": -472.16632080078125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.1232757568359375, + "rewards/margins": 20.961214065551758, + "rewards/rejected": -26.084487915039062, + "step": 2120 + }, + { + "epoch": 6.630518977536793, + "grad_norm": 0.00040341372368857265, + "learning_rate": 0.0001797870964788863, + "logits/chosen": -0.9253376126289368, + "logits/rejected": 0.42822474241256714, + "logps/chosen": -415.0838928222656, + "logps/rejected": -483.0498962402344, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.194808006286621, + "rewards/margins": 20.92584228515625, + "rewards/rejected": -27.120647430419922, + "step": 2140 + }, + { + "epoch": 6.692486444616576, + "grad_norm": 0.0016098152846097946, + "learning_rate": 0.0001794028980268049, + "logits/chosen": -0.9861367344856262, + "logits/rejected": 0.6260654330253601, + "logps/chosen": -428.81243896484375, + "logps/rejected": -470.528564453125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6738715171813965, + "rewards/margins": 21.712966918945312, + "rewards/rejected": -27.386837005615234, + "step": 2160 + }, + { + "epoch": 6.754453911696359, + "grad_norm": 0.00064540357561782, + "learning_rate": 0.00017901550123756906, + "logits/chosen": -0.9794920086860657, + "logits/rejected": 0.5585097074508667, + "logps/chosen": -405.3882141113281, + "logps/rejected": -460.2223205566406, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.186602592468262, + "rewards/margins": 21.48334312438965, + "rewards/rejected": -26.66994285583496, + "step": 2180 + }, + { + "epoch": 6.816421378776143, + "grad_norm": 0.01159485150128603, + "learning_rate": 0.00017862492171546478, + "logits/chosen": -0.8846482038497925, + "logits/rejected": 0.5302290320396423, + "logps/chosen": -418.35211181640625, + "logps/rejected": -480.18597412109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.644828796386719, + "rewards/margins": 21.466665267944336, + "rewards/rejected": -27.111492156982422, + "step": 2200 + }, + { + "epoch": 6.878388845855926, + "grad_norm": 0.0008106857421807945, + "learning_rate": 0.0001782311751929784, + "logits/chosen": -0.8796793222427368, + "logits/rejected": 0.47619304060935974, + "logps/chosen": -407.5473327636719, + "logps/rejected": -496.4972229003906, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.067633628845215, + "rewards/margins": 22.56241226196289, + "rewards/rejected": -27.63004493713379, + "step": 2220 + }, + { + "epoch": 6.940356312935709, + "grad_norm": 0.00046345905866473913, + "learning_rate": 0.00017783427753016232, + "logits/chosen": -0.837285041809082, + "logits/rejected": 0.547538161277771, + "logps/chosen": -422.29541015625, + "logps/rejected": -488.83734130859375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.489915370941162, + "rewards/margins": 20.592243194580078, + "rewards/rejected": -27.0821590423584, + "step": 2240 + }, + { + "epoch": 7.002323780015492, + "grad_norm": 0.0002803100214805454, + "learning_rate": 0.00017743424471399662, + "logits/chosen": -0.9276706576347351, + "logits/rejected": 0.4761223793029785, + "logps/chosen": -409.07073974609375, + "logps/rejected": -465.58544921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.659182548522949, + "rewards/margins": 20.956308364868164, + "rewards/rejected": -26.615489959716797, + "step": 2260 + }, + { + "epoch": 7.064291247095275, + "grad_norm": 0.0009606317616999149, + "learning_rate": 0.00017703109285774473, + "logits/chosen": -0.8453294634819031, + "logits/rejected": 0.4921432435512543, + "logps/chosen": -413.75616455078125, + "logps/rejected": -493.93634033203125, + "loss": 0.0033, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -5.530547142028809, + "rewards/margins": 21.9550724029541, + "rewards/rejected": -27.48561668395996, + "step": 2280 + }, + { + "epoch": 7.126258714175058, + "grad_norm": 0.00027114342083223164, + "learning_rate": 0.00017662483820030466, + "logits/chosen": -0.8099842071533203, + "logits/rejected": 0.5511294603347778, + "logps/chosen": -417.60467529296875, + "logps/rejected": -493.1431579589844, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.9566192626953125, + "rewards/margins": 21.779888153076172, + "rewards/rejected": -27.73651123046875, + "step": 2300 + }, + { + "epoch": 7.188226181254842, + "grad_norm": 0.00029379583429545164, + "learning_rate": 0.00017623603720914402, + "logits/chosen": -0.9296634793281555, + "logits/rejected": 0.502611517906189, + "logps/chosen": -408.59228515625, + "logps/rejected": -462.72052001953125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.541842937469482, + "rewards/margins": 21.353099822998047, + "rewards/rejected": -26.894943237304688, + "step": 2320 + }, + { + "epoch": 7.2501936483346245, + "grad_norm": 0.0035946103744208813, + "learning_rate": 0.00017582377926923305, + "logits/chosen": -0.904528021812439, + "logits/rejected": 0.5609920620918274, + "logps/chosen": -410.15753173828125, + "logps/rejected": -459.9115295410156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2013936042785645, + "rewards/margins": 21.163597106933594, + "rewards/rejected": -26.364990234375, + "step": 2340 + }, + { + "epoch": 7.3121611154144075, + "grad_norm": 0.001584152108989656, + "learning_rate": 0.00017540846715854923, + "logits/chosen": -0.8879092335700989, + "logits/rejected": 0.5792483687400818, + "logps/chosen": -416.14190673828125, + "logps/rejected": -487.8919372558594, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.094472408294678, + "rewards/margins": 21.977914810180664, + "rewards/rejected": -28.0723876953125, + "step": 2360 + }, + { + "epoch": 7.3741285824941905, + "grad_norm": 0.0013305445900186896, + "learning_rate": 0.00017499011760580376, + "logits/chosen": -0.8160873651504517, + "logits/rejected": 0.6186665296554565, + "logps/chosen": -419.13006591796875, + "logps/rejected": -483.44482421875, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.028205871582031, + "rewards/margins": 21.97576904296875, + "rewards/rejected": -28.003976821899414, + "step": 2380 + }, + { + "epoch": 7.436096049573973, + "grad_norm": 0.0005457611987367272, + "learning_rate": 0.00017456874746205568, + "logits/chosen": -0.8007138967514038, + "logits/rejected": 0.6117135882377625, + "logps/chosen": -416.05438232421875, + "logps/rejected": -488.6453552246094, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.2818403244018555, + "rewards/margins": 22.225994110107422, + "rewards/rejected": -28.507831573486328, + "step": 2400 + }, + { + "epoch": 7.498063516653756, + "grad_norm": 0.0002609234652481973, + "learning_rate": 0.00017414437370003293, + "logits/chosen": -0.885455310344696, + "logits/rejected": 0.7295613288879395, + "logps/chosen": -407.35797119140625, + "logps/rejected": -458.9103088378906, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.998741626739502, + "rewards/margins": 22.125844955444336, + "rewards/rejected": -28.124588012695312, + "step": 2420 + }, + { + "epoch": 7.56003098373354, + "grad_norm": 0.010798891074955463, + "learning_rate": 0.00017371701341344878, + "logits/chosen": -0.8749537467956543, + "logits/rejected": 0.737878680229187, + "logps/chosen": -418.1792907714844, + "logps/rejected": -482.16619873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.506211280822754, + "rewards/margins": 22.69670295715332, + "rewards/rejected": -28.20291519165039, + "step": 2440 + }, + { + "epoch": 7.621998450813323, + "grad_norm": 0.002587670926004648, + "learning_rate": 0.00017328668381631318, + "logits/chosen": -0.8414995074272156, + "logits/rejected": 0.6650308966636658, + "logps/chosen": -403.06243896484375, + "logps/rejected": -462.6051330566406, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.898090362548828, + "rewards/margins": 21.654659271240234, + "rewards/rejected": -27.552749633789062, + "step": 2460 + }, + { + "epoch": 7.683965917893106, + "grad_norm": 0.0005328520783223212, + "learning_rate": 0.00017285340224223965, + "logits/chosen": -0.8659757375717163, + "logits/rejected": 0.6123751401901245, + "logps/chosen": -417.27764892578125, + "logps/rejected": -472.28228759765625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.232241153717041, + "rewards/margins": 21.851760864257812, + "rewards/rejected": -28.084003448486328, + "step": 2480 + }, + { + "epoch": 7.745933384972889, + "grad_norm": 0.00171385589055717, + "learning_rate": 0.00017241718614374678, + "logits/chosen": -0.7435486912727356, + "logits/rejected": 0.608551025390625, + "logps/chosen": -418.3434143066406, + "logps/rejected": -485.39825439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.975823402404785, + "rewards/margins": 21.549240112304688, + "rewards/rejected": -27.525060653686523, + "step": 2500 + }, + { + "epoch": 7.807900852052672, + "grad_norm": 0.000463314849184826, + "learning_rate": 0.00017197805309155536, + "logits/chosen": -0.8705068826675415, + "logits/rejected": 0.7182124257087708, + "logps/chosen": -418.2559509277344, + "logps/rejected": -470.73431396484375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.518788814544678, + "rewards/margins": 21.688777923583984, + "rewards/rejected": -28.207569122314453, + "step": 2520 + }, + { + "epoch": 7.869868319132456, + "grad_norm": 0.002235127380117774, + "learning_rate": 0.0001715360207738808, + "logits/chosen": -0.8467845916748047, + "logits/rejected": 0.6931872367858887, + "logps/chosen": -398.0982360839844, + "logps/rejected": -461.494140625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.6116943359375, + "rewards/margins": 20.939289093017578, + "rewards/rejected": -27.550983428955078, + "step": 2540 + }, + { + "epoch": 7.931835786212239, + "grad_norm": 0.0014148158952593803, + "learning_rate": 0.0001710911069957203, + "logits/chosen": -0.8599146008491516, + "logits/rejected": 0.597332239151001, + "logps/chosen": -415.1485900878906, + "logps/rejected": -478.2962341308594, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.637750148773193, + "rewards/margins": 21.647342681884766, + "rewards/rejected": -27.285091400146484, + "step": 2560 + }, + { + "epoch": 7.993803253292022, + "grad_norm": 0.00026821051142178476, + "learning_rate": 0.00017064332967813605, + "logits/chosen": -0.8094769716262817, + "logits/rejected": 0.5584506988525391, + "logps/chosen": -433.38525390625, + "logps/rejected": -498.05792236328125, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.968195915222168, + "rewards/margins": 21.783451080322266, + "rewards/rejected": -27.751644134521484, + "step": 2580 + }, + { + "epoch": 8.055770720371806, + "grad_norm": 0.0008632375975139439, + "learning_rate": 0.0001701927068575331, + "logits/chosen": -0.8063030242919922, + "logits/rejected": 0.681650698184967, + "logps/chosen": -417.40740966796875, + "logps/rejected": -483.35833740234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.037853240966797, + "rewards/margins": 22.328609466552734, + "rewards/rejected": -28.3664608001709, + "step": 2600 + }, + { + "epoch": 8.117738187451588, + "grad_norm": 0.00020725080685224384, + "learning_rate": 0.0001697392566849329, + "logits/chosen": -0.7843044996261597, + "logits/rejected": 0.6513689160346985, + "logps/chosen": -413.47589111328125, + "logps/rejected": -490.56103515625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.002742290496826, + "rewards/margins": 22.975902557373047, + "rewards/rejected": -28.9786434173584, + "step": 2620 + }, + { + "epoch": 8.179705654531372, + "grad_norm": 0.001819239230826497, + "learning_rate": 0.00016928299742524234, + "logits/chosen": -0.8633748292922974, + "logits/rejected": 0.6922792196273804, + "logps/chosen": -412.0850524902344, + "logps/rejected": -474.5027770996094, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.745542526245117, + "rewards/margins": 22.514820098876953, + "rewards/rejected": -28.260364532470703, + "step": 2640 + }, + { + "epoch": 8.241673121611154, + "grad_norm": 0.0030217173043638468, + "learning_rate": 0.00016882394745651783, + "logits/chosen": -0.9476584196090698, + "logits/rejected": 0.6893962025642395, + "logps/chosen": -416.17291259765625, + "logps/rejected": -479.39923095703125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.220405101776123, + "rewards/margins": 22.663036346435547, + "rewards/rejected": -28.883441925048828, + "step": 2660 + }, + { + "epoch": 8.303640588690937, + "grad_norm": 0.0001663499278947711, + "learning_rate": 0.00016836212526922522, + "logits/chosen": -0.8719544410705566, + "logits/rejected": 0.6744921207427979, + "logps/chosen": -417.1148986816406, + "logps/rejected": -475.9832458496094, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.162570953369141, + "rewards/margins": 22.504776000976562, + "rewards/rejected": -27.667346954345703, + "step": 2680 + }, + { + "epoch": 8.36560805577072, + "grad_norm": 0.002916699508205056, + "learning_rate": 0.00016789754946549485, + "logits/chosen": -0.9196687936782837, + "logits/rejected": 0.5761129260063171, + "logps/chosen": -411.28033447265625, + "logps/rejected": -478.9852600097656, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.738530158996582, + "rewards/margins": 22.162540435791016, + "rewards/rejected": -27.901071548461914, + "step": 2700 + }, + { + "epoch": 8.427575522850503, + "grad_norm": 0.00011332995200064033, + "learning_rate": 0.00016743023875837233, + "logits/chosen": -0.7387035489082336, + "logits/rejected": 0.6855649948120117, + "logps/chosen": -426.68487548828125, + "logps/rejected": -502.105712890625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.9964470863342285, + "rewards/margins": 23.166492462158203, + "rewards/rejected": -29.16294288635254, + "step": 2720 + }, + { + "epoch": 8.489542989930287, + "grad_norm": 0.0009012964437715709, + "learning_rate": 0.00016696021197106487, + "logits/chosen": -0.8611875772476196, + "logits/rejected": 0.7133287191390991, + "logps/chosen": -395.930419921875, + "logps/rejected": -469.5165100097656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.954014778137207, + "rewards/margins": 22.503925323486328, + "rewards/rejected": -27.45794105529785, + "step": 2740 + }, + { + "epoch": 8.55151045701007, + "grad_norm": 0.00019017455633729696, + "learning_rate": 0.00016648748803618286, + "logits/chosen": -0.8437725901603699, + "logits/rejected": 0.6948248147964478, + "logps/chosen": -429.0943298339844, + "logps/rejected": -488.63507080078125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.994604110717773, + "rewards/margins": 21.811975479125977, + "rewards/rejected": -27.80657958984375, + "step": 2760 + }, + { + "epoch": 8.613477924089853, + "grad_norm": 0.002179044298827648, + "learning_rate": 0.00016601208599497752, + "logits/chosen": -0.8570048213005066, + "logits/rejected": 0.5581663846969604, + "logps/chosen": -420.24639892578125, + "logps/rejected": -517.4495239257812, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.668288230895996, + "rewards/margins": 22.895244598388672, + "rewards/rejected": -29.563533782958984, + "step": 2780 + }, + { + "epoch": 8.675445391169635, + "grad_norm": 0.0014218884753063321, + "learning_rate": 0.0001655340249965737, + "logits/chosen": -0.8298648595809937, + "logits/rejected": 0.7341376543045044, + "logps/chosen": -417.24017333984375, + "logps/rejected": -486.96197509765625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.189940929412842, + "rewards/margins": 22.539569854736328, + "rewards/rejected": -28.72951316833496, + "step": 2800 + }, + { + "epoch": 8.737412858249419, + "grad_norm": 0.004287872463464737, + "learning_rate": 0.0001650533242971987, + "logits/chosen": -0.783652126789093, + "logits/rejected": 0.6020525693893433, + "logps/chosen": -433.5499572753906, + "logps/rejected": -506.0953063964844, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.208212852478027, + "rewards/margins": 22.688650131225586, + "rewards/rejected": -28.896865844726562, + "step": 2820 + }, + { + "epoch": 8.799380325329203, + "grad_norm": 0.0011698489543050528, + "learning_rate": 0.00016457000325940667, + "logits/chosen": -0.9040369987487793, + "logits/rejected": 0.7520915269851685, + "logps/chosen": -423.38189697265625, + "logps/rejected": -480.9803161621094, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.842212677001953, + "rewards/margins": 23.215862274169922, + "rewards/rejected": -29.058074951171875, + "step": 2840 + }, + { + "epoch": 8.861347792408985, + "grad_norm": 0.0011277641169726849, + "learning_rate": 0.0001640840813512985, + "logits/chosen": -0.8744897842407227, + "logits/rejected": 0.7434971928596497, + "logps/chosen": -426.23931884765625, + "logps/rejected": -501.59710693359375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.3598833084106445, + "rewards/margins": 23.147171020507812, + "rewards/rejected": -29.507055282592773, + "step": 2860 + }, + { + "epoch": 8.923315259488769, + "grad_norm": 0.0010242237476632, + "learning_rate": 0.00016359557814573777, + "logits/chosen": -0.7934980392456055, + "logits/rejected": 0.6807147264480591, + "logps/chosen": -406.7119445800781, + "logps/rejected": -484.58831787109375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.454986572265625, + "rewards/margins": 22.421138763427734, + "rewards/rejected": -28.876129150390625, + "step": 2880 + }, + { + "epoch": 8.98528272656855, + "grad_norm": 0.00022442563204094768, + "learning_rate": 0.00016310451331956238, + "logits/chosen": -0.7804977297782898, + "logits/rejected": 0.7946068048477173, + "logps/chosen": -423.38397216796875, + "logps/rejected": -501.4856872558594, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.33301305770874, + "rewards/margins": 23.95859146118164, + "rewards/rejected": -29.29160499572754, + "step": 2900 + }, + { + "epoch": 9.047250193648335, + "grad_norm": 0.0018227125983685255, + "learning_rate": 0.00016261090665279198, + "logits/chosen": -0.8016149401664734, + "logits/rejected": 0.6636785268783569, + "logps/chosen": -426.03240966796875, + "logps/rejected": -527.2731323242188, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.532900810241699, + "rewards/margins": 24.20720863342285, + "rewards/rejected": -30.7401065826416, + "step": 2920 + }, + { + "epoch": 9.109217660728119, + "grad_norm": 0.0025721483398228884, + "learning_rate": 0.00016211477802783103, + "logits/chosen": -0.7957175374031067, + "logits/rejected": 0.7413456439971924, + "logps/chosen": -411.5255432128906, + "logps/rejected": -510.88909912109375, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.42633056640625, + "rewards/margins": 23.930238723754883, + "rewards/rejected": -30.3565673828125, + "step": 2940 + }, + { + "epoch": 9.1711851278079, + "grad_norm": 0.0015712358290329576, + "learning_rate": 0.00016161614742866832, + "logits/chosen": -0.7725690603256226, + "logits/rejected": 0.7799097895622253, + "logps/chosen": -412.74810791015625, + "logps/rejected": -492.89892578125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.049948692321777, + "rewards/margins": 22.938610076904297, + "rewards/rejected": -28.988555908203125, + "step": 2960 + }, + { + "epoch": 9.233152594887684, + "grad_norm": 0.0009299792000092566, + "learning_rate": 0.0001611150349400716, + "logits/chosen": -0.8838273882865906, + "logits/rejected": 0.7425965070724487, + "logps/chosen": -419.3350524902344, + "logps/rejected": -481.666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.259576320648193, + "rewards/margins": 22.729785919189453, + "rewards/rejected": -28.989360809326172, + "step": 2980 + }, + { + "epoch": 9.295120061967467, + "grad_norm": 0.00034220717498101294, + "learning_rate": 0.00016061146074677885, + "logits/chosen": -0.728831946849823, + "logits/rejected": 0.6241214871406555, + "logps/chosen": -414.3390197753906, + "logps/rejected": -511.9385681152344, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.351578235626221, + "rewards/margins": 22.98415756225586, + "rewards/rejected": -29.335735321044922, + "step": 3000 + }, + { + "epoch": 9.35708752904725, + "grad_norm": 0.002175732748582959, + "learning_rate": 0.00016010544513268515, + "logits/chosen": -0.8456689715385437, + "logits/rejected": 0.7692159414291382, + "logps/chosen": -417.7942810058594, + "logps/rejected": -494.46240234375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.584621429443359, + "rewards/margins": 23.603557586669922, + "rewards/rejected": -29.18817710876465, + "step": 3020 + }, + { + "epoch": 9.419054996127032, + "grad_norm": 0.00041741851600818336, + "learning_rate": 0.00015959700848002567, + "logits/chosen": -0.8197334408760071, + "logits/rejected": 0.8417131304740906, + "logps/chosen": -426.35845947265625, + "logps/rejected": -491.63421630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.198635578155518, + "rewards/margins": 23.649044036865234, + "rewards/rejected": -28.847681045532227, + "step": 3040 + }, + { + "epoch": 9.481022463206816, + "grad_norm": 0.0018229244742542505, + "learning_rate": 0.00015908617126855466, + "logits/chosen": -0.7269546389579773, + "logits/rejected": 0.8374387621879578, + "logps/chosen": -418.9156799316406, + "logps/rejected": -485.36920166015625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.943127632141113, + "rewards/margins": 22.811771392822266, + "rewards/rejected": -28.754898071289062, + "step": 3060 + }, + { + "epoch": 9.5429899302866, + "grad_norm": 0.0002307717950316146, + "learning_rate": 0.00015857295407472046, + "logits/chosen": -0.7557907700538635, + "logits/rejected": 0.7291213870048523, + "logps/chosen": -418.8255920410156, + "logps/rejected": -495.04010009765625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5549492835998535, + "rewards/margins": 23.22446632385254, + "rewards/rejected": -29.7794132232666, + "step": 3080 + }, + { + "epoch": 9.604957397366382, + "grad_norm": 5.642590986099094e-05, + "learning_rate": 0.00015805737757083681, + "logits/chosen": -0.7865381836891174, + "logits/rejected": 0.8217270970344543, + "logps/chosen": -432.85711669921875, + "logps/rejected": -503.07293701171875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.972267150878906, + "rewards/margins": 23.414052963256836, + "rewards/rejected": -30.386322021484375, + "step": 3100 + }, + { + "epoch": 9.666924864446166, + "grad_norm": 0.0031394653487950563, + "learning_rate": 0.00015753946252425013, + "logits/chosen": -0.8239234089851379, + "logits/rejected": 0.7479206323623657, + "logps/chosen": -423.5618591308594, + "logps/rejected": -487.6026306152344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.732801914215088, + "rewards/margins": 22.03606605529785, + "rewards/rejected": -28.768869400024414, + "step": 3120 + }, + { + "epoch": 9.728892331525948, + "grad_norm": 0.0020447850693017244, + "learning_rate": 0.000157019229796503, + "logits/chosen": -0.757154107093811, + "logits/rejected": 0.6788080334663391, + "logps/chosen": -401.03680419921875, + "logps/rejected": -482.5953674316406, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.940583229064941, + "rewards/margins": 22.295238494873047, + "rewards/rejected": -28.235824584960938, + "step": 3140 + }, + { + "epoch": 9.790859798605732, + "grad_norm": 0.0010579255176708102, + "learning_rate": 0.0001564967003424938, + "logits/chosen": -0.7942818403244019, + "logits/rejected": 0.6221517324447632, + "logps/chosen": -429.353515625, + "logps/rejected": -519.2991943359375, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.464829444885254, + "rewards/margins": 22.819751739501953, + "rewards/rejected": -30.284582138061523, + "step": 3160 + }, + { + "epoch": 9.852827265685516, + "grad_norm": 0.0005074171931482852, + "learning_rate": 0.00015597189520963277, + "logits/chosen": -0.7147163152694702, + "logits/rejected": 0.7833064794540405, + "logps/chosen": -423.9457092285156, + "logps/rejected": -520.0857543945312, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.090755462646484, + "rewards/margins": 23.77802848815918, + "rewards/rejected": -30.868785858154297, + "step": 3180 + }, + { + "epoch": 9.914794732765298, + "grad_norm": 0.00032806736999191344, + "learning_rate": 0.00015544483553699408, + "logits/chosen": -0.7453028559684753, + "logits/rejected": 0.8783397674560547, + "logps/chosen": -429.57421875, + "logps/rejected": -521.6644897460938, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.363633155822754, + "rewards/margins": 24.92776870727539, + "rewards/rejected": -31.291400909423828, + "step": 3200 + }, + { + "epoch": 9.976762199845082, + "grad_norm": 0.0016681504203006625, + "learning_rate": 0.00015491554255446462, + "logits/chosen": -0.800809383392334, + "logits/rejected": 0.8924380540847778, + "logps/chosen": -403.0744323730469, + "logps/rejected": -475.4017028808594, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.697165489196777, + "rewards/margins": 23.721771240234375, + "rewards/rejected": -29.418935775756836, + "step": 3220 + }, + { + "epoch": 10.038729666924864, + "grad_norm": 0.0008030760800465941, + "learning_rate": 0.0001543840375818884, + "logits/chosen": -0.844366729259491, + "logits/rejected": 0.8316100835800171, + "logps/chosen": -439.8392639160156, + "logps/rejected": -524.9827270507812, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.307751178741455, + "rewards/margins": 24.47549819946289, + "rewards/rejected": -30.783245086669922, + "step": 3240 + }, + { + "epoch": 10.100697134004648, + "grad_norm": 0.0015144218923524022, + "learning_rate": 0.0001538503420282083, + "logits/chosen": -0.7335025072097778, + "logits/rejected": 0.744965672492981, + "logps/chosen": -424.39984130859375, + "logps/rejected": -503.18902587890625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.292525291442871, + "rewards/margins": 23.78156089782715, + "rewards/rejected": -30.074087142944336, + "step": 3260 + }, + { + "epoch": 10.162664601084431, + "grad_norm": 0.0008248073281720281, + "learning_rate": 0.00015331447739060338, + "logits/chosen": -0.7617536187171936, + "logits/rejected": 0.8052036166191101, + "logps/chosen": -410.5335388183594, + "logps/rejected": -484.446044921875, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.2051849365234375, + "rewards/margins": 23.270933151245117, + "rewards/rejected": -29.476116180419922, + "step": 3280 + }, + { + "epoch": 10.224632068164214, + "grad_norm": 0.0004835600557271391, + "learning_rate": 0.0001527764652536231, + "logits/chosen": -0.7629774212837219, + "logits/rejected": 0.8532567024230957, + "logps/chosen": -402.6639709472656, + "logps/rejected": -491.9307556152344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.240920543670654, + "rewards/margins": 23.61534881591797, + "rewards/rejected": -29.85626792907715, + "step": 3300 + }, + { + "epoch": 10.286599535243997, + "grad_norm": 1.3681114978680853e-05, + "learning_rate": 0.0001522363272883179, + "logits/chosen": -0.6888297200202942, + "logits/rejected": 0.8447307348251343, + "logps/chosen": -411.98431396484375, + "logps/rejected": -501.7259216308594, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.599992275238037, + "rewards/margins": 23.839008331298828, + "rewards/rejected": -30.439001083374023, + "step": 3320 + }, + { + "epoch": 10.34856700232378, + "grad_norm": 0.0015049786306917667, + "learning_rate": 0.0001516940852513663, + "logits/chosen": -0.802474319934845, + "logits/rejected": 0.7312396168708801, + "logps/chosen": -408.78009033203125, + "logps/rejected": -501.54412841796875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.309834957122803, + "rewards/margins": 23.346172332763672, + "rewards/rejected": -30.656009674072266, + "step": 3340 + }, + { + "epoch": 10.410534469403563, + "grad_norm": 0.0007437937310896814, + "learning_rate": 0.00015114976098419842, + "logits/chosen": -0.7263267636299133, + "logits/rejected": 0.7361682057380676, + "logps/chosen": -429.12945556640625, + "logps/rejected": -530.2430419921875, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.809745788574219, + "rewards/margins": 24.216045379638672, + "rewards/rejected": -31.02579116821289, + "step": 3360 + }, + { + "epoch": 10.472501936483347, + "grad_norm": 0.0008804806275293231, + "learning_rate": 0.00015060337641211637, + "logits/chosen": -0.7912311553955078, + "logits/rejected": 0.9160035848617554, + "logps/chosen": -410.7565002441406, + "logps/rejected": -474.1514587402344, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.874563694000244, + "rewards/margins": 22.452852249145508, + "rewards/rejected": -29.32741355895996, + "step": 3380 + }, + { + "epoch": 10.53446940356313, + "grad_norm": 0.0012436832766979933, + "learning_rate": 0.00015005495354341114, + "logits/chosen": -0.7138643264770508, + "logits/rejected": 0.7492295503616333, + "logps/chosen": -415.385009765625, + "logps/rejected": -515.1976928710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.435647487640381, + "rewards/margins": 23.116397857666016, + "rewards/rejected": -29.552043914794922, + "step": 3400 + }, + { + "epoch": 10.596436870642913, + "grad_norm": 0.00021300691878423095, + "learning_rate": 0.00014950451446847578, + "logits/chosen": -0.807390034198761, + "logits/rejected": 0.7440930008888245, + "logps/chosen": -427.7305603027344, + "logps/rejected": -520.4072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.44257116317749, + "rewards/margins": 23.892803192138672, + "rewards/rejected": -30.335372924804688, + "step": 3420 + }, + { + "epoch": 10.658404337722695, + "grad_norm": 0.00298711028881371, + "learning_rate": 0.00014895208135891604, + "logits/chosen": -0.8018674850463867, + "logits/rejected": 0.7456072568893433, + "logps/chosen": -418.41131591796875, + "logps/rejected": -519.7051391601562, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.13026237487793, + "rewards/margins": 25.155405044555664, + "rewards/rejected": -30.285669326782227, + "step": 3440 + }, + { + "epoch": 10.720371804802479, + "grad_norm": 0.0018449191702529788, + "learning_rate": 0.000148397676466657, + "logits/chosen": -0.8092568516731262, + "logits/rejected": 0.8564573526382446, + "logps/chosen": -424.4937438964844, + "logps/rejected": -499.78472900390625, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.369976043701172, + "rewards/margins": 23.924694061279297, + "rewards/rejected": -30.294673919677734, + "step": 3460 + }, + { + "epoch": 10.782339271882261, + "grad_norm": 0.0006981108454056084, + "learning_rate": 0.00014784132212304694, + "logits/chosen": -0.6652621030807495, + "logits/rejected": 0.7642368078231812, + "logps/chosen": -421.30816650390625, + "logps/rejected": -508.30010986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.402596473693848, + "rewards/margins": 23.3673038482666, + "rewards/rejected": -29.7699031829834, + "step": 3480 + }, + { + "epoch": 10.844306738962045, + "grad_norm": 0.0020014916080981493, + "learning_rate": 0.00014728304073795764, + "logits/chosen": -0.7477067112922668, + "logits/rejected": 0.9086839556694031, + "logps/chosen": -410.92974853515625, + "logps/rejected": -485.93804931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.754061698913574, + "rewards/margins": 23.10568618774414, + "rewards/rejected": -29.8597469329834, + "step": 3500 + }, + { + "epoch": 10.906274206041829, + "grad_norm": 0.0002572743396740407, + "learning_rate": 0.0001467228547988819, + "logits/chosen": -0.7481425404548645, + "logits/rejected": 0.9864739179611206, + "logps/chosen": -417.87158203125, + "logps/rejected": -484.71270751953125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.400439262390137, + "rewards/margins": 23.81901741027832, + "rewards/rejected": -30.219451904296875, + "step": 3520 + }, + { + "epoch": 10.96824167312161, + "grad_norm": 1.1203023859707173e-05, + "learning_rate": 0.0001461607868700276, + "logits/chosen": -0.7098456621170044, + "logits/rejected": 0.8115944862365723, + "logps/chosen": -410.0858459472656, + "logps/rejected": -519.0262451171875, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -6.59292459487915, + "rewards/margins": 24.37098503112793, + "rewards/rejected": -30.963909149169922, + "step": 3540 + }, + { + "epoch": 11.030209140201395, + "grad_norm": 0.0009471502853557467, + "learning_rate": 0.00014559685959140907, + "logits/chosen": -0.7040443420410156, + "logits/rejected": 0.7584648132324219, + "logps/chosen": -409.934814453125, + "logps/rejected": -511.6397399902344, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.900947570800781, + "rewards/margins": 23.620128631591797, + "rewards/rejected": -30.521076202392578, + "step": 3560 + }, + { + "epoch": 11.092176607281177, + "grad_norm": 0.002169826766476035, + "learning_rate": 0.00014503109567793481, + "logits/chosen": -0.7528023719787598, + "logits/rejected": 0.9099094271659851, + "logps/chosen": -408.87664794921875, + "logps/rejected": -497.9203186035156, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.732300281524658, + "rewards/margins": 25.08970069885254, + "rewards/rejected": -30.821996688842773, + "step": 3580 + }, + { + "epoch": 11.15414407436096, + "grad_norm": 0.0010581511305645108, + "learning_rate": 0.00014446351791849276, + "logits/chosen": -0.6851012110710144, + "logits/rejected": 0.8104494214057922, + "logps/chosen": -419.51715087890625, + "logps/rejected": -524.4638671875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.121168613433838, + "rewards/margins": 24.22315788269043, + "rewards/rejected": -31.34432601928711, + "step": 3600 + }, + { + "epoch": 11.216111541440744, + "grad_norm": 0.0006640542997047305, + "learning_rate": 0.0001438941491750323, + "logits/chosen": -0.7948740124702454, + "logits/rejected": 0.8368036150932312, + "logps/chosen": -424.7229919433594, + "logps/rejected": -516.5067749023438, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.916244983673096, + "rewards/margins": 24.990079879760742, + "rewards/rejected": -30.906330108642578, + "step": 3620 + }, + { + "epoch": 11.278079008520526, + "grad_norm": 0.001158738974481821, + "learning_rate": 0.00014332301238164342, + "logits/chosen": -0.7854216694831848, + "logits/rejected": 0.7745558023452759, + "logps/chosen": -416.4905700683594, + "logps/rejected": -498.207763671875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.545945167541504, + "rewards/margins": 23.817821502685547, + "rewards/rejected": -30.36376953125, + "step": 3640 + }, + { + "epoch": 11.34004647560031, + "grad_norm": 0.0008971371571533382, + "learning_rate": 0.00014275013054363287, + "logits/chosen": -0.7516878247261047, + "logits/rejected": 0.8620051145553589, + "logps/chosen": -427.61993408203125, + "logps/rejected": -537.0158081054688, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.415410041809082, + "rewards/margins": 25.761362075805664, + "rewards/rejected": -32.17677307128906, + "step": 3660 + }, + { + "epoch": 11.402013942680092, + "grad_norm": 0.0016706970054656267, + "learning_rate": 0.00014217552673659754, + "logits/chosen": -0.7678354382514954, + "logits/rejected": 0.8684478998184204, + "logps/chosen": -422.351806640625, + "logps/rejected": -499.56201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.864222526550293, + "rewards/margins": 23.08515167236328, + "rewards/rejected": -29.94937515258789, + "step": 3680 + }, + { + "epoch": 11.463981409759876, + "grad_norm": 0.0001611242478247732, + "learning_rate": 0.00014159922410549497, + "logits/chosen": -0.7930831909179688, + "logits/rejected": 0.7886659502983093, + "logps/chosen": -411.2481994628906, + "logps/rejected": -510.31097412109375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.10078763961792, + "rewards/margins": 24.464405059814453, + "rewards/rejected": -30.565189361572266, + "step": 3700 + }, + { + "epoch": 11.52594887683966, + "grad_norm": 0.0010968918213620782, + "learning_rate": 0.0001410212458637112, + "logits/chosen": -0.750701367855072, + "logits/rejected": 0.8467336893081665, + "logps/chosen": -411.9034729003906, + "logps/rejected": -495.7069396972656, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.123504638671875, + "rewards/margins": 24.326147079467773, + "rewards/rejected": -30.449655532836914, + "step": 3720 + }, + { + "epoch": 11.587916343919442, + "grad_norm": 0.0012211522553116083, + "learning_rate": 0.00014044161529212543, + "logits/chosen": -0.7011710405349731, + "logits/rejected": 0.7984441518783569, + "logps/chosen": -407.7188415527344, + "logps/rejected": -515.7482299804688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.711037635803223, + "rewards/margins": 24.880985260009766, + "rewards/rejected": -30.59202003479004, + "step": 3740 + }, + { + "epoch": 11.649883810999226, + "grad_norm": 5.050484469393268e-05, + "learning_rate": 0.0001398603557381726, + "logits/chosen": -0.7948409914970398, + "logits/rejected": 0.8654049634933472, + "logps/chosen": -426.082763671875, + "logps/rejected": -484.33380126953125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.525465488433838, + "rewards/margins": 23.004573822021484, + "rewards/rejected": -29.530038833618164, + "step": 3760 + }, + { + "epoch": 11.711851278079008, + "grad_norm": 0.0003607009712141007, + "learning_rate": 0.0001392774906149028, + "logits/chosen": -0.8043268918991089, + "logits/rejected": 0.8279350996017456, + "logps/chosen": -409.0636291503906, + "logps/rejected": -511.4398498535156, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.8972344398498535, + "rewards/margins": 25.10378074645996, + "rewards/rejected": -31.00101661682129, + "step": 3780 + }, + { + "epoch": 11.773818745158792, + "grad_norm": 0.0004310712101869285, + "learning_rate": 0.0001386930434000382, + "logits/chosen": -0.6975358128547668, + "logits/rejected": 0.9005098342895508, + "logps/chosen": -414.76806640625, + "logps/rejected": -520.3231201171875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5791015625, + "rewards/margins": 24.619598388671875, + "rewards/rejected": -31.198699951171875, + "step": 3800 + }, + { + "epoch": 11.835786212238574, + "grad_norm": 0.0026548670139163733, + "learning_rate": 0.00013810703763502744, + "logits/chosen": -0.7100318074226379, + "logits/rejected": 0.9277878999710083, + "logps/chosen": -429.67669677734375, + "logps/rejected": -511.8397521972656, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.772208213806152, + "rewards/margins": 23.749717712402344, + "rewards/rejected": -30.521926879882812, + "step": 3820 + }, + { + "epoch": 11.897753679318358, + "grad_norm": 0.0004118687065783888, + "learning_rate": 0.00013751949692409718, + "logits/chosen": -0.6234445571899414, + "logits/rejected": 0.8737300634384155, + "logps/chosen": -428.03271484375, + "logps/rejected": -531.5043334960938, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.512963771820068, + "rewards/margins": 24.97269058227539, + "rewards/rejected": -32.485652923583984, + "step": 3840 + }, + { + "epoch": 11.959721146398142, + "grad_norm": 0.0020478537771850824, + "learning_rate": 0.00013693044493330166, + "logits/chosen": -0.7277485132217407, + "logits/rejected": 0.9264734983444214, + "logps/chosen": -431.9356384277344, + "logps/rejected": -531.0416870117188, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.424185752868652, + "rewards/margins": 24.675241470336914, + "rewards/rejected": -32.09942626953125, + "step": 3860 + }, + { + "epoch": 12.021688613477924, + "grad_norm": 0.0008361217333003879, + "learning_rate": 0.0001363399053895692, + "logits/chosen": -0.782467246055603, + "logits/rejected": 0.8881294131278992, + "logps/chosen": -434.98773193359375, + "logps/rejected": -516.9628295898438, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.331326007843018, + "rewards/margins": 24.816944122314453, + "rewards/rejected": -31.148269653320312, + "step": 3880 + }, + { + "epoch": 12.083656080557708, + "grad_norm": 3.445847687544301e-05, + "learning_rate": 0.00013574790207974646, + "logits/chosen": -0.6909776329994202, + "logits/rejected": 0.9060857892036438, + "logps/chosen": -430.9913024902344, + "logps/rejected": -531.0490112304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.471495628356934, + "rewards/margins": 25.3931941986084, + "rewards/rejected": -31.86469078063965, + "step": 3900 + }, + { + "epoch": 12.14562354763749, + "grad_norm": 0.0013186397263780236, + "learning_rate": 0.00013515445884964045, + "logits/chosen": -0.7101693153381348, + "logits/rejected": 0.9304086565971375, + "logps/chosen": -414.54608154296875, + "logps/rejected": -513.8648071289062, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.092409610748291, + "rewards/margins": 24.61441421508789, + "rewards/rejected": -31.70682144165039, + "step": 3920 + }, + { + "epoch": 12.207591014717273, + "grad_norm": 0.0014901352114975452, + "learning_rate": 0.00013455959960305798, + "logits/chosen": -0.6674474477767944, + "logits/rejected": 0.8775323629379272, + "logps/chosen": -437.1656799316406, + "logps/rejected": -539.7037963867188, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.307480812072754, + "rewards/margins": 25.1062068939209, + "rewards/rejected": -32.41368865966797, + "step": 3940 + }, + { + "epoch": 12.269558481797057, + "grad_norm": 0.0019103622762486339, + "learning_rate": 0.0001339633483008427, + "logits/chosen": -0.7112741470336914, + "logits/rejected": 0.9096217155456543, + "logps/chosen": -417.7974548339844, + "logps/rejected": -520.1793212890625, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.133049011230469, + "rewards/margins": 24.84225845336914, + "rewards/rejected": -31.975311279296875, + "step": 3960 + }, + { + "epoch": 12.33152594887684, + "grad_norm": 1.1235245438001584e-05, + "learning_rate": 0.00013336572895991016, + "logits/chosen": -0.6781491637229919, + "logits/rejected": 0.8913220167160034, + "logps/chosen": -417.21246337890625, + "logps/rejected": -522.6915283203125, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.486870765686035, + "rewards/margins": 25.340744018554688, + "rewards/rejected": -31.827617645263672, + "step": 3980 + }, + { + "epoch": 12.393493415956623, + "grad_norm": 0.0010617575608193874, + "learning_rate": 0.00013276676565228027, + "logits/chosen": -0.7145902514457703, + "logits/rejected": 0.9445177316665649, + "logps/chosen": -420.17547607421875, + "logps/rejected": -519.7775268554688, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.825955867767334, + "rewards/margins": 25.554126739501953, + "rewards/rejected": -31.380081176757812, + "step": 4000 + }, + { + "epoch": 12.455460883036405, + "grad_norm": 1.6598849470028654e-05, + "learning_rate": 0.00013216648250410776, + "logits/chosen": -0.6829872131347656, + "logits/rejected": 0.8398195505142212, + "logps/chosen": -432.6788635253906, + "logps/rejected": -534.5023193359375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.334891319274902, + "rewards/margins": 24.781658172607422, + "rewards/rejected": -31.116552352905273, + "step": 4020 + }, + { + "epoch": 12.51742835011619, + "grad_norm": 0.00028609836590476334, + "learning_rate": 0.00013156490369471027, + "logits/chosen": -0.7433018684387207, + "logits/rejected": 0.9416133761405945, + "logps/chosen": -409.298095703125, + "logps/rejected": -509.50128173828125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.734123229980469, + "rewards/margins": 25.44891357421875, + "rewards/rejected": -31.18303871154785, + "step": 4040 + }, + { + "epoch": 12.579395817195973, + "grad_norm": 0.0008309365948662162, + "learning_rate": 0.00013096205345559448, + "logits/chosen": -0.6434201002120972, + "logits/rejected": 1.0335710048675537, + "logps/chosen": -431.5645446777344, + "logps/rejected": -512.9036254882812, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.22793436050415, + "rewards/margins": 24.197513580322266, + "rewards/rejected": -31.42544937133789, + "step": 4060 + }, + { + "epoch": 12.641363284275755, + "grad_norm": 0.0005798207130283117, + "learning_rate": 0.00013035795606948023, + "logits/chosen": -0.6275348663330078, + "logits/rejected": 0.9419866800308228, + "logps/chosen": -420.60870361328125, + "logps/rejected": -529.2860717773438, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.123082160949707, + "rewards/margins": 25.372949600219727, + "rewards/rejected": -32.49603271484375, + "step": 4080 + }, + { + "epoch": 12.703330751355539, + "grad_norm": 0.0009076519636437297, + "learning_rate": 0.00012975263586932208, + "logits/chosen": -0.7268190979957581, + "logits/rejected": 0.8527728915214539, + "logps/chosen": -420.3800354003906, + "logps/rejected": -530.3096313476562, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.111058712005615, + "rewards/margins": 24.54432487487793, + "rewards/rejected": -31.6553897857666, + "step": 4100 + }, + { + "epoch": 12.765298218435321, + "grad_norm": 0.0008658567676320672, + "learning_rate": 0.00012914611723732942, + "logits/chosen": -0.7163742780685425, + "logits/rejected": 0.9323918223381042, + "logps/chosen": -420.14227294921875, + "logps/rejected": -508.64093017578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.57541036605835, + "rewards/margins": 24.635540008544922, + "rewards/rejected": -31.210952758789062, + "step": 4120 + }, + { + "epoch": 12.827265685515105, + "grad_norm": 0.0006353395874612033, + "learning_rate": 0.00012853842460398428, + "logits/chosen": -0.6529034972190857, + "logits/rejected": 0.9955090284347534, + "logps/chosen": -445.478271484375, + "logps/rejected": -555.215576171875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.958199977874756, + "rewards/margins": 25.986324310302734, + "rewards/rejected": -33.94452667236328, + "step": 4140 + }, + { + "epoch": 12.889233152594887, + "grad_norm": 0.0007925338577479124, + "learning_rate": 0.00012792958244705745, + "logits/chosen": -0.7437697649002075, + "logits/rejected": 0.945330023765564, + "logps/chosen": -436.5621032714844, + "logps/rejected": -523.8942260742188, + "loss": 0.0033, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -6.880484580993652, + "rewards/margins": 24.760828018188477, + "rewards/rejected": -31.641315460205078, + "step": 4160 + }, + { + "epoch": 12.95120061967467, + "grad_norm": 0.0009758576052263379, + "learning_rate": 0.00012731961529062211, + "logits/chosen": -0.707342267036438, + "logits/rejected": 1.0127270221710205, + "logps/chosen": -429.137451171875, + "logps/rejected": -498.2242736816406, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.349567413330078, + "rewards/margins": 23.53993797302246, + "rewards/rejected": -29.889501571655273, + "step": 4180 + }, + { + "epoch": 13.013168086754455, + "grad_norm": 0.001348630990833044, + "learning_rate": 0.0001267085477040664, + "logits/chosen": -0.710185170173645, + "logits/rejected": 0.9165050387382507, + "logps/chosen": -426.640869140625, + "logps/rejected": -530.394287109375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.763472557067871, + "rewards/margins": 25.113622665405273, + "rewards/rejected": -31.877094268798828, + "step": 4200 + }, + { + "epoch": 13.075135553834237, + "grad_norm": 0.0003159803745802492, + "learning_rate": 0.0001260964043011036, + "logits/chosen": -0.7468986511230469, + "logits/rejected": 0.946589469909668, + "logps/chosen": -449.58062744140625, + "logps/rejected": -552.7725219726562, + "loss": 0.0043, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.443936824798584, + "rewards/margins": 26.3941707611084, + "rewards/rejected": -32.83810806274414, + "step": 4220 + }, + { + "epoch": 13.13710302091402, + "grad_norm": 0.0002802180533763021, + "learning_rate": 0.0001254832097387808, + "logits/chosen": -0.7415999174118042, + "logits/rejected": 0.8606869578361511, + "logps/chosen": -412.0994567871094, + "logps/rejected": -522.6536254882812, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.73651123046875, + "rewards/margins": 25.14139175415039, + "rewards/rejected": -31.877904891967773, + "step": 4240 + }, + { + "epoch": 13.199070487993803, + "grad_norm": 0.0003081171598751098, + "learning_rate": 0.0001248689887164855, + "logits/chosen": -0.6822153329849243, + "logits/rejected": 0.9707147479057312, + "logps/chosen": -426.1729431152344, + "logps/rejected": -523.9768676757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.399308681488037, + "rewards/margins": 25.116762161254883, + "rewards/rejected": -31.516071319580078, + "step": 4260 + }, + { + "epoch": 13.261037955073586, + "grad_norm": 0.0007231299532577395, + "learning_rate": 0.0001242537659749509, + "logits/chosen": -0.6475167274475098, + "logits/rejected": 0.9226775169372559, + "logps/chosen": -428.57855224609375, + "logps/rejected": -549.84228515625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.925917148590088, + "rewards/margins": 25.896963119506836, + "rewards/rejected": -32.82288360595703, + "step": 4280 + }, + { + "epoch": 13.32300542215337, + "grad_norm": 0.0007026797975413501, + "learning_rate": 0.00012363756629525937, + "logits/chosen": -0.6976606845855713, + "logits/rejected": 0.9327031970024109, + "logps/chosen": -425.42449951171875, + "logps/rejected": -524.47509765625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.578268527984619, + "rewards/margins": 25.798770904541016, + "rewards/rejected": -32.377037048339844, + "step": 4300 + }, + { + "epoch": 13.384972889233152, + "grad_norm": 5.637519643642008e-05, + "learning_rate": 0.00012302041449784409, + "logits/chosen": -0.6870549917221069, + "logits/rejected": 0.9386127591133118, + "logps/chosen": -427.32501220703125, + "logps/rejected": -519.8970336914062, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.1073102951049805, + "rewards/margins": 24.731271743774414, + "rewards/rejected": -31.83858299255371, + "step": 4320 + }, + { + "epoch": 13.446940356312936, + "grad_norm": 0.0001799424208002165, + "learning_rate": 0.00012240233544148955, + "logits/chosen": -0.6837178468704224, + "logits/rejected": 0.9757431149482727, + "logps/chosen": -432.7763671875, + "logps/rejected": -525.1026611328125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.646360874176025, + "rewards/margins": 25.124706268310547, + "rewards/rejected": -31.771068572998047, + "step": 4340 + }, + { + "epoch": 13.508907823392718, + "grad_norm": 0.0015866424655541778, + "learning_rate": 0.00012178335402232996, + "logits/chosen": -0.6187258958816528, + "logits/rejected": 0.9561142921447754, + "logps/chosen": -419.75164794921875, + "logps/rejected": -530.4887084960938, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.826728820800781, + "rewards/margins": 25.215198516845703, + "rewards/rejected": -32.041927337646484, + "step": 4360 + }, + { + "epoch": 13.570875290472502, + "grad_norm": 0.00038470287108793855, + "learning_rate": 0.00012116349517284665, + "logits/chosen": -0.7070793509483337, + "logits/rejected": 0.9476947784423828, + "logps/chosen": -416.0244140625, + "logps/rejected": -527.4065551757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.205079555511475, + "rewards/margins": 24.985538482666016, + "rewards/rejected": -32.19062042236328, + "step": 4380 + }, + { + "epoch": 13.632842757552286, + "grad_norm": 0.0010880132904276252, + "learning_rate": 0.00012054278386086368, + "logits/chosen": -0.6799092292785645, + "logits/rejected": 0.9883368611335754, + "logps/chosen": -415.4871520996094, + "logps/rejected": -502.75994873046875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.845544338226318, + "rewards/margins": 24.369251251220703, + "rewards/rejected": -31.214794158935547, + "step": 4400 + }, + { + "epoch": 13.694810224632068, + "grad_norm": 0.0005410652374848723, + "learning_rate": 0.0001199523412929886, + "logits/chosen": -0.6802583336830139, + "logits/rejected": 1.0727746486663818, + "logps/chosen": -414.32733154296875, + "logps/rejected": -502.4496154785156, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.712512016296387, + "rewards/margins": 24.851696014404297, + "rewards/rejected": -31.564212799072266, + "step": 4420 + }, + { + "epoch": 13.756777691711852, + "grad_norm": 0.0004551692109089345, + "learning_rate": 0.00011933003962196613, + "logits/chosen": -0.6473695039749146, + "logits/rejected": 0.9640854597091675, + "logps/chosen": -416.96881103515625, + "logps/rejected": -520.4708251953125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.991703987121582, + "rewards/margins": 24.429357528686523, + "rewards/rejected": -31.42106056213379, + "step": 4440 + }, + { + "epoch": 13.818745158791634, + "grad_norm": 1.8331444152863696e-05, + "learning_rate": 0.00011870695933976628, + "logits/chosen": -0.7241955995559692, + "logits/rejected": 0.8486202359199524, + "logps/chosen": -418.4485778808594, + "logps/rejected": -525.0150756835938, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.720297813415527, + "rewards/margins": 24.988901138305664, + "rewards/rejected": -31.709197998046875, + "step": 4460 + }, + { + "epoch": 13.880712625871418, + "grad_norm": 4.146054925513454e-05, + "learning_rate": 0.00011808312554397192, + "logits/chosen": -0.5974981188774109, + "logits/rejected": 0.948889434337616, + "logps/chosen": -439.1353454589844, + "logps/rejected": -556.8853759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1817474365234375, + "rewards/margins": 25.7791805267334, + "rewards/rejected": -32.96092987060547, + "step": 4480 + }, + { + "epoch": 13.9426800929512, + "grad_norm": 4.285129398340359e-05, + "learning_rate": 0.00011745856336251742, + "logits/chosen": -0.7139695286750793, + "logits/rejected": 0.9608259201049805, + "logps/chosen": -419.91900634765625, + "logps/rejected": -496.1962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.308480739593506, + "rewards/margins": 23.839679718017578, + "rewards/rejected": -31.148162841796875, + "step": 4500 + }, + { + "epoch": 14.004647560030984, + "grad_norm": 0.0007021346245892346, + "learning_rate": 0.00011683329795267636, + "logits/chosen": -0.7038004994392395, + "logits/rejected": 1.0250012874603271, + "logps/chosen": -437.5606384277344, + "logps/rejected": -532.3974609375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.6007537841796875, + "rewards/margins": 26.052099227905273, + "rewards/rejected": -32.652854919433594, + "step": 4520 + }, + { + "epoch": 14.066615027110767, + "grad_norm": 0.0006834513042122126, + "learning_rate": 0.00011620735450004829, + "logits/chosen": -0.6422589421272278, + "logits/rejected": 0.9730979800224304, + "logps/chosen": -436.33941650390625, + "logps/rejected": -526.1567993164062, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -7.394343376159668, + "rewards/margins": 24.573226928710938, + "rewards/rejected": -31.967571258544922, + "step": 4540 + }, + { + "epoch": 14.12858249419055, + "grad_norm": 0.0012112940894439816, + "learning_rate": 0.00011558075821754417, + "logits/chosen": -0.5923640131950378, + "logits/rejected": 1.0607713460922241, + "logps/chosen": -416.2725524902344, + "logps/rejected": -506.9697265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.253162384033203, + "rewards/margins": 24.898513793945312, + "rewards/rejected": -31.15167808532715, + "step": 4560 + }, + { + "epoch": 14.190549961270333, + "grad_norm": 0.00040532436105422676, + "learning_rate": 0.00011495353434437098, + "logits/chosen": -0.5979864001274109, + "logits/rejected": 1.0398799180984497, + "logps/chosen": -425.46356201171875, + "logps/rejected": -538.7312622070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.892348289489746, + "rewards/margins": 26.227279663085938, + "rewards/rejected": -33.119632720947266, + "step": 4580 + }, + { + "epoch": 14.252517428350115, + "grad_norm": 8.036774670472369e-05, + "learning_rate": 0.00011432570814501478, + "logits/chosen": -0.6628149151802063, + "logits/rejected": 1.0684958696365356, + "logps/chosen": -420.6571350097656, + "logps/rejected": -505.7470703125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.971704959869385, + "rewards/margins": 24.749691009521484, + "rewards/rejected": -31.721399307250977, + "step": 4600 + }, + { + "epoch": 14.3144848954299, + "grad_norm": 2.7621756089502014e-05, + "learning_rate": 0.00011369730490822336, + "logits/chosen": -0.6400030255317688, + "logits/rejected": 1.14774489402771, + "logps/chosen": -435.0557556152344, + "logps/rejected": -527.8553466796875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.974684715270996, + "rewards/margins": 24.852916717529297, + "rewards/rejected": -32.827606201171875, + "step": 4620 + }, + { + "epoch": 14.376452362509683, + "grad_norm": 0.0011475204955786467, + "learning_rate": 0.0001130683499459875, + "logits/chosen": -0.6620621681213379, + "logits/rejected": 1.0378706455230713, + "logps/chosen": -432.5133361816406, + "logps/rejected": -552.24072265625, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.94607400894165, + "rewards/margins": 27.082103729248047, + "rewards/rejected": -34.02817916870117, + "step": 4640 + }, + { + "epoch": 14.438419829589465, + "grad_norm": 0.001489471411332488, + "learning_rate": 0.00011243886859252135, + "logits/chosen": -0.7889328002929688, + "logits/rejected": 1.0310903787612915, + "logps/chosen": -425.72125244140625, + "logps/rejected": -507.666259765625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.0838117599487305, + "rewards/margins": 24.99704933166504, + "rewards/rejected": -32.08086395263672, + "step": 4660 + }, + { + "epoch": 14.500387296669249, + "grad_norm": 0.00015960348537191749, + "learning_rate": 0.00011180888620324205, + "logits/chosen": -0.6674115061759949, + "logits/rejected": 1.0456359386444092, + "logps/chosen": -418.9027404785156, + "logps/rejected": -499.97003173828125, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -6.929097652435303, + "rewards/margins": 24.343183517456055, + "rewards/rejected": -31.272281646728516, + "step": 4680 + }, + { + "epoch": 14.562354763749031, + "grad_norm": 0.0009649236453697085, + "learning_rate": 0.00011117842815374835, + "logits/chosen": -0.6228715181350708, + "logits/rejected": 0.9750884175300598, + "logps/chosen": -421.4877014160156, + "logps/rejected": -532.0631103515625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.449153900146484, + "rewards/margins": 25.748123168945312, + "rewards/rejected": -32.19727325439453, + "step": 4700 + }, + { + "epoch": 14.624322230828815, + "grad_norm": 0.0010970581788569689, + "learning_rate": 0.00011054751983879859, + "logits/chosen": -0.6098747849464417, + "logits/rejected": 1.016177773475647, + "logps/chosen": -426.1904296875, + "logps/rejected": -536.5009765625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.70578670501709, + "rewards/margins": 25.852767944335938, + "rewards/rejected": -32.558555603027344, + "step": 4720 + }, + { + "epoch": 14.686289697908599, + "grad_norm": 0.000131874781800434, + "learning_rate": 0.00010991618667128769, + "logits/chosen": -0.609255850315094, + "logits/rejected": 1.0346933603286743, + "logps/chosen": -423.99566650390625, + "logps/rejected": -517.832275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.696162223815918, + "rewards/margins": 24.211074829101562, + "rewards/rejected": -31.907236099243164, + "step": 4740 + }, + { + "epoch": 14.748257164988381, + "grad_norm": 0.0003271376190241426, + "learning_rate": 0.00010928445408122361, + "logits/chosen": -0.5123564004898071, + "logits/rejected": 0.9423815011978149, + "logps/chosen": -412.5267639160156, + "logps/rejected": -537.0595703125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.50037145614624, + "rewards/margins": 25.577926635742188, + "rewards/rejected": -33.07830047607422, + "step": 4760 + }, + { + "epoch": 14.810224632068165, + "grad_norm": 0.0003270464367233217, + "learning_rate": 0.00010865234751470288, + "logits/chosen": -0.603384792804718, + "logits/rejected": 1.1043418645858765, + "logps/chosen": -415.77313232421875, + "logps/rejected": -513.60986328125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.077917575836182, + "rewards/margins": 25.412960052490234, + "rewards/rejected": -32.49087905883789, + "step": 4780 + }, + { + "epoch": 14.872192099147947, + "grad_norm": 0.0009512171382084489, + "learning_rate": 0.00010801989243288589, + "logits/chosen": -0.7142239809036255, + "logits/rejected": 1.0645368099212646, + "logps/chosen": -416.17230224609375, + "logps/rejected": -509.03082275390625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.745335578918457, + "rewards/margins": 24.773212432861328, + "rewards/rejected": -31.5185489654541, + "step": 4800 + }, + { + "epoch": 14.93415956622773, + "grad_norm": 0.0008968279580585659, + "learning_rate": 0.00010738711431097112, + "logits/chosen": -0.7670684456825256, + "logits/rejected": 1.1087074279785156, + "logps/chosen": -419.56573486328125, + "logps/rejected": -510.56365966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.202185153961182, + "rewards/margins": 25.509662628173828, + "rewards/rejected": -31.711849212646484, + "step": 4820 + }, + { + "epoch": 14.996127033307513, + "grad_norm": 0.00017500368994660676, + "learning_rate": 0.00010675403863716907, + "logits/chosen": -0.7169899344444275, + "logits/rejected": 0.8955768346786499, + "logps/chosen": -428.79864501953125, + "logps/rejected": -524.6358642578125, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.424875736236572, + "rewards/margins": 24.901775360107422, + "rewards/rejected": -32.3266487121582, + "step": 4840 + }, + { + "epoch": 15.058094500387297, + "grad_norm": 0.00016677333042025566, + "learning_rate": 0.00010612069091167551, + "logits/chosen": -0.641793966293335, + "logits/rejected": 1.0156922340393066, + "logps/chosen": -432.36248779296875, + "logps/rejected": -538.47998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.848368167877197, + "rewards/margins": 25.29831314086914, + "rewards/rejected": -32.14667892456055, + "step": 4860 + }, + { + "epoch": 15.12006196746708, + "grad_norm": 0.0034633041359484196, + "learning_rate": 0.00010548709664564449, + "logits/chosen": -0.6778856515884399, + "logits/rejected": 0.9707983732223511, + "logps/chosen": -426.789794921875, + "logps/rejected": -546.7899169921875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.392895698547363, + "rewards/margins": 26.034061431884766, + "rewards/rejected": -33.42695999145508, + "step": 4880 + }, + { + "epoch": 15.182029434546862, + "grad_norm": 0.0001036279572872445, + "learning_rate": 0.00010485328136016071, + "logits/chosen": -0.6301292181015015, + "logits/rejected": 0.9921154975891113, + "logps/chosen": -420.39520263671875, + "logps/rejected": -544.0909423828125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.199660778045654, + "rewards/margins": 26.49783706665039, + "rewards/rejected": -32.69750213623047, + "step": 4900 + }, + { + "epoch": 15.243996901626646, + "grad_norm": 0.0009169202530756593, + "learning_rate": 0.00010421927058521137, + "logits/chosen": -0.6611192226409912, + "logits/rejected": 1.050333023071289, + "logps/chosen": -428.5755920410156, + "logps/rejected": -530.2979125976562, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6709885597229, + "rewards/margins": 25.375732421875, + "rewards/rejected": -33.04671859741211, + "step": 4920 + }, + { + "epoch": 15.305964368706428, + "grad_norm": 0.0017765266820788383, + "learning_rate": 0.00010358508985865813, + "logits/chosen": -0.6134442687034607, + "logits/rejected": 1.212090253829956, + "logps/chosen": -415.17413330078125, + "logps/rejected": -492.6800231933594, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -7.1799468994140625, + "rewards/margins": 24.78128433227539, + "rewards/rejected": -31.961233139038086, + "step": 4940 + }, + { + "epoch": 15.367931835786212, + "grad_norm": 0.000361295067705214, + "learning_rate": 0.00010295076472520812, + "logits/chosen": -0.6132256984710693, + "logits/rejected": 1.1052106618881226, + "logps/chosen": -408.4927673339844, + "logps/rejected": -502.8935546875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.815627098083496, + "rewards/margins": 24.847381591796875, + "rewards/rejected": -31.663009643554688, + "step": 4960 + }, + { + "epoch": 15.429899302865996, + "grad_norm": 2.1386760636232793e-05, + "learning_rate": 0.00010231632073538522, + "logits/chosen": -0.6892057061195374, + "logits/rejected": 1.0478392839431763, + "logps/chosen": -440.66754150390625, + "logps/rejected": -533.3825073242188, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -6.8489556312561035, + "rewards/margins": 25.716054916381836, + "rewards/rejected": -32.56501388549805, + "step": 4980 + }, + { + "epoch": 15.491866769945778, + "grad_norm": 0.0018733438337221742, + "learning_rate": 0.00010168178344450086, + "logits/chosen": -0.5057135820388794, + "logits/rejected": 0.988396167755127, + "logps/chosen": -407.16107177734375, + "logps/rejected": -538.1986083984375, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.880420684814453, + "rewards/margins": 25.703664779663086, + "rewards/rejected": -32.584083557128906, + "step": 5000 + }, + { + "epoch": 15.553834237025562, + "grad_norm": 0.001603521523065865, + "learning_rate": 0.00010104717841162458, + "logits/chosen": -0.679779052734375, + "logits/rejected": 0.9824868440628052, + "logps/chosen": -441.1270446777344, + "logps/rejected": -546.2282104492188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.309430122375488, + "rewards/margins": 26.078725814819336, + "rewards/rejected": -32.38815689086914, + "step": 5020 + }, + { + "epoch": 15.615801704105344, + "grad_norm": 9.475573460804299e-05, + "learning_rate": 0.0001004125311985546, + "logits/chosen": -0.6173331141471863, + "logits/rejected": 1.0997240543365479, + "logps/chosen": -423.6936950683594, + "logps/rejected": -530.4503173828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.953721046447754, + "rewards/margins": 26.448678970336914, + "rewards/rejected": -33.40239715576172, + "step": 5040 + }, + { + "epoch": 15.677769171185128, + "grad_norm": 0.00013075934839434922, + "learning_rate": 9.977786736878808e-05, + "logits/chosen": -0.6333028674125671, + "logits/rejected": 0.9802857637405396, + "logps/chosen": -437.1517639160156, + "logps/rejected": -530.119384765625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.329917907714844, + "rewards/margins": 24.84552001953125, + "rewards/rejected": -32.17544174194336, + "step": 5060 + }, + { + "epoch": 15.739736638264912, + "grad_norm": 9.669300925452262e-05, + "learning_rate": 9.914321248649153e-05, + "logits/chosen": -0.6871415376663208, + "logits/rejected": 1.0859472751617432, + "logps/chosen": -440.9861755371094, + "logps/rejected": -540.0758056640625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.136324882507324, + "rewards/margins": 26.24369239807129, + "rewards/rejected": -33.3800163269043, + "step": 5080 + }, + { + "epoch": 15.801704105344694, + "grad_norm": 0.0001261440193047747, + "learning_rate": 9.85085921154711e-05, + "logits/chosen": -0.5889202356338501, + "logits/rejected": 1.0418097972869873, + "logps/chosen": -428.19598388671875, + "logps/rejected": -546.845947265625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.5799078941345215, + "rewards/margins": 26.233631134033203, + "rewards/rejected": -33.81353759765625, + "step": 5100 + }, + { + "epoch": 15.863671572424478, + "grad_norm": 0.00021416415984276682, + "learning_rate": 9.787403181814281e-05, + "logits/chosen": -0.6271970868110657, + "logits/rejected": 0.9168373346328735, + "logps/chosen": -425.2923889160156, + "logps/rejected": -527.7469482421875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.666965484619141, + "rewards/margins": 24.424535751342773, + "rewards/rejected": -32.09149932861328, + "step": 5120 + }, + { + "epoch": 15.92563903950426, + "grad_norm": 0.00090818852186203, + "learning_rate": 9.723955715450287e-05, + "logits/chosen": -0.6783554553985596, + "logits/rejected": 0.9546459913253784, + "logps/chosen": -422.9801330566406, + "logps/rejected": -557.9986572265625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.099810600280762, + "rewards/margins": 27.2694091796875, + "rewards/rejected": -34.36921691894531, + "step": 5140 + }, + { + "epoch": 15.987606506584044, + "grad_norm": 0.00033679328043945134, + "learning_rate": 9.660519368109823e-05, + "logits/chosen": -0.6659687757492065, + "logits/rejected": 1.0432156324386597, + "logps/chosen": -430.2832946777344, + "logps/rejected": -544.5806884765625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.264108180999756, + "rewards/margins": 26.98138427734375, + "rewards/rejected": -34.24549102783203, + "step": 5160 + }, + { + "epoch": 16.049573973663826, + "grad_norm": 0.00020042255346197635, + "learning_rate": 9.597096694999715e-05, + "logits/chosen": -0.5894029140472412, + "logits/rejected": 1.1150057315826416, + "logps/chosen": -451.55133056640625, + "logps/rejected": -567.7293701171875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.232612609863281, + "rewards/margins": 26.292133331298828, + "rewards/rejected": -34.524742126464844, + "step": 5180 + }, + { + "epoch": 16.11154144074361, + "grad_norm": 0.00011073077621404082, + "learning_rate": 9.53369025077598e-05, + "logits/chosen": -0.5697144269943237, + "logits/rejected": 1.127539038658142, + "logps/chosen": -430.92901611328125, + "logps/rejected": -548.5370483398438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.460732460021973, + "rewards/margins": 26.866458892822266, + "rewards/rejected": -34.32719039916992, + "step": 5200 + }, + { + "epoch": 16.173508907823393, + "grad_norm": 0.00015952142712194473, + "learning_rate": 9.470302589440952e-05, + "logits/chosen": -0.6166108250617981, + "logits/rejected": 1.1184258460998535, + "logps/chosen": -431.86761474609375, + "logps/rejected": -538.8045043945312, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.1826887130737305, + "rewards/margins": 26.133398056030273, + "rewards/rejected": -33.31608581542969, + "step": 5220 + }, + { + "epoch": 16.235476374903175, + "grad_norm": 0.0005165811162441969, + "learning_rate": 9.406936264240386e-05, + "logits/chosen": -0.5293561816215515, + "logits/rejected": 1.0149633884429932, + "logps/chosen": -411.24774169921875, + "logps/rejected": -552.6751708984375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.941656589508057, + "rewards/margins": 26.677719116210938, + "rewards/rejected": -33.61937713623047, + "step": 5240 + }, + { + "epoch": 16.297443841982957, + "grad_norm": 0.00018194419681094587, + "learning_rate": 9.343593827560617e-05, + "logits/chosen": -0.5177640318870544, + "logits/rejected": 1.0884605646133423, + "logps/chosen": -436.27557373046875, + "logps/rejected": -555.76611328125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.657900333404541, + "rewards/margins": 26.019603729248047, + "rewards/rejected": -33.67750549316406, + "step": 5260 + }, + { + "epoch": 16.359411309062743, + "grad_norm": 1.4249508240027353e-05, + "learning_rate": 9.280277830825763e-05, + "logits/chosen": -0.566566526889801, + "logits/rejected": 1.115505576133728, + "logps/chosen": -420.29864501953125, + "logps/rejected": -530.7322387695312, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.4629082679748535, + "rewards/margins": 26.702518463134766, + "rewards/rejected": -33.165428161621094, + "step": 5280 + }, + { + "epoch": 16.421378776142525, + "grad_norm": 0.00037811213405802846, + "learning_rate": 9.216990824394937e-05, + "logits/chosen": -0.6209192872047424, + "logits/rejected": 1.0690656900405884, + "logps/chosen": -428.49700927734375, + "logps/rejected": -556.0371704101562, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.307267189025879, + "rewards/margins": 28.177288055419922, + "rewards/rejected": -34.484554290771484, + "step": 5300 + }, + { + "epoch": 16.483346243222307, + "grad_norm": 0.0006744434358552098, + "learning_rate": 9.15373535745953e-05, + "logits/chosen": -0.6343203186988831, + "logits/rejected": 1.017704725265503, + "logps/chosen": -426.8873596191406, + "logps/rejected": -555.207275390625, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.941340446472168, + "rewards/margins": 26.214609146118164, + "rewards/rejected": -33.15595245361328, + "step": 5320 + }, + { + "epoch": 16.545313710302093, + "grad_norm": 0.00023284759663511068, + "learning_rate": 9.090513977940532e-05, + "logits/chosen": -0.5838009119033813, + "logits/rejected": 1.173878788948059, + "logps/chosen": -431.891845703125, + "logps/rejected": -531.1908569335938, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.410466194152832, + "rewards/margins": 25.734268188476562, + "rewards/rejected": -33.14473342895508, + "step": 5340 + }, + { + "epoch": 16.607281177381875, + "grad_norm": 0.0001960826339200139, + "learning_rate": 9.027329232385887e-05, + "logits/chosen": -0.7288259267807007, + "logits/rejected": 1.11318039894104, + "logps/chosen": -422.9234313964844, + "logps/rejected": -513.193115234375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.876204490661621, + "rewards/margins": 25.526058197021484, + "rewards/rejected": -32.40226364135742, + "step": 5360 + }, + { + "epoch": 16.669248644461657, + "grad_norm": 0.00011543634900590405, + "learning_rate": 8.96418366586793e-05, + "logits/chosen": -0.6046707034111023, + "logits/rejected": 1.0723017454147339, + "logps/chosen": -433.06646728515625, + "logps/rejected": -532.0721435546875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.24627685546875, + "rewards/margins": 25.937397003173828, + "rewards/rejected": -33.18367004394531, + "step": 5380 + }, + { + "epoch": 16.73121611154144, + "grad_norm": 0.00011998928675893694, + "learning_rate": 8.901079821880882e-05, + "logits/chosen": -0.5506830811500549, + "logits/rejected": 1.1579818725585938, + "logps/chosen": -437.7366638183594, + "logps/rejected": -554.2669067382812, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.844750881195068, + "rewards/margins": 26.484683990478516, + "rewards/rejected": -34.329429626464844, + "step": 5400 + }, + { + "epoch": 16.793183578621225, + "grad_norm": 0.00036458164686337113, + "learning_rate": 8.838020242238367e-05, + "logits/chosen": -0.5875279903411865, + "logits/rejected": 1.0199635028839111, + "logps/chosen": -430.7936096191406, + "logps/rejected": -550.0620727539062, + "loss": 0.0043, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.758522033691406, + "rewards/margins": 25.7502498626709, + "rewards/rejected": -33.50877380371094, + "step": 5420 + }, + { + "epoch": 16.855151045701007, + "grad_norm": 4.471308056963608e-05, + "learning_rate": 8.775007466971067e-05, + "logits/chosen": -0.5515426397323608, + "logits/rejected": 1.3305522203445435, + "logps/chosen": -421.59027099609375, + "logps/rejected": -505.4217224121094, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.508360385894775, + "rewards/margins": 25.7374210357666, + "rewards/rejected": -32.24578094482422, + "step": 5440 + }, + { + "epoch": 16.91711851278079, + "grad_norm": 0.0005242056213319302, + "learning_rate": 8.712044034224374e-05, + "logits/chosen": -0.5384324789047241, + "logits/rejected": 1.0426101684570312, + "logps/chosen": -419.22314453125, + "logps/rejected": -534.5950317382812, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.432672500610352, + "rewards/margins": 25.05026626586914, + "rewards/rejected": -33.48293685913086, + "step": 5460 + }, + { + "epoch": 16.979085979860574, + "grad_norm": 0.00010303401359124109, + "learning_rate": 8.649132480156181e-05, + "logits/chosen": -0.5100408792495728, + "logits/rejected": 1.2015564441680908, + "logps/chosen": -420.74835205078125, + "logps/rejected": -532.5908203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.649355411529541, + "rewards/margins": 26.004283905029297, + "rewards/rejected": -33.65364456176758, + "step": 5480 + }, + { + "epoch": 17.041053446940357, + "grad_norm": 0.00029292888939380646, + "learning_rate": 8.586275338834718e-05, + "logits/chosen": -0.6359528303146362, + "logits/rejected": 1.1911952495574951, + "logps/chosen": -427.14605712890625, + "logps/rejected": -508.2286071777344, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.083102226257324, + "rewards/margins": 24.796707153320312, + "rewards/rejected": -31.879810333251953, + "step": 5500 + }, + { + "epoch": 17.10302091402014, + "grad_norm": 0.00010459234908921644, + "learning_rate": 8.523475142136463e-05, + "logits/chosen": -0.7133889198303223, + "logits/rejected": 1.1848968267440796, + "logps/chosen": -435.04644775390625, + "logps/rejected": -545.0972900390625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.787796974182129, + "rewards/margins": 27.31494140625, + "rewards/rejected": -34.10273742675781, + "step": 5520 + }, + { + "epoch": 17.164988381099924, + "grad_norm": 0.00017392283189110458, + "learning_rate": 8.460734419644185e-05, + "logits/chosen": -0.5804117918014526, + "logits/rejected": 1.1580560207366943, + "logps/chosen": -441.470703125, + "logps/rejected": -547.8115234375, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -6.628912448883057, + "rewards/margins": 26.92593765258789, + "rewards/rejected": -33.554847717285156, + "step": 5540 + }, + { + "epoch": 17.226955848179706, + "grad_norm": 0.00011059839744120836, + "learning_rate": 8.398055698545043e-05, + "logits/chosen": -0.6109145879745483, + "logits/rejected": 1.0190441608428955, + "logps/chosen": -426.3067321777344, + "logps/rejected": -557.4554443359375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.038537502288818, + "rewards/margins": 27.249805450439453, + "rewards/rejected": -34.2883415222168, + "step": 5560 + }, + { + "epoch": 17.28892331525949, + "grad_norm": 5.200642044655979e-05, + "learning_rate": 8.33544150352878e-05, + "logits/chosen": -0.6560173034667969, + "logits/rejected": 1.1862332820892334, + "logps/chosen": -432.2843322753906, + "logps/rejected": -529.6439819335938, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.714084625244141, + "rewards/margins": 26.171829223632812, + "rewards/rejected": -32.88591766357422, + "step": 5580 + }, + { + "epoch": 17.35089078233927, + "grad_norm": 0.00016506008978467435, + "learning_rate": 8.272894356686039e-05, + "logits/chosen": -0.660111665725708, + "logits/rejected": 1.1549274921417236, + "logps/chosen": -441.23822021484375, + "logps/rejected": -559.5040893554688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.083149909973145, + "rewards/margins": 27.01141357421875, + "rewards/rejected": -35.09456253051758, + "step": 5600 + }, + { + "epoch": 17.412858249419056, + "grad_norm": 0.00016628840239718556, + "learning_rate": 8.210416777406774e-05, + "logits/chosen": -0.5330893993377686, + "logits/rejected": 1.0295093059539795, + "logps/chosen": -420.89794921875, + "logps/rejected": -539.91064453125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.301202774047852, + "rewards/margins": 24.926477432250977, + "rewards/rejected": -33.22768020629883, + "step": 5620 + }, + { + "epoch": 17.474825716498838, + "grad_norm": 0.0009920025477185845, + "learning_rate": 8.148011282278772e-05, + "logits/chosen": -0.5239976048469543, + "logits/rejected": 1.067887306213379, + "logps/chosen": -454.75970458984375, + "logps/rejected": -577.5914306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.137965202331543, + "rewards/margins": 26.359233856201172, + "rewards/rejected": -34.49720001220703, + "step": 5640 + }, + { + "epoch": 17.53679318357862, + "grad_norm": 3.656623812275939e-05, + "learning_rate": 8.085680384986276e-05, + "logits/chosen": -0.716755747795105, + "logits/rejected": 1.1244462728500366, + "logps/chosen": -430.6189880371094, + "logps/rejected": -534.71875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.90305233001709, + "rewards/margins": 27.014053344726562, + "rewards/rejected": -33.91710662841797, + "step": 5660 + }, + { + "epoch": 17.598760650658406, + "grad_norm": 0.00035754471900872886, + "learning_rate": 8.023426596208739e-05, + "logits/chosen": -0.5729060769081116, + "logits/rejected": 1.239874005317688, + "logps/chosen": -421.12432861328125, + "logps/rejected": -516.0450439453125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.880531311035156, + "rewards/margins": 25.47823715209961, + "rewards/rejected": -32.35877227783203, + "step": 5680 + }, + { + "epoch": 17.660728117738188, + "grad_norm": 0.00047625869046896696, + "learning_rate": 7.961252423519696e-05, + "logits/chosen": -0.5793383121490479, + "logits/rejected": 1.1896495819091797, + "logps/chosen": -414.25518798828125, + "logps/rejected": -531.8538818359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.382920742034912, + "rewards/margins": 26.68353271484375, + "rewards/rejected": -33.06645584106445, + "step": 5700 + }, + { + "epoch": 17.72269558481797, + "grad_norm": 8.376881305593997e-05, + "learning_rate": 7.899160371285761e-05, + "logits/chosen": -0.5482260584831238, + "logits/rejected": 1.0082305669784546, + "logps/chosen": -432.7474670410156, + "logps/rejected": -554.15478515625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.3407301902771, + "rewards/margins": 25.97622299194336, + "rewards/rejected": -33.31695556640625, + "step": 5720 + }, + { + "epoch": 17.784663051897752, + "grad_norm": 0.00044510714360512793, + "learning_rate": 7.837152940565741e-05, + "logits/chosen": -0.5911335945129395, + "logits/rejected": 1.0798364877700806, + "logps/chosen": -428.51202392578125, + "logps/rejected": -549.1173095703125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.572705268859863, + "rewards/margins": 26.540054321289062, + "rewards/rejected": -34.11275863647461, + "step": 5740 + }, + { + "epoch": 17.846630518977538, + "grad_norm": 0.00010473801376065239, + "learning_rate": 7.775232629009904e-05, + "logits/chosen": -0.545853316783905, + "logits/rejected": 1.2726647853851318, + "logps/chosen": -423.32196044921875, + "logps/rejected": -526.806640625, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.698158264160156, + "rewards/margins": 26.736377716064453, + "rewards/rejected": -33.434532165527344, + "step": 5760 + }, + { + "epoch": 17.90859798605732, + "grad_norm": 0.0019047368550673127, + "learning_rate": 7.713401930759365e-05, + "logits/chosen": -0.4332125782966614, + "logits/rejected": 1.0440775156021118, + "logps/chosen": -416.0174865722656, + "logps/rejected": -550.7855834960938, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.867218971252441, + "rewards/margins": 26.22536849975586, + "rewards/rejected": -34.09259033203125, + "step": 5780 + }, + { + "epoch": 17.9705654531371, + "grad_norm": 0.00020266433421056718, + "learning_rate": 7.651663336345642e-05, + "logits/chosen": -0.6614434719085693, + "logits/rejected": 1.2425765991210938, + "logps/chosen": -412.87200927734375, + "logps/rejected": -513.8663940429688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.660252571105957, + "rewards/margins": 27.139822006225586, + "rewards/rejected": -32.80007553100586, + "step": 5800 + }, + { + "epoch": 18.032532920216887, + "grad_norm": 0.0003341589472256601, + "learning_rate": 7.590019332590315e-05, + "logits/chosen": -0.5724080801010132, + "logits/rejected": 1.1166841983795166, + "logps/chosen": -428.62835693359375, + "logps/rejected": -538.9694213867188, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.769497871398926, + "rewards/margins": 25.40299415588379, + "rewards/rejected": -33.17249298095703, + "step": 5820 + }, + { + "epoch": 18.09450038729667, + "grad_norm": 0.0004949842114001513, + "learning_rate": 7.528472402504862e-05, + "logits/chosen": -0.5866089463233948, + "logits/rejected": 1.1626102924346924, + "logps/chosen": -438.97210693359375, + "logps/rejected": -541.1741943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.18331241607666, + "rewards/margins": 25.09009552001953, + "rewards/rejected": -33.27341079711914, + "step": 5840 + }, + { + "epoch": 18.15646785437645, + "grad_norm": 6.053561810404062e-05, + "learning_rate": 7.467025025190657e-05, + "logits/chosen": -0.5582033395767212, + "logits/rejected": 1.0998704433441162, + "logps/chosen": -408.2658386230469, + "logps/rejected": -540.3822631835938, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.132643222808838, + "rewards/margins": 26.61345863342285, + "rewards/rejected": -33.74610137939453, + "step": 5860 + }, + { + "epoch": 18.218435321456237, + "grad_norm": 0.0005991118378005922, + "learning_rate": 7.405679675739096e-05, + "logits/chosen": -0.56462162733078, + "logits/rejected": 1.2322356700897217, + "logps/chosen": -431.82940673828125, + "logps/rejected": -530.2599487304688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.95839786529541, + "rewards/margins": 26.209026336669922, + "rewards/rejected": -33.16742706298828, + "step": 5880 + }, + { + "epoch": 18.28040278853602, + "grad_norm": 0.002049060305580497, + "learning_rate": 7.344438825131911e-05, + "logits/chosen": -0.6229193210601807, + "logits/rejected": 1.1154416799545288, + "logps/chosen": -423.47125244140625, + "logps/rejected": -539.2529907226562, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.331400394439697, + "rewards/margins": 26.0443172454834, + "rewards/rejected": -33.3757209777832, + "step": 5900 + }, + { + "epoch": 18.3423702556158, + "grad_norm": 0.0006644345703534782, + "learning_rate": 7.283304940141637e-05, + "logits/chosen": -0.5369777679443359, + "logits/rejected": 1.0778067111968994, + "logps/chosen": -421.04803466796875, + "logps/rejected": -554.5689697265625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.97817325592041, + "rewards/margins": 27.166324615478516, + "rewards/rejected": -34.14449691772461, + "step": 5920 + }, + { + "epoch": 18.404337722695583, + "grad_norm": 0.0002038206730503589, + "learning_rate": 7.222280483232242e-05, + "logits/chosen": -0.6521028876304626, + "logits/rejected": 1.2157747745513916, + "logps/chosen": -426.2845153808594, + "logps/rejected": -528.0088500976562, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.181411266326904, + "rewards/margins": 26.629674911499023, + "rewards/rejected": -32.81108856201172, + "step": 5940 + }, + { + "epoch": 18.46630518977537, + "grad_norm": 9.838932601269335e-05, + "learning_rate": 7.161367912459954e-05, + "logits/chosen": -0.5511382222175598, + "logits/rejected": 1.1784141063690186, + "logps/chosen": -440.888916015625, + "logps/rejected": -546.6519165039062, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.140920639038086, + "rewards/margins": 26.043033599853516, + "rewards/rejected": -34.18395233154297, + "step": 5960 + }, + { + "epoch": 18.52827265685515, + "grad_norm": 0.0001888351107481867, + "learning_rate": 7.100569681374245e-05, + "logits/chosen": -0.6422185301780701, + "logits/rejected": 1.0930224657058716, + "logps/chosen": -432.44708251953125, + "logps/rejected": -569.1817626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.413304328918457, + "rewards/margins": 27.831863403320312, + "rewards/rejected": -35.24516677856445, + "step": 5980 + }, + { + "epoch": 18.590240123934933, + "grad_norm": 0.00025130840367637575, + "learning_rate": 7.039888238918993e-05, + "logits/chosen": -0.6784704327583313, + "logits/rejected": 1.2808759212493896, + "logps/chosen": -424.71820068359375, + "logps/rejected": -530.2955322265625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.33709192276001, + "rewards/margins": 27.75899314880371, + "rewards/rejected": -34.0960807800293, + "step": 6000 + }, + { + "epoch": 18.65220759101472, + "grad_norm": 0.000115082977572456, + "learning_rate": 6.979326029333855e-05, + "logits/chosen": -0.5961139798164368, + "logits/rejected": 1.1904783248901367, + "logps/chosen": -424.91619873046875, + "logps/rejected": -537.8231201171875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.321267127990723, + "rewards/margins": 27.020503997802734, + "rewards/rejected": -33.341773986816406, + "step": 6020 + }, + { + "epoch": 18.7141750580945, + "grad_norm": 0.00029704332700930536, + "learning_rate": 6.918885492055803e-05, + "logits/chosen": -0.5596092939376831, + "logits/rejected": 1.1183770895004272, + "logps/chosen": -401.2093505859375, + "logps/rejected": -517.4161376953125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.887650966644287, + "rewards/margins": 26.075496673583984, + "rewards/rejected": -32.9631462097168, + "step": 6040 + }, + { + "epoch": 18.776142525174283, + "grad_norm": 0.00012532403343357146, + "learning_rate": 6.858569061620862e-05, + "logits/chosen": -0.605311930179596, + "logits/rejected": 1.1743067502975464, + "logps/chosen": -431.15850830078125, + "logps/rejected": -542.1910400390625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.516722202301025, + "rewards/margins": 27.494848251342773, + "rewards/rejected": -34.011573791503906, + "step": 6060 + }, + { + "epoch": 18.838109992254065, + "grad_norm": 0.00032498795189894736, + "learning_rate": 6.798379167566064e-05, + "logits/chosen": -0.5115218162536621, + "logits/rejected": 1.1053214073181152, + "logps/chosen": -429.5455627441406, + "logps/rejected": -554.1275024414062, + "loss": 0.0043, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.813681125640869, + "rewards/margins": 26.788400650024414, + "rewards/rejected": -34.60208511352539, + "step": 6080 + }, + { + "epoch": 18.90007745933385, + "grad_norm": 0.00025639976956881583, + "learning_rate": 6.738318234331554e-05, + "logits/chosen": -0.5659054517745972, + "logits/rejected": 1.2672799825668335, + "logps/chosen": -422.6165466308594, + "logps/rejected": -531.7962646484375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.032742500305176, + "rewards/margins": 26.38532066345215, + "rewards/rejected": -33.418067932128906, + "step": 6100 + }, + { + "epoch": 18.962044926413633, + "grad_norm": 0.00040332350181415677, + "learning_rate": 6.67838868116297e-05, + "logits/chosen": -0.5604509711265564, + "logits/rejected": 1.3309004306793213, + "logps/chosen": -430.5907287597656, + "logps/rejected": -535.790771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.163422584533691, + "rewards/margins": 26.109851837158203, + "rewards/rejected": -34.27326965332031, + "step": 6120 + }, + { + "epoch": 19.024012393493415, + "grad_norm": 0.0001713493256829679, + "learning_rate": 6.618592922013973e-05, + "logits/chosen": -0.6125264763832092, + "logits/rejected": 1.121382713317871, + "logps/chosen": -435.3624572753906, + "logps/rejected": -564.345947265625, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.341497898101807, + "rewards/margins": 27.505783081054688, + "rewards/rejected": -34.84728240966797, + "step": 6140 + }, + { + "epoch": 19.0859798605732, + "grad_norm": 2.4474145902786404e-05, + "learning_rate": 6.558933365449025e-05, + "logits/chosen": -0.4567294716835022, + "logits/rejected": 1.182701587677002, + "logps/chosen": -430.1796875, + "logps/rejected": -557.1361694335938, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.146208763122559, + "rewards/margins": 26.323129653930664, + "rewards/rejected": -34.469337463378906, + "step": 6160 + }, + { + "epoch": 19.147947327652982, + "grad_norm": 0.0014432374155148864, + "learning_rate": 6.499412414546362e-05, + "logits/chosen": -0.6155918836593628, + "logits/rejected": 1.3073813915252686, + "logps/chosen": -436.5602111816406, + "logps/rejected": -532.6322021484375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.379100799560547, + "rewards/margins": 26.880939483642578, + "rewards/rejected": -34.260040283203125, + "step": 6180 + }, + { + "epoch": 19.209914794732764, + "grad_norm": 4.102818638784811e-05, + "learning_rate": 6.440032466801215e-05, + "logits/chosen": -0.5177820920944214, + "logits/rejected": 1.2410507202148438, + "logps/chosen": -435.9002380371094, + "logps/rejected": -561.0885009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.766203880310059, + "rewards/margins": 27.36117935180664, + "rewards/rejected": -35.12738037109375, + "step": 6200 + }, + { + "epoch": 19.27188226181255, + "grad_norm": 4.8919737309915945e-05, + "learning_rate": 6.380795914029213e-05, + "logits/chosen": -0.47016972303390503, + "logits/rejected": 1.1552751064300537, + "logps/chosen": -429.572265625, + "logps/rejected": -567.6087646484375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.870247840881348, + "rewards/margins": 27.22457504272461, + "rewards/rejected": -35.094825744628906, + "step": 6220 + }, + { + "epoch": 19.333849728892332, + "grad_norm": 0.0002924731816165149, + "learning_rate": 6.321705142270067e-05, + "logits/chosen": -0.6526008248329163, + "logits/rejected": 1.217797040939331, + "logps/chosen": -428.2090759277344, + "logps/rejected": -534.576904296875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.726796627044678, + "rewards/margins": 26.724050521850586, + "rewards/rejected": -33.450843811035156, + "step": 6240 + }, + { + "epoch": 19.395817195972114, + "grad_norm": 0.00014902207476552576, + "learning_rate": 6.262762531691451e-05, + "logits/chosen": -0.5008414387702942, + "logits/rejected": 1.121274471282959, + "logps/chosen": -416.0611267089844, + "logps/rejected": -548.2586669921875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.260308742523193, + "rewards/margins": 26.569469451904297, + "rewards/rejected": -33.82978057861328, + "step": 6260 + }, + { + "epoch": 19.457784663051896, + "grad_norm": 0.0001254824601346627, + "learning_rate": 6.203970456493118e-05, + "logits/chosen": -0.6224455237388611, + "logits/rejected": 1.1558345556259155, + "logps/chosen": -430.1212463378906, + "logps/rejected": -535.5614013671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.150260925292969, + "rewards/margins": 26.5768985748291, + "rewards/rejected": -33.72715759277344, + "step": 6280 + }, + { + "epoch": 19.519752130131682, + "grad_norm": 0.00014725126675330102, + "learning_rate": 6.145331284811285e-05, + "logits/chosen": -0.6229298710823059, + "logits/rejected": 1.20758855342865, + "logps/chosen": -448.34716796875, + "logps/rejected": -562.9547119140625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.880606174468994, + "rewards/margins": 28.050073623657227, + "rewards/rejected": -34.93068313598633, + "step": 6300 + }, + { + "epoch": 19.581719597211464, + "grad_norm": 0.00048207101644948125, + "learning_rate": 6.0868473786232395e-05, + "logits/chosen": -0.5494848489761353, + "logits/rejected": 1.18673837184906, + "logps/chosen": -418.2279357910156, + "logps/rejected": -551.411376953125, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.64691686630249, + "rewards/margins": 27.100601196289062, + "rewards/rejected": -34.747520446777344, + "step": 6320 + }, + { + "epoch": 19.643687064291246, + "grad_norm": 0.0007689573685638607, + "learning_rate": 6.0285210936521955e-05, + "logits/chosen": -0.4644032418727875, + "logits/rejected": 1.1573445796966553, + "logps/chosen": -430.5718688964844, + "logps/rejected": -580.4948120117188, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.147204399108887, + "rewards/margins": 27.156152725219727, + "rewards/rejected": -35.3033561706543, + "step": 6340 + }, + { + "epoch": 19.70565453137103, + "grad_norm": 0.00015767944569233805, + "learning_rate": 5.9703547792724045e-05, + "logits/chosen": -0.521681010723114, + "logits/rejected": 1.169802188873291, + "logps/chosen": -419.7161560058594, + "logps/rejected": -550.3419799804688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.470440864562988, + "rewards/margins": 27.15780258178711, + "rewards/rejected": -34.62824249267578, + "step": 6360 + }, + { + "epoch": 19.767621998450814, + "grad_norm": 0.000223572802497074, + "learning_rate": 5.912350778414531e-05, + "logits/chosen": -0.5011137127876282, + "logits/rejected": 1.0836546421051025, + "logps/chosen": -432.50421142578125, + "logps/rejected": -566.302734375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.912287712097168, + "rewards/margins": 26.835742950439453, + "rewards/rejected": -34.74802780151367, + "step": 6380 + }, + { + "epoch": 19.829589465530596, + "grad_norm": 0.0003539229219313711, + "learning_rate": 5.8545114274712695e-05, + "logits/chosen": -0.5557172894477844, + "logits/rejected": 1.1703180074691772, + "logps/chosen": -436.09454345703125, + "logps/rejected": -560.4405517578125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.96994161605835, + "rewards/margins": 26.893199920654297, + "rewards/rejected": -34.86314010620117, + "step": 6400 + }, + { + "epoch": 19.891556932610378, + "grad_norm": 0.00028784500318579376, + "learning_rate": 5.796839056203247e-05, + "logits/chosen": -0.6601584553718567, + "logits/rejected": 1.2676100730895996, + "logps/chosen": -423.5615234375, + "logps/rejected": -529.6727294921875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.596652984619141, + "rewards/margins": 26.803829193115234, + "rewards/rejected": -33.400482177734375, + "step": 6420 + }, + { + "epoch": 19.953524399690163, + "grad_norm": 0.0001648878096602857, + "learning_rate": 5.7422070843492734e-05, + "logits/chosen": -0.5620417594909668, + "logits/rejected": 1.2710864543914795, + "logps/chosen": -440.33001708984375, + "logps/rejected": -560.01123046875, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.448406219482422, + "rewards/margins": 28.1207275390625, + "rewards/rejected": -35.56913757324219, + "step": 6440 + }, + { + "epoch": 20.015491866769946, + "grad_norm": 3.79412122128997e-05, + "learning_rate": 5.684866998866316e-05, + "logits/chosen": -0.6099050045013428, + "logits/rejected": 1.330368995666504, + "logps/chosen": -440.4766540527344, + "logps/rejected": -537.293701171875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.571197509765625, + "rewards/margins": 26.832447052001953, + "rewards/rejected": -34.40364456176758, + "step": 6460 + }, + { + "epoch": 20.077459333849728, + "grad_norm": 0.00026894695474766195, + "learning_rate": 5.6277007263114437e-05, + "logits/chosen": -0.5007272958755493, + "logits/rejected": 1.3822646141052246, + "logps/chosen": -431.9654846191406, + "logps/rejected": -549.35546875, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.299338340759277, + "rewards/margins": 27.777185440063477, + "rewards/rejected": -35.07652282714844, + "step": 6480 + }, + { + "epoch": 20.139426800929513, + "grad_norm": 8.49057687446475e-05, + "learning_rate": 5.570710569333772e-05, + "logits/chosen": -0.6163416504859924, + "logits/rejected": 1.2752020359039307, + "logps/chosen": -447.9937438964844, + "logps/rejected": -553.1744384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5520758628845215, + "rewards/margins": 27.005313873291016, + "rewards/rejected": -34.55738830566406, + "step": 6500 + }, + { + "epoch": 20.201394268009295, + "grad_norm": 0.00022929662372916937, + "learning_rate": 5.513898823488528e-05, + "logits/chosen": -0.49778875708580017, + "logits/rejected": 1.2400882244110107, + "logps/chosen": -442.35284423828125, + "logps/rejected": -562.985595703125, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.439814567565918, + "rewards/margins": 27.415929794311523, + "rewards/rejected": -34.85574722290039, + "step": 6520 + }, + { + "epoch": 20.263361735089077, + "grad_norm": 2.7919993954128586e-05, + "learning_rate": 5.4572677771445344e-05, + "logits/chosen": -0.650310754776001, + "logits/rejected": 1.2607206106185913, + "logps/chosen": -437.0475158691406, + "logps/rejected": -534.0076904296875, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.755753517150879, + "rewards/margins": 26.984405517578125, + "rewards/rejected": -33.74016189575195, + "step": 6540 + }, + { + "epoch": 20.325329202168863, + "grad_norm": 3.8151458284119144e-05, + "learning_rate": 5.400819711392091e-05, + "logits/chosen": -0.5207892656326294, + "logits/rejected": 1.2761542797088623, + "logps/chosen": -422.2308654785156, + "logps/rejected": -537.1773071289062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0580034255981445, + "rewards/margins": 27.111309051513672, + "rewards/rejected": -34.169307708740234, + "step": 6560 + }, + { + "epoch": 20.387296669248645, + "grad_norm": 8.57314735185355e-05, + "learning_rate": 5.344556899951054e-05, + "logits/chosen": -0.5232094526290894, + "logits/rejected": 1.1201660633087158, + "logps/chosen": -445.23309326171875, + "logps/rejected": -583.000732421875, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -7.883921146392822, + "rewards/margins": 27.830123901367188, + "rewards/rejected": -35.71404266357422, + "step": 6580 + }, + { + "epoch": 20.449264136328427, + "grad_norm": 0.0002258592430735007, + "learning_rate": 5.288481609079259e-05, + "logits/chosen": -0.5788317322731018, + "logits/rejected": 1.0734844207763672, + "logps/chosen": -428.3194885253906, + "logps/rejected": -547.5687255859375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.71385669708252, + "rewards/margins": 26.13985252380371, + "rewards/rejected": -34.85371398925781, + "step": 6600 + }, + { + "epoch": 20.51123160340821, + "grad_norm": 9.367791790282354e-05, + "learning_rate": 5.232596097481251e-05, + "logits/chosen": -0.5063174366950989, + "logits/rejected": 1.2991888523101807, + "logps/chosen": -428.83673095703125, + "logps/rejected": -536.5294189453125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.142416954040527, + "rewards/margins": 26.242568969726562, + "rewards/rejected": -34.384986877441406, + "step": 6620 + }, + { + "epoch": 20.573199070487995, + "grad_norm": 0.00023494637571275234, + "learning_rate": 5.17690261621729e-05, + "logits/chosen": -0.521760880947113, + "logits/rejected": 1.2555716037750244, + "logps/chosen": -443.06781005859375, + "logps/rejected": -544.702880859375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.644153594970703, + "rewards/margins": 26.8060245513916, + "rewards/rejected": -34.45018005371094, + "step": 6640 + }, + { + "epoch": 20.635166537567777, + "grad_norm": 0.0011836939956992865, + "learning_rate": 5.121403408612672e-05, + "logits/chosen": -0.4721315801143646, + "logits/rejected": 1.1798968315124512, + "logps/chosen": -445.50909423828125, + "logps/rejected": -572.2584838867188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.873363494873047, + "rewards/margins": 27.153827667236328, + "rewards/rejected": -36.027191162109375, + "step": 6660 + }, + { + "epoch": 20.69713400464756, + "grad_norm": 0.00022585636179428548, + "learning_rate": 5.066100710167401e-05, + "logits/chosen": -0.5391095876693726, + "logits/rejected": 1.2846006155014038, + "logps/chosen": -429.7838439941406, + "logps/rejected": -565.8057861328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0114898681640625, + "rewards/margins": 28.408946990966797, + "rewards/rejected": -35.420440673828125, + "step": 6680 + }, + { + "epoch": 20.759101471727345, + "grad_norm": 7.968185673234984e-05, + "learning_rate": 5.010996748466088e-05, + "logits/chosen": -0.551699697971344, + "logits/rejected": 1.2516696453094482, + "logps/chosen": -421.5836486816406, + "logps/rejected": -523.4829711914062, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.205231666564941, + "rewards/margins": 26.57697105407715, + "rewards/rejected": -33.782203674316406, + "step": 6700 + }, + { + "epoch": 20.821068938807127, + "grad_norm": 9.455503459321335e-05, + "learning_rate": 4.956093743088291e-05, + "logits/chosen": -0.4987064003944397, + "logits/rejected": 1.2454708814620972, + "logps/chosen": -425.71221923828125, + "logps/rejected": -552.5072021484375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.379124641418457, + "rewards/margins": 26.97328758239746, + "rewards/rejected": -34.352413177490234, + "step": 6720 + }, + { + "epoch": 20.88303640588691, + "grad_norm": 5.707304808311164e-06, + "learning_rate": 4.901393905519055e-05, + "logits/chosen": -0.5764604806900024, + "logits/rejected": 1.3113079071044922, + "logps/chosen": -420.73919677734375, + "logps/rejected": -510.94891357421875, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.187965393066406, + "rewards/margins": 25.822784423828125, + "rewards/rejected": -33.01074981689453, + "step": 6740 + }, + { + "epoch": 20.945003872966694, + "grad_norm": 6.422119622584432e-05, + "learning_rate": 4.8468994390598574e-05, + "logits/chosen": -0.4906904101371765, + "logits/rejected": 1.1637732982635498, + "logps/chosen": -426.24737548828125, + "logps/rejected": -572.7752075195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.849706172943115, + "rewards/margins": 28.3243350982666, + "rewards/rejected": -35.174041748046875, + "step": 6760 + }, + { + "epoch": 21.006971340046476, + "grad_norm": 0.0005418303771875799, + "learning_rate": 4.79261253873987e-05, + "logits/chosen": -0.4840869903564453, + "logits/rejected": 1.2838385105133057, + "logps/chosen": -414.9949645996094, + "logps/rejected": -528.3868408203125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.632540225982666, + "rewards/margins": 26.313039779663086, + "rewards/rejected": -33.945579528808594, + "step": 6780 + }, + { + "epoch": 21.06893880712626, + "grad_norm": 0.00022662655101157725, + "learning_rate": 4.7385353912275165e-05, + "logits/chosen": -0.5232284665107727, + "logits/rejected": 1.2743021249771118, + "logps/chosen": -430.8934631347656, + "logps/rejected": -557.8607177734375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.5154900550842285, + "rewards/margins": 27.52066993713379, + "rewards/rejected": -35.036163330078125, + "step": 6800 + }, + { + "epoch": 21.13090627420604, + "grad_norm": 6.364914497680729e-06, + "learning_rate": 4.684670174742412e-05, + "logits/chosen": -0.5547454953193665, + "logits/rejected": 1.209750771522522, + "logps/chosen": -437.41033935546875, + "logps/rejected": -568.2105712890625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.9095869064331055, + "rewards/margins": 27.975051879882812, + "rewards/rejected": -35.88463592529297, + "step": 6820 + }, + { + "epoch": 21.192873741285826, + "grad_norm": 8.028039883356541e-05, + "learning_rate": 4.631019058967627e-05, + "logits/chosen": -0.5240232348442078, + "logits/rejected": 1.104143738746643, + "logps/chosen": -407.6142272949219, + "logps/rejected": -549.146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8381028175354, + "rewards/margins": 27.074167251586914, + "rewards/rejected": -34.912269592285156, + "step": 6840 + }, + { + "epoch": 21.254841208365608, + "grad_norm": 4.430773697094992e-05, + "learning_rate": 4.5775842049622806e-05, + "logits/chosen": -0.4717990756034851, + "logits/rejected": 1.254495620727539, + "logps/chosen": -435.498779296875, + "logps/rejected": -542.4292602539062, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.805765151977539, + "rewards/margins": 25.804275512695312, + "rewards/rejected": -34.61003875732422, + "step": 6860 + }, + { + "epoch": 21.31680867544539, + "grad_norm": 0.00010798404400702566, + "learning_rate": 4.524367765074499e-05, + "logits/chosen": -0.5588334798812866, + "logits/rejected": 1.3098465204238892, + "logps/chosen": -430.4671325683594, + "logps/rejected": -547.7396850585938, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4153337478637695, + "rewards/margins": 27.558120727539062, + "rewards/rejected": -34.973453521728516, + "step": 6880 + }, + { + "epoch": 21.378776142525176, + "grad_norm": 0.00025497484602965415, + "learning_rate": 4.471371882854723e-05, + "logits/chosen": -0.5086442232131958, + "logits/rejected": 1.2061560153961182, + "logps/chosen": -423.62109375, + "logps/rejected": -562.035888671875, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.9457902908325195, + "rewards/margins": 27.93740463256836, + "rewards/rejected": -34.88319778442383, + "step": 6900 + }, + { + "epoch": 21.440743609604958, + "grad_norm": 0.00021185963123571128, + "learning_rate": 4.4185986929693546e-05, + "logits/chosen": -0.5642815828323364, + "logits/rejected": 1.322967767715454, + "logps/chosen": -430.9812927246094, + "logps/rejected": -532.7042846679688, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.864518642425537, + "rewards/margins": 26.29730796813965, + "rewards/rejected": -34.161827087402344, + "step": 6920 + }, + { + "epoch": 21.50271107668474, + "grad_norm": 5.743455403717235e-05, + "learning_rate": 4.366050321114796e-05, + "logits/chosen": -0.530922532081604, + "logits/rejected": 1.3586127758026123, + "logps/chosen": -413.8951110839844, + "logps/rejected": -528.0460205078125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.085272789001465, + "rewards/margins": 27.594009399414062, + "rewards/rejected": -34.679283142089844, + "step": 6940 + }, + { + "epoch": 21.564678543764522, + "grad_norm": 0.00015524946502409875, + "learning_rate": 4.3137288839318014e-05, + "logits/chosen": -0.5405689477920532, + "logits/rejected": 1.2518033981323242, + "logps/chosen": -430.952392578125, + "logps/rejected": -577.5679931640625, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.740090370178223, + "rewards/margins": 28.604055404663086, + "rewards/rejected": -36.34414291381836, + "step": 6960 + }, + { + "epoch": 21.626646010844308, + "grad_norm": 0.0003134405706077814, + "learning_rate": 4.2616364889202254e-05, + "logits/chosen": -0.5319267511367798, + "logits/rejected": 1.2375494241714478, + "logps/chosen": -430.71209716796875, + "logps/rejected": -563.7471923828125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.725900173187256, + "rewards/margins": 27.626745223999023, + "rewards/rejected": -35.3526496887207, + "step": 6980 + }, + { + "epoch": 21.68861347792409, + "grad_norm": 0.0003931570390705019, + "learning_rate": 4.209775234354151e-05, + "logits/chosen": -0.5283772349357605, + "logits/rejected": 1.2156752347946167, + "logps/chosen": -417.3208923339844, + "logps/rejected": -555.2176513671875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.691795349121094, + "rewards/margins": 28.186809539794922, + "rewards/rejected": -35.87860870361328, + "step": 7000 + }, + { + "epoch": 21.750580945003872, + "grad_norm": 0.0001691762008704245, + "learning_rate": 4.158147209197347e-05, + "logits/chosen": -0.49120578169822693, + "logits/rejected": 1.30716073513031, + "logps/chosen": -423.93121337890625, + "logps/rejected": -562.0618286132812, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.96550989151001, + "rewards/margins": 28.568927764892578, + "rewards/rejected": -35.53443908691406, + "step": 7020 + }, + { + "epoch": 21.812548412083657, + "grad_norm": 0.0004458031035028398, + "learning_rate": 4.106754493019138e-05, + "logits/chosen": -0.5782488584518433, + "logits/rejected": 1.2537811994552612, + "logps/chosen": -440.19927978515625, + "logps/rejected": -559.7462158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.946485996246338, + "rewards/margins": 27.713253021240234, + "rewards/rejected": -35.65974044799805, + "step": 7040 + }, + { + "epoch": 21.87451587916344, + "grad_norm": 0.0034644545521587133, + "learning_rate": 4.055599155910639e-05, + "logits/chosen": -0.48173093795776367, + "logits/rejected": 1.1414930820465088, + "logps/chosen": -455.94866943359375, + "logps/rejected": -615.8961181640625, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.730755805969238, + "rewards/margins": 28.979084014892578, + "rewards/rejected": -37.709835052490234, + "step": 7060 + }, + { + "epoch": 21.93648334624322, + "grad_norm": 3.83302649424877e-05, + "learning_rate": 4.004683258401366e-05, + "logits/chosen": -0.4850381314754486, + "logits/rejected": 1.4388806819915771, + "logps/chosen": -431.77947998046875, + "logps/rejected": -547.6139526367188, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.851006507873535, + "rewards/margins": 27.237091064453125, + "rewards/rejected": -35.088096618652344, + "step": 7080 + }, + { + "epoch": 21.998450813323004, + "grad_norm": 6.421873695217073e-05, + "learning_rate": 3.954008851376252e-05, + "logits/chosen": -0.5726882219314575, + "logits/rejected": 1.2820627689361572, + "logps/chosen": -424.2509765625, + "logps/rejected": -561.9562377929688, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.327461242675781, + "rewards/margins": 28.40390968322754, + "rewards/rejected": -35.73137283325195, + "step": 7100 + }, + { + "epoch": 22.06041828040279, + "grad_norm": 0.0006721434183418751, + "learning_rate": 3.903577975993021e-05, + "logits/chosen": -0.410485178232193, + "logits/rejected": 1.2668616771697998, + "logps/chosen": -414.65509033203125, + "logps/rejected": -551.499755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.70891809463501, + "rewards/margins": 27.578258514404297, + "rewards/rejected": -35.28717803955078, + "step": 7120 + }, + { + "epoch": 22.12238574748257, + "grad_norm": 0.00025342078879475594, + "learning_rate": 3.853392663599976e-05, + "logits/chosen": -0.5337496995925903, + "logits/rejected": 1.3907458782196045, + "logps/chosen": -442.95892333984375, + "logps/rejected": -538.0328369140625, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -8.646308898925781, + "rewards/margins": 26.30740737915039, + "rewards/rejected": -34.95371627807617, + "step": 7140 + }, + { + "epoch": 22.184353214562353, + "grad_norm": 0.0006203025695867836, + "learning_rate": 3.8034549356541894e-05, + "logits/chosen": -0.6515469551086426, + "logits/rejected": 1.425569772720337, + "logps/chosen": -416.54864501953125, + "logps/rejected": -507.20306396484375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.088509559631348, + "rewards/margins": 26.99295997619629, + "rewards/rejected": -33.08147048950195, + "step": 7160 + }, + { + "epoch": 22.24632068164214, + "grad_norm": 0.001270798034965992, + "learning_rate": 3.7537668036400574e-05, + "logits/chosen": -0.4973440170288086, + "logits/rejected": 1.3225786685943604, + "logps/chosen": -438.9662170410156, + "logps/rejected": -548.2359619140625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.235987186431885, + "rewards/margins": 27.249414443969727, + "rewards/rejected": -34.48540115356445, + "step": 7180 + }, + { + "epoch": 22.30828814872192, + "grad_norm": 0.0001219348851009272, + "learning_rate": 3.704330268988293e-05, + "logits/chosen": -0.553697943687439, + "logits/rejected": 1.298783779144287, + "logps/chosen": -437.8828125, + "logps/rejected": -566.4822998046875, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.940356254577637, + "rewards/margins": 27.971059799194336, + "rewards/rejected": -35.911415100097656, + "step": 7200 + }, + { + "epoch": 22.370255615801703, + "grad_norm": 0.0001257601979887113, + "learning_rate": 3.6551473229953037e-05, + "logits/chosen": -0.4942377209663391, + "logits/rejected": 1.3459523916244507, + "logps/chosen": -437.99273681640625, + "logps/rejected": -557.7435913085938, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.5271759033203125, + "rewards/margins": 27.564788818359375, + "rewards/rejected": -35.09196853637695, + "step": 7220 + }, + { + "epoch": 22.43222308288149, + "grad_norm": 5.536680873774458e-06, + "learning_rate": 3.606219946742978e-05, + "logits/chosen": -0.5135564804077148, + "logits/rejected": 1.2007367610931396, + "logps/chosen": -429.4864807128906, + "logps/rejected": -593.4481201171875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.349099159240723, + "rewards/margins": 29.2182674407959, + "rewards/rejected": -36.56736755371094, + "step": 7240 + }, + { + "epoch": 22.49419054996127, + "grad_norm": 0.0003757201775442809, + "learning_rate": 3.557550111018906e-05, + "logits/chosen": -0.4902525544166565, + "logits/rejected": 1.1989456415176392, + "logps/chosen": -445.7784118652344, + "logps/rejected": -589.0730590820312, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.687643527984619, + "rewards/margins": 28.44379234313965, + "rewards/rejected": -36.131439208984375, + "step": 7260 + }, + { + "epoch": 22.556158017041053, + "grad_norm": 0.0004273348895367235, + "learning_rate": 3.509139776236967e-05, + "logits/chosen": -0.5242363810539246, + "logits/rejected": 1.3806064128875732, + "logps/chosen": -443.4256286621094, + "logps/rejected": -554.4573364257812, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.07096004486084, + "rewards/margins": 27.879369735717773, + "rewards/rejected": -34.95033645629883, + "step": 7280 + }, + { + "epoch": 22.618125484120835, + "grad_norm": 0.00013076326285954565, + "learning_rate": 3.460990892358388e-05, + "logits/chosen": -0.46005791425704956, + "logits/rejected": 1.3477107286453247, + "logps/chosen": -448.58856201171875, + "logps/rejected": -565.8875122070312, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.28325366973877, + "rewards/margins": 27.60245704650879, + "rewards/rejected": -35.88571548461914, + "step": 7300 + }, + { + "epoch": 22.68009295120062, + "grad_norm": 4.240042107994668e-05, + "learning_rate": 3.413105398813195e-05, + "logits/chosen": -0.4579412043094635, + "logits/rejected": 1.342012643814087, + "logps/chosen": -438.57684326171875, + "logps/rejected": -577.7442626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.141871452331543, + "rewards/margins": 28.370080947875977, + "rewards/rejected": -36.5119514465332, + "step": 7320 + }, + { + "epoch": 22.742060418280403, + "grad_norm": 7.035260750853922e-06, + "learning_rate": 3.3654852244220826e-05, + "logits/chosen": -0.4461139738559723, + "logits/rejected": 1.2642186880111694, + "logps/chosen": -411.8309631347656, + "logps/rejected": -534.2032470703125, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.252596855163574, + "rewards/margins": 26.30681800842285, + "rewards/rejected": -34.55941390991211, + "step": 7340 + }, + { + "epoch": 22.804027885360185, + "grad_norm": 0.0002788783167488873, + "learning_rate": 3.3181322873187326e-05, + "logits/chosen": -0.5451353192329407, + "logits/rejected": 1.40205979347229, + "logps/chosen": -433.80523681640625, + "logps/rejected": -542.520263671875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.425281524658203, + "rewards/margins": 28.070571899414062, + "rewards/rejected": -34.495853424072266, + "step": 7360 + }, + { + "epoch": 22.86599535243997, + "grad_norm": 0.00026678614085540175, + "learning_rate": 3.271048494872546e-05, + "logits/chosen": -0.47595128417015076, + "logits/rejected": 1.2741992473602295, + "logps/chosen": -418.16845703125, + "logps/rejected": -567.8311767578125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.957074165344238, + "rewards/margins": 28.70499610900879, + "rewards/rejected": -35.662071228027344, + "step": 7380 + }, + { + "epoch": 22.927962819519752, + "grad_norm": 0.0001355899585178122, + "learning_rate": 3.224235743611814e-05, + "logits/chosen": -0.4618222713470459, + "logits/rejected": 1.414473295211792, + "logps/chosen": -431.6698303222656, + "logps/rejected": -572.1229248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.292036533355713, + "rewards/margins": 28.899133682250977, + "rewards/rejected": -36.19116973876953, + "step": 7400 + }, + { + "epoch": 22.989930286599535, + "grad_norm": 0.00026807800168171525, + "learning_rate": 3.177695919147339e-05, + "logits/chosen": -0.4957195222377777, + "logits/rejected": 1.3398468494415283, + "logps/chosen": -426.016845703125, + "logps/rejected": -548.435302734375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.988835334777832, + "rewards/margins": 28.052658081054688, + "rewards/rejected": -35.0414924621582, + "step": 7420 + }, + { + "epoch": 23.051897753679317, + "grad_norm": 0.0001755996490828693, + "learning_rate": 3.131430896096459e-05, + "logits/chosen": -0.4614803194999695, + "logits/rejected": 1.3167946338653564, + "logps/chosen": -420.3673400878906, + "logps/rejected": -540.8953857421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.986320495605469, + "rewards/margins": 26.317516326904297, + "rewards/rejected": -34.30384063720703, + "step": 7440 + }, + { + "epoch": 23.113865220759102, + "grad_norm": 8.663154585519806e-05, + "learning_rate": 3.0854425380075544e-05, + "logits/chosen": -0.4759630262851715, + "logits/rejected": 1.354256272315979, + "logps/chosen": -444.06231689453125, + "logps/rejected": -582.34619140625, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.985074520111084, + "rewards/margins": 28.706161499023438, + "rewards/rejected": -36.69123840332031, + "step": 7460 + }, + { + "epoch": 23.175832687838884, + "grad_norm": 0.00045451842015609145, + "learning_rate": 3.0397326972849892e-05, + "logits/chosen": -0.5278170108795166, + "logits/rejected": 1.2997605800628662, + "logps/chosen": -415.7640686035156, + "logps/rejected": -550.7852783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.4368720054626465, + "rewards/margins": 27.63351821899414, + "rewards/rejected": -35.07038497924805, + "step": 7480 + }, + { + "epoch": 23.237800154918666, + "grad_norm": 8.681453618919477e-05, + "learning_rate": 2.9943032151144812e-05, + "logits/chosen": -0.5060716867446899, + "logits/rejected": 1.314741849899292, + "logps/chosen": -424.48583984375, + "logps/rejected": -550.7124633789062, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.339421272277832, + "rewards/margins": 27.69765281677246, + "rewards/rejected": -35.03707504272461, + "step": 7500 + }, + { + "epoch": 23.299767621998452, + "grad_norm": 2.901791231124662e-05, + "learning_rate": 2.949155921388943e-05, + "logits/chosen": -0.48493748903274536, + "logits/rejected": 1.361489176750183, + "logps/chosen": -439.04864501953125, + "logps/rejected": -573.9337158203125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.827279567718506, + "rewards/margins": 28.83859634399414, + "rewards/rejected": -36.665870666503906, + "step": 7520 + }, + { + "epoch": 23.361735089078234, + "grad_norm": 0.00018417155661154538, + "learning_rate": 2.904292634634793e-05, + "logits/chosen": -0.5873640775680542, + "logits/rejected": 1.3690621852874756, + "logps/chosen": -431.43994140625, + "logps/rejected": -537.2473754882812, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.988741397857666, + "rewards/margins": 27.92215919494629, + "rewards/rejected": -34.91089630126953, + "step": 7540 + }, + { + "epoch": 23.423702556158016, + "grad_norm": 0.00022559291392099112, + "learning_rate": 2.8597151619386707e-05, + "logits/chosen": -0.5808820128440857, + "logits/rejected": 1.3238177299499512, + "logps/chosen": -447.241455078125, + "logps/rejected": -549.6907958984375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.945500373840332, + "rewards/margins": 26.819957733154297, + "rewards/rejected": -33.76545715332031, + "step": 7560 + }, + { + "epoch": 23.4856700232378, + "grad_norm": 0.0003778359678108245, + "learning_rate": 2.8154252988746755e-05, + "logits/chosen": -0.3938234746456146, + "logits/rejected": 1.2302014827728271, + "logps/chosen": -422.442626953125, + "logps/rejected": -562.1705932617188, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.191490173339844, + "rewards/margins": 27.9207763671875, + "rewards/rejected": -35.11227035522461, + "step": 7580 + }, + { + "epoch": 23.547637490317584, + "grad_norm": 0.00015092053217813373, + "learning_rate": 2.771424829432041e-05, + "logits/chosen": -0.5172373056411743, + "logits/rejected": 1.2900029420852661, + "logps/chosen": -445.7874450683594, + "logps/rejected": -598.3099365234375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.032974243164062, + "rewards/margins": 29.89381980895996, + "rewards/rejected": -37.926795959472656, + "step": 7600 + }, + { + "epoch": 23.609604957397366, + "grad_norm": 0.00026129186153411865, + "learning_rate": 2.727715525943253e-05, + "logits/chosen": -0.5197489857673645, + "logits/rejected": 1.264127492904663, + "logps/chosen": -439.525146484375, + "logps/rejected": -574.5367431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.128179550170898, + "rewards/margins": 27.236980438232422, + "rewards/rejected": -35.36515426635742, + "step": 7620 + }, + { + "epoch": 23.671572424477148, + "grad_norm": 4.1022536606760696e-05, + "learning_rate": 2.68429914901269e-05, + "logits/chosen": -0.532270073890686, + "logits/rejected": 1.3862292766571045, + "logps/chosen": -425.9048767089844, + "logps/rejected": -557.3917236328125, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.1133575439453125, + "rewards/margins": 28.244802474975586, + "rewards/rejected": -35.35816192626953, + "step": 7640 + }, + { + "epoch": 23.733539891556934, + "grad_norm": 9.765337745193392e-05, + "learning_rate": 2.6411774474456797e-05, + "logits/chosen": -0.5974343419075012, + "logits/rejected": 1.36226487159729, + "logps/chosen": -439.45269775390625, + "logps/rejected": -552.4537963867188, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1931352615356445, + "rewards/margins": 27.47402572631836, + "rewards/rejected": -34.66716384887695, + "step": 7660 + }, + { + "epoch": 23.795507358636716, + "grad_norm": 0.0002056274825008586, + "learning_rate": 2.5983521581780724e-05, + "logits/chosen": -0.46689167618751526, + "logits/rejected": 1.3515841960906982, + "logps/chosen": -442.60284423828125, + "logps/rejected": -573.5643310546875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.276591300964355, + "rewards/margins": 28.31182861328125, + "rewards/rejected": -36.588417053222656, + "step": 7680 + }, + { + "epoch": 23.857474825716498, + "grad_norm": 0.00024225719971582294, + "learning_rate": 2.5558250062062828e-05, + "logits/chosen": -0.4331149160861969, + "logits/rejected": 1.337519884109497, + "logps/chosen": -453.8560485839844, + "logps/rejected": -565.5623779296875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.228255271911621, + "rewards/margins": 26.08279800415039, + "rewards/rejected": -35.311058044433594, + "step": 7700 + }, + { + "epoch": 23.919442292796283, + "grad_norm": 0.00014199658471625298, + "learning_rate": 2.5135977045177815e-05, + "logits/chosen": -0.43462008237838745, + "logits/rejected": 1.2873212099075317, + "logps/chosen": -420.39898681640625, + "logps/rejected": -550.4713134765625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.8913254737854, + "rewards/margins": 26.91733741760254, + "rewards/rejected": -34.80866241455078, + "step": 7720 + }, + { + "epoch": 23.981409759876065, + "grad_norm": 0.0002433314803056419, + "learning_rate": 2.4716719540221268e-05, + "logits/chosen": -0.45984095335006714, + "logits/rejected": 1.445708990097046, + "logps/chosen": -409.4692687988281, + "logps/rejected": -525.4560546875, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.901988983154297, + "rewards/margins": 27.886280059814453, + "rewards/rejected": -34.78826904296875, + "step": 7740 + }, + { + "epoch": 24.043377226955847, + "grad_norm": 0.0001660619891481474, + "learning_rate": 2.4300494434824373e-05, + "logits/chosen": -0.4552191197872162, + "logits/rejected": 1.3290296792984009, + "logps/chosen": -426.6686096191406, + "logps/rejected": -570.5438232421875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.440762519836426, + "rewards/margins": 28.601394653320312, + "rewards/rejected": -36.042152404785156, + "step": 7760 + }, + { + "epoch": 24.10534469403563, + "grad_norm": 8.915072248782963e-05, + "learning_rate": 2.3887318494473677e-05, + "logits/chosen": -0.539429783821106, + "logits/rejected": 1.2853329181671143, + "logps/chosen": -416.55908203125, + "logps/rejected": -535.9031982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.548221588134766, + "rewards/margins": 26.924367904663086, + "rewards/rejected": -34.47259521484375, + "step": 7780 + }, + { + "epoch": 24.167312161115415, + "grad_norm": 0.0002355161268496886, + "learning_rate": 2.347720836183578e-05, + "logits/chosen": -0.4840586185455322, + "logits/rejected": 1.3917993307113647, + "logps/chosen": -424.24969482421875, + "logps/rejected": -553.1238403320312, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.6523895263671875, + "rewards/margins": 28.274377822875977, + "rewards/rejected": -34.92676544189453, + "step": 7800 + }, + { + "epoch": 24.229279628195197, + "grad_norm": 0.0002348150301259011, + "learning_rate": 2.3070180556087074e-05, + "logits/chosen": -0.4440614581108093, + "logits/rejected": 1.27870512008667, + "logps/chosen": -444.35772705078125, + "logps/rejected": -572.5586547851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.005529403686523, + "rewards/margins": 27.384714126586914, + "rewards/rejected": -35.39024353027344, + "step": 7820 + }, + { + "epoch": 24.29124709527498, + "grad_norm": 4.529371653916314e-05, + "learning_rate": 2.266625147224817e-05, + "logits/chosen": -0.5220253467559814, + "logits/rejected": 1.4514058828353882, + "logps/chosen": -432.4845275878906, + "logps/rejected": -534.2028198242188, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.848171234130859, + "rewards/margins": 27.818805694580078, + "rewards/rejected": -34.66697692871094, + "step": 7840 + }, + { + "epoch": 24.353214562354765, + "grad_norm": 0.0005656908615492284, + "learning_rate": 2.2265437380523734e-05, + "logits/chosen": -0.5128262639045715, + "logits/rejected": 1.2830040454864502, + "logps/chosen": -459.4671325683594, + "logps/rejected": -589.1271362304688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.426397323608398, + "rewards/margins": 28.289302825927734, + "rewards/rejected": -36.715702056884766, + "step": 7860 + }, + { + "epoch": 24.415182029434547, + "grad_norm": 0.00024748279247432947, + "learning_rate": 2.1867754425646926e-05, + "logits/chosen": -0.5120356678962708, + "logits/rejected": 1.4352920055389404, + "logps/chosen": -449.92620849609375, + "logps/rejected": -586.4793090820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.046315670013428, + "rewards/margins": 29.50286293029785, + "rewards/rejected": -36.54917907714844, + "step": 7880 + }, + { + "epoch": 24.47714949651433, + "grad_norm": 0.0029017701745033264, + "learning_rate": 2.1473218626229095e-05, + "logits/chosen": -0.49962282180786133, + "logits/rejected": 1.3568048477172852, + "logps/chosen": -440.7109375, + "logps/rejected": -566.8717651367188, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.135717868804932, + "rewards/margins": 28.588165283203125, + "rewards/rejected": -35.723880767822266, + "step": 7900 + }, + { + "epoch": 24.539116963594115, + "grad_norm": 0.0006671111332252622, + "learning_rate": 2.1081845874114815e-05, + "logits/chosen": -0.49870556592941284, + "logits/rejected": 1.3455699682235718, + "logps/chosen": -427.1560974121094, + "logps/rejected": -567.7340087890625, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.85535192489624, + "rewards/margins": 29.06607437133789, + "rewards/rejected": -35.921424865722656, + "step": 7920 + }, + { + "epoch": 24.601084430673897, + "grad_norm": 0.0003407177282497287, + "learning_rate": 2.069365193374142e-05, + "logits/chosen": -0.4392669200897217, + "logits/rejected": 1.4311549663543701, + "logps/chosen": -438.2574768066406, + "logps/rejected": -559.4137573242188, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.767866611480713, + "rewards/margins": 27.89835548400879, + "rewards/rejected": -35.66621780395508, + "step": 7940 + }, + { + "epoch": 24.66305189775368, + "grad_norm": 9.20661841519177e-05, + "learning_rate": 2.0308652441504217e-05, + "logits/chosen": -0.4642259478569031, + "logits/rejected": 1.3726909160614014, + "logps/chosen": -434.41571044921875, + "logps/rejected": -561.6458129882812, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.912638187408447, + "rewards/margins": 28.36699867248535, + "rewards/rejected": -36.27963638305664, + "step": 7960 + }, + { + "epoch": 24.72501936483346, + "grad_norm": 0.00026214818353764713, + "learning_rate": 1.9926862905126665e-05, + "logits/chosen": -0.4355667531490326, + "logits/rejected": 1.2842066287994385, + "logps/chosen": -454.08746337890625, + "logps/rejected": -588.8795166015625, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.499858856201172, + "rewards/margins": 27.742816925048828, + "rewards/rejected": -36.24267578125, + "step": 7980 + }, + { + "epoch": 24.786986831913246, + "grad_norm": 1.4072214980842546e-05, + "learning_rate": 1.954829870303555e-05, + "logits/chosen": -0.4878915250301361, + "logits/rejected": 1.3544992208480835, + "logps/chosen": -441.7566833496094, + "logps/rejected": -572.3592529296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.753843307495117, + "rewards/margins": 27.906024932861328, + "rewards/rejected": -36.65986251831055, + "step": 8000 + }, + { + "epoch": 24.84895429899303, + "grad_norm": 4.5072305510984734e-05, + "learning_rate": 1.9172975083741817e-05, + "logits/chosen": -0.5160379409790039, + "logits/rejected": 1.3292560577392578, + "logps/chosen": -430.331787109375, + "logps/rejected": -547.330322265625, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.6765336990356445, + "rewards/margins": 26.8677921295166, + "rewards/rejected": -34.54432678222656, + "step": 8020 + }, + { + "epoch": 24.91092176607281, + "grad_norm": 0.00028837990248575807, + "learning_rate": 1.8800907165226066e-05, + "logits/chosen": -0.5141741633415222, + "logits/rejected": 1.4077479839324951, + "logps/chosen": -430.82916259765625, + "logps/rejected": -552.1541748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.073647499084473, + "rewards/margins": 28.02901268005371, + "rewards/rejected": -36.102657318115234, + "step": 8040 + }, + { + "epoch": 24.972889233152596, + "grad_norm": 0.0027730674482882023, + "learning_rate": 1.8432109934329834e-05, + "logits/chosen": -0.4535338878631592, + "logits/rejected": 1.4890105724334717, + "logps/chosen": -427.42724609375, + "logps/rejected": -543.154052734375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.028363227844238, + "rewards/margins": 27.05509376525879, + "rewards/rejected": -35.08345413208008, + "step": 8060 + }, + { + "epoch": 25.03485670023238, + "grad_norm": 0.00010149605077458546, + "learning_rate": 1.8066598246151768e-05, + "logits/chosen": -0.5200473070144653, + "logits/rejected": 1.3415261507034302, + "logps/chosen": -436.22723388671875, + "logps/rejected": -571.4678344726562, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.299592971801758, + "rewards/margins": 28.236587524414062, + "rewards/rejected": -36.53618240356445, + "step": 8080 + }, + { + "epoch": 25.09682416731216, + "grad_norm": 4.2563999159028754e-05, + "learning_rate": 1.7704386823449403e-05, + "logits/chosen": -0.5422636866569519, + "logits/rejected": 1.3826935291290283, + "logps/chosen": -439.9010314941406, + "logps/rejected": -581.65380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.662468910217285, + "rewards/margins": 29.301517486572266, + "rewards/rejected": -36.963985443115234, + "step": 8100 + }, + { + "epoch": 25.158791634391946, + "grad_norm": 6.259313522605225e-05, + "learning_rate": 1.7345490256045993e-05, + "logits/chosen": -0.4815981984138489, + "logits/rejected": 1.3883564472198486, + "logps/chosen": -438.5204162597656, + "logps/rejected": -578.703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.347479343414307, + "rewards/margins": 28.74386215209961, + "rewards/rejected": -36.09134292602539, + "step": 8120 + }, + { + "epoch": 25.220759101471728, + "grad_norm": 6.804763688705862e-05, + "learning_rate": 1.6989923000243e-05, + "logits/chosen": -0.43593135476112366, + "logits/rejected": 1.4665887355804443, + "logps/chosen": -415.4696350097656, + "logps/rejected": -555.9053955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.401350498199463, + "rewards/margins": 28.641956329345703, + "rewards/rejected": -36.043304443359375, + "step": 8140 + }, + { + "epoch": 25.28272656855151, + "grad_norm": 0.00014422877575270832, + "learning_rate": 1.6637699378237605e-05, + "logits/chosen": -0.5304734110832214, + "logits/rejected": 1.378281831741333, + "logps/chosen": -435.180419921875, + "logps/rejected": -561.174560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.14901876449585, + "rewards/margins": 28.46111488342285, + "rewards/rejected": -35.610137939453125, + "step": 8160 + }, + { + "epoch": 25.344694035631292, + "grad_norm": 0.00013896219024900347, + "learning_rate": 1.6288833577545914e-05, + "logits/chosen": -0.4343484938144684, + "logits/rejected": 1.2599773406982422, + "logps/chosen": -425.02606201171875, + "logps/rejected": -573.0264282226562, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.175013542175293, + "rewards/margins": 27.731231689453125, + "rewards/rejected": -35.90624237060547, + "step": 8180 + }, + { + "epoch": 25.406661502711078, + "grad_norm": 4.538395660347305e-05, + "learning_rate": 1.5943339650431576e-05, + "logits/chosen": -0.5071940422058105, + "logits/rejected": 1.3921631574630737, + "logps/chosen": -432.5198669433594, + "logps/rejected": -560.261962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.497560024261475, + "rewards/margins": 28.007415771484375, + "rewards/rejected": -35.50497817993164, + "step": 8200 + }, + { + "epoch": 25.46862896979086, + "grad_norm": 0.00019486816017888486, + "learning_rate": 1.5601231513339565e-05, + "logits/chosen": -0.47233065962791443, + "logits/rejected": 1.3630597591400146, + "logps/chosen": -425.60736083984375, + "logps/rejected": -546.03662109375, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.1140923500061035, + "rewards/margins": 27.629613876342773, + "rewards/rejected": -34.74370574951172, + "step": 8220 + }, + { + "epoch": 25.530596436870642, + "grad_norm": 0.0008970848866738379, + "learning_rate": 1.5262522946335755e-05, + "logits/chosen": -0.5086788535118103, + "logits/rejected": 1.389817476272583, + "logps/chosen": -428.55328369140625, + "logps/rejected": -537.9722900390625, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.565539360046387, + "rewards/margins": 27.265399932861328, + "rewards/rejected": -34.83094024658203, + "step": 8240 + }, + { + "epoch": 25.592563903950428, + "grad_norm": 0.0003016830887645483, + "learning_rate": 1.492722759255184e-05, + "logits/chosen": -0.4972669184207916, + "logits/rejected": 1.4621047973632812, + "logps/chosen": -442.19903564453125, + "logps/rejected": -546.1878051757812, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.644277095794678, + "rewards/margins": 27.799856185913086, + "rewards/rejected": -35.44413375854492, + "step": 8260 + }, + { + "epoch": 25.65453137103021, + "grad_norm": 0.00022154908219818026, + "learning_rate": 1.4595358957635763e-05, + "logits/chosen": -0.5789279341697693, + "logits/rejected": 1.2437044382095337, + "logps/chosen": -430.11151123046875, + "logps/rejected": -562.6528930664062, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.121301174163818, + "rewards/margins": 28.626384735107422, + "rewards/rejected": -35.74768829345703, + "step": 8280 + }, + { + "epoch": 25.71649883810999, + "grad_norm": 1.7708864106680267e-05, + "learning_rate": 1.4266930409207791e-05, + "logits/chosen": -0.5174465775489807, + "logits/rejected": 1.299397587776184, + "logps/chosen": -445.3390197753906, + "logps/rejected": -575.9884643554688, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.392850875854492, + "rewards/margins": 28.103023529052734, + "rewards/rejected": -36.495872497558594, + "step": 8300 + }, + { + "epoch": 25.778466305189774, + "grad_norm": 0.00012290375889278948, + "learning_rate": 1.394195517632193e-05, + "logits/chosen": -0.44883760809898376, + "logits/rejected": 1.4809213876724243, + "logps/chosen": -451.4324645996094, + "logps/rejected": -562.3369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.809348106384277, + "rewards/margins": 27.419443130493164, + "rewards/rejected": -35.22879409790039, + "step": 8320 + }, + { + "epoch": 25.84043377226956, + "grad_norm": 0.00036960910074412823, + "learning_rate": 1.362044634893318e-05, + "logits/chosen": -0.4777800440788269, + "logits/rejected": 1.4690001010894775, + "logps/chosen": -441.289306640625, + "logps/rejected": -547.408935546875, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.73660135269165, + "rewards/margins": 27.180755615234375, + "rewards/rejected": -34.917354583740234, + "step": 8340 + }, + { + "epoch": 25.90240123934934, + "grad_norm": 2.864907764887903e-05, + "learning_rate": 1.3302416877370239e-05, + "logits/chosen": -0.5380562543869019, + "logits/rejected": 1.4958761930465698, + "logps/chosen": -438.1744079589844, + "logps/rejected": -547.0137939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.9420647621154785, + "rewards/margins": 28.122699737548828, + "rewards/rejected": -35.064762115478516, + "step": 8360 + }, + { + "epoch": 25.964368706429124, + "grad_norm": 9.71209374256432e-05, + "learning_rate": 1.2987879571813854e-05, + "logits/chosen": -0.43615514039993286, + "logits/rejected": 1.3821794986724854, + "logps/chosen": -438.2367248535156, + "logps/rejected": -562.9356689453125, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.997640609741211, + "rewards/margins": 27.08341407775879, + "rewards/rejected": -36.081050872802734, + "step": 8380 + }, + { + "epoch": 26.02633617350891, + "grad_norm": 0.0003731061005964875, + "learning_rate": 1.267684710178081e-05, + "logits/chosen": -0.5250933766365051, + "logits/rejected": 1.2111034393310547, + "logps/chosen": -434.3421936035156, + "logps/rejected": -571.15673828125, + "loss": 0.0033, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -8.725934982299805, + "rewards/margins": 27.260334014892578, + "rewards/rejected": -35.98626708984375, + "step": 8400 + }, + { + "epoch": 26.08830364058869, + "grad_norm": 0.00014083593850955367, + "learning_rate": 1.2369331995613665e-05, + "logits/chosen": -0.4025232195854187, + "logits/rejected": 1.4967281818389893, + "logps/chosen": -442.66424560546875, + "logps/rejected": -540.0578002929688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.629788398742676, + "rewards/margins": 27.342418670654297, + "rewards/rejected": -34.97220993041992, + "step": 8420 + }, + { + "epoch": 26.150271107668473, + "grad_norm": 7.524704415118322e-05, + "learning_rate": 1.2065346639976016e-05, + "logits/chosen": -0.4743157923221588, + "logits/rejected": 1.304748296737671, + "logps/chosen": -420.94073486328125, + "logps/rejected": -565.73388671875, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.074350357055664, + "rewards/margins": 27.66106605529785, + "rewards/rejected": -35.735416412353516, + "step": 8440 + }, + { + "epoch": 26.21223857474826, + "grad_norm": 0.0002636277349665761, + "learning_rate": 1.177984113760211e-05, + "logits/chosen": -0.47151750326156616, + "logits/rejected": 1.3237098455429077, + "logps/chosen": -424.85430908203125, + "logps/rejected": -548.9588012695312, + "loss": 0.0043, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.068964004516602, + "rewards/margins": 27.008281707763672, + "rewards/rejected": -35.077247619628906, + "step": 8460 + }, + { + "epoch": 26.27420604182804, + "grad_norm": 0.0002253088023280725, + "learning_rate": 1.1482773883758357e-05, + "logits/chosen": -0.47794610261917114, + "logits/rejected": 1.3321702480316162, + "logps/chosen": -433.79241943359375, + "logps/rejected": -567.6560668945312, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.62246036529541, + "rewards/margins": 28.215002059936523, + "rewards/rejected": -35.837459564208984, + "step": 8480 + }, + { + "epoch": 26.336173508907823, + "grad_norm": 2.4017164832912385e-05, + "learning_rate": 1.1189272090875591e-05, + "logits/chosen": -0.4993225932121277, + "logits/rejected": 1.416684865951538, + "logps/chosen": -424.10797119140625, + "logps/rejected": -537.0582275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.3983659744262695, + "rewards/margins": 26.852685928344727, + "rewards/rejected": -34.25105285644531, + "step": 8500 + }, + { + "epoch": 26.398140975987605, + "grad_norm": 1.272676968255837e-06, + "learning_rate": 1.0899347581163221e-05, + "logits/chosen": -0.4834226965904236, + "logits/rejected": 1.3799736499786377, + "logps/chosen": -426.93487548828125, + "logps/rejected": -559.1065673828125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.2085771560668945, + "rewards/margins": 28.53904151916504, + "rewards/rejected": -35.747623443603516, + "step": 8520 + }, + { + "epoch": 26.46010844306739, + "grad_norm": 0.0005902862176299095, + "learning_rate": 1.0613012032738268e-05, + "logits/chosen": -0.49945202469825745, + "logits/rejected": 1.369750738143921, + "logps/chosen": -423.49951171875, + "logps/rejected": -546.5079345703125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.095788478851318, + "rewards/margins": 28.046178817749023, + "rewards/rejected": -35.1419677734375, + "step": 8540 + }, + { + "epoch": 26.522075910147173, + "grad_norm": 0.0001648461475269869, + "learning_rate": 1.033027697915483e-05, + "logits/chosen": -0.4533432424068451, + "logits/rejected": 1.3663042783737183, + "logps/chosen": -447.10919189453125, + "logps/rejected": -590.3201293945312, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.335497856140137, + "rewards/margins": 28.570659637451172, + "rewards/rejected": -36.90616226196289, + "step": 8560 + }, + { + "epoch": 26.584043377226955, + "grad_norm": 4.678579443861963e-06, + "learning_rate": 1.0051153808939685e-05, + "logits/chosen": -0.4500574469566345, + "logits/rejected": 1.3406521081924438, + "logps/chosen": -409.4193115234375, + "logps/rejected": -528.8170166015625, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.115273952484131, + "rewards/margins": 26.795047760009766, + "rewards/rejected": -33.91032028198242, + "step": 8580 + }, + { + "epoch": 26.64601084430674, + "grad_norm": 0.00027692707953974605, + "learning_rate": 9.775653765133396e-06, + "logits/chosen": -0.3562534749507904, + "logits/rejected": 1.4658071994781494, + "logps/chosen": -424.7490234375, + "logps/rejected": -549.2362670898438, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8106536865234375, + "rewards/margins": 27.75030517578125, + "rewards/rejected": -35.56095504760742, + "step": 8600 + }, + { + "epoch": 26.707978311386523, + "grad_norm": 0.000177569585503079, + "learning_rate": 9.503787944837561e-06, + "logits/chosen": -0.48818501830101013, + "logits/rejected": 1.3173562288284302, + "logps/chosen": -436.6263732910156, + "logps/rejected": -579.1359252929688, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.437201499938965, + "rewards/margins": 28.580413818359375, + "rewards/rejected": -37.017616271972656, + "step": 8620 + }, + { + "epoch": 26.769945778466305, + "grad_norm": 0.0019391351379454136, + "learning_rate": 9.23556729876781e-06, + "logits/chosen": -0.3800424039363861, + "logits/rejected": 1.3795052766799927, + "logps/chosen": -428.7498474121094, + "logps/rejected": -563.37451171875, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.988425254821777, + "rewards/margins": 28.296335220336914, + "rewards/rejected": -35.284767150878906, + "step": 8640 + }, + { + "epoch": 26.831913245546087, + "grad_norm": 0.0002963369188364595, + "learning_rate": 8.971002630812619e-06, + "logits/chosen": -0.4745435118675232, + "logits/rejected": 1.407877802848816, + "logps/chosen": -427.7135314941406, + "logps/rejected": -580.1387939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.42059850692749, + "rewards/margins": 30.40887451171875, + "rewards/rejected": -36.829471588134766, + "step": 8660 + }, + { + "epoch": 26.893880712625872, + "grad_norm": 7.189660391304642e-05, + "learning_rate": 8.710104597598223e-06, + "logits/chosen": -0.4752906262874603, + "logits/rejected": 1.3506954908370972, + "logps/chosen": -424.6866760253906, + "logps/rejected": -528.3801879882812, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.252419948577881, + "rewards/margins": 26.98067855834961, + "rewards/rejected": -34.23310089111328, + "step": 8680 + }, + { + "epoch": 26.955848179705654, + "grad_norm": 0.00013338649296201766, + "learning_rate": 8.4528837080594e-06, + "logits/chosen": -0.527729332447052, + "logits/rejected": 1.4104167222976685, + "logps/chosen": -425.31817626953125, + "logps/rejected": -548.75732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.509429931640625, + "rewards/margins": 27.011505126953125, + "rewards/rejected": -35.52093505859375, + "step": 8700 + }, + { + "epoch": 27.017815646785436, + "grad_norm": 7.765422924421728e-05, + "learning_rate": 8.199350323016041e-06, + "logits/chosen": -0.44741296768188477, + "logits/rejected": 1.3625432252883911, + "logps/chosen": -463.06219482421875, + "logps/rejected": -607.8698120117188, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.488401412963867, + "rewards/margins": 29.808670043945312, + "rewards/rejected": -38.29706954956055, + "step": 8720 + }, + { + "epoch": 27.079783113865222, + "grad_norm": 3.868131898343563e-05, + "learning_rate": 7.949514654755962e-06, + "logits/chosen": -0.4678593575954437, + "logits/rejected": 1.44140625, + "logps/chosen": -429.23345947265625, + "logps/rejected": -552.1541748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.506895542144775, + "rewards/margins": 28.108739852905273, + "rewards/rejected": -35.615638732910156, + "step": 8740 + }, + { + "epoch": 27.141750580945004, + "grad_norm": 0.0002340103528695181, + "learning_rate": 7.703386766623444e-06, + "logits/chosen": -0.3554527759552002, + "logits/rejected": 1.4482638835906982, + "logps/chosen": -430.29913330078125, + "logps/rejected": -555.3551025390625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.764983177185059, + "rewards/margins": 27.69220542907715, + "rewards/rejected": -35.45718765258789, + "step": 8760 + }, + { + "epoch": 27.203718048024786, + "grad_norm": 3.8839676562929526e-05, + "learning_rate": 7.460976572613887e-06, + "logits/chosen": -0.4137742519378662, + "logits/rejected": 1.3642592430114746, + "logps/chosen": -432.981689453125, + "logps/rejected": -575.1232299804688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.254365921020508, + "rewards/margins": 28.477890014648438, + "rewards/rejected": -36.73225784301758, + "step": 8780 + }, + { + "epoch": 27.265685515104572, + "grad_norm": 0.00010233109060209244, + "learning_rate": 7.222293836974614e-06, + "logits/chosen": -0.5344318151473999, + "logits/rejected": 1.3696272373199463, + "logps/chosen": -458.417236328125, + "logps/rejected": -578.0775146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.462766647338867, + "rewards/margins": 28.498706817626953, + "rewards/rejected": -36.96147155761719, + "step": 8800 + }, + { + "epoch": 27.327652982184354, + "grad_norm": 6.865251634735614e-05, + "learning_rate": 6.9873481738114145e-06, + "logits/chosen": -0.5033076405525208, + "logits/rejected": 1.3286387920379639, + "logps/chosen": -400.2259826660156, + "logps/rejected": -537.0637817382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.391934394836426, + "rewards/margins": 27.038738250732422, + "rewards/rejected": -34.43067169189453, + "step": 8820 + }, + { + "epoch": 27.389620449264136, + "grad_norm": 5.756524478783831e-05, + "learning_rate": 6.756149046701277e-06, + "logits/chosen": -0.557861864566803, + "logits/rejected": 1.485015869140625, + "logps/chosen": -428.1537170410156, + "logps/rejected": -548.026611328125, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.231484889984131, + "rewards/margins": 28.245208740234375, + "rewards/rejected": -35.47669219970703, + "step": 8840 + }, + { + "epoch": 27.451587916343918, + "grad_norm": 8.74618417583406e-05, + "learning_rate": 6.528705768311394e-06, + "logits/chosen": -0.343317449092865, + "logits/rejected": 1.4624199867248535, + "logps/chosen": -429.8575134277344, + "logps/rejected": -577.4666137695312, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.366799831390381, + "rewards/margins": 29.25629234313965, + "rewards/rejected": -36.62308883666992, + "step": 8860 + }, + { + "epoch": 27.513555383423704, + "grad_norm": 3.0331759262480773e-05, + "learning_rate": 6.3050275000238414e-06, + "logits/chosen": -0.5099083781242371, + "logits/rejected": 1.3164933919906616, + "logps/chosen": -431.3134765625, + "logps/rejected": -566.6702880859375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.827919960021973, + "rewards/margins": 28.29727554321289, + "rewards/rejected": -36.12519454956055, + "step": 8880 + }, + { + "epoch": 27.575522850503486, + "grad_norm": 0.00020247708016540855, + "learning_rate": 6.085123251566616e-06, + "logits/chosen": -0.3177579343318939, + "logits/rejected": 1.4168663024902344, + "logps/chosen": -422.2972717285156, + "logps/rejected": -565.1768188476562, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.462930679321289, + "rewards/margins": 27.940074920654297, + "rewards/rejected": -36.40300750732422, + "step": 8900 + }, + { + "epoch": 27.637490317583268, + "grad_norm": 0.00026213927776552737, + "learning_rate": 5.869001880650826e-06, + "logits/chosen": -0.47557058930397034, + "logits/rejected": 1.302455186843872, + "logps/chosen": -437.5243225097656, + "logps/rejected": -567.4195556640625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.532797813415527, + "rewards/margins": 27.39797592163086, + "rewards/rejected": -35.93077850341797, + "step": 8920 + }, + { + "epoch": 27.699457784663053, + "grad_norm": 0.00015056796837598085, + "learning_rate": 5.656672092613757e-06, + "logits/chosen": -0.42232632637023926, + "logits/rejected": 1.44803786277771, + "logps/chosen": -444.7637634277344, + "logps/rejected": -585.7415771484375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -7.886309623718262, + "rewards/margins": 28.971731185913086, + "rewards/rejected": -36.85803985595703, + "step": 8940 + }, + { + "epoch": 27.761425251742835, + "grad_norm": 0.0001097571657737717, + "learning_rate": 5.448142440068316e-06, + "logits/chosen": -0.43719226121902466, + "logits/rejected": 1.4402527809143066, + "logps/chosen": -435.96868896484375, + "logps/rejected": -554.9208374023438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.246077060699463, + "rewards/margins": 28.056320190429688, + "rewards/rejected": -35.302391052246094, + "step": 8960 + }, + { + "epoch": 27.823392718822618, + "grad_norm": 0.00012969023373443633, + "learning_rate": 5.243421322558506e-06, + "logits/chosen": -0.36826613545417786, + "logits/rejected": 1.3211771249771118, + "logps/chosen": -431.38653564453125, + "logps/rejected": -585.7420654296875, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.941071510314941, + "rewards/margins": 28.0720272064209, + "rewards/rejected": -37.013099670410156, + "step": 8980 + }, + { + "epoch": 27.8853601859024, + "grad_norm": 5.7663233747007325e-05, + "learning_rate": 5.04251698622108e-06, + "logits/chosen": -0.4926798343658447, + "logits/rejected": 1.2853095531463623, + "logps/chosen": -423.64349365234375, + "logps/rejected": -575.9889526367188, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.923436641693115, + "rewards/margins": 28.138051986694336, + "rewards/rejected": -36.06148910522461, + "step": 9000 + }, + { + "epoch": 27.947327652982185, + "grad_norm": 0.00023578341642860323, + "learning_rate": 4.845437523453411e-06, + "logits/chosen": -0.4551617503166199, + "logits/rejected": 1.5024217367172241, + "logps/chosen": -427.65850830078125, + "logps/rejected": -548.6030883789062, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.91290283203125, + "rewards/margins": 28.266826629638672, + "rewards/rejected": -35.17972946166992, + "step": 9020 + }, + { + "epoch": 28.009295120061967, + "grad_norm": 0.00039731161086820066, + "learning_rate": 4.652190872587525e-06, + "logits/chosen": -0.5274960994720459, + "logits/rejected": 1.2422288656234741, + "logps/chosen": -433.1166076660156, + "logps/rejected": -576.1898193359375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.720697402954102, + "rewards/margins": 27.77066993713379, + "rewards/rejected": -36.491371154785156, + "step": 9040 + }, + { + "epoch": 28.07126258714175, + "grad_norm": 1.7103820937336423e-05, + "learning_rate": 4.462784817570331e-06, + "logits/chosen": -0.4241279065608978, + "logits/rejected": 1.3979580402374268, + "logps/chosen": -421.704345703125, + "logps/rejected": -548.5910034179688, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.8423662185668945, + "rewards/margins": 27.527713775634766, + "rewards/rejected": -35.370079040527344, + "step": 9060 + }, + { + "epoch": 28.133230054221535, + "grad_norm": 8.69917930685915e-05, + "learning_rate": 4.277226987650129e-06, + "logits/chosen": -0.5255895256996155, + "logits/rejected": 1.4158531427383423, + "logps/chosen": -457.18829345703125, + "logps/rejected": -573.0018310546875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.396566390991211, + "rewards/margins": 28.05706214904785, + "rewards/rejected": -36.4536247253418, + "step": 9080 + }, + { + "epoch": 28.195197521301317, + "grad_norm": 9.612823487259448e-05, + "learning_rate": 4.095524857069244e-06, + "logits/chosen": -0.5023406744003296, + "logits/rejected": 1.4314396381378174, + "logps/chosen": -434.41015625, + "logps/rejected": -562.998291015625, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.083227157592773, + "rewards/margins": 28.252487182617188, + "rewards/rejected": -36.33571243286133, + "step": 9100 + }, + { + "epoch": 28.2571649883811, + "grad_norm": 2.338832382520195e-05, + "learning_rate": 3.917685744762989e-06, + "logits/chosen": -0.37236514687538147, + "logits/rejected": 1.355520486831665, + "logps/chosen": -429.93804931640625, + "logps/rejected": -610.6384887695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0256195068359375, + "rewards/margins": 30.985666275024414, + "rewards/rejected": -38.01128387451172, + "step": 9120 + }, + { + "epoch": 28.319132455460885, + "grad_norm": 0.00021635735174641013, + "learning_rate": 3.7437168140648904e-06, + "logits/chosen": -0.46186137199401855, + "logits/rejected": 1.3692152500152588, + "logps/chosen": -432.89642333984375, + "logps/rejected": -550.060791015625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.183605194091797, + "rewards/margins": 27.7097110748291, + "rewards/rejected": -34.89331817626953, + "step": 9140 + }, + { + "epoch": 28.381099922540667, + "grad_norm": 0.0004849826218560338, + "learning_rate": 3.5736250724180966e-06, + "logits/chosen": -0.47364211082458496, + "logits/rejected": 1.4335612058639526, + "logps/chosen": -423.085693359375, + "logps/rejected": -547.94970703125, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.032341957092285, + "rewards/margins": 27.98506736755371, + "rewards/rejected": -36.01740646362305, + "step": 9160 + }, + { + "epoch": 28.44306738962045, + "grad_norm": 6.062612010282464e-05, + "learning_rate": 3.40741737109318e-06, + "logits/chosen": -0.407947838306427, + "logits/rejected": 1.195319414138794, + "logps/chosen": -422.4693908691406, + "logps/rejected": -581.656982421875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.4915771484375, + "rewards/margins": 28.65443992614746, + "rewards/rejected": -37.146018981933594, + "step": 9180 + }, + { + "epoch": 28.50503485670023, + "grad_norm": 0.00024373046471737325, + "learning_rate": 3.245100404912094e-06, + "logits/chosen": -0.42624807357788086, + "logits/rejected": 1.5248339176177979, + "logps/chosen": -416.85845947265625, + "logps/rejected": -533.588623046875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.168668746948242, + "rewards/margins": 26.794754028320312, + "rewards/rejected": -34.96342086791992, + "step": 9200 + }, + { + "epoch": 28.567002323780017, + "grad_norm": 0.00022582674864679575, + "learning_rate": 3.0866807119785734e-06, + "logits/chosen": -0.47749605774879456, + "logits/rejected": 1.391265869140625, + "logps/chosen": -432.8900451660156, + "logps/rejected": -567.29931640625, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.252971649169922, + "rewards/margins": 28.681339263916016, + "rewards/rejected": -36.93431091308594, + "step": 9220 + }, + { + "epoch": 28.6289697908598, + "grad_norm": 0.00010398898302810267, + "learning_rate": 2.9321646734147502e-06, + "logits/chosen": -0.5295883417129517, + "logits/rejected": 1.4302924871444702, + "logps/chosen": -418.13165283203125, + "logps/rejected": -574.0758666992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.945282936096191, + "rewards/margins": 30.516979217529297, + "rewards/rejected": -36.46226119995117, + "step": 9240 + }, + { + "epoch": 28.69093725793958, + "grad_norm": 9.844397573033348e-05, + "learning_rate": 2.7815585131041435e-06, + "logits/chosen": -0.5464714765548706, + "logits/rejected": 1.3297626972198486, + "logps/chosen": -429.76580810546875, + "logps/rejected": -579.6892700195312, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.043060302734375, + "rewards/margins": 29.36098861694336, + "rewards/rejected": -36.40404510498047, + "step": 9260 + }, + { + "epoch": 28.752904725019366, + "grad_norm": 0.0001132589895860292, + "learning_rate": 2.6348682974408955e-06, + "logits/chosen": -0.40210598707199097, + "logits/rejected": 1.3564558029174805, + "logps/chosen": -441.2831115722656, + "logps/rejected": -580.2747802734375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.844902038574219, + "rewards/margins": 28.26337242126465, + "rewards/rejected": -36.1082763671875, + "step": 9280 + }, + { + "epoch": 28.81487219209915, + "grad_norm": 0.00019120110664516687, + "learning_rate": 2.4920999350855458e-06, + "logits/chosen": -0.4944397509098053, + "logits/rejected": 1.402043104171753, + "logps/chosen": -432.44671630859375, + "logps/rejected": -566.1397705078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.532393455505371, + "rewards/margins": 27.875173568725586, + "rewards/rejected": -36.407569885253906, + "step": 9300 + }, + { + "epoch": 28.87683965917893, + "grad_norm": 0.00020868379215244204, + "learning_rate": 2.3532591767268853e-06, + "logits/chosen": -0.5052774548530579, + "logits/rejected": 1.4609028100967407, + "logps/chosen": -440.171875, + "logps/rejected": -546.31640625, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.388496398925781, + "rewards/margins": 27.529205322265625, + "rewards/rejected": -35.917701721191406, + "step": 9320 + }, + { + "epoch": 28.938807126258713, + "grad_norm": 0.00011665547935990617, + "learning_rate": 2.2183516148504226e-06, + "logits/chosen": -0.5089203119277954, + "logits/rejected": 1.4505350589752197, + "logps/chosen": -444.054931640625, + "logps/rejected": -563.5677490234375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.200953483581543, + "rewards/margins": 27.984363555908203, + "rewards/rejected": -36.18532180786133, + "step": 9340 + }, + { + "epoch": 29.000774593338498, + "grad_norm": 5.405825504567474e-05, + "learning_rate": 2.0938375055220893e-06, + "logits/chosen": -0.4666138291358948, + "logits/rejected": 1.410650610923767, + "logps/chosen": -427.109375, + "logps/rejected": -549.4266967773438, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.324958801269531, + "rewards/margins": 27.69424819946289, + "rewards/rejected": -35.019203186035156, + "step": 9360 + }, + { + "epoch": 29.06274206041828, + "grad_norm": 2.820672671077773e-05, + "learning_rate": 1.966615161996477e-06, + "logits/chosen": -0.38536205887794495, + "logits/rejected": 1.5962440967559814, + "logps/chosen": -445.6573181152344, + "logps/rejected": -557.0641479492188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.192480087280273, + "rewards/margins": 27.905410766601562, + "rewards/rejected": -36.0978889465332, + "step": 9380 + }, + { + "epoch": 29.124709527498062, + "grad_norm": 0.00012238779163453728, + "learning_rate": 1.8433415889175799e-06, + "logits/chosen": -0.5433516502380371, + "logits/rejected": 1.3412362337112427, + "logps/chosen": -448.1932067871094, + "logps/rejected": -580.395751953125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.69677734375, + "rewards/margins": 28.863109588623047, + "rewards/rejected": -36.55989074707031, + "step": 9400 + }, + { + "epoch": 29.186676994577848, + "grad_norm": 5.539653648156673e-05, + "learning_rate": 1.7240217517269897e-06, + "logits/chosen": -0.4289434850215912, + "logits/rejected": 1.394278645515442, + "logps/chosen": -429.510009765625, + "logps/rejected": -581.1126708984375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.776337623596191, + "rewards/margins": 28.806468963623047, + "rewards/rejected": -36.58280944824219, + "step": 9420 + }, + { + "epoch": 29.24864446165763, + "grad_norm": 8.6743557403679e-06, + "learning_rate": 1.6086604566103002e-06, + "logits/chosen": -0.44304054975509644, + "logits/rejected": 1.4444400072097778, + "logps/chosen": -419.0517578125, + "logps/rejected": -557.58251953125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.534919738769531, + "rewards/margins": 29.064559936523438, + "rewards/rejected": -36.59947967529297, + "step": 9440 + }, + { + "epoch": 29.310611928737412, + "grad_norm": 0.0001106134441215545, + "learning_rate": 1.4972623503036965e-06, + "logits/chosen": -0.5489664673805237, + "logits/rejected": 1.2551854848861694, + "logps/chosen": -442.162841796875, + "logps/rejected": -578.1027221679688, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.913488388061523, + "rewards/margins": 27.9173641204834, + "rewards/rejected": -36.83085250854492, + "step": 9460 + }, + { + "epoch": 29.372579395817198, + "grad_norm": 0.00013442269118968397, + "learning_rate": 1.3898319199066478e-06, + "logits/chosen": -0.4975252151489258, + "logits/rejected": 1.331676721572876, + "logps/chosen": -433.96630859375, + "logps/rejected": -583.5892944335938, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.272103786468506, + "rewards/margins": 29.543193817138672, + "rewards/rejected": -36.81529998779297, + "step": 9480 + }, + { + "epoch": 29.43454686289698, + "grad_norm": 0.000179157592356205, + "learning_rate": 1.2863734927012095e-06, + "logits/chosen": -0.5022043585777283, + "logits/rejected": 1.4060289859771729, + "logps/chosen": -444.994140625, + "logps/rejected": -577.6284790039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.224766731262207, + "rewards/margins": 28.887136459350586, + "rewards/rejected": -36.111900329589844, + "step": 9500 + }, + { + "epoch": 29.496514329976762, + "grad_norm": 4.570256351144053e-05, + "learning_rate": 1.1868912359777607e-06, + "logits/chosen": -0.45927444100379944, + "logits/rejected": 1.3402315378189087, + "logps/chosen": -428.650390625, + "logps/rejected": -558.9736328125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.128926753997803, + "rewards/margins": 28.86312484741211, + "rewards/rejected": -35.99205780029297, + "step": 9520 + }, + { + "epoch": 29.558481797056544, + "grad_norm": 7.603448466397822e-05, + "learning_rate": 1.0913891568670842e-06, + "logits/chosen": -0.5526672601699829, + "logits/rejected": 1.4398711919784546, + "logps/chosen": -442.75152587890625, + "logps/rejected": -563.4345092773438, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.6889238357543945, + "rewards/margins": 28.326879501342773, + "rewards/rejected": -36.01580810546875, + "step": 9540 + }, + { + "epoch": 29.62044926413633, + "grad_norm": 0.00014126779569778591, + "learning_rate": 9.998711021790174e-07, + "logits/chosen": -0.50676029920578, + "logits/rejected": 1.4971421957015991, + "logps/chosen": -433.22235107421875, + "logps/rejected": -527.7393188476562, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.903284549713135, + "rewards/margins": 26.93197250366211, + "rewards/rejected": -34.83525466918945, + "step": 9560 + }, + { + "epoch": 29.68241673121611, + "grad_norm": 1.3805640264763497e-05, + "learning_rate": 9.123407582474541e-07, + "logits/chosen": -0.43605098128318787, + "logits/rejected": 1.3629835844039917, + "logps/chosen": -442.35345458984375, + "logps/rejected": -579.7109985351562, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.504144668579102, + "rewards/margins": 28.617279052734375, + "rewards/rejected": -37.121421813964844, + "step": 9580 + }, + { + "epoch": 29.744384198295894, + "grad_norm": 0.00011257326696068048, + "learning_rate": 8.288016507818742e-07, + "logits/chosen": -0.5420119762420654, + "logits/rejected": 1.3263146877288818, + "logps/chosen": -431.49005126953125, + "logps/rejected": -569.9481811523438, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.66623067855835, + "rewards/margins": 28.865182876586914, + "rewards/rejected": -36.53141403198242, + "step": 9600 + }, + { + "epoch": 29.80635166537568, + "grad_norm": 3.732260665856302e-05, + "learning_rate": 7.49257144725346e-07, + "logits/chosen": -0.3866724669933319, + "logits/rejected": 1.3957892656326294, + "logps/chosen": -431.8949279785156, + "logps/rejected": -569.178955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.590885162353516, + "rewards/margins": 27.871734619140625, + "rewards/rejected": -36.462623596191406, + "step": 9620 + }, + { + "epoch": 29.86831913245546, + "grad_norm": 8.747459651203826e-05, + "learning_rate": 6.737104441189801e-07, + "logits/chosen": -0.5281975865364075, + "logits/rejected": 1.3896856307983398, + "logps/chosen": -411.46221923828125, + "logps/rejected": -531.2066650390625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.190832614898682, + "rewards/margins": 27.5278263092041, + "rewards/rejected": -34.71866226196289, + "step": 9640 + }, + { + "epoch": 29.930286599535243, + "grad_norm": 0.0035836484748870134, + "learning_rate": 6.021645919728647e-07, + "logits/chosen": -0.38994961977005005, + "logits/rejected": 1.36453115940094, + "logps/chosen": -410.65118408203125, + "logps/rejected": -549.8154296875, + "loss": 0.0076, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -8.187182426452637, + "rewards/margins": 27.47507667541504, + "rewards/rejected": -35.662261962890625, + "step": 9660 + }, + { + "epoch": 29.992254066615025, + "grad_norm": 4.6857512643327937e-05, + "learning_rate": 5.346224701434866e-07, + "logits/chosen": -0.35250693559646606, + "logits/rejected": 1.5060350894927979, + "logps/chosen": -443.78369140625, + "logps/rejected": -572.0866088867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.185739517211914, + "rewards/margins": 28.152313232421875, + "rewards/rejected": -36.33805465698242, + "step": 9680 + }, + { + "epoch": 30.05422153369481, + "grad_norm": 0.00014773959992453456, + "learning_rate": 4.710867992176682e-07, + "logits/chosen": -0.4862252175807953, + "logits/rejected": 1.4310047626495361, + "logps/chosen": -430.75177001953125, + "logps/rejected": -556.7779541015625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -6.786073207855225, + "rewards/margins": 28.293575286865234, + "rewards/rejected": -35.07965087890625, + "step": 9700 + }, + { + "epoch": 30.116189000774593, + "grad_norm": 0.00040511396946385503, + "learning_rate": 4.115601384029666e-07, + "logits/chosen": -0.4759834408760071, + "logits/rejected": 1.455631971359253, + "logps/chosen": -437.209716796875, + "logps/rejected": -545.3187866210938, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.022417068481445, + "rewards/margins": 26.92848777770996, + "rewards/rejected": -34.950904846191406, + "step": 9720 + }, + { + "epoch": 30.178156467854375, + "grad_norm": 0.0002189109945902601, + "learning_rate": 3.5604488542460014e-07, + "logits/chosen": -0.5051871538162231, + "logits/rejected": 1.4911781549453735, + "logps/chosen": -438.0755920410156, + "logps/rejected": -570.4459228515625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.649161338806152, + "rewards/margins": 29.3557186126709, + "rewards/rejected": -37.004878997802734, + "step": 9740 + }, + { + "epoch": 30.24012393493416, + "grad_norm": 0.00017708781524561346, + "learning_rate": 3.045432764288703e-07, + "logits/chosen": -0.44464248418807983, + "logits/rejected": 1.4907373189926147, + "logps/chosen": -435.34344482421875, + "logps/rejected": -544.2034301757812, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.806088447570801, + "rewards/margins": 27.583263397216797, + "rewards/rejected": -35.38935089111328, + "step": 9760 + }, + { + "epoch": 30.302091402013943, + "grad_norm": 0.00015091463865246624, + "learning_rate": 2.5705738589306696e-07, + "logits/chosen": -0.4208938181400299, + "logits/rejected": 1.4723773002624512, + "logps/chosen": -440.129150390625, + "logps/rejected": -565.9328002929688, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.044323921203613, + "rewards/margins": 27.99143409729004, + "rewards/rejected": -36.03575897216797, + "step": 9780 + }, + { + "epoch": 30.364058869093725, + "grad_norm": 0.00013884674990549684, + "learning_rate": 2.135891265419465e-07, + "logits/chosen": -0.4527045786380768, + "logits/rejected": 1.4194512367248535, + "logps/chosen": -418.86083984375, + "logps/rejected": -554.153564453125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.422151565551758, + "rewards/margins": 27.6077880859375, + "rewards/rejected": -36.029937744140625, + "step": 9800 + }, + { + "epoch": 30.42602633617351, + "grad_norm": 4.7392735723406076e-05, + "learning_rate": 1.7414024927064897e-07, + "logits/chosen": -0.4850187301635742, + "logits/rejected": 1.3569281101226807, + "logps/chosen": -432.75164794921875, + "logps/rejected": -571.4185791015625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.542023658752441, + "rewards/margins": 27.974933624267578, + "rewards/rejected": -36.5169563293457, + "step": 9820 + }, + { + "epoch": 30.487993803253293, + "grad_norm": 0.00015379580145236105, + "learning_rate": 1.3871234307420989e-07, + "logits/chosen": -0.4856896996498108, + "logits/rejected": 1.3765947818756104, + "logps/chosen": -438.67718505859375, + "logps/rejected": -581.4616088867188, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -8.13099479675293, + "rewards/margins": 28.83926010131836, + "rewards/rejected": -36.970252990722656, + "step": 9840 + }, + { + "epoch": 30.549961270333075, + "grad_norm": 0.00021822135022375733, + "learning_rate": 1.0730683498351157e-07, + "logits/chosen": -0.4343891143798828, + "logits/rejected": 1.4583574533462524, + "logps/chosen": -434.26239013671875, + "logps/rejected": -559.0338134765625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -7.727663516998291, + "rewards/margins": 28.047531127929688, + "rewards/rejected": -35.77519607543945, + "step": 9860 + }, + { + "epoch": 30.611928737412857, + "grad_norm": 5.923645221628249e-05, + "learning_rate": 7.992499000785136e-08, + "logits/chosen": -0.39348846673965454, + "logits/rejected": 1.4489144086837769, + "logps/chosen": -441.802734375, + "logps/rejected": -564.3275146484375, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.21603012084961, + "rewards/margins": 27.678537368774414, + "rewards/rejected": -35.894569396972656, + "step": 9880 + }, + { + "epoch": 30.673896204492642, + "grad_norm": 2.2205487766768783e-05, + "learning_rate": 5.6567911083937883e-08, + "logits/chosen": -0.4316628575325012, + "logits/rejected": 1.3483152389526367, + "logps/chosen": -420.97381591796875, + "logps/rejected": -556.51953125, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -8.481675148010254, + "rewards/margins": 27.316696166992188, + "rewards/rejected": -35.79837417602539, + "step": 9900 + }, + { + "epoch": 30.735863671572424, + "grad_norm": 9.596312884241343e-05, + "learning_rate": 3.723653903152657e-08, + "logits/chosen": -0.3819994032382965, + "logits/rejected": 1.4576747417449951, + "logps/chosen": -435.2261657714844, + "logps/rejected": -543.2717895507812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.418352127075195, + "rewards/margins": 26.706283569335938, + "rewards/rejected": -35.1246337890625, + "step": 9920 + }, + { + "epoch": 30.797831138652207, + "grad_norm": 3.857334831991466e-06, + "learning_rate": 2.193165251545004e-08, + "logits/chosen": -0.4508894979953766, + "logits/rejected": 1.3289421796798706, + "logps/chosen": -435.36102294921875, + "logps/rejected": -582.7653198242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.7185211181640625, + "rewards/margins": 29.5615177154541, + "rewards/rejected": -37.28003692626953, + "step": 9940 + }, + { + "epoch": 30.859798605731992, + "grad_norm": 8.236696157837287e-05, + "learning_rate": 1.0653868014309786e-08, + "logits/chosen": -0.46718326210975647, + "logits/rejected": 1.4237308502197266, + "logps/chosen": -429.4341735839844, + "logps/rejected": -575.0633544921875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.886075496673584, + "rewards/margins": 29.133136749267578, + "rewards/rejected": -37.01920700073242, + "step": 9960 + }, + { + "epoch": 30.921766072811774, + "grad_norm": 0.0001442828943254426, + "learning_rate": 3.4036397956183076e-09, + "logits/chosen": -0.4402598440647125, + "logits/rejected": 1.5141351222991943, + "logps/chosen": -416.0702209472656, + "logps/rejected": -538.6110229492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.578369140625, + "rewards/margins": 28.705799102783203, + "rewards/rejected": -35.28417205810547, + "step": 9980 + }, + { + "epoch": 30.983733539891556, + "grad_norm": 2.7997641154797748e-05, + "learning_rate": 1.812598975137192e-10, + "logits/chosen": -0.41933926939964294, + "logits/rejected": 1.3799235820770264, + "logps/chosen": -449.55023193359375, + "logps/rejected": -607.9024658203125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.158994674682617, + "rewards/margins": 30.011821746826172, + "rewards/rejected": -38.170814514160156, + "step": 10000 + } + ], + "logging_steps": 20, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 32, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}