{ "best_metric": null, "best_model_checkpoint": null, "epoch": 30.983733539891556, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.061967467079783116, "grad_norm": 0.9636571407318115, "learning_rate": 4e-05, "logits/chosen": -2.2784409523010254, "logits/rejected": -1.8663209676742554, "logps/chosen": -371.1264953613281, "logps/rejected": -214.287109375, "loss": 0.6698, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": 0.04825574904680252, "rewards/margins": 0.04888930916786194, "rewards/rejected": -0.0006335576181299984, "step": 20 }, { "epoch": 0.12393493415956623, "grad_norm": 0.8874484300613403, "learning_rate": 7.800000000000001e-05, "logits/chosen": -2.2781295776367188, "logits/rejected": -1.883906602859497, "logps/chosen": -354.96759033203125, "logps/rejected": -200.45201110839844, "loss": 0.4393, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.6998767852783203, "rewards/margins": 0.8133988380432129, "rewards/rejected": -0.11352206766605377, "step": 40 }, { "epoch": 0.18590240123934934, "grad_norm": 0.5087229013442993, "learning_rate": 0.000118, "logits/chosen": -2.292691469192505, "logits/rejected": -1.8518846035003662, "logps/chosen": -332.63885498046875, "logps/rejected": -207.2465057373047, "loss": 0.181, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 1.8739244937896729, "rewards/margins": 3.126392364501953, "rewards/rejected": -1.2524681091308594, "step": 60 }, { "epoch": 0.24786986831913246, "grad_norm": 0.28147539496421814, "learning_rate": 0.00015800000000000002, "logits/chosen": -2.2652933597564697, "logits/rejected": -1.7805808782577515, "logps/chosen": -333.56829833984375, "logps/rejected": -216.029541015625, "loss": 0.0837, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": 1.9676460027694702, "rewards/margins": 4.509096145629883, "rewards/rejected": -2.541449785232544, "step": 80 }, { "epoch": 0.30983733539891556, "grad_norm": 0.2960963547229767, "learning_rate": 0.00019800000000000002, "logits/chosen": -2.193760871887207, "logits/rejected": -1.7519451379776, "logps/chosen": -314.7029113769531, "logps/rejected": -225.7288055419922, "loss": 0.0583, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": 1.717433214187622, "rewards/margins": 5.224194526672363, "rewards/rejected": -3.506760835647583, "step": 100 }, { "epoch": 0.3718048024786987, "grad_norm": 0.5042657852172852, "learning_rate": 0.00019999818237098496, "logits/chosen": -2.103884220123291, "logits/rejected": -1.6459449529647827, "logps/chosen": -353.2919921875, "logps/rejected": -271.13140869140625, "loss": 0.0305, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.9538187980651855, "rewards/margins": 6.551173210144043, "rewards/rejected": -5.597354888916016, "step": 120 }, { "epoch": 0.4337722695584818, "grad_norm": 0.04080686345696449, "learning_rate": 0.00019999234186476365, "logits/chosen": -1.97976815700531, "logits/rejected": -1.4605042934417725, "logps/chosen": -360.1809387207031, "logps/rejected": -283.61322021484375, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -0.14382997155189514, "rewards/margins": 8.151076316833496, "rewards/rejected": -8.294907569885254, "step": 140 }, { "epoch": 0.4957397366382649, "grad_norm": 0.2120855450630188, "learning_rate": 0.00019998247368159224, "logits/chosen": -1.9709131717681885, "logits/rejected": -1.4924025535583496, "logps/chosen": -352.43408203125, "logps/rejected": -283.54193115234375, "loss": 0.0202, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.14206284284591675, "rewards/margins": 8.227814674377441, "rewards/rejected": -8.085752487182617, "step": 160 }, { "epoch": 0.557707203718048, "grad_norm": 0.14296187460422516, "learning_rate": 0.00019996857821895966, "logits/chosen": -1.9138180017471313, "logits/rejected": -1.3899542093276978, "logps/chosen": -349.78985595703125, "logps/rejected": -291.88946533203125, "loss": 0.014, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -0.20763222873210907, "rewards/margins": 9.207175254821777, "rewards/rejected": -9.414807319641113, "step": 180 }, { "epoch": 0.6196746707978311, "grad_norm": 0.15746493637561798, "learning_rate": 0.00019995065603657316, "logits/chosen": -1.803430199623108, "logits/rejected": -1.2661969661712646, "logps/chosen": -365.71539306640625, "logps/rejected": -320.00897216796875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.6435772180557251, "rewards/margins": 9.713006973266602, "rewards/rejected": -10.356585502624512, "step": 200 }, { "epoch": 0.6816421378776143, "grad_norm": 0.03454677388072014, "learning_rate": 0.00019992870785633563, "logits/chosen": -1.8543188571929932, "logits/rejected": -1.3436486721038818, "logps/chosen": -354.7937316894531, "logps/rejected": -284.83819580078125, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.3698399066925049, "rewards/margins": 8.893186569213867, "rewards/rejected": -8.523347854614258, "step": 220 }, { "epoch": 0.7436096049573974, "grad_norm": 0.6197882294654846, "learning_rate": 0.0001999027345623165, "logits/chosen": -1.9029203653335571, "logits/rejected": -1.3840315341949463, "logps/chosen": -342.14666748046875, "logps/rejected": -296.29547119140625, "loss": 0.0103, "rewards/accuracies": 0.996874988079071, "rewards/chosen": 0.0756167322397232, "rewards/margins": 9.7724027633667, "rewards/rejected": -9.696786880493164, "step": 240 }, { "epoch": 0.8055770720371804, "grad_norm": 0.03329584375023842, "learning_rate": 0.00019987273720071632, "logits/chosen": -1.856236457824707, "logits/rejected": -1.350318193435669, "logps/chosen": -358.27630615234375, "logps/rejected": -334.4754943847656, "loss": 0.0112, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.7377322912216187, "rewards/margins": 10.27548599243164, "rewards/rejected": -11.01321792602539, "step": 260 }, { "epoch": 0.8675445391169636, "grad_norm": 0.01427725050598383, "learning_rate": 0.00019983871697982445, "logits/chosen": -1.7166755199432373, "logits/rejected": -1.1573728322982788, "logps/chosen": -357.41461181640625, "logps/rejected": -323.04693603515625, "loss": 0.0072, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.0735583305358887, "rewards/margins": 10.781556129455566, "rewards/rejected": -11.855114936828613, "step": 280 }, { "epoch": 0.9295120061967467, "grad_norm": 0.03855278715491295, "learning_rate": 0.00019980067526997045, "logits/chosen": -1.6517670154571533, "logits/rejected": -1.0374754667282104, "logps/chosen": -368.36083984375, "logps/rejected": -319.9913635253906, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -1.422989010810852, "rewards/margins": 10.918716430664062, "rewards/rejected": -12.341705322265625, "step": 300 }, { "epoch": 0.9914794732765299, "grad_norm": 0.019272373989224434, "learning_rate": 0.00019975861360346876, "logits/chosen": -1.6813256740570068, "logits/rejected": -1.0804450511932373, "logps/chosen": -365.1048278808594, "logps/rejected": -309.0344543457031, "loss": 0.0082, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -0.9812471270561218, "rewards/margins": 10.517460823059082, "rewards/rejected": -11.498706817626953, "step": 320 }, { "epoch": 1.053446940356313, "grad_norm": 0.05422881618142128, "learning_rate": 0.00019971253367455727, "logits/chosen": -1.667327880859375, "logits/rejected": -1.058142066001892, "logps/chosen": -386.6405944824219, "logps/rejected": -340.94427490234375, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -1.013697862625122, "rewards/margins": 11.128829956054688, "rewards/rejected": -12.142528533935547, "step": 340 }, { "epoch": 1.115414407436096, "grad_norm": 0.009864700958132744, "learning_rate": 0.00019966243733932873, "logits/chosen": -1.5947678089141846, "logits/rejected": -0.8976384997367859, "logps/chosen": -368.97564697265625, "logps/rejected": -330.3668212890625, "loss": 0.0036, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5015220642089844, "rewards/margins": 12.364274978637695, "rewards/rejected": -13.865796089172363, "step": 360 }, { "epoch": 1.1773818745158793, "grad_norm": 0.0011158271227031946, "learning_rate": 0.00019960832661565622, "logits/chosen": -1.644182562828064, "logits/rejected": -1.0111494064331055, "logps/chosen": -380.652099609375, "logps/rejected": -347.1996765136719, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.7853857278823853, "rewards/margins": 12.316856384277344, "rewards/rejected": -14.102241516113281, "step": 380 }, { "epoch": 1.2393493415956622, "grad_norm": 0.003002722980454564, "learning_rate": 0.00019955020368311183, "logits/chosen": -1.5604654550552368, "logits/rejected": -0.855100154876709, "logps/chosen": -383.905517578125, "logps/rejected": -356.8383483886719, "loss": 0.005, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.063133478164673, "rewards/margins": 13.559541702270508, "rewards/rejected": -15.622674942016602, "step": 400 }, { "epoch": 1.3013168086754454, "grad_norm": 0.12509553134441376, "learning_rate": 0.00019948807088287883, "logits/chosen": -1.4892700910568237, "logits/rejected": -0.7589560747146606, "logps/chosen": -369.834716796875, "logps/rejected": -347.0176086425781, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.5435351133346558, "rewards/margins": 13.051846504211426, "rewards/rejected": -14.595380783081055, "step": 420 }, { "epoch": 1.3632842757552286, "grad_norm": 0.17950604856014252, "learning_rate": 0.0001994219307176573, "logits/chosen": -1.3477447032928467, "logits/rejected": -0.5739808678627014, "logps/chosen": -383.2652282714844, "logps/rejected": -377.4029541015625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.7989494800567627, "rewards/margins": 13.98778247833252, "rewards/rejected": -16.786731719970703, "step": 440 }, { "epoch": 1.4252517428350115, "grad_norm": 0.030621694400906563, "learning_rate": 0.00019935178585156347, "logits/chosen": -1.2917251586914062, "logits/rejected": -0.4901725649833679, "logps/chosen": -387.3026428222656, "logps/rejected": -381.1557312011719, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.059861660003662, "rewards/margins": 14.754425048828125, "rewards/rejected": -17.814287185668945, "step": 460 }, { "epoch": 1.4872192099147947, "grad_norm": 0.018743343651294708, "learning_rate": 0.00019927763911002232, "logits/chosen": -1.2943724393844604, "logits/rejected": -0.4242786765098572, "logps/chosen": -402.34521484375, "logps/rejected": -377.7867736816406, "loss": 0.0055, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.333390474319458, "rewards/margins": 14.473798751831055, "rewards/rejected": -17.80718994140625, "step": 480 }, { "epoch": 1.549186676994578, "grad_norm": 0.9102460741996765, "learning_rate": 0.0001991994934796538, "logits/chosen": -1.378178358078003, "logits/rejected": -0.5917572975158691, "logps/chosen": -379.13470458984375, "logps/rejected": -377.18170166015625, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1097264289855957, "rewards/margins": 13.761838912963867, "rewards/rejected": -16.871564865112305, "step": 500 }, { "epoch": 1.6111541440743609, "grad_norm": 0.08494719117879868, "learning_rate": 0.0001991173521081525, "logits/chosen": -1.6131643056869507, "logits/rejected": -1.0569688081741333, "logps/chosen": -353.74285888671875, "logps/rejected": -293.4755859375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -0.8248776197433472, "rewards/margins": 10.194657325744629, "rewards/rejected": -11.019535064697266, "step": 520 }, { "epoch": 1.673121611154144, "grad_norm": 0.08382871747016907, "learning_rate": 0.00019903121830416084, "logits/chosen": -1.6320635080337524, "logits/rejected": -1.0402500629425049, "logps/chosen": -365.8774719238281, "logps/rejected": -338.92730712890625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.775146484375, "rewards/margins": 11.3680419921875, "rewards/rejected": -12.143187522888184, "step": 540 }, { "epoch": 1.7350890782339272, "grad_norm": 0.2668975293636322, "learning_rate": 0.00019894109553713596, "logits/chosen": -1.5704303979873657, "logits/rejected": -0.9429661631584167, "logps/chosen": -365.54425048828125, "logps/rejected": -338.42523193359375, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -1.945920705795288, "rewards/margins": 11.87964916229248, "rewards/rejected": -13.825571060180664, "step": 560 }, { "epoch": 1.7970565453137102, "grad_norm": 0.0014168431516736746, "learning_rate": 0.00019884698743720974, "logits/chosen": -1.5261198282241821, "logits/rejected": -0.8152379989624023, "logps/chosen": -387.27398681640625, "logps/rejected": -349.8082580566406, "loss": 0.0054, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.8853065967559814, "rewards/margins": 12.5154447555542, "rewards/rejected": -15.400751113891602, "step": 580 }, { "epoch": 1.8590240123934936, "grad_norm": 0.060512229800224304, "learning_rate": 0.00019874889779504274, "logits/chosen": -1.4643357992172241, "logits/rejected": -0.7492440938949585, "logps/chosen": -371.1864013671875, "logps/rejected": -374.9864196777344, "loss": 0.0053, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.833853244781494, "rewards/margins": 13.810094833374023, "rewards/rejected": -16.64394760131836, "step": 600 }, { "epoch": 1.9209914794732765, "grad_norm": 0.012925480492413044, "learning_rate": 0.00019864683056167138, "logits/chosen": -1.5827527046203613, "logits/rejected": -0.8596705198287964, "logps/chosen": -378.899658203125, "logps/rejected": -334.9442443847656, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.0560431480407715, "rewards/margins": 11.832538604736328, "rewards/rejected": -13.888582229614258, "step": 620 }, { "epoch": 1.9829589465530595, "grad_norm": 0.02249460108578205, "learning_rate": 0.00019854078984834903, "logits/chosen": -1.6471761465072632, "logits/rejected": -1.0331779718399048, "logps/chosen": -384.35479736328125, "logps/rejected": -350.6449890136719, "loss": 0.007, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.5288515090942383, "rewards/margins": 11.604903221130371, "rewards/rejected": -13.133755683898926, "step": 640 }, { "epoch": 2.044926413632843, "grad_norm": 0.08954928070306778, "learning_rate": 0.00019843077992638008, "logits/chosen": -1.6173429489135742, "logits/rejected": -0.8341380953788757, "logps/chosen": -364.81646728515625, "logps/rejected": -330.5616149902344, "loss": 0.007, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.3666422367095947, "rewards/margins": 11.623918533325195, "rewards/rejected": -12.990560531616211, "step": 660 }, { "epoch": 2.106893880712626, "grad_norm": 0.007693038322031498, "learning_rate": 0.00019831680522694822, "logits/chosen": -1.5330901145935059, "logits/rejected": -0.7276937365531921, "logps/chosen": -367.1166687011719, "logps/rejected": -353.382080078125, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0102535486221313, "rewards/margins": 13.941228866577148, "rewards/rejected": -14.951481819152832, "step": 680 }, { "epoch": 2.168861347792409, "grad_norm": 0.04048047587275505, "learning_rate": 0.00019819887034093768, "logits/chosen": -1.6976232528686523, "logits/rejected": -1.0186755657196045, "logps/chosen": -371.5323791503906, "logps/rejected": -328.90411376953125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.6277135610580444, "rewards/margins": 12.13409423828125, "rewards/rejected": -13.761807441711426, "step": 700 }, { "epoch": 2.230828814872192, "grad_norm": 0.0501631423830986, "learning_rate": 0.00019807698001874846, "logits/chosen": -1.7097208499908447, "logits/rejected": -1.064752221107483, "logps/chosen": -355.98419189453125, "logps/rejected": -349.1392517089844, "loss": 0.0034, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.9499849081039429, "rewards/margins": 12.722036361694336, "rewards/rejected": -14.672018051147461, "step": 720 }, { "epoch": 2.292796281951975, "grad_norm": 0.09596577286720276, "learning_rate": 0.000197951139170105, "logits/chosen": -1.6241413354873657, "logits/rejected": -0.8459763526916504, "logps/chosen": -391.1998596191406, "logps/rejected": -360.8172607421875, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.3230502605438232, "rewards/margins": 13.51972770690918, "rewards/rejected": -15.84277629852295, "step": 740 }, { "epoch": 2.3547637490317586, "grad_norm": 0.009040975011885166, "learning_rate": 0.0001978213528638583, "logits/chosen": -1.6358083486557007, "logits/rejected": -0.8858752250671387, "logps/chosen": -385.79425048828125, "logps/rejected": -363.9739685058594, "loss": 0.0058, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.426802158355713, "rewards/margins": 13.022024154663086, "rewards/rejected": -15.448826789855957, "step": 760 }, { "epoch": 2.4167312161115415, "grad_norm": 0.0005452932673506439, "learning_rate": 0.00019768762632778187, "logits/chosen": -1.5128395557403564, "logits/rejected": -0.7464967966079712, "logps/chosen": -386.6916198730469, "logps/rejected": -367.53118896484375, "loss": 0.0043, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.430518627166748, "rewards/margins": 13.321751594543457, "rewards/rejected": -15.752270698547363, "step": 780 }, { "epoch": 2.4786986831913245, "grad_norm": 0.029041077941656113, "learning_rate": 0.0001975499649483611, "logits/chosen": -1.5379225015640259, "logits/rejected": -0.7502544522285461, "logps/chosen": -391.7222900390625, "logps/rejected": -362.2694396972656, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.068117618560791, "rewards/margins": 13.040313720703125, "rewards/rejected": -15.108428955078125, "step": 800 }, { "epoch": 2.5406661502711074, "grad_norm": 0.19298532605171204, "learning_rate": 0.00019740837427057625, "logits/chosen": -1.456314206123352, "logits/rejected": -0.614872932434082, "logps/chosen": -399.50787353515625, "logps/rejected": -373.15673828125, "loss": 0.0053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.449636936187744, "rewards/margins": 13.22521686553955, "rewards/rejected": -16.674854278564453, "step": 820 }, { "epoch": 2.602633617350891, "grad_norm": 0.07452496141195297, "learning_rate": 0.00019726285999767919, "logits/chosen": -1.4205154180526733, "logits/rejected": -0.2992118299007416, "logps/chosen": -389.38677978515625, "logps/rejected": -361.76226806640625, "loss": 0.0045, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.5656723976135254, "rewards/margins": 13.694564819335938, "rewards/rejected": -17.260236740112305, "step": 840 }, { "epoch": 2.664601084430674, "grad_norm": 0.2981736361980438, "learning_rate": 0.00019711342799096361, "logits/chosen": -1.3148002624511719, "logits/rejected": -0.20274639129638672, "logps/chosen": -397.6435546875, "logps/rejected": -394.09381103515625, "loss": 0.0077, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.557480812072754, "rewards/margins": 14.411791801452637, "rewards/rejected": -18.96927261352539, "step": 860 }, { "epoch": 2.726568551510457, "grad_norm": 0.0045928251929581165, "learning_rate": 0.00019696008426952897, "logits/chosen": -1.528051733970642, "logits/rejected": -0.5431190133094788, "logps/chosen": -370.94769287109375, "logps/rejected": -352.85858154296875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.877048134803772, "rewards/margins": 12.948789596557617, "rewards/rejected": -14.825838088989258, "step": 880 }, { "epoch": 2.78853601859024, "grad_norm": 0.004106991924345493, "learning_rate": 0.00019680283501003797, "logits/chosen": -1.4431769847869873, "logits/rejected": -0.3284229636192322, "logps/chosen": -385.5152893066406, "logps/rejected": -374.2150573730469, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.3450465202331543, "rewards/margins": 15.452998161315918, "rewards/rejected": -17.798046112060547, "step": 900 }, { "epoch": 2.850503485670023, "grad_norm": 0.0018571156542748213, "learning_rate": 0.00019664168654646787, "logits/chosen": -1.432015299797058, "logits/rejected": -0.47949114441871643, "logps/chosen": -373.06402587890625, "logps/rejected": -391.0716247558594, "loss": 0.004, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.047811985015869, "rewards/margins": 14.360496520996094, "rewards/rejected": -16.408308029174805, "step": 920 }, { "epoch": 2.9124709527498065, "grad_norm": 0.014180217869579792, "learning_rate": 0.00019647664536985536, "logits/chosen": -1.3931537866592407, "logits/rejected": -0.2861904501914978, "logps/chosen": -375.765380859375, "logps/rejected": -380.55035400390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.287869453430176, "rewards/margins": 15.421821594238281, "rewards/rejected": -17.70969009399414, "step": 940 }, { "epoch": 2.9744384198295895, "grad_norm": 0.0018744635162875056, "learning_rate": 0.00019630771812803482, "logits/chosen": -1.1787105798721313, "logits/rejected": -0.12521542608737946, "logps/chosen": -369.18731689453125, "logps/rejected": -404.814208984375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.6008732318878174, "rewards/margins": 16.838558197021484, "rewards/rejected": -19.43943214416504, "step": 960 }, { "epoch": 3.0364058869093724, "grad_norm": 0.0062718172557652, "learning_rate": 0.00019613491162537105, "logits/chosen": -1.1543958187103271, "logits/rejected": 0.015751656144857407, "logps/chosen": -387.29241943359375, "logps/rejected": -409.869384765625, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.3296706676483154, "rewards/margins": 16.757152557373047, "rewards/rejected": -20.086822509765625, "step": 980 }, { "epoch": 3.098373353989156, "grad_norm": 0.0017134093213826418, "learning_rate": 0.00019595823282248472, "logits/chosen": -1.1962834596633911, "logits/rejected": 0.06610006093978882, "logps/chosen": -384.97509765625, "logps/rejected": -392.449462890625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.8541882038116455, "rewards/margins": 16.58295249938965, "rewards/rejected": -19.437145233154297, "step": 1000 }, { "epoch": 3.1603408210689388, "grad_norm": 0.004404888488352299, "learning_rate": 0.00019577768883597224, "logits/chosen": -1.120615839958191, "logits/rejected": 0.19888155162334442, "logps/chosen": -380.3006591796875, "logps/rejected": -392.1662292480469, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.439927339553833, "rewards/margins": 16.68685531616211, "rewards/rejected": -20.126781463623047, "step": 1020 }, { "epoch": 3.2223082881487217, "grad_norm": 0.008267875760793686, "learning_rate": 0.00019559328693811908, "logits/chosen": -1.243600845336914, "logits/rejected": 0.01010244432836771, "logps/chosen": -402.1722412109375, "logps/rejected": -400.5585021972656, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.8246588706970215, "rewards/margins": 16.491710662841797, "rewards/rejected": -19.31637191772461, "step": 1040 }, { "epoch": 3.284275755228505, "grad_norm": 0.2958065867424011, "learning_rate": 0.0001954050345566068, "logits/chosen": -1.2641799449920654, "logits/rejected": -0.049269963055849075, "logps/chosen": -399.22235107421875, "logps/rejected": -417.200439453125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.203291654586792, "rewards/margins": 17.573293685913086, "rewards/rejected": -20.776586532592773, "step": 1060 }, { "epoch": 3.346243222308288, "grad_norm": 0.006309076678007841, "learning_rate": 0.00019521293927421388, "logits/chosen": -1.3915287256240845, "logits/rejected": -0.25474995374679565, "logps/chosen": -377.03460693359375, "logps/rejected": -377.8516540527344, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.6546690464019775, "rewards/margins": 15.229527473449707, "rewards/rejected": -17.88419532775879, "step": 1080 }, { "epoch": 3.4082106893880715, "grad_norm": 0.017275001853704453, "learning_rate": 0.0001950170088285103, "logits/chosen": -1.320058822631836, "logits/rejected": -0.22610430419445038, "logps/chosen": -367.7281799316406, "logps/rejected": -360.3981018066406, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -2.8530807495117188, "rewards/margins": 14.434481620788574, "rewards/rejected": -17.28756332397461, "step": 1100 }, { "epoch": 3.4701781564678544, "grad_norm": 0.006489979103207588, "learning_rate": 0.00019481725111154577, "logits/chosen": -1.1993420124053955, "logits/rejected": 0.13253316283226013, "logps/chosen": -399.0036315917969, "logps/rejected": -387.82672119140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.1873600482940674, "rewards/margins": 15.84624195098877, "rewards/rejected": -19.03360366821289, "step": 1120 }, { "epoch": 3.5321456235476374, "grad_norm": 0.01082077156752348, "learning_rate": 0.00019461367416953208, "logits/chosen": -1.0335127115249634, "logits/rejected": 0.3021327555179596, "logps/chosen": -389.12701416015625, "logps/rejected": -399.442626953125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.1247944831848145, "rewards/margins": 17.54226303100586, "rewards/rejected": -21.66705894470215, "step": 1140 }, { "epoch": 3.5941130906274203, "grad_norm": 0.0018865488236770034, "learning_rate": 0.00019440628620251874, "logits/chosen": -0.9542962312698364, "logits/rejected": 0.2728312611579895, "logps/chosen": -408.50225830078125, "logps/rejected": -458.05157470703125, "loss": 0.0048, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.655162334442139, "rewards/margins": 18.82590103149414, "rewards/rejected": -24.481061935424805, "step": 1160 }, { "epoch": 3.6560805577072037, "grad_norm": 0.00045730554847978055, "learning_rate": 0.00019419509556406285, "logits/chosen": -1.1230123043060303, "logits/rejected": 0.33410170674324036, "logps/chosen": -388.6370849609375, "logps/rejected": -404.42462158203125, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.1652071475982666, "rewards/margins": 18.062408447265625, "rewards/rejected": -21.227617263793945, "step": 1180 }, { "epoch": 3.7180480247869867, "grad_norm": 0.12257017195224762, "learning_rate": 0.00019398011076089252, "logits/chosen": -1.0567783117294312, "logits/rejected": 0.1874198019504547, "logps/chosen": -391.79998779296875, "logps/rejected": -427.81524658203125, "loss": 0.0021, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.9393508434295654, "rewards/margins": 17.934566497802734, "rewards/rejected": -21.873916625976562, "step": 1200 }, { "epoch": 3.78001549186677, "grad_norm": 0.020518837496638298, "learning_rate": 0.00019376134045256423, "logits/chosen": -1.6692512035369873, "logits/rejected": -0.8992973566055298, "logps/chosen": -377.19891357421875, "logps/rejected": -354.7894592285156, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.314318060874939, "rewards/margins": 12.59657096862793, "rewards/rejected": -13.910888671875, "step": 1220 }, { "epoch": 3.841982958946553, "grad_norm": 0.006729189306497574, "learning_rate": 0.00019353879345111413, "logits/chosen": -1.650059461593628, "logits/rejected": -0.9682002067565918, "logps/chosen": -370.81866455078125, "logps/rejected": -390.57159423828125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.371593952178955, "rewards/margins": 13.772104263305664, "rewards/rejected": -16.14369773864746, "step": 1240 }, { "epoch": 3.903950426026336, "grad_norm": 0.007746795192360878, "learning_rate": 0.000193312478720703, "logits/chosen": -1.6948245763778687, "logits/rejected": -0.8788079023361206, "logps/chosen": -388.7099304199219, "logps/rejected": -379.38824462890625, "loss": 0.0044, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.7807118892669678, "rewards/margins": 14.142008781433105, "rewards/rejected": -16.92272186279297, "step": 1260 }, { "epoch": 3.9659178931061194, "grad_norm": 0.03579813614487648, "learning_rate": 0.00019308240537725517, "logits/chosen": -1.610775351524353, "logits/rejected": -0.7109208703041077, "logps/chosen": -386.43792724609375, "logps/rejected": -369.51416015625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.900850534439087, "rewards/margins": 14.74780559539795, "rewards/rejected": -17.64865493774414, "step": 1280 }, { "epoch": 4.027885360185903, "grad_norm": 0.038256481289863586, "learning_rate": 0.00019284858268809137, "logits/chosen": -1.6177009344100952, "logits/rejected": -0.7443078756332397, "logps/chosen": -374.1534423828125, "logps/rejected": -370.35137939453125, "loss": 0.0014, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.607776641845703, "rewards/margins": 14.8814697265625, "rewards/rejected": -18.48924446105957, "step": 1300 }, { "epoch": 4.089852827265686, "grad_norm": 0.011043623089790344, "learning_rate": 0.0001926110200715554, "logits/chosen": -1.5368865728378296, "logits/rejected": -0.7745493650436401, "logps/chosen": -364.08038330078125, "logps/rejected": -396.889404296875, "loss": 0.0036, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.033879280090332, "rewards/margins": 14.89660358428955, "rewards/rejected": -18.930482864379883, "step": 1320 }, { "epoch": 4.151820294345469, "grad_norm": 0.0009816307574510574, "learning_rate": 0.00019236972709663487, "logits/chosen": -1.4818181991577148, "logits/rejected": -0.5778347849845886, "logps/chosen": -388.94549560546875, "logps/rejected": -394.2232971191406, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.4815616607666016, "rewards/margins": 15.029690742492676, "rewards/rejected": -18.511253356933594, "step": 1340 }, { "epoch": 4.213787761425252, "grad_norm": 0.015701670199632645, "learning_rate": 0.00019212471348257562, "logits/chosen": -1.4690208435058594, "logits/rejected": -0.48099368810653687, "logps/chosen": -396.7850036621094, "logps/rejected": -408.1683349609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.170346975326538, "rewards/margins": 16.654111862182617, "rewards/rejected": -19.824459075927734, "step": 1360 }, { "epoch": 4.275755228505035, "grad_norm": 0.0036051629576832056, "learning_rate": 0.0001918759890984902, "logits/chosen": -1.3399614095687866, "logits/rejected": -0.3685137629508972, "logps/chosen": -372.87969970703125, "logps/rejected": -388.63116455078125, "loss": 0.0023, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.198568344116211, "rewards/margins": 15.592170715332031, "rewards/rejected": -19.79073715209961, "step": 1380 }, { "epoch": 4.337722695584818, "grad_norm": 0.01418756041675806, "learning_rate": 0.00019162356396296067, "logits/chosen": -1.3985111713409424, "logits/rejected": -0.25035908818244934, "logps/chosen": -397.00213623046875, "logps/rejected": -385.6864318847656, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.2335734367370605, "rewards/margins": 15.834146499633789, "rewards/rejected": -20.067720413208008, "step": 1400 }, { "epoch": 4.3996901626646014, "grad_norm": 0.0017586932517588139, "learning_rate": 0.0001913674482436346, "logits/chosen": -1.3915663957595825, "logits/rejected": -0.2883684039115906, "logps/chosen": -400.862060546875, "logps/rejected": -400.7593078613281, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.649229526519775, "rewards/margins": 16.46903419494629, "rewards/rejected": -21.118263244628906, "step": 1420 }, { "epoch": 4.461657629744384, "grad_norm": 0.0009894509566947818, "learning_rate": 0.00019110765225681582, "logits/chosen": -1.2623310089111328, "logits/rejected": -0.3094506859779358, "logps/chosen": -405.25714111328125, "logps/rejected": -450.39678955078125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.272841453552246, "rewards/margins": 17.283151626586914, "rewards/rejected": -22.555994033813477, "step": 1440 }, { "epoch": 4.523625096824167, "grad_norm": 0.011076156981289387, "learning_rate": 0.00019084418646704882, "logits/chosen": -1.3033758401870728, "logits/rejected": -0.33031535148620605, "logps/chosen": -394.64715576171875, "logps/rejected": -434.3880310058594, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -4.661707401275635, "rewards/margins": 17.585865020751953, "rewards/rejected": -22.247570037841797, "step": 1460 }, { "epoch": 4.58559256390395, "grad_norm": 0.011492446064949036, "learning_rate": 0.0001905770614866972, "logits/chosen": -1.2698830366134644, "logits/rejected": -0.2575158476829529, "logps/chosen": -411.17333984375, "logps/rejected": -460.8190002441406, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.470455646514893, "rewards/margins": 19.05929946899414, "rewards/rejected": -23.529756546020508, "step": 1480 }, { "epoch": 4.647560030983733, "grad_norm": 0.0024670190177857876, "learning_rate": 0.0001903062880755162, "logits/chosen": -1.2591683864593506, "logits/rejected": -0.018044818192720413, "logps/chosen": -403.6463317871094, "logps/rejected": -414.41552734375, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.826168060302734, "rewards/margins": 17.654024124145508, "rewards/rejected": -22.480192184448242, "step": 1500 }, { "epoch": 4.709527498063517, "grad_norm": 0.006593796424567699, "learning_rate": 0.00019003187714021938, "logits/chosen": -1.2251088619232178, "logits/rejected": 0.010517546907067299, "logps/chosen": -413.086181640625, "logps/rejected": -438.603515625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.791226387023926, "rewards/margins": 18.92746925354004, "rewards/rejected": -23.71869659423828, "step": 1520 }, { "epoch": 4.7714949651433, "grad_norm": 0.004145272541791201, "learning_rate": 0.00018975383973403914, "logits/chosen": -1.0933794975280762, "logits/rejected": 0.2759025990962982, "logps/chosen": -422.998779296875, "logps/rejected": -437.8479919433594, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -5.828397274017334, "rewards/margins": 19.29062843322754, "rewards/rejected": -25.1190242767334, "step": 1540 }, { "epoch": 4.833462432223083, "grad_norm": 0.0056694443337619305, "learning_rate": 0.00018947218705628167, "logits/chosen": -1.1124125719070435, "logits/rejected": 0.17458318173885345, "logps/chosen": -418.27703857421875, "logps/rejected": -469.0071716308594, "loss": 0.0045, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.002425670623779, "rewards/margins": 20.302865982055664, "rewards/rejected": -25.3052921295166, "step": 1560 }, { "epoch": 4.895429899302866, "grad_norm": 0.00018859546980820596, "learning_rate": 0.0001891869304518758, "logits/chosen": -1.0209033489227295, "logits/rejected": 0.3391306698322296, "logps/chosen": -400.2538146972656, "logps/rejected": -430.3617248535156, "loss": 0.0034, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.619088172912598, "rewards/margins": 19.616975784301758, "rewards/rejected": -24.23606300354004, "step": 1580 }, { "epoch": 4.957397366382649, "grad_norm": 0.029597284272313118, "learning_rate": 0.00018889808141091597, "logits/chosen": -1.0116602182388306, "logits/rejected": 0.38184159994125366, "logps/chosen": -412.35205078125, "logps/rejected": -459.47991943359375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -5.358821868896484, "rewards/margins": 20.45774269104004, "rewards/rejected": -25.816564559936523, "step": 1600 }, { "epoch": 5.019364833462432, "grad_norm": 0.001491004484705627, "learning_rate": 0.00018860565156819935, "logits/chosen": -1.278867244720459, "logits/rejected": -0.06797562539577484, "logps/chosen": -397.50836181640625, "logps/rejected": -435.203857421875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.6724727153778076, "rewards/margins": 17.823925018310547, "rewards/rejected": -21.49639892578125, "step": 1620 }, { "epoch": 5.081332300542216, "grad_norm": 0.012196192517876625, "learning_rate": 0.00018830965270275746, "logits/chosen": -1.1280790567398071, "logits/rejected": 0.10533533245325089, "logps/chosen": -397.92498779296875, "logps/rejected": -435.8148498535156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.880277633666992, "rewards/margins": 18.349157333374023, "rewards/rejected": -23.229434967041016, "step": 1640 }, { "epoch": 5.143299767621999, "grad_norm": 0.0038448043633252382, "learning_rate": 0.00018801009673738138, "logits/chosen": -1.1577694416046143, "logits/rejected": 0.07719329744577408, "logps/chosen": -417.5320739746094, "logps/rejected": -470.4189453125, "loss": 0.0055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.50655460357666, "rewards/margins": 19.84600830078125, "rewards/rejected": -24.352563858032227, "step": 1660 }, { "epoch": 5.205267234701782, "grad_norm": 0.0032272750977426767, "learning_rate": 0.00018770699573814176, "logits/chosen": -1.0571038722991943, "logits/rejected": 0.0797622948884964, "logps/chosen": -390.07745361328125, "logps/rejected": -450.1996154785156, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.879145622253418, "rewards/margins": 18.845855712890625, "rewards/rejected": -23.725000381469727, "step": 1680 }, { "epoch": 5.267234701781565, "grad_norm": 0.0002681920013856143, "learning_rate": 0.0001874003619139026, "logits/chosen": -1.136499047279358, "logits/rejected": 0.09882185608148575, "logps/chosen": -404.396240234375, "logps/rejected": -455.455322265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.68656587600708, "rewards/margins": 19.550119400024414, "rewards/rejected": -23.236684799194336, "step": 1700 }, { "epoch": 5.329202168861348, "grad_norm": 0.002475241431966424, "learning_rate": 0.00018709020761582967, "logits/chosen": -1.1179392337799072, "logits/rejected": 0.12788158655166626, "logps/chosen": -407.6341857910156, "logps/rejected": -452.46905517578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.104896068572998, "rewards/margins": 19.286361694335938, "rewards/rejected": -24.39125633239746, "step": 1720 }, { "epoch": 5.3911696359411305, "grad_norm": 9.136780863627791e-05, "learning_rate": 0.00018677654533689287, "logits/chosen": -1.0823495388031006, "logits/rejected": 0.1217053085565567, "logps/chosen": -406.08746337890625, "logps/rejected": -469.70672607421875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.2551984786987305, "rewards/margins": 19.903423309326172, "rewards/rejected": -25.15862274169922, "step": 1740 }, { "epoch": 5.453137103020914, "grad_norm": 0.004714645445346832, "learning_rate": 0.00018645938771136303, "logits/chosen": -1.1251791715621948, "logits/rejected": 0.17070087790489197, "logps/chosen": -389.72161865234375, "logps/rejected": -451.95098876953125, "loss": 0.0076, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.84260368347168, "rewards/margins": 20.070592880249023, "rewards/rejected": -24.91319465637207, "step": 1760 }, { "epoch": 5.515104570100697, "grad_norm": 0.0027761622332036495, "learning_rate": 0.00018613874751430306, "logits/chosen": -1.0996571779251099, "logits/rejected": 0.08243855088949203, "logps/chosen": -397.93212890625, "logps/rejected": -448.5020446777344, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.580557346343994, "rewards/margins": 19.335094451904297, "rewards/rejected": -23.915653228759766, "step": 1780 }, { "epoch": 5.57707203718048, "grad_norm": 0.0022653560154139996, "learning_rate": 0.0001858146376610534, "logits/chosen": -1.1286752223968506, "logits/rejected": 0.13742080330848694, "logps/chosen": -411.6807556152344, "logps/rejected": -451.95098876953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.51253604888916, "rewards/margins": 19.66611099243164, "rewards/rejected": -24.178647994995117, "step": 1800 }, { "epoch": 5.639039504260263, "grad_norm": 0.003803479950875044, "learning_rate": 0.0001854870712067116, "logits/chosen": -1.052321195602417, "logits/rejected": 0.1815873682498932, "logps/chosen": -400.9226379394531, "logps/rejected": -471.00616455078125, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.826806545257568, "rewards/margins": 20.531307220458984, "rewards/rejected": -25.358118057250977, "step": 1820 }, { "epoch": 5.701006971340046, "grad_norm": 0.0006538841407746077, "learning_rate": 0.00018515606134560675, "logits/chosen": -0.9289565086364746, "logits/rejected": 0.3704550266265869, "logps/chosen": -403.9594421386719, "logps/rejected": -450.3981018066406, "loss": 0.0024, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.7688727378845215, "rewards/margins": 19.25821304321289, "rewards/rejected": -25.027088165283203, "step": 1840 }, { "epoch": 5.76297443841983, "grad_norm": 0.0022948060650378466, "learning_rate": 0.00018482162141076778, "logits/chosen": -0.9270216822624207, "logits/rejected": 0.4516163766384125, "logps/chosen": -397.4948425292969, "logps/rejected": -454.2916564941406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.666428089141846, "rewards/margins": 20.301498413085938, "rewards/rejected": -24.967926025390625, "step": 1860 }, { "epoch": 5.824941905499613, "grad_norm": 0.006001237779855728, "learning_rate": 0.00018448376487338646, "logits/chosen": -0.8780455589294434, "logits/rejected": 0.6470359563827515, "logps/chosen": -391.3553161621094, "logps/rejected": -444.733642578125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.144345283508301, "rewards/margins": 21.074970245361328, "rewards/rejected": -26.219318389892578, "step": 1880 }, { "epoch": 5.886909372579396, "grad_norm": 0.005203678738325834, "learning_rate": 0.00018414250534227485, "logits/chosen": -0.8152813911437988, "logits/rejected": 0.5252994894981384, "logps/chosen": -437.67633056640625, "logps/rejected": -507.53533935546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.068148612976074, "rewards/margins": 21.216217041015625, "rewards/rejected": -27.28436279296875, "step": 1900 }, { "epoch": 5.948876839659179, "grad_norm": 0.0005356409237720072, "learning_rate": 0.00018379785656331713, "logits/chosen": -0.9145771265029907, "logits/rejected": 0.49262484908103943, "logps/chosen": -414.5752868652344, "logps/rejected": -481.29443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.121223449707031, "rewards/margins": 21.19986343383789, "rewards/rejected": -26.32108497619629, "step": 1920 }, { "epoch": 6.010844306738962, "grad_norm": 0.003898509545251727, "learning_rate": 0.00018344983241891586, "logits/chosen": -0.9906449317932129, "logits/rejected": 0.5498068332672119, "logps/chosen": -414.0738830566406, "logps/rejected": -446.76800537109375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.372982978820801, "rewards/margins": 20.67215347290039, "rewards/rejected": -26.045135498046875, "step": 1940 }, { "epoch": 6.072811773818745, "grad_norm": 0.002160005969926715, "learning_rate": 0.00018309844692743283, "logits/chosen": -0.9815686941146851, "logits/rejected": 0.5956189632415771, "logps/chosen": -405.01654052734375, "logps/rejected": -442.0355529785156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.603247165679932, "rewards/margins": 20.942607879638672, "rewards/rejected": -25.545854568481445, "step": 1960 }, { "epoch": 6.134779240898529, "grad_norm": 0.000817653548438102, "learning_rate": 0.0001827437142426244, "logits/chosen": -0.8936567306518555, "logits/rejected": 0.5212098360061646, "logps/chosen": -395.4510498046875, "logps/rejected": -475.28631591796875, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.184081077575684, "rewards/margins": 21.514982223510742, "rewards/rejected": -26.69906234741211, "step": 1980 }, { "epoch": 6.196746707978312, "grad_norm": 0.002587874187156558, "learning_rate": 0.00018238564865307138, "logits/chosen": -0.8976683616638184, "logits/rejected": 0.5809212923049927, "logps/chosen": -413.9544372558594, "logps/rejected": -460.4183654785156, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.504066467285156, "rewards/margins": 20.807832717895508, "rewards/rejected": -26.311901092529297, "step": 2000 }, { "epoch": 6.258714175058095, "grad_norm": 0.0015577581943944097, "learning_rate": 0.00018202426458160354, "logits/chosen": -0.9174526333808899, "logits/rejected": 0.5119841694831848, "logps/chosen": -408.99609375, "logps/rejected": -474.6453552246094, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.049027919769287, "rewards/margins": 21.92804718017578, "rewards/rejected": -26.977075576782227, "step": 2020 }, { "epoch": 6.3206816421378775, "grad_norm": 0.0013881104532629251, "learning_rate": 0.00018165957658471853, "logits/chosen": -0.8762157559394836, "logits/rejected": 0.3336823880672455, "logps/chosen": -424.74237060546875, "logps/rejected": -481.97198486328125, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.531933784484863, "rewards/margins": 19.711212158203125, "rewards/rejected": -26.243144989013672, "step": 2040 }, { "epoch": 6.3826491092176605, "grad_norm": 0.0015647505642846227, "learning_rate": 0.00018129159935199572, "logits/chosen": -0.9118504524230957, "logits/rejected": 0.40582960844039917, "logps/chosen": -405.77001953125, "logps/rejected": -471.5838317871094, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.325821876525879, "rewards/margins": 20.694194793701172, "rewards/rejected": -26.02001953125, "step": 2060 }, { "epoch": 6.4446165762974434, "grad_norm": 0.00021198451577220112, "learning_rate": 0.00018092034770550436, "logits/chosen": -0.9560929536819458, "logits/rejected": 0.44417256116867065, "logps/chosen": -407.6698303222656, "logps/rejected": -475.4552307128906, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.728771209716797, "rewards/margins": 21.15816879272461, "rewards/rejected": -26.88694190979004, "step": 2080 }, { "epoch": 6.506584043377227, "grad_norm": 0.002910887822508812, "learning_rate": 0.00018054583659920669, "logits/chosen": -0.913163959980011, "logits/rejected": 0.4632677435874939, "logps/chosen": -397.37841796875, "logps/rejected": -459.6831970214844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.117273807525635, "rewards/margins": 21.222431182861328, "rewards/rejected": -26.339706420898438, "step": 2100 }, { "epoch": 6.56855151045701, "grad_norm": 0.0028291107155382633, "learning_rate": 0.00018016808111835544, "logits/chosen": -0.9535878896713257, "logits/rejected": 0.4175487160682678, "logps/chosen": -405.0999755859375, "logps/rejected": -472.16632080078125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.1232757568359375, "rewards/margins": 20.961214065551758, "rewards/rejected": -26.084487915039062, "step": 2120 }, { "epoch": 6.630518977536793, "grad_norm": 0.00040341372368857265, "learning_rate": 0.0001797870964788863, "logits/chosen": -0.9253376126289368, "logits/rejected": 0.42822474241256714, "logps/chosen": -415.0838928222656, "logps/rejected": -483.0498962402344, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.194808006286621, "rewards/margins": 20.92584228515625, "rewards/rejected": -27.120647430419922, "step": 2140 }, { "epoch": 6.692486444616576, "grad_norm": 0.0016098152846097946, "learning_rate": 0.0001794028980268049, "logits/chosen": -0.9861367344856262, "logits/rejected": 0.6260654330253601, "logps/chosen": -428.81243896484375, "logps/rejected": -470.528564453125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.6738715171813965, "rewards/margins": 21.712966918945312, "rewards/rejected": -27.386837005615234, "step": 2160 }, { "epoch": 6.754453911696359, "grad_norm": 0.00064540357561782, "learning_rate": 0.00017901550123756906, "logits/chosen": -0.9794920086860657, "logits/rejected": 0.5585097074508667, "logps/chosen": -405.3882141113281, "logps/rejected": -460.2223205566406, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.186602592468262, "rewards/margins": 21.48334312438965, "rewards/rejected": -26.66994285583496, "step": 2180 }, { "epoch": 6.816421378776143, "grad_norm": 0.01159485150128603, "learning_rate": 0.00017862492171546478, "logits/chosen": -0.8846482038497925, "logits/rejected": 0.5302290320396423, "logps/chosen": -418.35211181640625, "logps/rejected": -480.18597412109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.644828796386719, "rewards/margins": 21.466665267944336, "rewards/rejected": -27.111492156982422, "step": 2200 }, { "epoch": 6.878388845855926, "grad_norm": 0.0008106857421807945, "learning_rate": 0.0001782311751929784, "logits/chosen": -0.8796793222427368, "logits/rejected": 0.47619304060935974, "logps/chosen": -407.5473327636719, "logps/rejected": -496.4972229003906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.067633628845215, "rewards/margins": 22.56241226196289, "rewards/rejected": -27.63004493713379, "step": 2220 }, { "epoch": 6.940356312935709, "grad_norm": 0.00046345905866473913, "learning_rate": 0.00017783427753016232, "logits/chosen": -0.837285041809082, "logits/rejected": 0.547538161277771, "logps/chosen": -422.29541015625, "logps/rejected": -488.83734130859375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.489915370941162, "rewards/margins": 20.592243194580078, "rewards/rejected": -27.0821590423584, "step": 2240 }, { "epoch": 7.002323780015492, "grad_norm": 0.0002803100214805454, "learning_rate": 0.00017743424471399662, "logits/chosen": -0.9276706576347351, "logits/rejected": 0.4761223793029785, "logps/chosen": -409.07073974609375, "logps/rejected": -465.58544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.659182548522949, "rewards/margins": 20.956308364868164, "rewards/rejected": -26.615489959716797, "step": 2260 }, { "epoch": 7.064291247095275, "grad_norm": 0.0009606317616999149, "learning_rate": 0.00017703109285774473, "logits/chosen": -0.8453294634819031, "logits/rejected": 0.4921432435512543, "logps/chosen": -413.75616455078125, "logps/rejected": -493.93634033203125, "loss": 0.0033, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.530547142028809, "rewards/margins": 21.9550724029541, "rewards/rejected": -27.48561668395996, "step": 2280 }, { "epoch": 7.126258714175058, "grad_norm": 0.00027114342083223164, "learning_rate": 0.00017662483820030466, "logits/chosen": -0.8099842071533203, "logits/rejected": 0.5511294603347778, "logps/chosen": -417.60467529296875, "logps/rejected": -493.1431579589844, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.9566192626953125, "rewards/margins": 21.779888153076172, "rewards/rejected": -27.73651123046875, "step": 2300 }, { "epoch": 7.188226181254842, "grad_norm": 0.00029379583429545164, "learning_rate": 0.00017623603720914402, "logits/chosen": -0.9296634793281555, "logits/rejected": 0.502611517906189, "logps/chosen": -408.59228515625, "logps/rejected": -462.72052001953125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.541842937469482, "rewards/margins": 21.353099822998047, "rewards/rejected": -26.894943237304688, "step": 2320 }, { "epoch": 7.2501936483346245, "grad_norm": 0.0035946103744208813, "learning_rate": 0.00017582377926923305, "logits/chosen": -0.904528021812439, "logits/rejected": 0.5609920620918274, "logps/chosen": -410.15753173828125, "logps/rejected": -459.9115295410156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.2013936042785645, "rewards/margins": 21.163597106933594, "rewards/rejected": -26.364990234375, "step": 2340 }, { "epoch": 7.3121611154144075, "grad_norm": 0.001584152108989656, "learning_rate": 0.00017540846715854923, "logits/chosen": -0.8879092335700989, "logits/rejected": 0.5792483687400818, "logps/chosen": -416.14190673828125, "logps/rejected": -487.8919372558594, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.094472408294678, "rewards/margins": 21.977914810180664, "rewards/rejected": -28.0723876953125, "step": 2360 }, { "epoch": 7.3741285824941905, "grad_norm": 0.0013305445900186896, "learning_rate": 0.00017499011760580376, "logits/chosen": -0.8160873651504517, "logits/rejected": 0.6186665296554565, "logps/chosen": -419.13006591796875, "logps/rejected": -483.44482421875, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.028205871582031, "rewards/margins": 21.97576904296875, "rewards/rejected": -28.003976821899414, "step": 2380 }, { "epoch": 7.436096049573973, "grad_norm": 0.0005457611987367272, "learning_rate": 0.00017456874746205568, "logits/chosen": -0.8007138967514038, "logits/rejected": 0.6117135882377625, "logps/chosen": -416.05438232421875, "logps/rejected": -488.6453552246094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.2818403244018555, "rewards/margins": 22.225994110107422, "rewards/rejected": -28.507831573486328, "step": 2400 }, { "epoch": 7.498063516653756, "grad_norm": 0.0002609234652481973, "learning_rate": 0.00017414437370003293, "logits/chosen": -0.885455310344696, "logits/rejected": 0.7295613288879395, "logps/chosen": -407.35797119140625, "logps/rejected": -458.9103088378906, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.998741626739502, "rewards/margins": 22.125844955444336, "rewards/rejected": -28.124588012695312, "step": 2420 }, { "epoch": 7.56003098373354, "grad_norm": 0.010798891074955463, "learning_rate": 0.00017371701341344878, "logits/chosen": -0.8749537467956543, "logits/rejected": 0.737878680229187, "logps/chosen": -418.1792907714844, "logps/rejected": -482.16619873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.506211280822754, "rewards/margins": 22.69670295715332, "rewards/rejected": -28.20291519165039, "step": 2440 }, { "epoch": 7.621998450813323, "grad_norm": 0.002587670926004648, "learning_rate": 0.00017328668381631318, "logits/chosen": -0.8414995074272156, "logits/rejected": 0.6650308966636658, "logps/chosen": -403.06243896484375, "logps/rejected": -462.6051330566406, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.898090362548828, "rewards/margins": 21.654659271240234, "rewards/rejected": -27.552749633789062, "step": 2460 }, { "epoch": 7.683965917893106, "grad_norm": 0.0005328520783223212, "learning_rate": 0.00017285340224223965, "logits/chosen": -0.8659757375717163, "logits/rejected": 0.6123751401901245, "logps/chosen": -417.27764892578125, "logps/rejected": -472.28228759765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.232241153717041, "rewards/margins": 21.851760864257812, "rewards/rejected": -28.084003448486328, "step": 2480 }, { "epoch": 7.745933384972889, "grad_norm": 0.00171385589055717, "learning_rate": 0.00017241718614374678, "logits/chosen": -0.7435486912727356, "logits/rejected": 0.608551025390625, "logps/chosen": -418.3434143066406, "logps/rejected": -485.39825439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.975823402404785, "rewards/margins": 21.549240112304688, "rewards/rejected": -27.525060653686523, "step": 2500 }, { "epoch": 7.807900852052672, "grad_norm": 0.000463314849184826, "learning_rate": 0.00017197805309155536, "logits/chosen": -0.8705068826675415, "logits/rejected": 0.7182124257087708, "logps/chosen": -418.2559509277344, "logps/rejected": -470.73431396484375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.518788814544678, "rewards/margins": 21.688777923583984, "rewards/rejected": -28.207569122314453, "step": 2520 }, { "epoch": 7.869868319132456, "grad_norm": 0.002235127380117774, "learning_rate": 0.0001715360207738808, "logits/chosen": -0.8467845916748047, "logits/rejected": 0.6931872367858887, "logps/chosen": -398.0982360839844, "logps/rejected": -461.494140625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.6116943359375, "rewards/margins": 20.939289093017578, "rewards/rejected": -27.550983428955078, "step": 2540 }, { "epoch": 7.931835786212239, "grad_norm": 0.0014148158952593803, "learning_rate": 0.0001710911069957203, "logits/chosen": -0.8599146008491516, "logits/rejected": 0.597332239151001, "logps/chosen": -415.1485900878906, "logps/rejected": -478.2962341308594, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.637750148773193, "rewards/margins": 21.647342681884766, "rewards/rejected": -27.285091400146484, "step": 2560 }, { "epoch": 7.993803253292022, "grad_norm": 0.00026821051142178476, "learning_rate": 0.00017064332967813605, "logits/chosen": -0.8094769716262817, "logits/rejected": 0.5584506988525391, "logps/chosen": -433.38525390625, "logps/rejected": -498.05792236328125, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.968195915222168, "rewards/margins": 21.783451080322266, "rewards/rejected": -27.751644134521484, "step": 2580 }, { "epoch": 8.055770720371806, "grad_norm": 0.0008632375975139439, "learning_rate": 0.0001701927068575331, "logits/chosen": -0.8063030242919922, "logits/rejected": 0.681650698184967, "logps/chosen": -417.40740966796875, "logps/rejected": -483.35833740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.037853240966797, "rewards/margins": 22.328609466552734, "rewards/rejected": -28.3664608001709, "step": 2600 }, { "epoch": 8.117738187451588, "grad_norm": 0.00020725080685224384, "learning_rate": 0.0001697392566849329, "logits/chosen": -0.7843044996261597, "logits/rejected": 0.6513689160346985, "logps/chosen": -413.47589111328125, "logps/rejected": -490.56103515625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.002742290496826, "rewards/margins": 22.975902557373047, "rewards/rejected": -28.9786434173584, "step": 2620 }, { "epoch": 8.179705654531372, "grad_norm": 0.001819239230826497, "learning_rate": 0.00016928299742524234, "logits/chosen": -0.8633748292922974, "logits/rejected": 0.6922792196273804, "logps/chosen": -412.0850524902344, "logps/rejected": -474.5027770996094, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.745542526245117, "rewards/margins": 22.514820098876953, "rewards/rejected": -28.260364532470703, "step": 2640 }, { "epoch": 8.241673121611154, "grad_norm": 0.0030217173043638468, "learning_rate": 0.00016882394745651783, "logits/chosen": -0.9476584196090698, "logits/rejected": 0.6893962025642395, "logps/chosen": -416.17291259765625, "logps/rejected": -479.39923095703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.220405101776123, "rewards/margins": 22.663036346435547, "rewards/rejected": -28.883441925048828, "step": 2660 }, { "epoch": 8.303640588690937, "grad_norm": 0.0001663499278947711, "learning_rate": 0.00016836212526922522, "logits/chosen": -0.8719544410705566, "logits/rejected": 0.6744921207427979, "logps/chosen": -417.1148986816406, "logps/rejected": -475.9832458496094, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.162570953369141, "rewards/margins": 22.504776000976562, "rewards/rejected": -27.667346954345703, "step": 2680 }, { "epoch": 8.36560805577072, "grad_norm": 0.002916699508205056, "learning_rate": 0.00016789754946549485, "logits/chosen": -0.9196687936782837, "logits/rejected": 0.5761129260063171, "logps/chosen": -411.28033447265625, "logps/rejected": -478.9852600097656, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.738530158996582, "rewards/margins": 22.162540435791016, "rewards/rejected": -27.901071548461914, "step": 2700 }, { "epoch": 8.427575522850503, "grad_norm": 0.00011332995200064033, "learning_rate": 0.00016743023875837233, "logits/chosen": -0.7387035489082336, "logits/rejected": 0.6855649948120117, "logps/chosen": -426.68487548828125, "logps/rejected": -502.105712890625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.9964470863342285, "rewards/margins": 23.166492462158203, "rewards/rejected": -29.16294288635254, "step": 2720 }, { "epoch": 8.489542989930287, "grad_norm": 0.0009012964437715709, "learning_rate": 0.00016696021197106487, "logits/chosen": -0.8611875772476196, "logits/rejected": 0.7133287191390991, "logps/chosen": -395.930419921875, "logps/rejected": -469.5165100097656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.954014778137207, "rewards/margins": 22.503925323486328, "rewards/rejected": -27.45794105529785, "step": 2740 }, { "epoch": 8.55151045701007, "grad_norm": 0.00019017455633729696, "learning_rate": 0.00016648748803618286, "logits/chosen": -0.8437725901603699, "logits/rejected": 0.6948248147964478, "logps/chosen": -429.0943298339844, "logps/rejected": -488.63507080078125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.994604110717773, "rewards/margins": 21.811975479125977, "rewards/rejected": -27.80657958984375, "step": 2760 }, { "epoch": 8.613477924089853, "grad_norm": 0.002179044298827648, "learning_rate": 0.00016601208599497752, "logits/chosen": -0.8570048213005066, "logits/rejected": 0.5581663846969604, "logps/chosen": -420.24639892578125, "logps/rejected": -517.4495239257812, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.668288230895996, "rewards/margins": 22.895244598388672, "rewards/rejected": -29.563533782958984, "step": 2780 }, { "epoch": 8.675445391169635, "grad_norm": 0.0014218884753063321, "learning_rate": 0.0001655340249965737, "logits/chosen": -0.8298648595809937, "logits/rejected": 0.7341376543045044, "logps/chosen": -417.24017333984375, "logps/rejected": -486.96197509765625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.189940929412842, "rewards/margins": 22.539569854736328, "rewards/rejected": -28.72951316833496, "step": 2800 }, { "epoch": 8.737412858249419, "grad_norm": 0.004287872463464737, "learning_rate": 0.0001650533242971987, "logits/chosen": -0.783652126789093, "logits/rejected": 0.6020525693893433, "logps/chosen": -433.5499572753906, "logps/rejected": -506.0953063964844, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.208212852478027, "rewards/margins": 22.688650131225586, "rewards/rejected": -28.896865844726562, "step": 2820 }, { "epoch": 8.799380325329203, "grad_norm": 0.0011698489543050528, "learning_rate": 0.00016457000325940667, "logits/chosen": -0.9040369987487793, "logits/rejected": 0.7520915269851685, "logps/chosen": -423.38189697265625, "logps/rejected": -480.9803161621094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.842212677001953, "rewards/margins": 23.215862274169922, "rewards/rejected": -29.058074951171875, "step": 2840 }, { "epoch": 8.861347792408985, "grad_norm": 0.0011277641169726849, "learning_rate": 0.0001640840813512985, "logits/chosen": -0.8744897842407227, "logits/rejected": 0.7434971928596497, "logps/chosen": -426.23931884765625, "logps/rejected": -501.59710693359375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.3598833084106445, "rewards/margins": 23.147171020507812, "rewards/rejected": -29.507055282592773, "step": 2860 }, { "epoch": 8.923315259488769, "grad_norm": 0.0010242237476632, "learning_rate": 0.00016359557814573777, "logits/chosen": -0.7934980392456055, "logits/rejected": 0.6807147264480591, "logps/chosen": -406.7119445800781, "logps/rejected": -484.58831787109375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.454986572265625, "rewards/margins": 22.421138763427734, "rewards/rejected": -28.876129150390625, "step": 2880 }, { "epoch": 8.98528272656855, "grad_norm": 0.00022442563204094768, "learning_rate": 0.00016310451331956238, "logits/chosen": -0.7804977297782898, "logits/rejected": 0.7946068048477173, "logps/chosen": -423.38397216796875, "logps/rejected": -501.4856872558594, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -5.33301305770874, "rewards/margins": 23.95859146118164, "rewards/rejected": -29.29160499572754, "step": 2900 }, { "epoch": 9.047250193648335, "grad_norm": 0.0018227125983685255, "learning_rate": 0.00016261090665279198, "logits/chosen": -0.8016149401664734, "logits/rejected": 0.6636785268783569, "logps/chosen": -426.03240966796875, "logps/rejected": -527.2731323242188, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.532900810241699, "rewards/margins": 24.20720863342285, "rewards/rejected": -30.7401065826416, "step": 2920 }, { "epoch": 9.109217660728119, "grad_norm": 0.0025721483398228884, "learning_rate": 0.00016211477802783103, "logits/chosen": -0.7957175374031067, "logits/rejected": 0.7413456439971924, "logps/chosen": -411.5255432128906, "logps/rejected": -510.88909912109375, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.42633056640625, "rewards/margins": 23.930238723754883, "rewards/rejected": -30.3565673828125, "step": 2940 }, { "epoch": 9.1711851278079, "grad_norm": 0.0015712358290329576, "learning_rate": 0.00016161614742866832, "logits/chosen": -0.7725690603256226, "logits/rejected": 0.7799097895622253, "logps/chosen": -412.74810791015625, "logps/rejected": -492.89892578125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.049948692321777, "rewards/margins": 22.938610076904297, "rewards/rejected": -28.988555908203125, "step": 2960 }, { "epoch": 9.233152594887684, "grad_norm": 0.0009299792000092566, "learning_rate": 0.0001611150349400716, "logits/chosen": -0.8838273882865906, "logits/rejected": 0.7425965070724487, "logps/chosen": -419.3350524902344, "logps/rejected": -481.666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.259576320648193, "rewards/margins": 22.729785919189453, "rewards/rejected": -28.989360809326172, "step": 2980 }, { "epoch": 9.295120061967467, "grad_norm": 0.00034220717498101294, "learning_rate": 0.00016061146074677885, "logits/chosen": -0.728831946849823, "logits/rejected": 0.6241214871406555, "logps/chosen": -414.3390197753906, "logps/rejected": -511.9385681152344, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.351578235626221, "rewards/margins": 22.98415756225586, "rewards/rejected": -29.335735321044922, "step": 3000 }, { "epoch": 9.35708752904725, "grad_norm": 0.002175732748582959, "learning_rate": 0.00016010544513268515, "logits/chosen": -0.8456689715385437, "logits/rejected": 0.7692159414291382, "logps/chosen": -417.7942810058594, "logps/rejected": -494.46240234375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.584621429443359, "rewards/margins": 23.603557586669922, "rewards/rejected": -29.18817710876465, "step": 3020 }, { "epoch": 9.419054996127032, "grad_norm": 0.00041741851600818336, "learning_rate": 0.00015959700848002567, "logits/chosen": -0.8197334408760071, "logits/rejected": 0.8417131304740906, "logps/chosen": -426.35845947265625, "logps/rejected": -491.63421630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.198635578155518, "rewards/margins": 23.649044036865234, "rewards/rejected": -28.847681045532227, "step": 3040 }, { "epoch": 9.481022463206816, "grad_norm": 0.0018229244742542505, "learning_rate": 0.00015908617126855466, "logits/chosen": -0.7269546389579773, "logits/rejected": 0.8374387621879578, "logps/chosen": -418.9156799316406, "logps/rejected": -485.36920166015625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.943127632141113, "rewards/margins": 22.811771392822266, "rewards/rejected": -28.754898071289062, "step": 3060 }, { "epoch": 9.5429899302866, "grad_norm": 0.0002307717950316146, "learning_rate": 0.00015857295407472046, "logits/chosen": -0.7557907700538635, "logits/rejected": 0.7291213870048523, "logps/chosen": -418.8255920410156, "logps/rejected": -495.04010009765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.5549492835998535, "rewards/margins": 23.22446632385254, "rewards/rejected": -29.7794132232666, "step": 3080 }, { "epoch": 9.604957397366382, "grad_norm": 5.642590986099094e-05, "learning_rate": 0.00015805737757083681, "logits/chosen": -0.7865381836891174, "logits/rejected": 0.8217270970344543, "logps/chosen": -432.85711669921875, "logps/rejected": -503.07293701171875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.972267150878906, "rewards/margins": 23.414052963256836, "rewards/rejected": -30.386322021484375, "step": 3100 }, { "epoch": 9.666924864446166, "grad_norm": 0.0031394653487950563, "learning_rate": 0.00015753946252425013, "logits/chosen": -0.8239234089851379, "logits/rejected": 0.7479206323623657, "logps/chosen": -423.5618591308594, "logps/rejected": -487.6026306152344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.732801914215088, "rewards/margins": 22.03606605529785, "rewards/rejected": -28.768869400024414, "step": 3120 }, { "epoch": 9.728892331525948, "grad_norm": 0.0020447850693017244, "learning_rate": 0.000157019229796503, "logits/chosen": -0.757154107093811, "logits/rejected": 0.6788080334663391, "logps/chosen": -401.03680419921875, "logps/rejected": -482.5953674316406, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.940583229064941, "rewards/margins": 22.295238494873047, "rewards/rejected": -28.235824584960938, "step": 3140 }, { "epoch": 9.790859798605732, "grad_norm": 0.0010579255176708102, "learning_rate": 0.0001564967003424938, "logits/chosen": -0.7942818403244019, "logits/rejected": 0.6221517324447632, "logps/chosen": -429.353515625, "logps/rejected": -519.2991943359375, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.464829444885254, "rewards/margins": 22.819751739501953, "rewards/rejected": -30.284582138061523, "step": 3160 }, { "epoch": 9.852827265685516, "grad_norm": 0.0005074171931482852, "learning_rate": 0.00015597189520963277, "logits/chosen": -0.7147163152694702, "logits/rejected": 0.7833064794540405, "logps/chosen": -423.9457092285156, "logps/rejected": -520.0857543945312, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.090755462646484, "rewards/margins": 23.77802848815918, "rewards/rejected": -30.868785858154297, "step": 3180 }, { "epoch": 9.914794732765298, "grad_norm": 0.00032806736999191344, "learning_rate": 0.00015544483553699408, "logits/chosen": -0.7453028559684753, "logits/rejected": 0.8783397674560547, "logps/chosen": -429.57421875, "logps/rejected": -521.6644897460938, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.363633155822754, "rewards/margins": 24.92776870727539, "rewards/rejected": -31.291400909423828, "step": 3200 }, { "epoch": 9.976762199845082, "grad_norm": 0.0016681504203006625, "learning_rate": 0.00015491554255446462, "logits/chosen": -0.800809383392334, "logits/rejected": 0.8924380540847778, "logps/chosen": -403.0744323730469, "logps/rejected": -475.4017028808594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.697165489196777, "rewards/margins": 23.721771240234375, "rewards/rejected": -29.418935775756836, "step": 3220 }, { "epoch": 10.038729666924864, "grad_norm": 0.0008030760800465941, "learning_rate": 0.0001543840375818884, "logits/chosen": -0.844366729259491, "logits/rejected": 0.8316100835800171, "logps/chosen": -439.8392639160156, "logps/rejected": -524.9827270507812, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.307751178741455, "rewards/margins": 24.47549819946289, "rewards/rejected": -30.783245086669922, "step": 3240 }, { "epoch": 10.100697134004648, "grad_norm": 0.0015144218923524022, "learning_rate": 0.0001538503420282083, "logits/chosen": -0.7335025072097778, "logits/rejected": 0.744965672492981, "logps/chosen": -424.39984130859375, "logps/rejected": -503.18902587890625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.292525291442871, "rewards/margins": 23.78156089782715, "rewards/rejected": -30.074087142944336, "step": 3260 }, { "epoch": 10.162664601084431, "grad_norm": 0.0008248073281720281, "learning_rate": 0.00015331447739060338, "logits/chosen": -0.7617536187171936, "logits/rejected": 0.8052036166191101, "logps/chosen": -410.5335388183594, "logps/rejected": -484.446044921875, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.2051849365234375, "rewards/margins": 23.270933151245117, "rewards/rejected": -29.476116180419922, "step": 3280 }, { "epoch": 10.224632068164214, "grad_norm": 0.0004835600557271391, "learning_rate": 0.0001527764652536231, "logits/chosen": -0.7629774212837219, "logits/rejected": 0.8532567024230957, "logps/chosen": -402.6639709472656, "logps/rejected": -491.9307556152344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.240920543670654, "rewards/margins": 23.61534881591797, "rewards/rejected": -29.85626792907715, "step": 3300 }, { "epoch": 10.286599535243997, "grad_norm": 1.3681114978680853e-05, "learning_rate": 0.0001522363272883179, "logits/chosen": -0.6888297200202942, "logits/rejected": 0.8447307348251343, "logps/chosen": -411.98431396484375, "logps/rejected": -501.7259216308594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.599992275238037, "rewards/margins": 23.839008331298828, "rewards/rejected": -30.439001083374023, "step": 3320 }, { "epoch": 10.34856700232378, "grad_norm": 0.0015049786306917667, "learning_rate": 0.0001516940852513663, "logits/chosen": -0.802474319934845, "logits/rejected": 0.7312396168708801, "logps/chosen": -408.78009033203125, "logps/rejected": -501.54412841796875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.309834957122803, "rewards/margins": 23.346172332763672, "rewards/rejected": -30.656009674072266, "step": 3340 }, { "epoch": 10.410534469403563, "grad_norm": 0.0007437937310896814, "learning_rate": 0.00015114976098419842, "logits/chosen": -0.7263267636299133, "logits/rejected": 0.7361682057380676, "logps/chosen": -429.12945556640625, "logps/rejected": -530.2430419921875, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.809745788574219, "rewards/margins": 24.216045379638672, "rewards/rejected": -31.02579116821289, "step": 3360 }, { "epoch": 10.472501936483347, "grad_norm": 0.0008804806275293231, "learning_rate": 0.00015060337641211637, "logits/chosen": -0.7912311553955078, "logits/rejected": 0.9160035848617554, "logps/chosen": -410.7565002441406, "logps/rejected": -474.1514587402344, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.874563694000244, "rewards/margins": 22.452852249145508, "rewards/rejected": -29.32741355895996, "step": 3380 }, { "epoch": 10.53446940356313, "grad_norm": 0.0012436832766979933, "learning_rate": 0.00015005495354341114, "logits/chosen": -0.7138643264770508, "logits/rejected": 0.7492295503616333, "logps/chosen": -415.385009765625, "logps/rejected": -515.1976928710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.435647487640381, "rewards/margins": 23.116397857666016, "rewards/rejected": -29.552043914794922, "step": 3400 }, { "epoch": 10.596436870642913, "grad_norm": 0.00021300691878423095, "learning_rate": 0.00014950451446847578, "logits/chosen": -0.807390034198761, "logits/rejected": 0.7440930008888245, "logps/chosen": -427.7305603027344, "logps/rejected": -520.4072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.44257116317749, "rewards/margins": 23.892803192138672, "rewards/rejected": -30.335372924804688, "step": 3420 }, { "epoch": 10.658404337722695, "grad_norm": 0.00298711028881371, "learning_rate": 0.00014895208135891604, "logits/chosen": -0.8018674850463867, "logits/rejected": 0.7456072568893433, "logps/chosen": -418.41131591796875, "logps/rejected": -519.7051391601562, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.13026237487793, "rewards/margins": 25.155405044555664, "rewards/rejected": -30.285669326782227, "step": 3440 }, { "epoch": 10.720371804802479, "grad_norm": 0.0018449191702529788, "learning_rate": 0.000148397676466657, "logits/chosen": -0.8092568516731262, "logits/rejected": 0.8564573526382446, "logps/chosen": -424.4937438964844, "logps/rejected": -499.78472900390625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.369976043701172, "rewards/margins": 23.924694061279297, "rewards/rejected": -30.294673919677734, "step": 3460 }, { "epoch": 10.782339271882261, "grad_norm": 0.0006981108454056084, "learning_rate": 0.00014784132212304694, "logits/chosen": -0.6652621030807495, "logits/rejected": 0.7642368078231812, "logps/chosen": -421.30816650390625, "logps/rejected": -508.30010986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.402596473693848, "rewards/margins": 23.3673038482666, "rewards/rejected": -29.7699031829834, "step": 3480 }, { "epoch": 10.844306738962045, "grad_norm": 0.0020014916080981493, "learning_rate": 0.00014728304073795764, "logits/chosen": -0.7477067112922668, "logits/rejected": 0.9086839556694031, "logps/chosen": -410.92974853515625, "logps/rejected": -485.93804931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.754061698913574, "rewards/margins": 23.10568618774414, "rewards/rejected": -29.8597469329834, "step": 3500 }, { "epoch": 10.906274206041829, "grad_norm": 0.0002572743396740407, "learning_rate": 0.0001467228547988819, "logits/chosen": -0.7481425404548645, "logits/rejected": 0.9864739179611206, "logps/chosen": -417.87158203125, "logps/rejected": -484.71270751953125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.400439262390137, "rewards/margins": 23.81901741027832, "rewards/rejected": -30.219451904296875, "step": 3520 }, { "epoch": 10.96824167312161, "grad_norm": 1.1203023859707173e-05, "learning_rate": 0.0001461607868700276, "logits/chosen": -0.7098456621170044, "logits/rejected": 0.8115944862365723, "logps/chosen": -410.0858459472656, "logps/rejected": -519.0262451171875, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.59292459487915, "rewards/margins": 24.37098503112793, "rewards/rejected": -30.963909149169922, "step": 3540 }, { "epoch": 11.030209140201395, "grad_norm": 0.0009471502853557467, "learning_rate": 0.00014559685959140907, "logits/chosen": -0.7040443420410156, "logits/rejected": 0.7584648132324219, "logps/chosen": -409.934814453125, "logps/rejected": -511.6397399902344, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.900947570800781, "rewards/margins": 23.620128631591797, "rewards/rejected": -30.521076202392578, "step": 3560 }, { "epoch": 11.092176607281177, "grad_norm": 0.002169826766476035, "learning_rate": 0.00014503109567793481, "logits/chosen": -0.7528023719787598, "logits/rejected": 0.9099094271659851, "logps/chosen": -408.87664794921875, "logps/rejected": -497.9203186035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.732300281524658, "rewards/margins": 25.08970069885254, "rewards/rejected": -30.821996688842773, "step": 3580 }, { "epoch": 11.15414407436096, "grad_norm": 0.0010581511305645108, "learning_rate": 0.00014446351791849276, "logits/chosen": -0.6851012110710144, "logits/rejected": 0.8104494214057922, "logps/chosen": -419.51715087890625, "logps/rejected": -524.4638671875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.121168613433838, "rewards/margins": 24.22315788269043, "rewards/rejected": -31.34432601928711, "step": 3600 }, { "epoch": 11.216111541440744, "grad_norm": 0.0006640542997047305, "learning_rate": 0.0001438941491750323, "logits/chosen": -0.7948740124702454, "logits/rejected": 0.8368036150932312, "logps/chosen": -424.7229919433594, "logps/rejected": -516.5067749023438, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.916244983673096, "rewards/margins": 24.990079879760742, "rewards/rejected": -30.906330108642578, "step": 3620 }, { "epoch": 11.278079008520526, "grad_norm": 0.001158738974481821, "learning_rate": 0.00014332301238164342, "logits/chosen": -0.7854216694831848, "logits/rejected": 0.7745558023452759, "logps/chosen": -416.4905700683594, "logps/rejected": -498.207763671875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.545945167541504, "rewards/margins": 23.817821502685547, "rewards/rejected": -30.36376953125, "step": 3640 }, { "epoch": 11.34004647560031, "grad_norm": 0.0008971371571533382, "learning_rate": 0.00014275013054363287, "logits/chosen": -0.7516878247261047, "logits/rejected": 0.8620051145553589, "logps/chosen": -427.61993408203125, "logps/rejected": -537.0158081054688, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.415410041809082, "rewards/margins": 25.761362075805664, "rewards/rejected": -32.17677307128906, "step": 3660 }, { "epoch": 11.402013942680092, "grad_norm": 0.0016706970054656267, "learning_rate": 0.00014217552673659754, "logits/chosen": -0.7678354382514954, "logits/rejected": 0.8684478998184204, "logps/chosen": -422.351806640625, "logps/rejected": -499.56201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.864222526550293, "rewards/margins": 23.08515167236328, "rewards/rejected": -29.94937515258789, "step": 3680 }, { "epoch": 11.463981409759876, "grad_norm": 0.0001611242478247732, "learning_rate": 0.00014159922410549497, "logits/chosen": -0.7930831909179688, "logits/rejected": 0.7886659502983093, "logps/chosen": -411.2481994628906, "logps/rejected": -510.31097412109375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.10078763961792, "rewards/margins": 24.464405059814453, "rewards/rejected": -30.565189361572266, "step": 3700 }, { "epoch": 11.52594887683966, "grad_norm": 0.0010968918213620782, "learning_rate": 0.0001410212458637112, "logits/chosen": -0.750701367855072, "logits/rejected": 0.8467336893081665, "logps/chosen": -411.9034729003906, "logps/rejected": -495.7069396972656, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.123504638671875, "rewards/margins": 24.326147079467773, "rewards/rejected": -30.449655532836914, "step": 3720 }, { "epoch": 11.587916343919442, "grad_norm": 0.0012211522553116083, "learning_rate": 0.00014044161529212543, "logits/chosen": -0.7011710405349731, "logits/rejected": 0.7984441518783569, "logps/chosen": -407.7188415527344, "logps/rejected": -515.7482299804688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.711037635803223, "rewards/margins": 24.880985260009766, "rewards/rejected": -30.59202003479004, "step": 3740 }, { "epoch": 11.649883810999226, "grad_norm": 5.050484469393268e-05, "learning_rate": 0.0001398603557381726, "logits/chosen": -0.7948409914970398, "logits/rejected": 0.8654049634933472, "logps/chosen": -426.082763671875, "logps/rejected": -484.33380126953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.525465488433838, "rewards/margins": 23.004573822021484, "rewards/rejected": -29.530038833618164, "step": 3760 }, { "epoch": 11.711851278079008, "grad_norm": 0.0003607009712141007, "learning_rate": 0.0001392774906149028, "logits/chosen": -0.8043268918991089, "logits/rejected": 0.8279350996017456, "logps/chosen": -409.0636291503906, "logps/rejected": -511.4398498535156, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.8972344398498535, "rewards/margins": 25.10378074645996, "rewards/rejected": -31.00101661682129, "step": 3780 }, { "epoch": 11.773818745158792, "grad_norm": 0.0004310712101869285, "learning_rate": 0.0001386930434000382, "logits/chosen": -0.6975358128547668, "logits/rejected": 0.9005098342895508, "logps/chosen": -414.76806640625, "logps/rejected": -520.3231201171875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.5791015625, "rewards/margins": 24.619598388671875, "rewards/rejected": -31.198699951171875, "step": 3800 }, { "epoch": 11.835786212238574, "grad_norm": 0.0026548670139163733, "learning_rate": 0.00013810703763502744, "logits/chosen": -0.7100318074226379, "logits/rejected": 0.9277878999710083, "logps/chosen": -429.67669677734375, "logps/rejected": -511.8397521972656, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.772208213806152, "rewards/margins": 23.749717712402344, "rewards/rejected": -30.521926879882812, "step": 3820 }, { "epoch": 11.897753679318358, "grad_norm": 0.0004118687065783888, "learning_rate": 0.00013751949692409718, "logits/chosen": -0.6234445571899414, "logits/rejected": 0.8737300634384155, "logps/chosen": -428.03271484375, "logps/rejected": -531.5043334960938, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.512963771820068, "rewards/margins": 24.97269058227539, "rewards/rejected": -32.485652923583984, "step": 3840 }, { "epoch": 11.959721146398142, "grad_norm": 0.0020478537771850824, "learning_rate": 0.00013693044493330166, "logits/chosen": -0.7277485132217407, "logits/rejected": 0.9264734983444214, "logps/chosen": -431.9356384277344, "logps/rejected": -531.0416870117188, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.424185752868652, "rewards/margins": 24.675241470336914, "rewards/rejected": -32.09942626953125, "step": 3860 }, { "epoch": 12.021688613477924, "grad_norm": 0.0008361217333003879, "learning_rate": 0.0001363399053895692, "logits/chosen": -0.782467246055603, "logits/rejected": 0.8881294131278992, "logps/chosen": -434.98773193359375, "logps/rejected": -516.9628295898438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.331326007843018, "rewards/margins": 24.816944122314453, "rewards/rejected": -31.148269653320312, "step": 3880 }, { "epoch": 12.083656080557708, "grad_norm": 3.445847687544301e-05, "learning_rate": 0.00013574790207974646, "logits/chosen": -0.6909776329994202, "logits/rejected": 0.9060857892036438, "logps/chosen": -430.9913024902344, "logps/rejected": -531.0490112304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.471495628356934, "rewards/margins": 25.3931941986084, "rewards/rejected": -31.86469078063965, "step": 3900 }, { "epoch": 12.14562354763749, "grad_norm": 0.0013186397263780236, "learning_rate": 0.00013515445884964045, "logits/chosen": -0.7101693153381348, "logits/rejected": 0.9304086565971375, "logps/chosen": -414.54608154296875, "logps/rejected": -513.8648071289062, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.092409610748291, "rewards/margins": 24.61441421508789, "rewards/rejected": -31.70682144165039, "step": 3920 }, { "epoch": 12.207591014717273, "grad_norm": 0.0014901352114975452, "learning_rate": 0.00013455959960305798, "logits/chosen": -0.6674474477767944, "logits/rejected": 0.8775323629379272, "logps/chosen": -437.1656799316406, "logps/rejected": -539.7037963867188, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.307480812072754, "rewards/margins": 25.1062068939209, "rewards/rejected": -32.41368865966797, "step": 3940 }, { "epoch": 12.269558481797057, "grad_norm": 0.0019103622762486339, "learning_rate": 0.0001339633483008427, "logits/chosen": -0.7112741470336914, "logits/rejected": 0.9096217155456543, "logps/chosen": -417.7974548339844, "logps/rejected": -520.1793212890625, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.133049011230469, "rewards/margins": 24.84225845336914, "rewards/rejected": -31.975311279296875, "step": 3960 }, { "epoch": 12.33152594887684, "grad_norm": 1.1235245438001584e-05, "learning_rate": 0.00013336572895991016, "logits/chosen": -0.6781491637229919, "logits/rejected": 0.8913220167160034, "logps/chosen": -417.21246337890625, "logps/rejected": -522.6915283203125, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.486870765686035, "rewards/margins": 25.340744018554688, "rewards/rejected": -31.827617645263672, "step": 3980 }, { "epoch": 12.393493415956623, "grad_norm": 0.0010617575608193874, "learning_rate": 0.00013276676565228027, "logits/chosen": -0.7145902514457703, "logits/rejected": 0.9445177316665649, "logps/chosen": -420.17547607421875, "logps/rejected": -519.7775268554688, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.825955867767334, "rewards/margins": 25.554126739501953, "rewards/rejected": -31.380081176757812, "step": 4000 }, { "epoch": 12.455460883036405, "grad_norm": 1.6598849470028654e-05, "learning_rate": 0.00013216648250410776, "logits/chosen": -0.6829872131347656, "logits/rejected": 0.8398195505142212, "logps/chosen": -432.6788635253906, "logps/rejected": -534.5023193359375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.334891319274902, "rewards/margins": 24.781658172607422, "rewards/rejected": -31.116552352905273, "step": 4020 }, { "epoch": 12.51742835011619, "grad_norm": 0.00028609836590476334, "learning_rate": 0.00013156490369471027, "logits/chosen": -0.7433018684387207, "logits/rejected": 0.9416133761405945, "logps/chosen": -409.298095703125, "logps/rejected": -509.50128173828125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.734123229980469, "rewards/margins": 25.44891357421875, "rewards/rejected": -31.18303871154785, "step": 4040 }, { "epoch": 12.579395817195973, "grad_norm": 0.0008309365948662162, "learning_rate": 0.00013096205345559448, "logits/chosen": -0.6434201002120972, "logits/rejected": 1.0335710048675537, "logps/chosen": -431.5645446777344, "logps/rejected": -512.9036254882812, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.22793436050415, "rewards/margins": 24.197513580322266, "rewards/rejected": -31.42544937133789, "step": 4060 }, { "epoch": 12.641363284275755, "grad_norm": 0.0005798207130283117, "learning_rate": 0.00013035795606948023, "logits/chosen": -0.6275348663330078, "logits/rejected": 0.9419866800308228, "logps/chosen": -420.60870361328125, "logps/rejected": -529.2860717773438, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.123082160949707, "rewards/margins": 25.372949600219727, "rewards/rejected": -32.49603271484375, "step": 4080 }, { "epoch": 12.703330751355539, "grad_norm": 0.0009076519636437297, "learning_rate": 0.00012975263586932208, "logits/chosen": -0.7268190979957581, "logits/rejected": 0.8527728915214539, "logps/chosen": -420.3800354003906, "logps/rejected": -530.3096313476562, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.111058712005615, "rewards/margins": 24.54432487487793, "rewards/rejected": -31.6553897857666, "step": 4100 }, { "epoch": 12.765298218435321, "grad_norm": 0.0008658567676320672, "learning_rate": 0.00012914611723732942, "logits/chosen": -0.7163742780685425, "logits/rejected": 0.9323918223381042, "logps/chosen": -420.14227294921875, "logps/rejected": -508.64093017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.57541036605835, "rewards/margins": 24.635540008544922, "rewards/rejected": -31.210952758789062, "step": 4120 }, { "epoch": 12.827265685515105, "grad_norm": 0.0006353395874612033, "learning_rate": 0.00012853842460398428, "logits/chosen": -0.6529034972190857, "logits/rejected": 0.9955090284347534, "logps/chosen": -445.478271484375, "logps/rejected": -555.215576171875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -7.958199977874756, "rewards/margins": 25.986324310302734, "rewards/rejected": -33.94452667236328, "step": 4140 }, { "epoch": 12.889233152594887, "grad_norm": 0.0007925338577479124, "learning_rate": 0.00012792958244705745, "logits/chosen": -0.7437697649002075, "logits/rejected": 0.945330023765564, "logps/chosen": -436.5621032714844, "logps/rejected": -523.8942260742188, "loss": 0.0033, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -6.880484580993652, "rewards/margins": 24.760828018188477, "rewards/rejected": -31.641315460205078, "step": 4160 }, { "epoch": 12.95120061967467, "grad_norm": 0.0009758576052263379, "learning_rate": 0.00012731961529062211, "logits/chosen": -0.707342267036438, "logits/rejected": 1.0127270221710205, "logps/chosen": -429.137451171875, "logps/rejected": -498.2242736816406, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.349567413330078, "rewards/margins": 23.53993797302246, "rewards/rejected": -29.889501571655273, "step": 4180 }, { "epoch": 13.013168086754455, "grad_norm": 0.001348630990833044, "learning_rate": 0.0001267085477040664, "logits/chosen": -0.710185170173645, "logits/rejected": 0.9165050387382507, "logps/chosen": -426.640869140625, "logps/rejected": -530.394287109375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.763472557067871, "rewards/margins": 25.113622665405273, "rewards/rejected": -31.877094268798828, "step": 4200 }, { "epoch": 13.075135553834237, "grad_norm": 0.0003159803745802492, "learning_rate": 0.0001260964043011036, "logits/chosen": -0.7468986511230469, "logits/rejected": 0.946589469909668, "logps/chosen": -449.58062744140625, "logps/rejected": -552.7725219726562, "loss": 0.0043, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.443936824798584, "rewards/margins": 26.3941707611084, "rewards/rejected": -32.83810806274414, "step": 4220 }, { "epoch": 13.13710302091402, "grad_norm": 0.0002802180533763021, "learning_rate": 0.0001254832097387808, "logits/chosen": -0.7415999174118042, "logits/rejected": 0.8606869578361511, "logps/chosen": -412.0994567871094, "logps/rejected": -522.6536254882812, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -6.73651123046875, "rewards/margins": 25.14139175415039, "rewards/rejected": -31.877904891967773, "step": 4240 }, { "epoch": 13.199070487993803, "grad_norm": 0.0003081171598751098, "learning_rate": 0.0001248689887164855, "logits/chosen": -0.6822153329849243, "logits/rejected": 0.9707147479057312, "logps/chosen": -426.1729431152344, "logps/rejected": -523.9768676757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.399308681488037, "rewards/margins": 25.116762161254883, "rewards/rejected": -31.516071319580078, "step": 4260 }, { "epoch": 13.261037955073586, "grad_norm": 0.0007231299532577395, "learning_rate": 0.0001242537659749509, "logits/chosen": -0.6475167274475098, "logits/rejected": 0.9226775169372559, "logps/chosen": -428.57855224609375, "logps/rejected": -549.84228515625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.925917148590088, "rewards/margins": 25.896963119506836, "rewards/rejected": -32.82288360595703, "step": 4280 }, { "epoch": 13.32300542215337, "grad_norm": 0.0007026797975413501, "learning_rate": 0.00012363756629525937, "logits/chosen": -0.6976606845855713, "logits/rejected": 0.9327031970024109, "logps/chosen": -425.42449951171875, "logps/rejected": -524.47509765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.578268527984619, "rewards/margins": 25.798770904541016, "rewards/rejected": -32.377037048339844, "step": 4300 }, { "epoch": 13.384972889233152, "grad_norm": 5.637519643642008e-05, "learning_rate": 0.00012302041449784409, "logits/chosen": -0.6870549917221069, "logits/rejected": 0.9386127591133118, "logps/chosen": -427.32501220703125, "logps/rejected": -519.8970336914062, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.1073102951049805, "rewards/margins": 24.731271743774414, "rewards/rejected": -31.83858299255371, "step": 4320 }, { "epoch": 13.446940356312936, "grad_norm": 0.0001799424208002165, "learning_rate": 0.00012240233544148955, "logits/chosen": -0.6837178468704224, "logits/rejected": 0.9757431149482727, "logps/chosen": -432.7763671875, "logps/rejected": -525.1026611328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.646360874176025, "rewards/margins": 25.124706268310547, "rewards/rejected": -31.771068572998047, "step": 4340 }, { "epoch": 13.508907823392718, "grad_norm": 0.0015866424655541778, "learning_rate": 0.00012178335402232996, "logits/chosen": -0.6187258958816528, "logits/rejected": 0.9561142921447754, "logps/chosen": -419.75164794921875, "logps/rejected": -530.4887084960938, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.826728820800781, "rewards/margins": 25.215198516845703, "rewards/rejected": -32.041927337646484, "step": 4360 }, { "epoch": 13.570875290472502, "grad_norm": 0.00038470287108793855, "learning_rate": 0.00012116349517284665, "logits/chosen": -0.7070793509483337, "logits/rejected": 0.9476947784423828, "logps/chosen": -416.0244140625, "logps/rejected": -527.4065551757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.205079555511475, "rewards/margins": 24.985538482666016, "rewards/rejected": -32.19062042236328, "step": 4380 }, { "epoch": 13.632842757552286, "grad_norm": 0.0010880132904276252, "learning_rate": 0.00012054278386086368, "logits/chosen": -0.6799092292785645, "logits/rejected": 0.9883368611335754, "logps/chosen": -415.4871520996094, "logps/rejected": -502.75994873046875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.845544338226318, "rewards/margins": 24.369251251220703, "rewards/rejected": -31.214794158935547, "step": 4400 }, { "epoch": 13.694810224632068, "grad_norm": 0.0005410652374848723, "learning_rate": 0.0001199523412929886, "logits/chosen": -0.6802583336830139, "logits/rejected": 1.0727746486663818, "logps/chosen": -414.32733154296875, "logps/rejected": -502.4496154785156, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.712512016296387, "rewards/margins": 24.851696014404297, "rewards/rejected": -31.564212799072266, "step": 4420 }, { "epoch": 13.756777691711852, "grad_norm": 0.0004551692109089345, "learning_rate": 0.00011933003962196613, "logits/chosen": -0.6473695039749146, "logits/rejected": 0.9640854597091675, "logps/chosen": -416.96881103515625, "logps/rejected": -520.4708251953125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.991703987121582, "rewards/margins": 24.429357528686523, "rewards/rejected": -31.42106056213379, "step": 4440 }, { "epoch": 13.818745158791634, "grad_norm": 1.8331444152863696e-05, "learning_rate": 0.00011870695933976628, "logits/chosen": -0.7241955995559692, "logits/rejected": 0.8486202359199524, "logps/chosen": -418.4485778808594, "logps/rejected": -525.0150756835938, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.720297813415527, "rewards/margins": 24.988901138305664, "rewards/rejected": -31.709197998046875, "step": 4460 }, { "epoch": 13.880712625871418, "grad_norm": 4.146054925513454e-05, "learning_rate": 0.00011808312554397192, "logits/chosen": -0.5974981188774109, "logits/rejected": 0.948889434337616, "logps/chosen": -439.1353454589844, "logps/rejected": -556.8853759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.1817474365234375, "rewards/margins": 25.7791805267334, "rewards/rejected": -32.96092987060547, "step": 4480 }, { "epoch": 13.9426800929512, "grad_norm": 4.285129398340359e-05, "learning_rate": 0.00011745856336251742, "logits/chosen": -0.7139695286750793, "logits/rejected": 0.9608259201049805, "logps/chosen": -419.91900634765625, "logps/rejected": -496.1962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.308480739593506, "rewards/margins": 23.839679718017578, "rewards/rejected": -31.148162841796875, "step": 4500 }, { "epoch": 14.004647560030984, "grad_norm": 0.0007021346245892346, "learning_rate": 0.00011683329795267636, "logits/chosen": -0.7038004994392395, "logits/rejected": 1.0250012874603271, "logps/chosen": -437.5606384277344, "logps/rejected": -532.3974609375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.6007537841796875, "rewards/margins": 26.052099227905273, "rewards/rejected": -32.652854919433594, "step": 4520 }, { "epoch": 14.066615027110767, "grad_norm": 0.0006834513042122126, "learning_rate": 0.00011620735450004829, "logits/chosen": -0.6422589421272278, "logits/rejected": 0.9730979800224304, "logps/chosen": -436.33941650390625, "logps/rejected": -526.1567993164062, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.394343376159668, "rewards/margins": 24.573226928710938, "rewards/rejected": -31.967571258544922, "step": 4540 }, { "epoch": 14.12858249419055, "grad_norm": 0.0012112940894439816, "learning_rate": 0.00011558075821754417, "logits/chosen": -0.5923640131950378, "logits/rejected": 1.0607713460922241, "logps/chosen": -416.2725524902344, "logps/rejected": -506.9697265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.253162384033203, "rewards/margins": 24.898513793945312, "rewards/rejected": -31.15167808532715, "step": 4560 }, { "epoch": 14.190549961270333, "grad_norm": 0.00040532436105422676, "learning_rate": 0.00011495353434437098, "logits/chosen": -0.5979864001274109, "logits/rejected": 1.0398799180984497, "logps/chosen": -425.46356201171875, "logps/rejected": -538.7312622070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.892348289489746, "rewards/margins": 26.227279663085938, "rewards/rejected": -33.119632720947266, "step": 4580 }, { "epoch": 14.252517428350115, "grad_norm": 8.036774670472369e-05, "learning_rate": 0.00011432570814501478, "logits/chosen": -0.6628149151802063, "logits/rejected": 1.0684958696365356, "logps/chosen": -420.6571350097656, "logps/rejected": -505.7470703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.971704959869385, "rewards/margins": 24.749691009521484, "rewards/rejected": -31.721399307250977, "step": 4600 }, { "epoch": 14.3144848954299, "grad_norm": 2.7621756089502014e-05, "learning_rate": 0.00011369730490822336, "logits/chosen": -0.6400030255317688, "logits/rejected": 1.14774489402771, "logps/chosen": -435.0557556152344, "logps/rejected": -527.8553466796875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.974684715270996, "rewards/margins": 24.852916717529297, "rewards/rejected": -32.827606201171875, "step": 4620 }, { "epoch": 14.376452362509683, "grad_norm": 0.0011475204955786467, "learning_rate": 0.0001130683499459875, "logits/chosen": -0.6620621681213379, "logits/rejected": 1.0378706455230713, "logps/chosen": -432.5133361816406, "logps/rejected": -552.24072265625, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.94607400894165, "rewards/margins": 27.082103729248047, "rewards/rejected": -34.02817916870117, "step": 4640 }, { "epoch": 14.438419829589465, "grad_norm": 0.001489471411332488, "learning_rate": 0.00011243886859252135, "logits/chosen": -0.7889328002929688, "logits/rejected": 1.0310903787612915, "logps/chosen": -425.72125244140625, "logps/rejected": -507.666259765625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.0838117599487305, "rewards/margins": 24.99704933166504, "rewards/rejected": -32.08086395263672, "step": 4660 }, { "epoch": 14.500387296669249, "grad_norm": 0.00015960348537191749, "learning_rate": 0.00011180888620324205, "logits/chosen": -0.6674115061759949, "logits/rejected": 1.0456359386444092, "logps/chosen": -418.9027404785156, "logps/rejected": -499.97003173828125, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -6.929097652435303, "rewards/margins": 24.343183517456055, "rewards/rejected": -31.272281646728516, "step": 4680 }, { "epoch": 14.562354763749031, "grad_norm": 0.0009649236453697085, "learning_rate": 0.00011117842815374835, "logits/chosen": -0.6228715181350708, "logits/rejected": 0.9750884175300598, "logps/chosen": -421.4877014160156, "logps/rejected": -532.0631103515625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.449153900146484, "rewards/margins": 25.748123168945312, "rewards/rejected": -32.19727325439453, "step": 4700 }, { "epoch": 14.624322230828815, "grad_norm": 0.0010970581788569689, "learning_rate": 0.00011054751983879859, "logits/chosen": -0.6098747849464417, "logits/rejected": 1.016177773475647, "logps/chosen": -426.1904296875, "logps/rejected": -536.5009765625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.70578670501709, "rewards/margins": 25.852767944335938, "rewards/rejected": -32.558555603027344, "step": 4720 }, { "epoch": 14.686289697908599, "grad_norm": 0.000131874781800434, "learning_rate": 0.00010991618667128769, "logits/chosen": -0.609255850315094, "logits/rejected": 1.0346933603286743, "logps/chosen": -423.99566650390625, "logps/rejected": -517.832275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.696162223815918, "rewards/margins": 24.211074829101562, "rewards/rejected": -31.907236099243164, "step": 4740 }, { "epoch": 14.748257164988381, "grad_norm": 0.0003271376190241426, "learning_rate": 0.00010928445408122361, "logits/chosen": -0.5123564004898071, "logits/rejected": 0.9423815011978149, "logps/chosen": -412.5267639160156, "logps/rejected": -537.0595703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.50037145614624, "rewards/margins": 25.577926635742188, "rewards/rejected": -33.07830047607422, "step": 4760 }, { "epoch": 14.810224632068165, "grad_norm": 0.0003270464367233217, "learning_rate": 0.00010865234751470288, "logits/chosen": -0.603384792804718, "logits/rejected": 1.1043418645858765, "logps/chosen": -415.77313232421875, "logps/rejected": -513.60986328125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.077917575836182, "rewards/margins": 25.412960052490234, "rewards/rejected": -32.49087905883789, "step": 4780 }, { "epoch": 14.872192099147947, "grad_norm": 0.0009512171382084489, "learning_rate": 0.00010801989243288589, "logits/chosen": -0.7142239809036255, "logits/rejected": 1.0645368099212646, "logps/chosen": -416.17230224609375, "logps/rejected": -509.03082275390625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -6.745335578918457, "rewards/margins": 24.773212432861328, "rewards/rejected": -31.5185489654541, "step": 4800 }, { "epoch": 14.93415956622773, "grad_norm": 0.0008968279580585659, "learning_rate": 0.00010738711431097112, "logits/chosen": -0.7670684456825256, "logits/rejected": 1.1087074279785156, "logps/chosen": -419.56573486328125, "logps/rejected": -510.56365966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.202185153961182, "rewards/margins": 25.509662628173828, "rewards/rejected": -31.711849212646484, "step": 4820 }, { "epoch": 14.996127033307513, "grad_norm": 0.00017500368994660676, "learning_rate": 0.00010675403863716907, "logits/chosen": -0.7169899344444275, "logits/rejected": 0.8955768346786499, "logps/chosen": -428.79864501953125, "logps/rejected": -524.6358642578125, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.424875736236572, "rewards/margins": 24.901775360107422, "rewards/rejected": -32.3266487121582, "step": 4840 }, { "epoch": 15.058094500387297, "grad_norm": 0.00016677333042025566, "learning_rate": 0.00010612069091167551, "logits/chosen": -0.641793966293335, "logits/rejected": 1.0156922340393066, "logps/chosen": -432.36248779296875, "logps/rejected": -538.47998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.848368167877197, "rewards/margins": 25.29831314086914, "rewards/rejected": -32.14667892456055, "step": 4860 }, { "epoch": 15.12006196746708, "grad_norm": 0.0034633041359484196, "learning_rate": 0.00010548709664564449, "logits/chosen": -0.6778856515884399, "logits/rejected": 0.9707983732223511, "logps/chosen": -426.789794921875, "logps/rejected": -546.7899169921875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.392895698547363, "rewards/margins": 26.034061431884766, "rewards/rejected": -33.42695999145508, "step": 4880 }, { "epoch": 15.182029434546862, "grad_norm": 0.0001036279572872445, "learning_rate": 0.00010485328136016071, "logits/chosen": -0.6301292181015015, "logits/rejected": 0.9921154975891113, "logps/chosen": -420.39520263671875, "logps/rejected": -544.0909423828125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.199660778045654, "rewards/margins": 26.49783706665039, "rewards/rejected": -32.69750213623047, "step": 4900 }, { "epoch": 15.243996901626646, "grad_norm": 0.0009169202530756593, "learning_rate": 0.00010421927058521137, "logits/chosen": -0.6611192226409912, "logits/rejected": 1.050333023071289, "logps/chosen": -428.5755920410156, "logps/rejected": -530.2979125976562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.6709885597229, "rewards/margins": 25.375732421875, "rewards/rejected": -33.04671859741211, "step": 4920 }, { "epoch": 15.305964368706428, "grad_norm": 0.0017765266820788383, "learning_rate": 0.00010358508985865813, "logits/chosen": -0.6134442687034607, "logits/rejected": 1.212090253829956, "logps/chosen": -415.17413330078125, "logps/rejected": -492.6800231933594, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -7.1799468994140625, "rewards/margins": 24.78128433227539, "rewards/rejected": -31.961233139038086, "step": 4940 }, { "epoch": 15.367931835786212, "grad_norm": 0.000361295067705214, "learning_rate": 0.00010295076472520812, "logits/chosen": -0.6132256984710693, "logits/rejected": 1.1052106618881226, "logps/chosen": -408.4927673339844, "logps/rejected": -502.8935546875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.815627098083496, "rewards/margins": 24.847381591796875, "rewards/rejected": -31.663009643554688, "step": 4960 }, { "epoch": 15.429899302865996, "grad_norm": 2.1386760636232793e-05, "learning_rate": 0.00010231632073538522, "logits/chosen": -0.6892057061195374, "logits/rejected": 1.0478392839431763, "logps/chosen": -440.66754150390625, "logps/rejected": -533.3825073242188, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -6.8489556312561035, "rewards/margins": 25.716054916381836, "rewards/rejected": -32.56501388549805, "step": 4980 }, { "epoch": 15.491866769945778, "grad_norm": 0.0018733438337221742, "learning_rate": 0.00010168178344450086, "logits/chosen": -0.5057135820388794, "logits/rejected": 0.988396167755127, "logps/chosen": -407.16107177734375, "logps/rejected": -538.1986083984375, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.880420684814453, "rewards/margins": 25.703664779663086, "rewards/rejected": -32.584083557128906, "step": 5000 }, { "epoch": 15.553834237025562, "grad_norm": 0.001603521523065865, "learning_rate": 0.00010104717841162458, "logits/chosen": -0.679779052734375, "logits/rejected": 0.9824868440628052, "logps/chosen": -441.1270446777344, "logps/rejected": -546.2282104492188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.309430122375488, "rewards/margins": 26.078725814819336, "rewards/rejected": -32.38815689086914, "step": 5020 }, { "epoch": 15.615801704105344, "grad_norm": 9.475573460804299e-05, "learning_rate": 0.0001004125311985546, "logits/chosen": -0.6173331141471863, "logits/rejected": 1.0997240543365479, "logps/chosen": -423.6936950683594, "logps/rejected": -530.4503173828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.953721046447754, "rewards/margins": 26.448678970336914, "rewards/rejected": -33.40239715576172, "step": 5040 }, { "epoch": 15.677769171185128, "grad_norm": 0.00013075934839434922, "learning_rate": 9.977786736878808e-05, "logits/chosen": -0.6333028674125671, "logits/rejected": 0.9802857637405396, "logps/chosen": -437.1517639160156, "logps/rejected": -530.119384765625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.329917907714844, "rewards/margins": 24.84552001953125, "rewards/rejected": -32.17544174194336, "step": 5060 }, { "epoch": 15.739736638264912, "grad_norm": 9.669300925452262e-05, "learning_rate": 9.914321248649153e-05, "logits/chosen": -0.6871415376663208, "logits/rejected": 1.0859472751617432, "logps/chosen": -440.9861755371094, "logps/rejected": -540.0758056640625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.136324882507324, "rewards/margins": 26.24369239807129, "rewards/rejected": -33.3800163269043, "step": 5080 }, { "epoch": 15.801704105344694, "grad_norm": 0.0001261440193047747, "learning_rate": 9.85085921154711e-05, "logits/chosen": -0.5889202356338501, "logits/rejected": 1.0418097972869873, "logps/chosen": -428.19598388671875, "logps/rejected": -546.845947265625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.5799078941345215, "rewards/margins": 26.233631134033203, "rewards/rejected": -33.81353759765625, "step": 5100 }, { "epoch": 15.863671572424478, "grad_norm": 0.00021416415984276682, "learning_rate": 9.787403181814281e-05, "logits/chosen": -0.6271970868110657, "logits/rejected": 0.9168373346328735, "logps/chosen": -425.2923889160156, "logps/rejected": -527.7469482421875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.666965484619141, "rewards/margins": 24.424535751342773, "rewards/rejected": -32.09149932861328, "step": 5120 }, { "epoch": 15.92563903950426, "grad_norm": 0.00090818852186203, "learning_rate": 9.723955715450287e-05, "logits/chosen": -0.6783554553985596, "logits/rejected": 0.9546459913253784, "logps/chosen": -422.9801330566406, "logps/rejected": -557.9986572265625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.099810600280762, "rewards/margins": 27.2694091796875, "rewards/rejected": -34.36921691894531, "step": 5140 }, { "epoch": 15.987606506584044, "grad_norm": 0.00033679328043945134, "learning_rate": 9.660519368109823e-05, "logits/chosen": -0.6659687757492065, "logits/rejected": 1.0432156324386597, "logps/chosen": -430.2832946777344, "logps/rejected": -544.5806884765625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.264108180999756, "rewards/margins": 26.98138427734375, "rewards/rejected": -34.24549102783203, "step": 5160 }, { "epoch": 16.049573973663826, "grad_norm": 0.00020042255346197635, "learning_rate": 9.597096694999715e-05, "logits/chosen": -0.5894029140472412, "logits/rejected": 1.1150057315826416, "logps/chosen": -451.55133056640625, "logps/rejected": -567.7293701171875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.232612609863281, "rewards/margins": 26.292133331298828, "rewards/rejected": -34.524742126464844, "step": 5180 }, { "epoch": 16.11154144074361, "grad_norm": 0.00011073077621404082, "learning_rate": 9.53369025077598e-05, "logits/chosen": -0.5697144269943237, "logits/rejected": 1.127539038658142, "logps/chosen": -430.92901611328125, "logps/rejected": -548.5370483398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.460732460021973, "rewards/margins": 26.866458892822266, "rewards/rejected": -34.32719039916992, "step": 5200 }, { "epoch": 16.173508907823393, "grad_norm": 0.00015952142712194473, "learning_rate": 9.470302589440952e-05, "logits/chosen": -0.6166108250617981, "logits/rejected": 1.1184258460998535, "logps/chosen": -431.86761474609375, "logps/rejected": -538.8045043945312, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.1826887130737305, "rewards/margins": 26.133398056030273, "rewards/rejected": -33.31608581542969, "step": 5220 }, { "epoch": 16.235476374903175, "grad_norm": 0.0005165811162441969, "learning_rate": 9.406936264240386e-05, "logits/chosen": -0.5293561816215515, "logits/rejected": 1.0149633884429932, "logps/chosen": -411.24774169921875, "logps/rejected": -552.6751708984375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.941656589508057, "rewards/margins": 26.677719116210938, "rewards/rejected": -33.61937713623047, "step": 5240 }, { "epoch": 16.297443841982957, "grad_norm": 0.00018194419681094587, "learning_rate": 9.343593827560617e-05, "logits/chosen": -0.5177640318870544, "logits/rejected": 1.0884605646133423, "logps/chosen": -436.27557373046875, "logps/rejected": -555.76611328125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.657900333404541, "rewards/margins": 26.019603729248047, "rewards/rejected": -33.67750549316406, "step": 5260 }, { "epoch": 16.359411309062743, "grad_norm": 1.4249508240027353e-05, "learning_rate": 9.280277830825763e-05, "logits/chosen": -0.566566526889801, "logits/rejected": 1.115505576133728, "logps/chosen": -420.29864501953125, "logps/rejected": -530.7322387695312, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.4629082679748535, "rewards/margins": 26.702518463134766, "rewards/rejected": -33.165428161621094, "step": 5280 }, { "epoch": 16.421378776142525, "grad_norm": 0.00037811213405802846, "learning_rate": 9.216990824394937e-05, "logits/chosen": -0.6209192872047424, "logits/rejected": 1.0690656900405884, "logps/chosen": -428.49700927734375, "logps/rejected": -556.0371704101562, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.307267189025879, "rewards/margins": 28.177288055419922, "rewards/rejected": -34.484554290771484, "step": 5300 }, { "epoch": 16.483346243222307, "grad_norm": 0.0006744434358552098, "learning_rate": 9.15373535745953e-05, "logits/chosen": -0.6343203186988831, "logits/rejected": 1.017704725265503, "logps/chosen": -426.8873596191406, "logps/rejected": -555.207275390625, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.941340446472168, "rewards/margins": 26.214609146118164, "rewards/rejected": -33.15595245361328, "step": 5320 }, { "epoch": 16.545313710302093, "grad_norm": 0.00023284759663511068, "learning_rate": 9.090513977940532e-05, "logits/chosen": -0.5838009119033813, "logits/rejected": 1.173878788948059, "logps/chosen": -431.891845703125, "logps/rejected": -531.1908569335938, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.410466194152832, "rewards/margins": 25.734268188476562, "rewards/rejected": -33.14473342895508, "step": 5340 }, { "epoch": 16.607281177381875, "grad_norm": 0.0001960826339200139, "learning_rate": 9.027329232385887e-05, "logits/chosen": -0.7288259267807007, "logits/rejected": 1.11318039894104, "logps/chosen": -422.9234313964844, "logps/rejected": -513.193115234375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.876204490661621, "rewards/margins": 25.526058197021484, "rewards/rejected": -32.40226364135742, "step": 5360 }, { "epoch": 16.669248644461657, "grad_norm": 0.00011543634900590405, "learning_rate": 8.96418366586793e-05, "logits/chosen": -0.6046707034111023, "logits/rejected": 1.0723017454147339, "logps/chosen": -433.06646728515625, "logps/rejected": -532.0721435546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.24627685546875, "rewards/margins": 25.937397003173828, "rewards/rejected": -33.18367004394531, "step": 5380 }, { "epoch": 16.73121611154144, "grad_norm": 0.00011998928675893694, "learning_rate": 8.901079821880882e-05, "logits/chosen": -0.5506830811500549, "logits/rejected": 1.1579818725585938, "logps/chosen": -437.7366638183594, "logps/rejected": -554.2669067382812, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.844750881195068, "rewards/margins": 26.484683990478516, "rewards/rejected": -34.329429626464844, "step": 5400 }, { "epoch": 16.793183578621225, "grad_norm": 0.00036458164686337113, "learning_rate": 8.838020242238367e-05, "logits/chosen": -0.5875279903411865, "logits/rejected": 1.0199635028839111, "logps/chosen": -430.7936096191406, "logps/rejected": -550.0620727539062, "loss": 0.0043, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.758522033691406, "rewards/margins": 25.7502498626709, "rewards/rejected": -33.50877380371094, "step": 5420 }, { "epoch": 16.855151045701007, "grad_norm": 4.471308056963608e-05, "learning_rate": 8.775007466971067e-05, "logits/chosen": -0.5515426397323608, "logits/rejected": 1.3305522203445435, "logps/chosen": -421.59027099609375, "logps/rejected": -505.4217224121094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.508360385894775, "rewards/margins": 25.7374210357666, "rewards/rejected": -32.24578094482422, "step": 5440 }, { "epoch": 16.91711851278079, "grad_norm": 0.0005242056213319302, "learning_rate": 8.712044034224374e-05, "logits/chosen": -0.5384324789047241, "logits/rejected": 1.0426101684570312, "logps/chosen": -419.22314453125, "logps/rejected": -534.5950317382812, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.432672500610352, "rewards/margins": 25.05026626586914, "rewards/rejected": -33.48293685913086, "step": 5460 }, { "epoch": 16.979085979860574, "grad_norm": 0.00010303401359124109, "learning_rate": 8.649132480156181e-05, "logits/chosen": -0.5100408792495728, "logits/rejected": 1.2015564441680908, "logps/chosen": -420.74835205078125, "logps/rejected": -532.5908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.649355411529541, "rewards/margins": 26.004283905029297, "rewards/rejected": -33.65364456176758, "step": 5480 }, { "epoch": 17.041053446940357, "grad_norm": 0.00029292888939380646, "learning_rate": 8.586275338834718e-05, "logits/chosen": -0.6359528303146362, "logits/rejected": 1.1911952495574951, "logps/chosen": -427.14605712890625, "logps/rejected": -508.2286071777344, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.083102226257324, "rewards/margins": 24.796707153320312, "rewards/rejected": -31.879810333251953, "step": 5500 }, { "epoch": 17.10302091402014, "grad_norm": 0.00010459234908921644, "learning_rate": 8.523475142136463e-05, "logits/chosen": -0.7133889198303223, "logits/rejected": 1.1848968267440796, "logps/chosen": -435.04644775390625, "logps/rejected": -545.0972900390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.787796974182129, "rewards/margins": 27.31494140625, "rewards/rejected": -34.10273742675781, "step": 5520 }, { "epoch": 17.164988381099924, "grad_norm": 0.00017392283189110458, "learning_rate": 8.460734419644185e-05, "logits/chosen": -0.5804117918014526, "logits/rejected": 1.1580560207366943, "logps/chosen": -441.470703125, "logps/rejected": -547.8115234375, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -6.628912448883057, "rewards/margins": 26.92593765258789, "rewards/rejected": -33.554847717285156, "step": 5540 }, { "epoch": 17.226955848179706, "grad_norm": 0.00011059839744120836, "learning_rate": 8.398055698545043e-05, "logits/chosen": -0.6109145879745483, "logits/rejected": 1.0190441608428955, "logps/chosen": -426.3067321777344, "logps/rejected": -557.4554443359375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.038537502288818, "rewards/margins": 27.249805450439453, "rewards/rejected": -34.2883415222168, "step": 5560 }, { "epoch": 17.28892331525949, "grad_norm": 5.200642044655979e-05, "learning_rate": 8.33544150352878e-05, "logits/chosen": -0.6560173034667969, "logits/rejected": 1.1862332820892334, "logps/chosen": -432.2843322753906, "logps/rejected": -529.6439819335938, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.714084625244141, "rewards/margins": 26.171829223632812, "rewards/rejected": -32.88591766357422, "step": 5580 }, { "epoch": 17.35089078233927, "grad_norm": 0.00016506008978467435, "learning_rate": 8.272894356686039e-05, "logits/chosen": -0.660111665725708, "logits/rejected": 1.1549274921417236, "logps/chosen": -441.23822021484375, "logps/rejected": -559.5040893554688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.083149909973145, "rewards/margins": 27.01141357421875, "rewards/rejected": -35.09456253051758, "step": 5600 }, { "epoch": 17.412858249419056, "grad_norm": 0.00016628840239718556, "learning_rate": 8.210416777406774e-05, "logits/chosen": -0.5330893993377686, "logits/rejected": 1.0295093059539795, "logps/chosen": -420.89794921875, "logps/rejected": -539.91064453125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.301202774047852, "rewards/margins": 24.926477432250977, "rewards/rejected": -33.22768020629883, "step": 5620 }, { "epoch": 17.474825716498838, "grad_norm": 0.0009920025477185845, "learning_rate": 8.148011282278772e-05, "logits/chosen": -0.5239976048469543, "logits/rejected": 1.067887306213379, "logps/chosen": -454.75970458984375, "logps/rejected": -577.5914306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.137965202331543, "rewards/margins": 26.359233856201172, "rewards/rejected": -34.49720001220703, "step": 5640 }, { "epoch": 17.53679318357862, "grad_norm": 3.656623812275939e-05, "learning_rate": 8.085680384986276e-05, "logits/chosen": -0.716755747795105, "logits/rejected": 1.1244462728500366, "logps/chosen": -430.6189880371094, "logps/rejected": -534.71875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.90305233001709, "rewards/margins": 27.014053344726562, "rewards/rejected": -33.91710662841797, "step": 5660 }, { "epoch": 17.598760650658406, "grad_norm": 0.00035754471900872886, "learning_rate": 8.023426596208739e-05, "logits/chosen": -0.5729060769081116, "logits/rejected": 1.239874005317688, "logps/chosen": -421.12432861328125, "logps/rejected": -516.0450439453125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.880531311035156, "rewards/margins": 25.47823715209961, "rewards/rejected": -32.35877227783203, "step": 5680 }, { "epoch": 17.660728117738188, "grad_norm": 0.00047625869046896696, "learning_rate": 7.961252423519696e-05, "logits/chosen": -0.5793383121490479, "logits/rejected": 1.1896495819091797, "logps/chosen": -414.25518798828125, "logps/rejected": -531.8538818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.382920742034912, "rewards/margins": 26.68353271484375, "rewards/rejected": -33.06645584106445, "step": 5700 }, { "epoch": 17.72269558481797, "grad_norm": 8.376881305593997e-05, "learning_rate": 7.899160371285761e-05, "logits/chosen": -0.5482260584831238, "logits/rejected": 1.0082305669784546, "logps/chosen": -432.7474670410156, "logps/rejected": -554.15478515625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.3407301902771, "rewards/margins": 25.97622299194336, "rewards/rejected": -33.31695556640625, "step": 5720 }, { "epoch": 17.784663051897752, "grad_norm": 0.00044510714360512793, "learning_rate": 7.837152940565741e-05, "logits/chosen": -0.5911335945129395, "logits/rejected": 1.0798364877700806, "logps/chosen": -428.51202392578125, "logps/rejected": -549.1173095703125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.572705268859863, "rewards/margins": 26.540054321289062, "rewards/rejected": -34.11275863647461, "step": 5740 }, { "epoch": 17.846630518977538, "grad_norm": 0.00010473801376065239, "learning_rate": 7.775232629009904e-05, "logits/chosen": -0.545853316783905, "logits/rejected": 1.2726647853851318, "logps/chosen": -423.32196044921875, "logps/rejected": -526.806640625, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.698158264160156, "rewards/margins": 26.736377716064453, "rewards/rejected": -33.434532165527344, "step": 5760 }, { "epoch": 17.90859798605732, "grad_norm": 0.0019047368550673127, "learning_rate": 7.713401930759365e-05, "logits/chosen": -0.4332125782966614, "logits/rejected": 1.0440775156021118, "logps/chosen": -416.0174865722656, "logps/rejected": -550.7855834960938, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.867218971252441, "rewards/margins": 26.22536849975586, "rewards/rejected": -34.09259033203125, "step": 5780 }, { "epoch": 17.9705654531371, "grad_norm": 0.00020266433421056718, "learning_rate": 7.651663336345642e-05, "logits/chosen": -0.6614434719085693, "logits/rejected": 1.2425765991210938, "logps/chosen": -412.87200927734375, "logps/rejected": -513.8663940429688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.660252571105957, "rewards/margins": 27.139822006225586, "rewards/rejected": -32.80007553100586, "step": 5800 }, { "epoch": 18.032532920216887, "grad_norm": 0.0003341589472256601, "learning_rate": 7.590019332590315e-05, "logits/chosen": -0.5724080801010132, "logits/rejected": 1.1166841983795166, "logps/chosen": -428.62835693359375, "logps/rejected": -538.9694213867188, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.769497871398926, "rewards/margins": 25.40299415588379, "rewards/rejected": -33.17249298095703, "step": 5820 }, { "epoch": 18.09450038729667, "grad_norm": 0.0004949842114001513, "learning_rate": 7.528472402504862e-05, "logits/chosen": -0.5866089463233948, "logits/rejected": 1.1626102924346924, "logps/chosen": -438.97210693359375, "logps/rejected": -541.1741943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.18331241607666, "rewards/margins": 25.09009552001953, "rewards/rejected": -33.27341079711914, "step": 5840 }, { "epoch": 18.15646785437645, "grad_norm": 6.053561810404062e-05, "learning_rate": 7.467025025190657e-05, "logits/chosen": -0.5582033395767212, "logits/rejected": 1.0998704433441162, "logps/chosen": -408.2658386230469, "logps/rejected": -540.3822631835938, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.132643222808838, "rewards/margins": 26.61345863342285, "rewards/rejected": -33.74610137939453, "step": 5860 }, { "epoch": 18.218435321456237, "grad_norm": 0.0005991118378005922, "learning_rate": 7.405679675739096e-05, "logits/chosen": -0.56462162733078, "logits/rejected": 1.2322356700897217, "logps/chosen": -431.82940673828125, "logps/rejected": -530.2599487304688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.95839786529541, "rewards/margins": 26.209026336669922, "rewards/rejected": -33.16742706298828, "step": 5880 }, { "epoch": 18.28040278853602, "grad_norm": 0.002049060305580497, "learning_rate": 7.344438825131911e-05, "logits/chosen": -0.6229193210601807, "logits/rejected": 1.1154416799545288, "logps/chosen": -423.47125244140625, "logps/rejected": -539.2529907226562, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.331400394439697, "rewards/margins": 26.0443172454834, "rewards/rejected": -33.3757209777832, "step": 5900 }, { "epoch": 18.3423702556158, "grad_norm": 0.0006644345703534782, "learning_rate": 7.283304940141637e-05, "logits/chosen": -0.5369777679443359, "logits/rejected": 1.0778067111968994, "logps/chosen": -421.04803466796875, "logps/rejected": -554.5689697265625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.97817325592041, "rewards/margins": 27.166324615478516, "rewards/rejected": -34.14449691772461, "step": 5920 }, { "epoch": 18.404337722695583, "grad_norm": 0.0002038206730503589, "learning_rate": 7.222280483232242e-05, "logits/chosen": -0.6521028876304626, "logits/rejected": 1.2157747745513916, "logps/chosen": -426.2845153808594, "logps/rejected": -528.0088500976562, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.181411266326904, "rewards/margins": 26.629674911499023, "rewards/rejected": -32.81108856201172, "step": 5940 }, { "epoch": 18.46630518977537, "grad_norm": 9.838932601269335e-05, "learning_rate": 7.161367912459954e-05, "logits/chosen": -0.5511382222175598, "logits/rejected": 1.1784141063690186, "logps/chosen": -440.888916015625, "logps/rejected": -546.6519165039062, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.140920639038086, "rewards/margins": 26.043033599853516, "rewards/rejected": -34.18395233154297, "step": 5960 }, { "epoch": 18.52827265685515, "grad_norm": 0.0001888351107481867, "learning_rate": 7.100569681374245e-05, "logits/chosen": -0.6422185301780701, "logits/rejected": 1.0930224657058716, "logps/chosen": -432.44708251953125, "logps/rejected": -569.1817626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.413304328918457, "rewards/margins": 27.831863403320312, "rewards/rejected": -35.24516677856445, "step": 5980 }, { "epoch": 18.590240123934933, "grad_norm": 0.00025130840367637575, "learning_rate": 7.039888238918993e-05, "logits/chosen": -0.6784704327583313, "logits/rejected": 1.2808759212493896, "logps/chosen": -424.71820068359375, "logps/rejected": -530.2955322265625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.33709192276001, "rewards/margins": 27.75899314880371, "rewards/rejected": -34.0960807800293, "step": 6000 }, { "epoch": 18.65220759101472, "grad_norm": 0.000115082977572456, "learning_rate": 6.979326029333855e-05, "logits/chosen": -0.5961139798164368, "logits/rejected": 1.1904783248901367, "logps/chosen": -424.91619873046875, "logps/rejected": -537.8231201171875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.321267127990723, "rewards/margins": 27.020503997802734, "rewards/rejected": -33.341773986816406, "step": 6020 }, { "epoch": 18.7141750580945, "grad_norm": 0.00029704332700930536, "learning_rate": 6.918885492055803e-05, "logits/chosen": -0.5596092939376831, "logits/rejected": 1.1183770895004272, "logps/chosen": -401.2093505859375, "logps/rejected": -517.4161376953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.887650966644287, "rewards/margins": 26.075496673583984, "rewards/rejected": -32.9631462097168, "step": 6040 }, { "epoch": 18.776142525174283, "grad_norm": 0.00012532403343357146, "learning_rate": 6.858569061620862e-05, "logits/chosen": -0.605311930179596, "logits/rejected": 1.1743067502975464, "logps/chosen": -431.15850830078125, "logps/rejected": -542.1910400390625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.516722202301025, "rewards/margins": 27.494848251342773, "rewards/rejected": -34.011573791503906, "step": 6060 }, { "epoch": 18.838109992254065, "grad_norm": 0.00032498795189894736, "learning_rate": 6.798379167566064e-05, "logits/chosen": -0.5115218162536621, "logits/rejected": 1.1053214073181152, "logps/chosen": -429.5455627441406, "logps/rejected": -554.1275024414062, "loss": 0.0043, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.813681125640869, "rewards/margins": 26.788400650024414, "rewards/rejected": -34.60208511352539, "step": 6080 }, { "epoch": 18.90007745933385, "grad_norm": 0.00025639976956881583, "learning_rate": 6.738318234331554e-05, "logits/chosen": -0.5659054517745972, "logits/rejected": 1.2672799825668335, "logps/chosen": -422.6165466308594, "logps/rejected": -531.7962646484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.032742500305176, "rewards/margins": 26.38532066345215, "rewards/rejected": -33.418067932128906, "step": 6100 }, { "epoch": 18.962044926413633, "grad_norm": 0.00040332350181415677, "learning_rate": 6.67838868116297e-05, "logits/chosen": -0.5604509711265564, "logits/rejected": 1.3309004306793213, "logps/chosen": -430.5907287597656, "logps/rejected": -535.790771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.163422584533691, "rewards/margins": 26.109851837158203, "rewards/rejected": -34.27326965332031, "step": 6120 }, { "epoch": 19.024012393493415, "grad_norm": 0.0001713493256829679, "learning_rate": 6.618592922013973e-05, "logits/chosen": -0.6125264763832092, "logits/rejected": 1.121382713317871, "logps/chosen": -435.3624572753906, "logps/rejected": -564.345947265625, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.341497898101807, "rewards/margins": 27.505783081054688, "rewards/rejected": -34.84728240966797, "step": 6140 }, { "epoch": 19.0859798605732, "grad_norm": 2.4474145902786404e-05, "learning_rate": 6.558933365449025e-05, "logits/chosen": -0.4567294716835022, "logits/rejected": 1.182701587677002, "logps/chosen": -430.1796875, "logps/rejected": -557.1361694335938, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.146208763122559, "rewards/margins": 26.323129653930664, "rewards/rejected": -34.469337463378906, "step": 6160 }, { "epoch": 19.147947327652982, "grad_norm": 0.0014432374155148864, "learning_rate": 6.499412414546362e-05, "logits/chosen": -0.6155918836593628, "logits/rejected": 1.3073813915252686, "logps/chosen": -436.5602111816406, "logps/rejected": -532.6322021484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.379100799560547, "rewards/margins": 26.880939483642578, "rewards/rejected": -34.260040283203125, "step": 6180 }, { "epoch": 19.209914794732764, "grad_norm": 4.102818638784811e-05, "learning_rate": 6.440032466801215e-05, "logits/chosen": -0.5177820920944214, "logits/rejected": 1.2410507202148438, "logps/chosen": -435.9002380371094, "logps/rejected": -561.0885009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.766203880310059, "rewards/margins": 27.36117935180664, "rewards/rejected": -35.12738037109375, "step": 6200 }, { "epoch": 19.27188226181255, "grad_norm": 4.8919737309915945e-05, "learning_rate": 6.380795914029213e-05, "logits/chosen": -0.47016972303390503, "logits/rejected": 1.1552751064300537, "logps/chosen": -429.572265625, "logps/rejected": -567.6087646484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.870247840881348, "rewards/margins": 27.22457504272461, "rewards/rejected": -35.094825744628906, "step": 6220 }, { "epoch": 19.333849728892332, "grad_norm": 0.0002924731816165149, "learning_rate": 6.321705142270067e-05, "logits/chosen": -0.6526008248329163, "logits/rejected": 1.217797040939331, "logps/chosen": -428.2090759277344, "logps/rejected": -534.576904296875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.726796627044678, "rewards/margins": 26.724050521850586, "rewards/rejected": -33.450843811035156, "step": 6240 }, { "epoch": 19.395817195972114, "grad_norm": 0.00014902207476552576, "learning_rate": 6.262762531691451e-05, "logits/chosen": -0.5008414387702942, "logits/rejected": 1.121274471282959, "logps/chosen": -416.0611267089844, "logps/rejected": -548.2586669921875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.260308742523193, "rewards/margins": 26.569469451904297, "rewards/rejected": -33.82978057861328, "step": 6260 }, { "epoch": 19.457784663051896, "grad_norm": 0.0001254824601346627, "learning_rate": 6.203970456493118e-05, "logits/chosen": -0.6224455237388611, "logits/rejected": 1.1558345556259155, "logps/chosen": -430.1212463378906, "logps/rejected": -535.5614013671875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.150260925292969, "rewards/margins": 26.5768985748291, "rewards/rejected": -33.72715759277344, "step": 6280 }, { "epoch": 19.519752130131682, "grad_norm": 0.00014725126675330102, "learning_rate": 6.145331284811285e-05, "logits/chosen": -0.6229298710823059, "logits/rejected": 1.20758855342865, "logps/chosen": -448.34716796875, "logps/rejected": -562.9547119140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.880606174468994, "rewards/margins": 28.050073623657227, "rewards/rejected": -34.93068313598633, "step": 6300 }, { "epoch": 19.581719597211464, "grad_norm": 0.00048207101644948125, "learning_rate": 6.0868473786232395e-05, "logits/chosen": -0.5494848489761353, "logits/rejected": 1.18673837184906, "logps/chosen": -418.2279357910156, "logps/rejected": -551.411376953125, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.64691686630249, "rewards/margins": 27.100601196289062, "rewards/rejected": -34.747520446777344, "step": 6320 }, { "epoch": 19.643687064291246, "grad_norm": 0.0007689573685638607, "learning_rate": 6.0285210936521955e-05, "logits/chosen": -0.4644032418727875, "logits/rejected": 1.1573445796966553, "logps/chosen": -430.5718688964844, "logps/rejected": -580.4948120117188, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.147204399108887, "rewards/margins": 27.156152725219727, "rewards/rejected": -35.3033561706543, "step": 6340 }, { "epoch": 19.70565453137103, "grad_norm": 0.00015767944569233805, "learning_rate": 5.9703547792724045e-05, "logits/chosen": -0.521681010723114, "logits/rejected": 1.169802188873291, "logps/chosen": -419.7161560058594, "logps/rejected": -550.3419799804688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.470440864562988, "rewards/margins": 27.15780258178711, "rewards/rejected": -34.62824249267578, "step": 6360 }, { "epoch": 19.767621998450814, "grad_norm": 0.000223572802497074, "learning_rate": 5.912350778414531e-05, "logits/chosen": -0.5011137127876282, "logits/rejected": 1.0836546421051025, "logps/chosen": -432.50421142578125, "logps/rejected": -566.302734375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.912287712097168, "rewards/margins": 26.835742950439453, "rewards/rejected": -34.74802780151367, "step": 6380 }, { "epoch": 19.829589465530596, "grad_norm": 0.0003539229219313711, "learning_rate": 5.8545114274712695e-05, "logits/chosen": -0.5557172894477844, "logits/rejected": 1.1703180074691772, "logps/chosen": -436.09454345703125, "logps/rejected": -560.4405517578125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.96994161605835, "rewards/margins": 26.893199920654297, "rewards/rejected": -34.86314010620117, "step": 6400 }, { "epoch": 19.891556932610378, "grad_norm": 0.00028784500318579376, "learning_rate": 5.796839056203247e-05, "logits/chosen": -0.6601584553718567, "logits/rejected": 1.2676100730895996, "logps/chosen": -423.5615234375, "logps/rejected": -529.6727294921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.596652984619141, "rewards/margins": 26.803829193115234, "rewards/rejected": -33.400482177734375, "step": 6420 }, { "epoch": 19.953524399690163, "grad_norm": 0.0001648878096602857, "learning_rate": 5.7422070843492734e-05, "logits/chosen": -0.5620417594909668, "logits/rejected": 1.2710864543914795, "logps/chosen": -440.33001708984375, "logps/rejected": -560.01123046875, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.448406219482422, "rewards/margins": 28.1207275390625, "rewards/rejected": -35.56913757324219, "step": 6440 }, { "epoch": 20.015491866769946, "grad_norm": 3.79412122128997e-05, "learning_rate": 5.684866998866316e-05, "logits/chosen": -0.6099050045013428, "logits/rejected": 1.330368995666504, "logps/chosen": -440.4766540527344, "logps/rejected": -537.293701171875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -7.571197509765625, "rewards/margins": 26.832447052001953, "rewards/rejected": -34.40364456176758, "step": 6460 }, { "epoch": 20.077459333849728, "grad_norm": 0.00026894695474766195, "learning_rate": 5.6277007263114437e-05, "logits/chosen": -0.5007272958755493, "logits/rejected": 1.3822646141052246, "logps/chosen": -431.9654846191406, "logps/rejected": -549.35546875, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.299338340759277, "rewards/margins": 27.777185440063477, "rewards/rejected": -35.07652282714844, "step": 6480 }, { "epoch": 20.139426800929513, "grad_norm": 8.49057687446475e-05, "learning_rate": 5.570710569333772e-05, "logits/chosen": -0.6163416504859924, "logits/rejected": 1.2752020359039307, "logps/chosen": -447.9937438964844, "logps/rejected": -553.1744384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.5520758628845215, "rewards/margins": 27.005313873291016, "rewards/rejected": -34.55738830566406, "step": 6500 }, { "epoch": 20.201394268009295, "grad_norm": 0.00022929662372916937, "learning_rate": 5.513898823488528e-05, "logits/chosen": -0.49778875708580017, "logits/rejected": 1.2400882244110107, "logps/chosen": -442.35284423828125, "logps/rejected": -562.985595703125, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.439814567565918, "rewards/margins": 27.415929794311523, "rewards/rejected": -34.85574722290039, "step": 6520 }, { "epoch": 20.263361735089077, "grad_norm": 2.7919993954128586e-05, "learning_rate": 5.4572677771445344e-05, "logits/chosen": -0.650310754776001, "logits/rejected": 1.2607206106185913, "logps/chosen": -437.0475158691406, "logps/rejected": -534.0076904296875, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.755753517150879, "rewards/margins": 26.984405517578125, "rewards/rejected": -33.74016189575195, "step": 6540 }, { "epoch": 20.325329202168863, "grad_norm": 3.8151458284119144e-05, "learning_rate": 5.400819711392091e-05, "logits/chosen": -0.5207892656326294, "logits/rejected": 1.2761542797088623, "logps/chosen": -422.2308654785156, "logps/rejected": -537.1773071289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0580034255981445, "rewards/margins": 27.111309051513672, "rewards/rejected": -34.169307708740234, "step": 6560 }, { "epoch": 20.387296669248645, "grad_norm": 8.57314735185355e-05, "learning_rate": 5.344556899951054e-05, "logits/chosen": -0.5232094526290894, "logits/rejected": 1.1201660633087158, "logps/chosen": -445.23309326171875, "logps/rejected": -583.000732421875, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -7.883921146392822, "rewards/margins": 27.830123901367188, "rewards/rejected": -35.71404266357422, "step": 6580 }, { "epoch": 20.449264136328427, "grad_norm": 0.0002258592430735007, "learning_rate": 5.288481609079259e-05, "logits/chosen": -0.5788317322731018, "logits/rejected": 1.0734844207763672, "logps/chosen": -428.3194885253906, "logps/rejected": -547.5687255859375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.71385669708252, "rewards/margins": 26.13985252380371, "rewards/rejected": -34.85371398925781, "step": 6600 }, { "epoch": 20.51123160340821, "grad_norm": 9.367791790282354e-05, "learning_rate": 5.232596097481251e-05, "logits/chosen": -0.5063174366950989, "logits/rejected": 1.2991888523101807, "logps/chosen": -428.83673095703125, "logps/rejected": -536.5294189453125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.142416954040527, "rewards/margins": 26.242568969726562, "rewards/rejected": -34.384986877441406, "step": 6620 }, { "epoch": 20.573199070487995, "grad_norm": 0.00023494637571275234, "learning_rate": 5.17690261621729e-05, "logits/chosen": -0.521760880947113, "logits/rejected": 1.2555716037750244, "logps/chosen": -443.06781005859375, "logps/rejected": -544.702880859375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.644153594970703, "rewards/margins": 26.8060245513916, "rewards/rejected": -34.45018005371094, "step": 6640 }, { "epoch": 20.635166537567777, "grad_norm": 0.0011836939956992865, "learning_rate": 5.121403408612672e-05, "logits/chosen": -0.4721315801143646, "logits/rejected": 1.1798968315124512, "logps/chosen": -445.50909423828125, "logps/rejected": -572.2584838867188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.873363494873047, "rewards/margins": 27.153827667236328, "rewards/rejected": -36.027191162109375, "step": 6660 }, { "epoch": 20.69713400464756, "grad_norm": 0.00022585636179428548, "learning_rate": 5.066100710167401e-05, "logits/chosen": -0.5391095876693726, "logits/rejected": 1.2846006155014038, "logps/chosen": -429.7838439941406, "logps/rejected": -565.8057861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0114898681640625, "rewards/margins": 28.408946990966797, "rewards/rejected": -35.420440673828125, "step": 6680 }, { "epoch": 20.759101471727345, "grad_norm": 7.968185673234984e-05, "learning_rate": 5.010996748466088e-05, "logits/chosen": -0.551699697971344, "logits/rejected": 1.2516696453094482, "logps/chosen": -421.5836486816406, "logps/rejected": -523.4829711914062, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.205231666564941, "rewards/margins": 26.57697105407715, "rewards/rejected": -33.782203674316406, "step": 6700 }, { "epoch": 20.821068938807127, "grad_norm": 9.455503459321335e-05, "learning_rate": 4.956093743088291e-05, "logits/chosen": -0.4987064003944397, "logits/rejected": 1.2454708814620972, "logps/chosen": -425.71221923828125, "logps/rejected": -552.5072021484375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.379124641418457, "rewards/margins": 26.97328758239746, "rewards/rejected": -34.352413177490234, "step": 6720 }, { "epoch": 20.88303640588691, "grad_norm": 5.707304808311164e-06, "learning_rate": 4.901393905519055e-05, "logits/chosen": -0.5764604806900024, "logits/rejected": 1.3113079071044922, "logps/chosen": -420.73919677734375, "logps/rejected": -510.94891357421875, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.187965393066406, "rewards/margins": 25.822784423828125, "rewards/rejected": -33.01074981689453, "step": 6740 }, { "epoch": 20.945003872966694, "grad_norm": 6.422119622584432e-05, "learning_rate": 4.8468994390598574e-05, "logits/chosen": -0.4906904101371765, "logits/rejected": 1.1637732982635498, "logps/chosen": -426.24737548828125, "logps/rejected": -572.7752075195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.849706172943115, "rewards/margins": 28.3243350982666, "rewards/rejected": -35.174041748046875, "step": 6760 }, { "epoch": 21.006971340046476, "grad_norm": 0.0005418303771875799, "learning_rate": 4.79261253873987e-05, "logits/chosen": -0.4840869903564453, "logits/rejected": 1.2838385105133057, "logps/chosen": -414.9949645996094, "logps/rejected": -528.3868408203125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.632540225982666, "rewards/margins": 26.313039779663086, "rewards/rejected": -33.945579528808594, "step": 6780 }, { "epoch": 21.06893880712626, "grad_norm": 0.00022662655101157725, "learning_rate": 4.7385353912275165e-05, "logits/chosen": -0.5232284665107727, "logits/rejected": 1.2743021249771118, "logps/chosen": -430.8934631347656, "logps/rejected": -557.8607177734375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.5154900550842285, "rewards/margins": 27.52066993713379, "rewards/rejected": -35.036163330078125, "step": 6800 }, { "epoch": 21.13090627420604, "grad_norm": 6.364914497680729e-06, "learning_rate": 4.684670174742412e-05, "logits/chosen": -0.5547454953193665, "logits/rejected": 1.209750771522522, "logps/chosen": -437.41033935546875, "logps/rejected": -568.2105712890625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.9095869064331055, "rewards/margins": 27.975051879882812, "rewards/rejected": -35.88463592529297, "step": 6820 }, { "epoch": 21.192873741285826, "grad_norm": 8.028039883356541e-05, "learning_rate": 4.631019058967627e-05, "logits/chosen": -0.5240232348442078, "logits/rejected": 1.104143738746643, "logps/chosen": -407.6142272949219, "logps/rejected": -549.146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.8381028175354, "rewards/margins": 27.074167251586914, "rewards/rejected": -34.912269592285156, "step": 6840 }, { "epoch": 21.254841208365608, "grad_norm": 4.430773697094992e-05, "learning_rate": 4.5775842049622806e-05, "logits/chosen": -0.4717990756034851, "logits/rejected": 1.254495620727539, "logps/chosen": -435.498779296875, "logps/rejected": -542.4292602539062, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.805765151977539, "rewards/margins": 25.804275512695312, "rewards/rejected": -34.61003875732422, "step": 6860 }, { "epoch": 21.31680867544539, "grad_norm": 0.00010798404400702566, "learning_rate": 4.524367765074499e-05, "logits/chosen": -0.5588334798812866, "logits/rejected": 1.3098465204238892, "logps/chosen": -430.4671325683594, "logps/rejected": -547.7396850585938, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.4153337478637695, "rewards/margins": 27.558120727539062, "rewards/rejected": -34.973453521728516, "step": 6880 }, { "epoch": 21.378776142525176, "grad_norm": 0.00025497484602965415, "learning_rate": 4.471371882854723e-05, "logits/chosen": -0.5086442232131958, "logits/rejected": 1.2061560153961182, "logps/chosen": -423.62109375, "logps/rejected": -562.035888671875, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.9457902908325195, "rewards/margins": 27.93740463256836, "rewards/rejected": -34.88319778442383, "step": 6900 }, { "epoch": 21.440743609604958, "grad_norm": 0.00021185963123571128, "learning_rate": 4.4185986929693546e-05, "logits/chosen": -0.5642815828323364, "logits/rejected": 1.322967767715454, "logps/chosen": -430.9812927246094, "logps/rejected": -532.7042846679688, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.864518642425537, "rewards/margins": 26.29730796813965, "rewards/rejected": -34.161827087402344, "step": 6920 }, { "epoch": 21.50271107668474, "grad_norm": 5.743455403717235e-05, "learning_rate": 4.366050321114796e-05, "logits/chosen": -0.530922532081604, "logits/rejected": 1.3586127758026123, "logps/chosen": -413.8951110839844, "logps/rejected": -528.0460205078125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.085272789001465, "rewards/margins": 27.594009399414062, "rewards/rejected": -34.679283142089844, "step": 6940 }, { "epoch": 21.564678543764522, "grad_norm": 0.00015524946502409875, "learning_rate": 4.3137288839318014e-05, "logits/chosen": -0.5405689477920532, "logits/rejected": 1.2518033981323242, "logps/chosen": -430.952392578125, "logps/rejected": -577.5679931640625, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.740090370178223, "rewards/margins": 28.604055404663086, "rewards/rejected": -36.34414291381836, "step": 6960 }, { "epoch": 21.626646010844308, "grad_norm": 0.0003134405706077814, "learning_rate": 4.2616364889202254e-05, "logits/chosen": -0.5319267511367798, "logits/rejected": 1.2375494241714478, "logps/chosen": -430.71209716796875, "logps/rejected": -563.7471923828125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.725900173187256, "rewards/margins": 27.626745223999023, "rewards/rejected": -35.3526496887207, "step": 6980 }, { "epoch": 21.68861347792409, "grad_norm": 0.0003931570390705019, "learning_rate": 4.209775234354151e-05, "logits/chosen": -0.5283772349357605, "logits/rejected": 1.2156752347946167, "logps/chosen": -417.3208923339844, "logps/rejected": -555.2176513671875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.691795349121094, "rewards/margins": 28.186809539794922, "rewards/rejected": -35.87860870361328, "step": 7000 }, { "epoch": 21.750580945003872, "grad_norm": 0.0001691762008704245, "learning_rate": 4.158147209197347e-05, "logits/chosen": -0.49120578169822693, "logits/rejected": 1.30716073513031, "logps/chosen": -423.93121337890625, "logps/rejected": -562.0618286132812, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.96550989151001, "rewards/margins": 28.568927764892578, "rewards/rejected": -35.53443908691406, "step": 7020 }, { "epoch": 21.812548412083657, "grad_norm": 0.0004458031035028398, "learning_rate": 4.106754493019138e-05, "logits/chosen": -0.5782488584518433, "logits/rejected": 1.2537811994552612, "logps/chosen": -440.19927978515625, "logps/rejected": -559.7462158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.946485996246338, "rewards/margins": 27.713253021240234, "rewards/rejected": -35.65974044799805, "step": 7040 }, { "epoch": 21.87451587916344, "grad_norm": 0.0034644545521587133, "learning_rate": 4.055599155910639e-05, "logits/chosen": -0.48173093795776367, "logits/rejected": 1.1414930820465088, "logps/chosen": -455.94866943359375, "logps/rejected": -615.8961181640625, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.730755805969238, "rewards/margins": 28.979084014892578, "rewards/rejected": -37.709835052490234, "step": 7060 }, { "epoch": 21.93648334624322, "grad_norm": 3.83302649424877e-05, "learning_rate": 4.004683258401366e-05, "logits/chosen": -0.4850381314754486, "logits/rejected": 1.4388806819915771, "logps/chosen": -431.77947998046875, "logps/rejected": -547.6139526367188, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.851006507873535, "rewards/margins": 27.237091064453125, "rewards/rejected": -35.088096618652344, "step": 7080 }, { "epoch": 21.998450813323004, "grad_norm": 6.421873695217073e-05, "learning_rate": 3.954008851376252e-05, "logits/chosen": -0.5726882219314575, "logits/rejected": 1.2820627689361572, "logps/chosen": -424.2509765625, "logps/rejected": -561.9562377929688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.327461242675781, "rewards/margins": 28.40390968322754, "rewards/rejected": -35.73137283325195, "step": 7100 }, { "epoch": 22.06041828040279, "grad_norm": 0.0006721434183418751, "learning_rate": 3.903577975993021e-05, "logits/chosen": -0.410485178232193, "logits/rejected": 1.2668616771697998, "logps/chosen": -414.65509033203125, "logps/rejected": -551.499755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.70891809463501, "rewards/margins": 27.578258514404297, "rewards/rejected": -35.28717803955078, "step": 7120 }, { "epoch": 22.12238574748257, "grad_norm": 0.00025342078879475594, "learning_rate": 3.853392663599976e-05, "logits/chosen": -0.5337496995925903, "logits/rejected": 1.3907458782196045, "logps/chosen": -442.95892333984375, "logps/rejected": -538.0328369140625, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -8.646308898925781, "rewards/margins": 26.30740737915039, "rewards/rejected": -34.95371627807617, "step": 7140 }, { "epoch": 22.184353214562353, "grad_norm": 0.0006203025695867836, "learning_rate": 3.8034549356541894e-05, "logits/chosen": -0.6515469551086426, "logits/rejected": 1.425569772720337, "logps/chosen": -416.54864501953125, "logps/rejected": -507.20306396484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.088509559631348, "rewards/margins": 26.99295997619629, "rewards/rejected": -33.08147048950195, "step": 7160 }, { "epoch": 22.24632068164214, "grad_norm": 0.001270798034965992, "learning_rate": 3.7537668036400574e-05, "logits/chosen": -0.4973440170288086, "logits/rejected": 1.3225786685943604, "logps/chosen": -438.9662170410156, "logps/rejected": -548.2359619140625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.235987186431885, "rewards/margins": 27.249414443969727, "rewards/rejected": -34.48540115356445, "step": 7180 }, { "epoch": 22.30828814872192, "grad_norm": 0.0001219348851009272, "learning_rate": 3.704330268988293e-05, "logits/chosen": -0.553697943687439, "logits/rejected": 1.298783779144287, "logps/chosen": -437.8828125, "logps/rejected": -566.4822998046875, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.940356254577637, "rewards/margins": 27.971059799194336, "rewards/rejected": -35.911415100097656, "step": 7200 }, { "epoch": 22.370255615801703, "grad_norm": 0.0001257601979887113, "learning_rate": 3.6551473229953037e-05, "logits/chosen": -0.4942377209663391, "logits/rejected": 1.3459523916244507, "logps/chosen": -437.99273681640625, "logps/rejected": -557.7435913085938, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.5271759033203125, "rewards/margins": 27.564788818359375, "rewards/rejected": -35.09196853637695, "step": 7220 }, { "epoch": 22.43222308288149, "grad_norm": 5.536680873774458e-06, "learning_rate": 3.606219946742978e-05, "logits/chosen": -0.5135564804077148, "logits/rejected": 1.2007367610931396, "logps/chosen": -429.4864807128906, "logps/rejected": -593.4481201171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.349099159240723, "rewards/margins": 29.2182674407959, "rewards/rejected": -36.56736755371094, "step": 7240 }, { "epoch": 22.49419054996127, "grad_norm": 0.0003757201775442809, "learning_rate": 3.557550111018906e-05, "logits/chosen": -0.4902525544166565, "logits/rejected": 1.1989456415176392, "logps/chosen": -445.7784118652344, "logps/rejected": -589.0730590820312, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.687643527984619, "rewards/margins": 28.44379234313965, "rewards/rejected": -36.131439208984375, "step": 7260 }, { "epoch": 22.556158017041053, "grad_norm": 0.0004273348895367235, "learning_rate": 3.509139776236967e-05, "logits/chosen": -0.5242363810539246, "logits/rejected": 1.3806064128875732, "logps/chosen": -443.4256286621094, "logps/rejected": -554.4573364257812, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.07096004486084, "rewards/margins": 27.879369735717773, "rewards/rejected": -34.95033645629883, "step": 7280 }, { "epoch": 22.618125484120835, "grad_norm": 0.00013076326285954565, "learning_rate": 3.460990892358388e-05, "logits/chosen": -0.46005791425704956, "logits/rejected": 1.3477107286453247, "logps/chosen": -448.58856201171875, "logps/rejected": -565.8875122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.28325366973877, "rewards/margins": 27.60245704650879, "rewards/rejected": -35.88571548461914, "step": 7300 }, { "epoch": 22.68009295120062, "grad_norm": 4.240042107994668e-05, "learning_rate": 3.413105398813195e-05, "logits/chosen": -0.4579412043094635, "logits/rejected": 1.342012643814087, "logps/chosen": -438.57684326171875, "logps/rejected": -577.7442626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.141871452331543, "rewards/margins": 28.370080947875977, "rewards/rejected": -36.5119514465332, "step": 7320 }, { "epoch": 22.742060418280403, "grad_norm": 7.035260750853922e-06, "learning_rate": 3.3654852244220826e-05, "logits/chosen": -0.4461139738559723, "logits/rejected": 1.2642186880111694, "logps/chosen": -411.8309631347656, "logps/rejected": -534.2032470703125, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.252596855163574, "rewards/margins": 26.30681800842285, "rewards/rejected": -34.55941390991211, "step": 7340 }, { "epoch": 22.804027885360185, "grad_norm": 0.0002788783167488873, "learning_rate": 3.3181322873187326e-05, "logits/chosen": -0.5451353192329407, "logits/rejected": 1.40205979347229, "logps/chosen": -433.80523681640625, "logps/rejected": -542.520263671875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.425281524658203, "rewards/margins": 28.070571899414062, "rewards/rejected": -34.495853424072266, "step": 7360 }, { "epoch": 22.86599535243997, "grad_norm": 0.00026678614085540175, "learning_rate": 3.271048494872546e-05, "logits/chosen": -0.47595128417015076, "logits/rejected": 1.2741992473602295, "logps/chosen": -418.16845703125, "logps/rejected": -567.8311767578125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.957074165344238, "rewards/margins": 28.70499610900879, "rewards/rejected": -35.662071228027344, "step": 7380 }, { "epoch": 22.927962819519752, "grad_norm": 0.0001355899585178122, "learning_rate": 3.224235743611814e-05, "logits/chosen": -0.4618222713470459, "logits/rejected": 1.414473295211792, "logps/chosen": -431.6698303222656, "logps/rejected": -572.1229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.292036533355713, "rewards/margins": 28.899133682250977, "rewards/rejected": -36.19116973876953, "step": 7400 }, { "epoch": 22.989930286599535, "grad_norm": 0.00026807800168171525, "learning_rate": 3.177695919147339e-05, "logits/chosen": -0.4957195222377777, "logits/rejected": 1.3398468494415283, "logps/chosen": -426.016845703125, "logps/rejected": -548.435302734375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.988835334777832, "rewards/margins": 28.052658081054688, "rewards/rejected": -35.0414924621582, "step": 7420 }, { "epoch": 23.051897753679317, "grad_norm": 0.0001755996490828693, "learning_rate": 3.131430896096459e-05, "logits/chosen": -0.4614803194999695, "logits/rejected": 1.3167946338653564, "logps/chosen": -420.3673400878906, "logps/rejected": -540.8953857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.986320495605469, "rewards/margins": 26.317516326904297, "rewards/rejected": -34.30384063720703, "step": 7440 }, { "epoch": 23.113865220759102, "grad_norm": 8.663154585519806e-05, "learning_rate": 3.0854425380075544e-05, "logits/chosen": -0.4759630262851715, "logits/rejected": 1.354256272315979, "logps/chosen": -444.06231689453125, "logps/rejected": -582.34619140625, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.985074520111084, "rewards/margins": 28.706161499023438, "rewards/rejected": -36.69123840332031, "step": 7460 }, { "epoch": 23.175832687838884, "grad_norm": 0.00045451842015609145, "learning_rate": 3.0397326972849892e-05, "logits/chosen": -0.5278170108795166, "logits/rejected": 1.2997605800628662, "logps/chosen": -415.7640686035156, "logps/rejected": -550.7852783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.4368720054626465, "rewards/margins": 27.63351821899414, "rewards/rejected": -35.07038497924805, "step": 7480 }, { "epoch": 23.237800154918666, "grad_norm": 8.681453618919477e-05, "learning_rate": 2.9943032151144812e-05, "logits/chosen": -0.5060716867446899, "logits/rejected": 1.314741849899292, "logps/chosen": -424.48583984375, "logps/rejected": -550.7124633789062, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.339421272277832, "rewards/margins": 27.69765281677246, "rewards/rejected": -35.03707504272461, "step": 7500 }, { "epoch": 23.299767621998452, "grad_norm": 2.901791231124662e-05, "learning_rate": 2.949155921388943e-05, "logits/chosen": -0.48493748903274536, "logits/rejected": 1.361489176750183, "logps/chosen": -439.04864501953125, "logps/rejected": -573.9337158203125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.827279567718506, "rewards/margins": 28.83859634399414, "rewards/rejected": -36.665870666503906, "step": 7520 }, { "epoch": 23.361735089078234, "grad_norm": 0.00018417155661154538, "learning_rate": 2.904292634634793e-05, "logits/chosen": -0.5873640775680542, "logits/rejected": 1.3690621852874756, "logps/chosen": -431.43994140625, "logps/rejected": -537.2473754882812, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.988741397857666, "rewards/margins": 27.92215919494629, "rewards/rejected": -34.91089630126953, "step": 7540 }, { "epoch": 23.423702556158016, "grad_norm": 0.00022559291392099112, "learning_rate": 2.8597151619386707e-05, "logits/chosen": -0.5808820128440857, "logits/rejected": 1.3238177299499512, "logps/chosen": -447.241455078125, "logps/rejected": -549.6907958984375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.945500373840332, "rewards/margins": 26.819957733154297, "rewards/rejected": -33.76545715332031, "step": 7560 }, { "epoch": 23.4856700232378, "grad_norm": 0.0003778359678108245, "learning_rate": 2.8154252988746755e-05, "logits/chosen": -0.3938234746456146, "logits/rejected": 1.2302014827728271, "logps/chosen": -422.442626953125, "logps/rejected": -562.1705932617188, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.191490173339844, "rewards/margins": 27.9207763671875, "rewards/rejected": -35.11227035522461, "step": 7580 }, { "epoch": 23.547637490317584, "grad_norm": 0.00015092053217813373, "learning_rate": 2.771424829432041e-05, "logits/chosen": -0.5172373056411743, "logits/rejected": 1.2900029420852661, "logps/chosen": -445.7874450683594, "logps/rejected": -598.3099365234375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.032974243164062, "rewards/margins": 29.89381980895996, "rewards/rejected": -37.926795959472656, "step": 7600 }, { "epoch": 23.609604957397366, "grad_norm": 0.00026129186153411865, "learning_rate": 2.727715525943253e-05, "logits/chosen": -0.5197489857673645, "logits/rejected": 1.264127492904663, "logps/chosen": -439.525146484375, "logps/rejected": -574.5367431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.128179550170898, "rewards/margins": 27.236980438232422, "rewards/rejected": -35.36515426635742, "step": 7620 }, { "epoch": 23.671572424477148, "grad_norm": 4.1022536606760696e-05, "learning_rate": 2.68429914901269e-05, "logits/chosen": -0.532270073890686, "logits/rejected": 1.3862292766571045, "logps/chosen": -425.9048767089844, "logps/rejected": -557.3917236328125, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.1133575439453125, "rewards/margins": 28.244802474975586, "rewards/rejected": -35.35816192626953, "step": 7640 }, { "epoch": 23.733539891556934, "grad_norm": 9.765337745193392e-05, "learning_rate": 2.6411774474456797e-05, "logits/chosen": -0.5974343419075012, "logits/rejected": 1.36226487159729, "logps/chosen": -439.45269775390625, "logps/rejected": -552.4537963867188, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.1931352615356445, "rewards/margins": 27.47402572631836, "rewards/rejected": -34.66716384887695, "step": 7660 }, { "epoch": 23.795507358636716, "grad_norm": 0.0002056274825008586, "learning_rate": 2.5983521581780724e-05, "logits/chosen": -0.46689167618751526, "logits/rejected": 1.3515841960906982, "logps/chosen": -442.60284423828125, "logps/rejected": -573.5643310546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.276591300964355, "rewards/margins": 28.31182861328125, "rewards/rejected": -36.588417053222656, "step": 7680 }, { "epoch": 23.857474825716498, "grad_norm": 0.00024225719971582294, "learning_rate": 2.5558250062062828e-05, "logits/chosen": -0.4331149160861969, "logits/rejected": 1.337519884109497, "logps/chosen": -453.8560485839844, "logps/rejected": -565.5623779296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -9.228255271911621, "rewards/margins": 26.08279800415039, "rewards/rejected": -35.311058044433594, "step": 7700 }, { "epoch": 23.919442292796283, "grad_norm": 0.00014199658471625298, "learning_rate": 2.5135977045177815e-05, "logits/chosen": -0.43462008237838745, "logits/rejected": 1.2873212099075317, "logps/chosen": -420.39898681640625, "logps/rejected": -550.4713134765625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.8913254737854, "rewards/margins": 26.91733741760254, "rewards/rejected": -34.80866241455078, "step": 7720 }, { "epoch": 23.981409759876065, "grad_norm": 0.0002433314803056419, "learning_rate": 2.4716719540221268e-05, "logits/chosen": -0.45984095335006714, "logits/rejected": 1.445708990097046, "logps/chosen": -409.4692687988281, "logps/rejected": -525.4560546875, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.901988983154297, "rewards/margins": 27.886280059814453, "rewards/rejected": -34.78826904296875, "step": 7740 }, { "epoch": 24.043377226955847, "grad_norm": 0.0001660619891481474, "learning_rate": 2.4300494434824373e-05, "logits/chosen": -0.4552191197872162, "logits/rejected": 1.3290296792984009, "logps/chosen": -426.6686096191406, "logps/rejected": -570.5438232421875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.440762519836426, "rewards/margins": 28.601394653320312, "rewards/rejected": -36.042152404785156, "step": 7760 }, { "epoch": 24.10534469403563, "grad_norm": 8.915072248782963e-05, "learning_rate": 2.3887318494473677e-05, "logits/chosen": -0.539429783821106, "logits/rejected": 1.2853329181671143, "logps/chosen": -416.55908203125, "logps/rejected": -535.9031982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.548221588134766, "rewards/margins": 26.924367904663086, "rewards/rejected": -34.47259521484375, "step": 7780 }, { "epoch": 24.167312161115415, "grad_norm": 0.0002355161268496886, "learning_rate": 2.347720836183578e-05, "logits/chosen": -0.4840586185455322, "logits/rejected": 1.3917993307113647, "logps/chosen": -424.24969482421875, "logps/rejected": -553.1238403320312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.6523895263671875, "rewards/margins": 28.274377822875977, "rewards/rejected": -34.92676544189453, "step": 7800 }, { "epoch": 24.229279628195197, "grad_norm": 0.0002348150301259011, "learning_rate": 2.3070180556087074e-05, "logits/chosen": -0.4440614581108093, "logits/rejected": 1.27870512008667, "logps/chosen": -444.35772705078125, "logps/rejected": -572.5586547851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.005529403686523, "rewards/margins": 27.384714126586914, "rewards/rejected": -35.39024353027344, "step": 7820 }, { "epoch": 24.29124709527498, "grad_norm": 4.529371653916314e-05, "learning_rate": 2.266625147224817e-05, "logits/chosen": -0.5220253467559814, "logits/rejected": 1.4514058828353882, "logps/chosen": -432.4845275878906, "logps/rejected": -534.2028198242188, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.848171234130859, "rewards/margins": 27.818805694580078, "rewards/rejected": -34.66697692871094, "step": 7840 }, { "epoch": 24.353214562354765, "grad_norm": 0.0005656908615492284, "learning_rate": 2.2265437380523734e-05, "logits/chosen": -0.5128262639045715, "logits/rejected": 1.2830040454864502, "logps/chosen": -459.4671325683594, "logps/rejected": -589.1271362304688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.426397323608398, "rewards/margins": 28.289302825927734, "rewards/rejected": -36.715702056884766, "step": 7860 }, { "epoch": 24.415182029434547, "grad_norm": 0.00024748279247432947, "learning_rate": 2.1867754425646926e-05, "logits/chosen": -0.5120356678962708, "logits/rejected": 1.4352920055389404, "logps/chosen": -449.92620849609375, "logps/rejected": -586.4793090820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.046315670013428, "rewards/margins": 29.50286293029785, "rewards/rejected": -36.54917907714844, "step": 7880 }, { "epoch": 24.47714949651433, "grad_norm": 0.0029017701745033264, "learning_rate": 2.1473218626229095e-05, "logits/chosen": -0.49962282180786133, "logits/rejected": 1.3568048477172852, "logps/chosen": -440.7109375, "logps/rejected": -566.8717651367188, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -7.135717868804932, "rewards/margins": 28.588165283203125, "rewards/rejected": -35.723880767822266, "step": 7900 }, { "epoch": 24.539116963594115, "grad_norm": 0.0006671111332252622, "learning_rate": 2.1081845874114815e-05, "logits/chosen": -0.49870556592941284, "logits/rejected": 1.3455699682235718, "logps/chosen": -427.1560974121094, "logps/rejected": -567.7340087890625, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.85535192489624, "rewards/margins": 29.06607437133789, "rewards/rejected": -35.921424865722656, "step": 7920 }, { "epoch": 24.601084430673897, "grad_norm": 0.0003407177282497287, "learning_rate": 2.069365193374142e-05, "logits/chosen": -0.4392669200897217, "logits/rejected": 1.4311549663543701, "logps/chosen": -438.2574768066406, "logps/rejected": -559.4137573242188, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.767866611480713, "rewards/margins": 27.89835548400879, "rewards/rejected": -35.66621780395508, "step": 7940 }, { "epoch": 24.66305189775368, "grad_norm": 9.20661841519177e-05, "learning_rate": 2.0308652441504217e-05, "logits/chosen": -0.4642259478569031, "logits/rejected": 1.3726909160614014, "logps/chosen": -434.41571044921875, "logps/rejected": -561.6458129882812, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.912638187408447, "rewards/margins": 28.36699867248535, "rewards/rejected": -36.27963638305664, "step": 7960 }, { "epoch": 24.72501936483346, "grad_norm": 0.00026214818353764713, "learning_rate": 1.9926862905126665e-05, "logits/chosen": -0.4355667531490326, "logits/rejected": 1.2842066287994385, "logps/chosen": -454.08746337890625, "logps/rejected": -588.8795166015625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.499858856201172, "rewards/margins": 27.742816925048828, "rewards/rejected": -36.24267578125, "step": 7980 }, { "epoch": 24.786986831913246, "grad_norm": 1.4072214980842546e-05, "learning_rate": 1.954829870303555e-05, "logits/chosen": -0.4878915250301361, "logits/rejected": 1.3544992208480835, "logps/chosen": -441.7566833496094, "logps/rejected": -572.3592529296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.753843307495117, "rewards/margins": 27.906024932861328, "rewards/rejected": -36.65986251831055, "step": 8000 }, { "epoch": 24.84895429899303, "grad_norm": 4.5072305510984734e-05, "learning_rate": 1.9172975083741817e-05, "logits/chosen": -0.5160379409790039, "logits/rejected": 1.3292560577392578, "logps/chosen": -430.331787109375, "logps/rejected": -547.330322265625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.6765336990356445, "rewards/margins": 26.8677921295166, "rewards/rejected": -34.54432678222656, "step": 8020 }, { "epoch": 24.91092176607281, "grad_norm": 0.00028837990248575807, "learning_rate": 1.8800907165226066e-05, "logits/chosen": -0.5141741633415222, "logits/rejected": 1.4077479839324951, "logps/chosen": -430.82916259765625, "logps/rejected": -552.1541748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.073647499084473, "rewards/margins": 28.02901268005371, "rewards/rejected": -36.102657318115234, "step": 8040 }, { "epoch": 24.972889233152596, "grad_norm": 0.0027730674482882023, "learning_rate": 1.8432109934329834e-05, "logits/chosen": -0.4535338878631592, "logits/rejected": 1.4890105724334717, "logps/chosen": -427.42724609375, "logps/rejected": -543.154052734375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.028363227844238, "rewards/margins": 27.05509376525879, "rewards/rejected": -35.08345413208008, "step": 8060 }, { "epoch": 25.03485670023238, "grad_norm": 0.00010149605077458546, "learning_rate": 1.8066598246151768e-05, "logits/chosen": -0.5200473070144653, "logits/rejected": 1.3415261507034302, "logps/chosen": -436.22723388671875, "logps/rejected": -571.4678344726562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.299592971801758, "rewards/margins": 28.236587524414062, "rewards/rejected": -36.53618240356445, "step": 8080 }, { "epoch": 25.09682416731216, "grad_norm": 4.2563999159028754e-05, "learning_rate": 1.7704386823449403e-05, "logits/chosen": -0.5422636866569519, "logits/rejected": 1.3826935291290283, "logps/chosen": -439.9010314941406, "logps/rejected": -581.65380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.662468910217285, "rewards/margins": 29.301517486572266, "rewards/rejected": -36.963985443115234, "step": 8100 }, { "epoch": 25.158791634391946, "grad_norm": 6.259313522605225e-05, "learning_rate": 1.7345490256045993e-05, "logits/chosen": -0.4815981984138489, "logits/rejected": 1.3883564472198486, "logps/chosen": -438.5204162597656, "logps/rejected": -578.703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.347479343414307, "rewards/margins": 28.74386215209961, "rewards/rejected": -36.09134292602539, "step": 8120 }, { "epoch": 25.220759101471728, "grad_norm": 6.804763688705862e-05, "learning_rate": 1.6989923000243e-05, "logits/chosen": -0.43593135476112366, "logits/rejected": 1.4665887355804443, "logps/chosen": -415.4696350097656, "logps/rejected": -555.9053955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.401350498199463, "rewards/margins": 28.641956329345703, "rewards/rejected": -36.043304443359375, "step": 8140 }, { "epoch": 25.28272656855151, "grad_norm": 0.00014422877575270832, "learning_rate": 1.6637699378237605e-05, "logits/chosen": -0.5304734110832214, "logits/rejected": 1.378281831741333, "logps/chosen": -435.180419921875, "logps/rejected": -561.174560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.14901876449585, "rewards/margins": 28.46111488342285, "rewards/rejected": -35.610137939453125, "step": 8160 }, { "epoch": 25.344694035631292, "grad_norm": 0.00013896219024900347, "learning_rate": 1.6288833577545914e-05, "logits/chosen": -0.4343484938144684, "logits/rejected": 1.2599773406982422, "logps/chosen": -425.02606201171875, "logps/rejected": -573.0264282226562, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.175013542175293, "rewards/margins": 27.731231689453125, "rewards/rejected": -35.90624237060547, "step": 8180 }, { "epoch": 25.406661502711078, "grad_norm": 4.538395660347305e-05, "learning_rate": 1.5943339650431576e-05, "logits/chosen": -0.5071940422058105, "logits/rejected": 1.3921631574630737, "logps/chosen": -432.5198669433594, "logps/rejected": -560.261962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.497560024261475, "rewards/margins": 28.007415771484375, "rewards/rejected": -35.50497817993164, "step": 8200 }, { "epoch": 25.46862896979086, "grad_norm": 0.00019486816017888486, "learning_rate": 1.5601231513339565e-05, "logits/chosen": -0.47233065962791443, "logits/rejected": 1.3630597591400146, "logps/chosen": -425.60736083984375, "logps/rejected": -546.03662109375, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.1140923500061035, "rewards/margins": 27.629613876342773, "rewards/rejected": -34.74370574951172, "step": 8220 }, { "epoch": 25.530596436870642, "grad_norm": 0.0008970848866738379, "learning_rate": 1.5262522946335755e-05, "logits/chosen": -0.5086788535118103, "logits/rejected": 1.389817476272583, "logps/chosen": -428.55328369140625, "logps/rejected": -537.9722900390625, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.565539360046387, "rewards/margins": 27.265399932861328, "rewards/rejected": -34.83094024658203, "step": 8240 }, { "epoch": 25.592563903950428, "grad_norm": 0.0003016830887645483, "learning_rate": 1.492722759255184e-05, "logits/chosen": -0.4972669184207916, "logits/rejected": 1.4621047973632812, "logps/chosen": -442.19903564453125, "logps/rejected": -546.1878051757812, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.644277095794678, "rewards/margins": 27.799856185913086, "rewards/rejected": -35.44413375854492, "step": 8260 }, { "epoch": 25.65453137103021, "grad_norm": 0.00022154908219818026, "learning_rate": 1.4595358957635763e-05, "logits/chosen": -0.5789279341697693, "logits/rejected": 1.2437044382095337, "logps/chosen": -430.11151123046875, "logps/rejected": -562.6528930664062, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -7.121301174163818, "rewards/margins": 28.626384735107422, "rewards/rejected": -35.74768829345703, "step": 8280 }, { "epoch": 25.71649883810999, "grad_norm": 1.7708864106680267e-05, "learning_rate": 1.4266930409207791e-05, "logits/chosen": -0.5174465775489807, "logits/rejected": 1.299397587776184, "logps/chosen": -445.3390197753906, "logps/rejected": -575.9884643554688, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.392850875854492, "rewards/margins": 28.103023529052734, "rewards/rejected": -36.495872497558594, "step": 8300 }, { "epoch": 25.778466305189774, "grad_norm": 0.00012290375889278948, "learning_rate": 1.394195517632193e-05, "logits/chosen": -0.44883760809898376, "logits/rejected": 1.4809213876724243, "logps/chosen": -451.4324645996094, "logps/rejected": -562.3369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.809348106384277, "rewards/margins": 27.419443130493164, "rewards/rejected": -35.22879409790039, "step": 8320 }, { "epoch": 25.84043377226956, "grad_norm": 0.00036960910074412823, "learning_rate": 1.362044634893318e-05, "logits/chosen": -0.4777800440788269, "logits/rejected": 1.4690001010894775, "logps/chosen": -441.289306640625, "logps/rejected": -547.408935546875, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.73660135269165, "rewards/margins": 27.180755615234375, "rewards/rejected": -34.917354583740234, "step": 8340 }, { "epoch": 25.90240123934934, "grad_norm": 2.864907764887903e-05, "learning_rate": 1.3302416877370239e-05, "logits/chosen": -0.5380562543869019, "logits/rejected": 1.4958761930465698, "logps/chosen": -438.1744079589844, "logps/rejected": -547.0137939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.9420647621154785, "rewards/margins": 28.122699737548828, "rewards/rejected": -35.064762115478516, "step": 8360 }, { "epoch": 25.964368706429124, "grad_norm": 9.71209374256432e-05, "learning_rate": 1.2987879571813854e-05, "logits/chosen": -0.43615514039993286, "logits/rejected": 1.3821794986724854, "logps/chosen": -438.2367248535156, "logps/rejected": -562.9356689453125, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.997640609741211, "rewards/margins": 27.08341407775879, "rewards/rejected": -36.081050872802734, "step": 8380 }, { "epoch": 26.02633617350891, "grad_norm": 0.0003731061005964875, "learning_rate": 1.267684710178081e-05, "logits/chosen": -0.5250933766365051, "logits/rejected": 1.2111034393310547, "logps/chosen": -434.3421936035156, "logps/rejected": -571.15673828125, "loss": 0.0033, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -8.725934982299805, "rewards/margins": 27.260334014892578, "rewards/rejected": -35.98626708984375, "step": 8400 }, { "epoch": 26.08830364058869, "grad_norm": 0.00014083593850955367, "learning_rate": 1.2369331995613665e-05, "logits/chosen": -0.4025232195854187, "logits/rejected": 1.4967281818389893, "logps/chosen": -442.66424560546875, "logps/rejected": -540.0578002929688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.629788398742676, "rewards/margins": 27.342418670654297, "rewards/rejected": -34.97220993041992, "step": 8420 }, { "epoch": 26.150271107668473, "grad_norm": 7.524704415118322e-05, "learning_rate": 1.2065346639976016e-05, "logits/chosen": -0.4743157923221588, "logits/rejected": 1.304748296737671, "logps/chosen": -420.94073486328125, "logps/rejected": -565.73388671875, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.074350357055664, "rewards/margins": 27.66106605529785, "rewards/rejected": -35.735416412353516, "step": 8440 }, { "epoch": 26.21223857474826, "grad_norm": 0.0002636277349665761, "learning_rate": 1.177984113760211e-05, "logits/chosen": -0.47151750326156616, "logits/rejected": 1.3237098455429077, "logps/chosen": -424.85430908203125, "logps/rejected": -548.9588012695312, "loss": 0.0043, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.068964004516602, "rewards/margins": 27.008281707763672, "rewards/rejected": -35.077247619628906, "step": 8460 }, { "epoch": 26.27420604182804, "grad_norm": 0.0002253088023280725, "learning_rate": 1.1482773883758357e-05, "logits/chosen": -0.47794610261917114, "logits/rejected": 1.3321702480316162, "logps/chosen": -433.79241943359375, "logps/rejected": -567.6560668945312, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.62246036529541, "rewards/margins": 28.215002059936523, "rewards/rejected": -35.837459564208984, "step": 8480 }, { "epoch": 26.336173508907823, "grad_norm": 2.4017164832912385e-05, "learning_rate": 1.1189272090875591e-05, "logits/chosen": -0.4993225932121277, "logits/rejected": 1.416684865951538, "logps/chosen": -424.10797119140625, "logps/rejected": -537.0582275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.3983659744262695, "rewards/margins": 26.852685928344727, "rewards/rejected": -34.25105285644531, "step": 8500 }, { "epoch": 26.398140975987605, "grad_norm": 1.272676968255837e-06, "learning_rate": 1.0899347581163221e-05, "logits/chosen": -0.4834226965904236, "logits/rejected": 1.3799736499786377, "logps/chosen": -426.93487548828125, "logps/rejected": -559.1065673828125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.2085771560668945, "rewards/margins": 28.53904151916504, "rewards/rejected": -35.747623443603516, "step": 8520 }, { "epoch": 26.46010844306739, "grad_norm": 0.0005902862176299095, "learning_rate": 1.0613012032738268e-05, "logits/chosen": -0.49945202469825745, "logits/rejected": 1.369750738143921, "logps/chosen": -423.49951171875, "logps/rejected": -546.5079345703125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.095788478851318, "rewards/margins": 28.046178817749023, "rewards/rejected": -35.1419677734375, "step": 8540 }, { "epoch": 26.522075910147173, "grad_norm": 0.0001648461475269869, "learning_rate": 1.033027697915483e-05, "logits/chosen": -0.4533432424068451, "logits/rejected": 1.3663042783737183, "logps/chosen": -447.10919189453125, "logps/rejected": -590.3201293945312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.335497856140137, "rewards/margins": 28.570659637451172, "rewards/rejected": -36.90616226196289, "step": 8560 }, { "epoch": 26.584043377226955, "grad_norm": 4.678579443861963e-06, "learning_rate": 1.0051153808939685e-05, "logits/chosen": -0.4500574469566345, "logits/rejected": 1.3406521081924438, "logps/chosen": -409.4193115234375, "logps/rejected": -528.8170166015625, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.115273952484131, "rewards/margins": 26.795047760009766, "rewards/rejected": -33.91032028198242, "step": 8580 }, { "epoch": 26.64601084430674, "grad_norm": 0.00027692707953974605, "learning_rate": 9.775653765133396e-06, "logits/chosen": -0.3562534749507904, "logits/rejected": 1.4658071994781494, "logps/chosen": -424.7490234375, "logps/rejected": -549.2362670898438, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.8106536865234375, "rewards/margins": 27.75030517578125, "rewards/rejected": -35.56095504760742, "step": 8600 }, { "epoch": 26.707978311386523, "grad_norm": 0.000177569585503079, "learning_rate": 9.503787944837561e-06, "logits/chosen": -0.48818501830101013, "logits/rejected": 1.3173562288284302, "logps/chosen": -436.6263732910156, "logps/rejected": -579.1359252929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.437201499938965, "rewards/margins": 28.580413818359375, "rewards/rejected": -37.017616271972656, "step": 8620 }, { "epoch": 26.769945778466305, "grad_norm": 0.0019391351379454136, "learning_rate": 9.23556729876781e-06, "logits/chosen": -0.3800424039363861, "logits/rejected": 1.3795052766799927, "logps/chosen": -428.7498474121094, "logps/rejected": -563.37451171875, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.988425254821777, "rewards/margins": 28.296335220336914, "rewards/rejected": -35.284767150878906, "step": 8640 }, { "epoch": 26.831913245546087, "grad_norm": 0.0002963369188364595, "learning_rate": 8.971002630812619e-06, "logits/chosen": -0.4745435118675232, "logits/rejected": 1.407877802848816, "logps/chosen": -427.7135314941406, "logps/rejected": -580.1387939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.42059850692749, "rewards/margins": 30.40887451171875, "rewards/rejected": -36.829471588134766, "step": 8660 }, { "epoch": 26.893880712625872, "grad_norm": 7.189660391304642e-05, "learning_rate": 8.710104597598223e-06, "logits/chosen": -0.4752906262874603, "logits/rejected": 1.3506954908370972, "logps/chosen": -424.6866760253906, "logps/rejected": -528.3801879882812, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.252419948577881, "rewards/margins": 26.98067855834961, "rewards/rejected": -34.23310089111328, "step": 8680 }, { "epoch": 26.955848179705654, "grad_norm": 0.00013338649296201766, "learning_rate": 8.4528837080594e-06, "logits/chosen": -0.527729332447052, "logits/rejected": 1.4104167222976685, "logps/chosen": -425.31817626953125, "logps/rejected": -548.75732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.509429931640625, "rewards/margins": 27.011505126953125, "rewards/rejected": -35.52093505859375, "step": 8700 }, { "epoch": 27.017815646785436, "grad_norm": 7.765422924421728e-05, "learning_rate": 8.199350323016041e-06, "logits/chosen": -0.44741296768188477, "logits/rejected": 1.3625432252883911, "logps/chosen": -463.06219482421875, "logps/rejected": -607.8698120117188, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.488401412963867, "rewards/margins": 29.808670043945312, "rewards/rejected": -38.29706954956055, "step": 8720 }, { "epoch": 27.079783113865222, "grad_norm": 3.868131898343563e-05, "learning_rate": 7.949514654755962e-06, "logits/chosen": -0.4678593575954437, "logits/rejected": 1.44140625, "logps/chosen": -429.23345947265625, "logps/rejected": -552.1541748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.506895542144775, "rewards/margins": 28.108739852905273, "rewards/rejected": -35.615638732910156, "step": 8740 }, { "epoch": 27.141750580945004, "grad_norm": 0.0002340103528695181, "learning_rate": 7.703386766623444e-06, "logits/chosen": -0.3554527759552002, "logits/rejected": 1.4482638835906982, "logps/chosen": -430.29913330078125, "logps/rejected": -555.3551025390625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.764983177185059, "rewards/margins": 27.69220542907715, "rewards/rejected": -35.45718765258789, "step": 8760 }, { "epoch": 27.203718048024786, "grad_norm": 3.8839676562929526e-05, "learning_rate": 7.460976572613887e-06, "logits/chosen": -0.4137742519378662, "logits/rejected": 1.3642592430114746, "logps/chosen": -432.981689453125, "logps/rejected": -575.1232299804688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.254365921020508, "rewards/margins": 28.477890014648438, "rewards/rejected": -36.73225784301758, "step": 8780 }, { "epoch": 27.265685515104572, "grad_norm": 0.00010233109060209244, "learning_rate": 7.222293836974614e-06, "logits/chosen": -0.5344318151473999, "logits/rejected": 1.3696272373199463, "logps/chosen": -458.417236328125, "logps/rejected": -578.0775146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.462766647338867, "rewards/margins": 28.498706817626953, "rewards/rejected": -36.96147155761719, "step": 8800 }, { "epoch": 27.327652982184354, "grad_norm": 6.865251634735614e-05, "learning_rate": 6.9873481738114145e-06, "logits/chosen": -0.5033076405525208, "logits/rejected": 1.3286387920379639, "logps/chosen": -400.2259826660156, "logps/rejected": -537.0637817382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.391934394836426, "rewards/margins": 27.038738250732422, "rewards/rejected": -34.43067169189453, "step": 8820 }, { "epoch": 27.389620449264136, "grad_norm": 5.756524478783831e-05, "learning_rate": 6.756149046701277e-06, "logits/chosen": -0.557861864566803, "logits/rejected": 1.485015869140625, "logps/chosen": -428.1537170410156, "logps/rejected": -548.026611328125, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.231484889984131, "rewards/margins": 28.245208740234375, "rewards/rejected": -35.47669219970703, "step": 8840 }, { "epoch": 27.451587916343918, "grad_norm": 8.74618417583406e-05, "learning_rate": 6.528705768311394e-06, "logits/chosen": -0.343317449092865, "logits/rejected": 1.4624199867248535, "logps/chosen": -429.8575134277344, "logps/rejected": -577.4666137695312, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.366799831390381, "rewards/margins": 29.25629234313965, "rewards/rejected": -36.62308883666992, "step": 8860 }, { "epoch": 27.513555383423704, "grad_norm": 3.0331759262480773e-05, "learning_rate": 6.3050275000238414e-06, "logits/chosen": -0.5099083781242371, "logits/rejected": 1.3164933919906616, "logps/chosen": -431.3134765625, "logps/rejected": -566.6702880859375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.827919960021973, "rewards/margins": 28.29727554321289, "rewards/rejected": -36.12519454956055, "step": 8880 }, { "epoch": 27.575522850503486, "grad_norm": 0.00020247708016540855, "learning_rate": 6.085123251566616e-06, "logits/chosen": -0.3177579343318939, "logits/rejected": 1.4168663024902344, "logps/chosen": -422.2972717285156, "logps/rejected": -565.1768188476562, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.462930679321289, "rewards/margins": 27.940074920654297, "rewards/rejected": -36.40300750732422, "step": 8900 }, { "epoch": 27.637490317583268, "grad_norm": 0.00026213927776552737, "learning_rate": 5.869001880650826e-06, "logits/chosen": -0.47557058930397034, "logits/rejected": 1.302455186843872, "logps/chosen": -437.5243225097656, "logps/rejected": -567.4195556640625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.532797813415527, "rewards/margins": 27.39797592163086, "rewards/rejected": -35.93077850341797, "step": 8920 }, { "epoch": 27.699457784663053, "grad_norm": 0.00015056796837598085, "learning_rate": 5.656672092613757e-06, "logits/chosen": -0.42232632637023926, "logits/rejected": 1.44803786277771, "logps/chosen": -444.7637634277344, "logps/rejected": -585.7415771484375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -7.886309623718262, "rewards/margins": 28.971731185913086, "rewards/rejected": -36.85803985595703, "step": 8940 }, { "epoch": 27.761425251742835, "grad_norm": 0.0001097571657737717, "learning_rate": 5.448142440068316e-06, "logits/chosen": -0.43719226121902466, "logits/rejected": 1.4402527809143066, "logps/chosen": -435.96868896484375, "logps/rejected": -554.9208374023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.246077060699463, "rewards/margins": 28.056320190429688, "rewards/rejected": -35.302391052246094, "step": 8960 }, { "epoch": 27.823392718822618, "grad_norm": 0.00012969023373443633, "learning_rate": 5.243421322558506e-06, "logits/chosen": -0.36826613545417786, "logits/rejected": 1.3211771249771118, "logps/chosen": -431.38653564453125, "logps/rejected": -585.7420654296875, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.941071510314941, "rewards/margins": 28.0720272064209, "rewards/rejected": -37.013099670410156, "step": 8980 }, { "epoch": 27.8853601859024, "grad_norm": 5.7663233747007325e-05, "learning_rate": 5.04251698622108e-06, "logits/chosen": -0.4926798343658447, "logits/rejected": 1.2853095531463623, "logps/chosen": -423.64349365234375, "logps/rejected": -575.9889526367188, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.923436641693115, "rewards/margins": 28.138051986694336, "rewards/rejected": -36.06148910522461, "step": 9000 }, { "epoch": 27.947327652982185, "grad_norm": 0.00023578341642860323, "learning_rate": 4.845437523453411e-06, "logits/chosen": -0.4551617503166199, "logits/rejected": 1.5024217367172241, "logps/chosen": -427.65850830078125, "logps/rejected": -548.6030883789062, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -6.91290283203125, "rewards/margins": 28.266826629638672, "rewards/rejected": -35.17972946166992, "step": 9020 }, { "epoch": 28.009295120061967, "grad_norm": 0.00039731161086820066, "learning_rate": 4.652190872587525e-06, "logits/chosen": -0.5274960994720459, "logits/rejected": 1.2422288656234741, "logps/chosen": -433.1166076660156, "logps/rejected": -576.1898193359375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.720697402954102, "rewards/margins": 27.77066993713379, "rewards/rejected": -36.491371154785156, "step": 9040 }, { "epoch": 28.07126258714175, "grad_norm": 1.7103820937336423e-05, "learning_rate": 4.462784817570331e-06, "logits/chosen": -0.4241279065608978, "logits/rejected": 1.3979580402374268, "logps/chosen": -421.704345703125, "logps/rejected": -548.5910034179688, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.8423662185668945, "rewards/margins": 27.527713775634766, "rewards/rejected": -35.370079040527344, "step": 9060 }, { "epoch": 28.133230054221535, "grad_norm": 8.69917930685915e-05, "learning_rate": 4.277226987650129e-06, "logits/chosen": -0.5255895256996155, "logits/rejected": 1.4158531427383423, "logps/chosen": -457.18829345703125, "logps/rejected": -573.0018310546875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.396566390991211, "rewards/margins": 28.05706214904785, "rewards/rejected": -36.4536247253418, "step": 9080 }, { "epoch": 28.195197521301317, "grad_norm": 9.612823487259448e-05, "learning_rate": 4.095524857069244e-06, "logits/chosen": -0.5023406744003296, "logits/rejected": 1.4314396381378174, "logps/chosen": -434.41015625, "logps/rejected": -562.998291015625, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.083227157592773, "rewards/margins": 28.252487182617188, "rewards/rejected": -36.33571243286133, "step": 9100 }, { "epoch": 28.2571649883811, "grad_norm": 2.338832382520195e-05, "learning_rate": 3.917685744762989e-06, "logits/chosen": -0.37236514687538147, "logits/rejected": 1.355520486831665, "logps/chosen": -429.93804931640625, "logps/rejected": -610.6384887695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.0256195068359375, "rewards/margins": 30.985666275024414, "rewards/rejected": -38.01128387451172, "step": 9120 }, { "epoch": 28.319132455460885, "grad_norm": 0.00021635735174641013, "learning_rate": 3.7437168140648904e-06, "logits/chosen": -0.46186137199401855, "logits/rejected": 1.3692152500152588, "logps/chosen": -432.89642333984375, "logps/rejected": -550.060791015625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.183605194091797, "rewards/margins": 27.7097110748291, "rewards/rejected": -34.89331817626953, "step": 9140 }, { "epoch": 28.381099922540667, "grad_norm": 0.0004849826218560338, "learning_rate": 3.5736250724180966e-06, "logits/chosen": -0.47364211082458496, "logits/rejected": 1.4335612058639526, "logps/chosen": -423.085693359375, "logps/rejected": -547.94970703125, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.032341957092285, "rewards/margins": 27.98506736755371, "rewards/rejected": -36.01740646362305, "step": 9160 }, { "epoch": 28.44306738962045, "grad_norm": 6.062612010282464e-05, "learning_rate": 3.40741737109318e-06, "logits/chosen": -0.407947838306427, "logits/rejected": 1.195319414138794, "logps/chosen": -422.4693908691406, "logps/rejected": -581.656982421875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.4915771484375, "rewards/margins": 28.65443992614746, "rewards/rejected": -37.146018981933594, "step": 9180 }, { "epoch": 28.50503485670023, "grad_norm": 0.00024373046471737325, "learning_rate": 3.245100404912094e-06, "logits/chosen": -0.42624807357788086, "logits/rejected": 1.5248339176177979, "logps/chosen": -416.85845947265625, "logps/rejected": -533.588623046875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.168668746948242, "rewards/margins": 26.794754028320312, "rewards/rejected": -34.96342086791992, "step": 9200 }, { "epoch": 28.567002323780017, "grad_norm": 0.00022582674864679575, "learning_rate": 3.0866807119785734e-06, "logits/chosen": -0.47749605774879456, "logits/rejected": 1.391265869140625, "logps/chosen": -432.8900451660156, "logps/rejected": -567.29931640625, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.252971649169922, "rewards/margins": 28.681339263916016, "rewards/rejected": -36.93431091308594, "step": 9220 }, { "epoch": 28.6289697908598, "grad_norm": 0.00010398898302810267, "learning_rate": 2.9321646734147502e-06, "logits/chosen": -0.5295883417129517, "logits/rejected": 1.4302924871444702, "logps/chosen": -418.13165283203125, "logps/rejected": -574.0758666992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.945282936096191, "rewards/margins": 30.516979217529297, "rewards/rejected": -36.46226119995117, "step": 9240 }, { "epoch": 28.69093725793958, "grad_norm": 9.844397573033348e-05, "learning_rate": 2.7815585131041435e-06, "logits/chosen": -0.5464714765548706, "logits/rejected": 1.3297626972198486, "logps/chosen": -429.76580810546875, "logps/rejected": -579.6892700195312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.043060302734375, "rewards/margins": 29.36098861694336, "rewards/rejected": -36.40404510498047, "step": 9260 }, { "epoch": 28.752904725019366, "grad_norm": 0.0001132589895860292, "learning_rate": 2.6348682974408955e-06, "logits/chosen": -0.40210598707199097, "logits/rejected": 1.3564558029174805, "logps/chosen": -441.2831115722656, "logps/rejected": -580.2747802734375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.844902038574219, "rewards/margins": 28.26337242126465, "rewards/rejected": -36.1082763671875, "step": 9280 }, { "epoch": 28.81487219209915, "grad_norm": 0.00019120110664516687, "learning_rate": 2.4920999350855458e-06, "logits/chosen": -0.4944397509098053, "logits/rejected": 1.402043104171753, "logps/chosen": -432.44671630859375, "logps/rejected": -566.1397705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.532393455505371, "rewards/margins": 27.875173568725586, "rewards/rejected": -36.407569885253906, "step": 9300 }, { "epoch": 28.87683965917893, "grad_norm": 0.00020868379215244204, "learning_rate": 2.3532591767268853e-06, "logits/chosen": -0.5052774548530579, "logits/rejected": 1.4609028100967407, "logps/chosen": -440.171875, "logps/rejected": -546.31640625, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.388496398925781, "rewards/margins": 27.529205322265625, "rewards/rejected": -35.917701721191406, "step": 9320 }, { "epoch": 28.938807126258713, "grad_norm": 0.00011665547935990617, "learning_rate": 2.2183516148504226e-06, "logits/chosen": -0.5089203119277954, "logits/rejected": 1.4505350589752197, "logps/chosen": -444.054931640625, "logps/rejected": -563.5677490234375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.200953483581543, "rewards/margins": 27.984363555908203, "rewards/rejected": -36.18532180786133, "step": 9340 }, { "epoch": 29.000774593338498, "grad_norm": 5.405825504567474e-05, "learning_rate": 2.0938375055220893e-06, "logits/chosen": -0.4666138291358948, "logits/rejected": 1.410650610923767, "logps/chosen": -427.109375, "logps/rejected": -549.4266967773438, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.324958801269531, "rewards/margins": 27.69424819946289, "rewards/rejected": -35.019203186035156, "step": 9360 }, { "epoch": 29.06274206041828, "grad_norm": 2.820672671077773e-05, "learning_rate": 1.966615161996477e-06, "logits/chosen": -0.38536205887794495, "logits/rejected": 1.5962440967559814, "logps/chosen": -445.6573181152344, "logps/rejected": -557.0641479492188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.192480087280273, "rewards/margins": 27.905410766601562, "rewards/rejected": -36.0978889465332, "step": 9380 }, { "epoch": 29.124709527498062, "grad_norm": 0.00012238779163453728, "learning_rate": 1.8433415889175799e-06, "logits/chosen": -0.5433516502380371, "logits/rejected": 1.3412362337112427, "logps/chosen": -448.1932067871094, "logps/rejected": -580.395751953125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.69677734375, "rewards/margins": 28.863109588623047, "rewards/rejected": -36.55989074707031, "step": 9400 }, { "epoch": 29.186676994577848, "grad_norm": 5.539653648156673e-05, "learning_rate": 1.7240217517269897e-06, "logits/chosen": -0.4289434850215912, "logits/rejected": 1.394278645515442, "logps/chosen": -429.510009765625, "logps/rejected": -581.1126708984375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.776337623596191, "rewards/margins": 28.806468963623047, "rewards/rejected": -36.58280944824219, "step": 9420 }, { "epoch": 29.24864446165763, "grad_norm": 8.6743557403679e-06, "learning_rate": 1.6086604566103002e-06, "logits/chosen": -0.44304054975509644, "logits/rejected": 1.4444400072097778, "logps/chosen": -419.0517578125, "logps/rejected": -557.58251953125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.534919738769531, "rewards/margins": 29.064559936523438, "rewards/rejected": -36.59947967529297, "step": 9440 }, { "epoch": 29.310611928737412, "grad_norm": 0.0001106134441215545, "learning_rate": 1.4972623503036965e-06, "logits/chosen": -0.5489664673805237, "logits/rejected": 1.2551854848861694, "logps/chosen": -442.162841796875, "logps/rejected": -578.1027221679688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.913488388061523, "rewards/margins": 27.9173641204834, "rewards/rejected": -36.83085250854492, "step": 9460 }, { "epoch": 29.372579395817198, "grad_norm": 0.00013442269118968397, "learning_rate": 1.3898319199066478e-06, "logits/chosen": -0.4975252151489258, "logits/rejected": 1.331676721572876, "logps/chosen": -433.96630859375, "logps/rejected": -583.5892944335938, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.272103786468506, "rewards/margins": 29.543193817138672, "rewards/rejected": -36.81529998779297, "step": 9480 }, { "epoch": 29.43454686289698, "grad_norm": 0.000179157592356205, "learning_rate": 1.2863734927012095e-06, "logits/chosen": -0.5022043585777283, "logits/rejected": 1.4060289859771729, "logps/chosen": -444.994140625, "logps/rejected": -577.6284790039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.224766731262207, "rewards/margins": 28.887136459350586, "rewards/rejected": -36.111900329589844, "step": 9500 }, { "epoch": 29.496514329976762, "grad_norm": 4.570256351144053e-05, "learning_rate": 1.1868912359777607e-06, "logits/chosen": -0.45927444100379944, "logits/rejected": 1.3402315378189087, "logps/chosen": -428.650390625, "logps/rejected": -558.9736328125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.128926753997803, "rewards/margins": 28.86312484741211, "rewards/rejected": -35.99205780029297, "step": 9520 }, { "epoch": 29.558481797056544, "grad_norm": 7.603448466397822e-05, "learning_rate": 1.0913891568670842e-06, "logits/chosen": -0.5526672601699829, "logits/rejected": 1.4398711919784546, "logps/chosen": -442.75152587890625, "logps/rejected": -563.4345092773438, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.6889238357543945, "rewards/margins": 28.326879501342773, "rewards/rejected": -36.01580810546875, "step": 9540 }, { "epoch": 29.62044926413633, "grad_norm": 0.00014126779569778591, "learning_rate": 9.998711021790174e-07, "logits/chosen": -0.50676029920578, "logits/rejected": 1.4971421957015991, "logps/chosen": -433.22235107421875, "logps/rejected": -527.7393188476562, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.903284549713135, "rewards/margins": 26.93197250366211, "rewards/rejected": -34.83525466918945, "step": 9560 }, { "epoch": 29.68241673121611, "grad_norm": 1.3805640264763497e-05, "learning_rate": 9.123407582474541e-07, "logits/chosen": -0.43605098128318787, "logits/rejected": 1.3629835844039917, "logps/chosen": -442.35345458984375, "logps/rejected": -579.7109985351562, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.504144668579102, "rewards/margins": 28.617279052734375, "rewards/rejected": -37.121421813964844, "step": 9580 }, { "epoch": 29.744384198295894, "grad_norm": 0.00011257326696068048, "learning_rate": 8.288016507818742e-07, "logits/chosen": -0.5420119762420654, "logits/rejected": 1.3263146877288818, "logps/chosen": -431.49005126953125, "logps/rejected": -569.9481811523438, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.66623067855835, "rewards/margins": 28.865182876586914, "rewards/rejected": -36.53141403198242, "step": 9600 }, { "epoch": 29.80635166537568, "grad_norm": 3.732260665856302e-05, "learning_rate": 7.49257144725346e-07, "logits/chosen": -0.3866724669933319, "logits/rejected": 1.3957892656326294, "logps/chosen": -431.8949279785156, "logps/rejected": -569.178955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.590885162353516, "rewards/margins": 27.871734619140625, "rewards/rejected": -36.462623596191406, "step": 9620 }, { "epoch": 29.86831913245546, "grad_norm": 8.747459651203826e-05, "learning_rate": 6.737104441189801e-07, "logits/chosen": -0.5281975865364075, "logits/rejected": 1.3896856307983398, "logps/chosen": -411.46221923828125, "logps/rejected": -531.2066650390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.190832614898682, "rewards/margins": 27.5278263092041, "rewards/rejected": -34.71866226196289, "step": 9640 }, { "epoch": 29.930286599535243, "grad_norm": 0.0035836484748870134, "learning_rate": 6.021645919728647e-07, "logits/chosen": -0.38994961977005005, "logits/rejected": 1.36453115940094, "logps/chosen": -410.65118408203125, "logps/rejected": -549.8154296875, "loss": 0.0076, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -8.187182426452637, "rewards/margins": 27.47507667541504, "rewards/rejected": -35.662261962890625, "step": 9660 }, { "epoch": 29.992254066615025, "grad_norm": 4.6857512643327937e-05, "learning_rate": 5.346224701434866e-07, "logits/chosen": -0.35250693559646606, "logits/rejected": 1.5060350894927979, "logps/chosen": -443.78369140625, "logps/rejected": -572.0866088867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.185739517211914, "rewards/margins": 28.152313232421875, "rewards/rejected": -36.33805465698242, "step": 9680 }, { "epoch": 30.05422153369481, "grad_norm": 0.00014773959992453456, "learning_rate": 4.710867992176682e-07, "logits/chosen": -0.4862252175807953, "logits/rejected": 1.4310047626495361, "logps/chosen": -430.75177001953125, "logps/rejected": -556.7779541015625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -6.786073207855225, "rewards/margins": 28.293575286865234, "rewards/rejected": -35.07965087890625, "step": 9700 }, { "epoch": 30.116189000774593, "grad_norm": 0.00040511396946385503, "learning_rate": 4.115601384029666e-07, "logits/chosen": -0.4759834408760071, "logits/rejected": 1.455631971359253, "logps/chosen": -437.209716796875, "logps/rejected": -545.3187866210938, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.022417068481445, "rewards/margins": 26.92848777770996, "rewards/rejected": -34.950904846191406, "step": 9720 }, { "epoch": 30.178156467854375, "grad_norm": 0.0002189109945902601, "learning_rate": 3.5604488542460014e-07, "logits/chosen": -0.5051871538162231, "logits/rejected": 1.4911781549453735, "logps/chosen": -438.0755920410156, "logps/rejected": -570.4459228515625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -7.649161338806152, "rewards/margins": 29.3557186126709, "rewards/rejected": -37.004878997802734, "step": 9740 }, { "epoch": 30.24012393493416, "grad_norm": 0.00017708781524561346, "learning_rate": 3.045432764288703e-07, "logits/chosen": -0.44464248418807983, "logits/rejected": 1.4907373189926147, "logps/chosen": -435.34344482421875, "logps/rejected": -544.2034301757812, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.806088447570801, "rewards/margins": 27.583263397216797, "rewards/rejected": -35.38935089111328, "step": 9760 }, { "epoch": 30.302091402013943, "grad_norm": 0.00015091463865246624, "learning_rate": 2.5705738589306696e-07, "logits/chosen": -0.4208938181400299, "logits/rejected": 1.4723773002624512, "logps/chosen": -440.129150390625, "logps/rejected": -565.9328002929688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.044323921203613, "rewards/margins": 27.99143409729004, "rewards/rejected": -36.03575897216797, "step": 9780 }, { "epoch": 30.364058869093725, "grad_norm": 0.00013884674990549684, "learning_rate": 2.135891265419465e-07, "logits/chosen": -0.4527045786380768, "logits/rejected": 1.4194512367248535, "logps/chosen": -418.86083984375, "logps/rejected": -554.153564453125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -8.422151565551758, "rewards/margins": 27.6077880859375, "rewards/rejected": -36.029937744140625, "step": 9800 }, { "epoch": 30.42602633617351, "grad_norm": 4.7392735723406076e-05, "learning_rate": 1.7414024927064897e-07, "logits/chosen": -0.4850187301635742, "logits/rejected": 1.3569281101226807, "logps/chosen": -432.75164794921875, "logps/rejected": -571.4185791015625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.542023658752441, "rewards/margins": 27.974933624267578, "rewards/rejected": -36.5169563293457, "step": 9820 }, { "epoch": 30.487993803253293, "grad_norm": 0.00015379580145236105, "learning_rate": 1.3871234307420989e-07, "logits/chosen": -0.4856896996498108, "logits/rejected": 1.3765947818756104, "logps/chosen": -438.67718505859375, "logps/rejected": -581.4616088867188, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.13099479675293, "rewards/margins": 28.83926010131836, "rewards/rejected": -36.970252990722656, "step": 9840 }, { "epoch": 30.549961270333075, "grad_norm": 0.00021822135022375733, "learning_rate": 1.0730683498351157e-07, "logits/chosen": -0.4343891143798828, "logits/rejected": 1.4583574533462524, "logps/chosen": -434.26239013671875, "logps/rejected": -559.0338134765625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.727663516998291, "rewards/margins": 28.047531127929688, "rewards/rejected": -35.77519607543945, "step": 9860 }, { "epoch": 30.611928737412857, "grad_norm": 5.923645221628249e-05, "learning_rate": 7.992499000785136e-08, "logits/chosen": -0.39348846673965454, "logits/rejected": 1.4489144086837769, "logps/chosen": -441.802734375, "logps/rejected": -564.3275146484375, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.21603012084961, "rewards/margins": 27.678537368774414, "rewards/rejected": -35.894569396972656, "step": 9880 }, { "epoch": 30.673896204492642, "grad_norm": 2.2205487766768783e-05, "learning_rate": 5.6567911083937883e-08, "logits/chosen": -0.4316628575325012, "logits/rejected": 1.3483152389526367, "logps/chosen": -420.97381591796875, "logps/rejected": -556.51953125, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -8.481675148010254, "rewards/margins": 27.316696166992188, "rewards/rejected": -35.79837417602539, "step": 9900 }, { "epoch": 30.735863671572424, "grad_norm": 9.596312884241343e-05, "learning_rate": 3.723653903152657e-08, "logits/chosen": -0.3819994032382965, "logits/rejected": 1.4576747417449951, "logps/chosen": -435.2261657714844, "logps/rejected": -543.2717895507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.418352127075195, "rewards/margins": 26.706283569335938, "rewards/rejected": -35.1246337890625, "step": 9920 }, { "epoch": 30.797831138652207, "grad_norm": 3.857334831991466e-06, "learning_rate": 2.193165251545004e-08, "logits/chosen": -0.4508894979953766, "logits/rejected": 1.3289421796798706, "logps/chosen": -435.36102294921875, "logps/rejected": -582.7653198242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.7185211181640625, "rewards/margins": 29.5615177154541, "rewards/rejected": -37.28003692626953, "step": 9940 }, { "epoch": 30.859798605731992, "grad_norm": 8.236696157837287e-05, "learning_rate": 1.0653868014309786e-08, "logits/chosen": -0.46718326210975647, "logits/rejected": 1.4237308502197266, "logps/chosen": -429.4341735839844, "logps/rejected": -575.0633544921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.886075496673584, "rewards/margins": 29.133136749267578, "rewards/rejected": -37.01920700073242, "step": 9960 }, { "epoch": 30.921766072811774, "grad_norm": 0.0001442828943254426, "learning_rate": 3.4036397956183076e-09, "logits/chosen": -0.4402598440647125, "logits/rejected": 1.5141351222991943, "logps/chosen": -416.0702209472656, "logps/rejected": -538.6110229492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.578369140625, "rewards/margins": 28.705799102783203, "rewards/rejected": -35.28417205810547, "step": 9980 }, { "epoch": 30.983733539891556, "grad_norm": 2.7997641154797748e-05, "learning_rate": 1.812598975137192e-10, "logits/chosen": -0.41933926939964294, "logits/rejected": 1.3799235820770264, "logps/chosen": -449.55023193359375, "logps/rejected": -607.9024658203125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -8.158994674682617, "rewards/margins": 30.011821746826172, "rewards/rejected": -38.170814514160156, "step": 10000 } ], "logging_steps": 20, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 32, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }