{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.491866769945778, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.061967467079783116, "grad_norm": 0.6070870757102966, "learning_rate": 4e-05, "logits/chosen": -2.0001754760742188, "logits/rejected": -1.449440598487854, "logps/chosen": -374.65521240234375, "logps/rejected": -215.3085479736328, "loss": 1.007, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": -0.3046182096004486, "rewards/margins": -0.20184263586997986, "rewards/rejected": -0.10277555137872696, "step": 20 }, { "epoch": 0.12393493415956623, "grad_norm": 0.5136411190032959, "learning_rate": 8e-05, "logits/chosen": -2.083824872970581, "logits/rejected": -1.584017038345337, "logps/chosen": -341.329833984375, "logps/rejected": -208.3067169189453, "loss": 0.1907, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.0636544227600098, "rewards/margins": 2.9626474380493164, "rewards/rejected": -0.8989933133125305, "step": 40 }, { "epoch": 0.18590240123934934, "grad_norm": 0.18788862228393555, "learning_rate": 0.00012, "logits/chosen": -2.0708529949188232, "logits/rejected": -1.5524569749832153, "logps/chosen": -329.73193359375, "logps/rejected": -221.080078125, "loss": 0.0732, "rewards/accuracies": 0.984375, "rewards/chosen": 2.1646170616149902, "rewards/margins": 4.800443649291992, "rewards/rejected": -2.635826826095581, "step": 60 }, { "epoch": 0.24786986831913246, "grad_norm": 0.2149907350540161, "learning_rate": 0.00016, "logits/chosen": -1.964525580406189, "logits/rejected": -1.425443172454834, "logps/chosen": -337.01165771484375, "logps/rejected": -236.92935180664062, "loss": 0.0384, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 1.6233104467391968, "rewards/margins": 6.25473690032959, "rewards/rejected": -4.631426811218262, "step": 80 }, { "epoch": 0.30983733539891556, "grad_norm": 0.13132674992084503, "learning_rate": 0.0002, "logits/chosen": -1.8194172382354736, "logits/rejected": -1.3340699672698975, "logps/chosen": -329.0172424316406, "logps/rejected": -260.6822814941406, "loss": 0.024, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.2860015332698822, "rewards/margins": 7.288111686706543, "rewards/rejected": -7.002110958099365, "step": 100 }, { "epoch": 0.3718048024786987, "grad_norm": 0.06768889725208282, "learning_rate": 0.00019999177886783194, "logits/chosen": -1.818981409072876, "logits/rejected": -1.3484697341918945, "logps/chosen": -359.87005615234375, "logps/rejected": -294.05047607421875, "loss": 0.021, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.2960149049758911, "rewards/margins": 8.185277938842773, "rewards/rejected": -7.889264106750488, "step": 120 }, { "epoch": 0.4337722695584818, "grad_norm": 0.00373012013733387, "learning_rate": 0.000199967116823068, "logits/chosen": -1.747314453125, "logits/rejected": -1.209826946258545, "logps/chosen": -356.72686767578125, "logps/rejected": -287.92205810546875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 0.20157980918884277, "rewards/margins": 8.92736530303955, "rewards/rejected": -8.725785255432129, "step": 140 }, { "epoch": 0.4957397366382649, "grad_norm": 0.08832018822431564, "learning_rate": 0.00019992601792070679, "logits/chosen": -1.760593056678772, "logits/rejected": -1.227081060409546, "logps/chosen": -359.7059326171875, "logps/rejected": -307.3652648925781, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.5851167440414429, "rewards/margins": 9.88296890258789, "rewards/rejected": -10.468085289001465, "step": 160 }, { "epoch": 0.557707203718048, "grad_norm": 0.12635135650634766, "learning_rate": 0.00019986848891833845, "logits/chosen": -1.6951453685760498, "logits/rejected": -1.1247837543487549, "logps/chosen": -369.36383056640625, "logps/rejected": -313.21380615234375, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.165026903152466, "rewards/margins": 9.382209777832031, "rewards/rejected": -11.547235488891602, "step": 180 }, { "epoch": 0.6196746707978311, "grad_norm": 0.5119428038597107, "learning_rate": 0.00019979453927503364, "logits/chosen": -1.5557712316513062, "logits/rejected": -0.9883753657341003, "logps/chosen": -378.3529357910156, "logps/rejected": -338.2301330566406, "loss": 0.0109, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.9073299169540405, "rewards/margins": 10.27137565612793, "rewards/rejected": -12.178706169128418, "step": 200 }, { "epoch": 0.6816421378776143, "grad_norm": 0.012499742209911346, "learning_rate": 0.0001997041811497882, "logits/chosen": -1.639301061630249, "logits/rejected": -1.059734582901001, "logps/chosen": -403.56439208984375, "logps/rejected": -362.4933776855469, "loss": 0.0113, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.50722599029541, "rewards/margins": 11.781638145446777, "rewards/rejected": -16.288862228393555, "step": 220 }, { "epoch": 0.7436096049573974, "grad_norm": 0.015822602435946465, "learning_rate": 0.00019959742939952392, "logits/chosen": -1.801640510559082, "logits/rejected": -1.2558636665344238, "logps/chosen": -358.8158264160156, "logps/rejected": -329.281494140625, "loss": 0.0085, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.591296911239624, "rewards/margins": 11.404090881347656, "rewards/rejected": -12.995388984680176, "step": 240 }, { "epoch": 0.8055770720371804, "grad_norm": 0.06576687842607498, "learning_rate": 0.00019947430157664576, "logits/chosen": -1.816361427307129, "logits/rejected": -1.3142831325531006, "logps/chosen": -375.107421875, "logps/rejected": -361.25567626953125, "loss": 0.0121, "rewards/accuracies": 0.984375, "rewards/chosen": -2.420842409133911, "rewards/margins": 11.270395278930664, "rewards/rejected": -13.691238403320312, "step": 260 }, { "epoch": 0.8675445391169636, "grad_norm": 0.01211523823440075, "learning_rate": 0.00019933481792615583, "logits/chosen": -1.7951005697250366, "logits/rejected": -1.256089448928833, "logps/chosen": -363.334228515625, "logps/rejected": -335.49615478515625, "loss": 0.0069, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6655162572860718, "rewards/margins": 11.434516906738281, "rewards/rejected": -13.1000337600708, "step": 280 }, { "epoch": 0.9295120061967467, "grad_norm": 0.005867226514965296, "learning_rate": 0.0001991790013823246, "logits/chosen": -1.8247705698013306, "logits/rejected": -1.2836697101593018, "logps/chosen": -373.73175048828125, "logps/rejected": -328.99371337890625, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -1.960078239440918, "rewards/margins": 11.281866073608398, "rewards/rejected": -13.241943359375, "step": 300 }, { "epoch": 0.9914794732765299, "grad_norm": 0.11168529838323593, "learning_rate": 0.0001990068775649202, "logits/chosen": -1.8314838409423828, "logits/rejected": -1.3281538486480713, "logps/chosen": -362.94549560546875, "logps/rejected": -310.90692138671875, "loss": 0.0109, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7653158903121948, "rewards/margins": 10.92064094543457, "rewards/rejected": -11.685956001281738, "step": 320 }, { "epoch": 1.053446940356313, "grad_norm": 0.053166139870882034, "learning_rate": 0.00019881847477499557, "logits/chosen": -1.8288739919662476, "logits/rejected": -1.2687069177627563, "logps/chosen": -379.93914794921875, "logps/rejected": -346.6662902832031, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -0.3435510993003845, "rewards/margins": 12.371174812316895, "rewards/rejected": -12.714726448059082, "step": 340 }, { "epoch": 1.115414407436096, "grad_norm": 0.007846315391361713, "learning_rate": 0.0001986138239902355, "logits/chosen": -1.8146957159042358, "logits/rejected": -1.1931467056274414, "logps/chosen": -361.128173828125, "logps/rejected": -333.5379333496094, "loss": 0.0035, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7167718410491943, "rewards/margins": 13.46613597869873, "rewards/rejected": -14.182907104492188, "step": 360 }, { "epoch": 1.1773818745158793, "grad_norm": 0.0029342020861804485, "learning_rate": 0.00019839295885986296, "logits/chosen": -1.8402125835418701, "logits/rejected": -1.3026095628738403, "logps/chosen": -367.6770935058594, "logps/rejected": -334.61505126953125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.48788753151893616, "rewards/margins": 12.355894088745117, "rewards/rejected": -12.843780517578125, "step": 380 }, { "epoch": 1.2393493415956622, "grad_norm": 0.0005422068061307073, "learning_rate": 0.00019815591569910654, "logits/chosen": -1.781711220741272, "logits/rejected": -1.2187694311141968, "logps/chosen": -368.02130126953125, "logps/rejected": -336.0605163574219, "loss": 0.004, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -0.474712073802948, "rewards/margins": 13.070175170898438, "rewards/rejected": -13.544886589050293, "step": 400 }, { "epoch": 1.3013168086754454, "grad_norm": 0.004247570876032114, "learning_rate": 0.0001979027334832293, "logits/chosen": -1.729142189025879, "logits/rejected": -1.1420295238494873, "logps/chosen": -363.62261962890625, "logps/rejected": -350.509765625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.9223267436027527, "rewards/margins": 14.022272109985352, "rewards/rejected": -14.944600105285645, "step": 420 }, { "epoch": 1.3632842757552286, "grad_norm": 0.025411546230316162, "learning_rate": 0.00019763345384112043, "logits/chosen": -1.6916519403457642, "logits/rejected": -1.1293952465057373, "logps/chosen": -368.69122314453125, "logps/rejected": -357.363037109375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.3415491580963135, "rewards/margins": 13.441192626953125, "rewards/rejected": -14.782742500305176, "step": 440 }, { "epoch": 1.4252517428350115, "grad_norm": 0.023552559316158295, "learning_rate": 0.00019734812104845047, "logits/chosen": -1.6404588222503662, "logits/rejected": -1.0976492166519165, "logps/chosen": -358.5830993652344, "logps/rejected": -323.82977294921875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.1879071146249771, "rewards/margins": 11.893779754638672, "rewards/rejected": -12.081686019897461, "step": 460 }, { "epoch": 1.4872192099147947, "grad_norm": 0.04839726537466049, "learning_rate": 0.0001970467820203915, "logits/chosen": -1.4514319896697998, "logits/rejected": -0.7945712208747864, "logps/chosen": -395.62109375, "logps/rejected": -361.99224853515625, "loss": 0.0052, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.660977602005005, "rewards/margins": 13.56675910949707, "rewards/rejected": -16.227737426757812, "step": 480 }, { "epoch": 1.549186676994578, "grad_norm": 0.04717102646827698, "learning_rate": 0.00019672948630390294, "logits/chosen": -1.6030662059783936, "logits/rejected": -1.008603811264038, "logps/chosen": -382.2178955078125, "logps/rejected": -384.981201171875, "loss": 0.0185, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.418046474456787, "rewards/margins": 14.233471870422363, "rewards/rejected": -17.65151596069336, "step": 500 }, { "epoch": 1.6111541440743609, "grad_norm": 0.022282173857092857, "learning_rate": 0.00019639628606958533, "logits/chosen": -1.943267822265625, "logits/rejected": -1.5064051151275635, "logps/chosen": -350.5743408203125, "logps/rejected": -292.48321533203125, "loss": 0.0043, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -0.508022129535675, "rewards/margins": 10.412274360656738, "rewards/rejected": -10.920295715332031, "step": 520 }, { "epoch": 1.673121611154144, "grad_norm": 0.009392939507961273, "learning_rate": 0.00019604723610310194, "logits/chosen": -1.932124376296997, "logits/rejected": -1.507216215133667, "logps/chosen": -366.7988586425781, "logps/rejected": -342.846923828125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.8672822713851929, "rewards/margins": 11.667869567871094, "rewards/rejected": -12.535151481628418, "step": 540 }, { "epoch": 1.7350890782339272, "grad_norm": 0.008884243667125702, "learning_rate": 0.00019568239379617088, "logits/chosen": -1.8822323083877563, "logits/rejected": -1.4790470600128174, "logps/chosen": -364.321044921875, "logps/rejected": -341.40081787109375, "loss": 0.0035, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.8236000537872314, "rewards/margins": 12.299530982971191, "rewards/rejected": -14.123130798339844, "step": 560 }, { "epoch": 1.7970565453137102, "grad_norm": 0.0044061969965696335, "learning_rate": 0.00019530181913712872, "logits/chosen": -1.926490068435669, "logits/rejected": -1.4624470472335815, "logps/chosen": -372.48468017578125, "logps/rejected": -331.5034484863281, "loss": 0.0055, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.4063794612884521, "rewards/margins": 12.16389274597168, "rewards/rejected": -13.570272445678711, "step": 580 }, { "epoch": 1.8590240123934936, "grad_norm": 0.028566114604473114, "learning_rate": 0.00019490557470106686, "logits/chosen": -1.92436945438385, "logits/rejected": -1.499299168586731, "logps/chosen": -355.2225646972656, "logps/rejected": -351.27313232421875, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2374690771102905, "rewards/margins": 13.03515338897705, "rewards/rejected": -14.272623062133789, "step": 600 }, { "epoch": 1.9209914794732765, "grad_norm": 0.006185224745422602, "learning_rate": 0.00019449372563954293, "logits/chosen": -1.9587417840957642, "logits/rejected": -1.4495702981948853, "logps/chosen": -383.0813903808594, "logps/rejected": -355.744873046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.4742207527160645, "rewards/margins": 13.494425773620605, "rewards/rejected": -15.968646049499512, "step": 620 }, { "epoch": 1.9829589465530595, "grad_norm": 0.006004327442497015, "learning_rate": 0.00019406633966986828, "logits/chosen": -1.9453758001327515, "logits/rejected": -1.512027621269226, "logps/chosen": -392.6808166503906, "logps/rejected": -378.18316650390625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.3614554405212402, "rewards/margins": 13.526113510131836, "rewards/rejected": -15.88757038116455, "step": 640 }, { "epoch": 2.044926413632843, "grad_norm": 0.013266593217849731, "learning_rate": 0.00019362348706397373, "logits/chosen": -1.9494597911834717, "logits/rejected": -1.4765260219573975, "logps/chosen": -373.5834045410156, "logps/rejected": -355.810546875, "loss": 0.0021, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.2433362007141113, "rewards/margins": 13.272119522094727, "rewards/rejected": -15.51545524597168, "step": 660 }, { "epoch": 2.106893880712626, "grad_norm": 0.0013421621406450868, "learning_rate": 0.0001931652406368554, "logits/chosen": -1.879929542541504, "logits/rejected": -1.4265925884246826, "logps/chosen": -377.5626220703125, "logps/rejected": -365.1024475097656, "loss": 0.0016, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.054849624633789, "rewards/margins": 14.068676948547363, "rewards/rejected": -16.123525619506836, "step": 680 }, { "epoch": 2.168861347792409, "grad_norm": 0.0016059954650700092, "learning_rate": 0.0001926916757346022, "logits/chosen": -1.8783481121063232, "logits/rejected": -1.4017314910888672, "logps/chosen": -375.7680969238281, "logps/rejected": -356.9335021972656, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.0512871742248535, "rewards/margins": 14.513456344604492, "rewards/rejected": -16.564743041992188, "step": 700 }, { "epoch": 2.230828814872192, "grad_norm": 0.0020687805954366922, "learning_rate": 0.00019220287022200707, "logits/chosen": -1.8722127676010132, "logits/rejected": -1.4170135259628296, "logps/chosen": -360.9228515625, "logps/rejected": -376.93304443359375, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.443851947784424, "rewards/margins": 15.007545471191406, "rewards/rejected": -17.451396942138672, "step": 720 }, { "epoch": 2.292796281951975, "grad_norm": 0.03182324767112732, "learning_rate": 0.00019169890446976454, "logits/chosen": -1.8520162105560303, "logits/rejected": -1.316450834274292, "logps/chosen": -392.74285888671875, "logps/rejected": -379.98138427734375, "loss": 0.0013, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.4773547649383545, "rewards/margins": 15.281835556030273, "rewards/rejected": -17.75918960571289, "step": 740 }, { "epoch": 2.3547637490317586, "grad_norm": 0.015935391187667847, "learning_rate": 0.0001911798613412557, "logits/chosen": -1.8732004165649414, "logits/rejected": -1.374529480934143, "logps/chosen": -386.89178466796875, "logps/rejected": -386.22894287109375, "loss": 0.0034, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.536558151245117, "rewards/margins": 15.137763977050781, "rewards/rejected": -17.6743221282959, "step": 760 }, { "epoch": 2.4167312161115415, "grad_norm": 0.00028358056442812085, "learning_rate": 0.0001906458261789238, "logits/chosen": -1.8395631313323975, "logits/rejected": -1.3308550119400024, "logps/chosen": -388.93792724609375, "logps/rejected": -391.17559814453125, "loss": 0.0018, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.6551461219787598, "rewards/margins": 15.461560249328613, "rewards/rejected": -18.116708755493164, "step": 780 }, { "epoch": 2.4786986831913245, "grad_norm": 0.001103501650504768, "learning_rate": 0.0001900968867902419, "logits/chosen": -1.8540499210357666, "logits/rejected": -1.3438807725906372, "logps/chosen": -397.89093017578125, "logps/rejected": -393.6608581542969, "loss": 0.0015, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.684976100921631, "rewards/margins": 15.562596321105957, "rewards/rejected": -18.247573852539062, "step": 800 }, { "epoch": 2.5406661502711074, "grad_norm": 0.05029486119747162, "learning_rate": 0.0001895331334332753, "logits/chosen": -1.8151705265045166, "logits/rejected": -1.3103126287460327, "logps/chosen": -396.3746643066406, "logps/rejected": -391.5860900878906, "loss": 0.0037, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1363155841827393, "rewards/margins": 15.38147258758545, "rewards/rejected": -18.51778793334961, "step": 820 }, { "epoch": 2.602633617350891, "grad_norm": 0.0015266811242327094, "learning_rate": 0.0001889546588018412, "logits/chosen": -1.850388765335083, "logits/rejected": -1.3118959665298462, "logps/chosen": -381.0390319824219, "logps/rejected": -371.218505859375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.7308974266052246, "rewards/margins": 15.474958419799805, "rewards/rejected": -18.205854415893555, "step": 840 }, { "epoch": 2.664601084430674, "grad_norm": 0.010239909403026104, "learning_rate": 0.00018836155801026753, "logits/chosen": -1.8376766443252563, "logits/rejected": -1.337482213973999, "logps/chosen": -380.15032958984375, "logps/rejected": -385.6625061035156, "loss": 0.0059, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8081612586975098, "rewards/margins": 15.317975997924805, "rewards/rejected": -18.12613868713379, "step": 860 }, { "epoch": 2.726568551510457, "grad_norm": 0.005239796359091997, "learning_rate": 0.00018775392857775432, "logits/chosen": -1.8260116577148438, "logits/rejected": -1.3371708393096924, "logps/chosen": -386.72052001953125, "logps/rejected": -393.1973571777344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4543259143829346, "rewards/margins": 15.405393600463867, "rewards/rejected": -18.859722137451172, "step": 880 }, { "epoch": 2.78853601859024, "grad_norm": 0.0014312748098745942, "learning_rate": 0.00018713187041233896, "logits/chosen": -1.8437349796295166, "logits/rejected": -1.295083999633789, "logps/chosen": -396.12713623046875, "logps/rejected": -400.5750427246094, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.4062328338623047, "rewards/margins": 17.027809143066406, "rewards/rejected": -20.434043884277344, "step": 900 }, { "epoch": 2.850503485670023, "grad_norm": 0.03151211887598038, "learning_rate": 0.00018649548579446936, "logits/chosen": -1.8418632745742798, "logits/rejected": -1.3832991123199463, "logps/chosen": -387.4415588378906, "logps/rejected": -418.4268493652344, "loss": 0.0036, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.485564708709717, "rewards/margins": 15.658266067504883, "rewards/rejected": -19.14383316040039, "step": 920 }, { "epoch": 2.9124709527498065, "grad_norm": 0.003437014762312174, "learning_rate": 0.00018584487936018661, "logits/chosen": -1.957241415977478, "logits/rejected": -1.4707096815109253, "logps/chosen": -370.52734375, "logps/rejected": -367.0068054199219, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.7640680074691772, "rewards/margins": 14.591270446777344, "rewards/rejected": -16.3553409576416, "step": 940 }, { "epoch": 2.9744384198295895, "grad_norm": 0.0018515066476538777, "learning_rate": 0.00018518015808392045, "logits/chosen": -1.8616878986358643, "logits/rejected": -1.3850669860839844, "logps/chosen": -370.74847412109375, "logps/rejected": -395.7770690917969, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.756985664367676, "rewards/margins": 15.77873420715332, "rewards/rejected": -18.53571891784668, "step": 960 }, { "epoch": 3.0364058869093724, "grad_norm": 0.0055403695441782475, "learning_rate": 0.00018450143126090015, "logits/chosen": -1.9129266738891602, "logits/rejected": -1.4352341890335083, "logps/chosen": -378.54547119140625, "logps/rejected": -389.22955322265625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.454970359802246, "rewards/margins": 15.567869186401367, "rewards/rejected": -18.022838592529297, "step": 980 }, { "epoch": 3.098373353989156, "grad_norm": 0.0003845282772090286, "learning_rate": 0.00018380881048918405, "logits/chosen": -1.955512285232544, "logits/rejected": -1.4428436756134033, "logps/chosen": -375.7381286621094, "logps/rejected": -373.1043701171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.9304916858673096, "rewards/margins": 15.572137832641602, "rewards/rejected": -17.502628326416016, "step": 1000 }, { "epoch": 3.1603408210689388, "grad_norm": 0.000813652528449893, "learning_rate": 0.00018310240965131041, "logits/chosen": -1.9499313831329346, "logits/rejected": -1.4106732606887817, "logps/chosen": -363.78314208984375, "logps/rejected": -364.62835693359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.788172721862793, "rewards/margins": 15.584823608398438, "rewards/rejected": -17.372997283935547, "step": 1020 }, { "epoch": 3.2223082881487217, "grad_norm": 0.0015642641810700297, "learning_rate": 0.00018238234489557215, "logits/chosen": -1.9376710653305054, "logits/rejected": -1.4058828353881836, "logps/chosen": -391.0188903808594, "logps/rejected": -384.52716064453125, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.709324598312378, "rewards/margins": 16.003910064697266, "rewards/rejected": -17.713237762451172, "step": 1040 }, { "epoch": 3.284275755228505, "grad_norm": 0.013190961442887783, "learning_rate": 0.00018164873461691986, "logits/chosen": -1.9225285053253174, "logits/rejected": -1.4039231538772583, "logps/chosen": -389.7248840332031, "logps/rejected": -403.44891357421875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.2535457611083984, "rewards/margins": 17.14788818359375, "rewards/rejected": -19.401432037353516, "step": 1060 }, { "epoch": 3.346243222308288, "grad_norm": 0.0009441258735023439, "learning_rate": 0.00018090169943749476, "logits/chosen": -1.9266620874404907, "logits/rejected": -1.3820419311523438, "logps/chosen": -377.3229064941406, "logps/rejected": -394.3813171386719, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.6834962368011475, "rewards/margins": 16.853666305541992, "rewards/rejected": -19.537160873413086, "step": 1080 }, { "epoch": 3.4082106893880715, "grad_norm": 0.000891213770955801, "learning_rate": 0.00018014136218679567, "logits/chosen": -1.8898261785507202, "logits/rejected": -1.3582581281661987, "logps/chosen": -367.8475341796875, "logps/rejected": -381.94219970703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -2.8650197982788086, "rewards/margins": 16.576953887939453, "rewards/rejected": -19.441974639892578, "step": 1100 }, { "epoch": 3.4701781564678544, "grad_norm": 0.0021270292345434427, "learning_rate": 0.00017936784788148328, "logits/chosen": -1.9054046869277954, "logits/rejected": -1.3137685060501099, "logps/chosen": -396.55718994140625, "logps/rejected": -399.8603515625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.9427146911621094, "rewards/margins": 17.294252395629883, "rewards/rejected": -20.236968994140625, "step": 1120 }, { "epoch": 3.5321456235476374, "grad_norm": 0.0006443614838644862, "learning_rate": 0.00017858128370482426, "logits/chosen": -1.8784294128417969, "logits/rejected": -1.3266098499298096, "logps/chosen": -376.5830993652344, "logps/rejected": -384.6981506347656, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.870404005050659, "rewards/margins": 17.322202682495117, "rewards/rejected": -20.192609786987305, "step": 1140 }, { "epoch": 3.5941130906274203, "grad_norm": 0.0011427829740568995, "learning_rate": 0.00017778179898577973, "logits/chosen": -1.8605209589004517, "logits/rejected": -1.3551753759384155, "logps/chosen": -393.83099365234375, "logps/rejected": -431.01824951171875, "loss": 0.0044, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.188037872314453, "rewards/margins": 17.58969497680664, "rewards/rejected": -21.77773094177246, "step": 1160 }, { "epoch": 3.6560805577072037, "grad_norm": 0.00015023932792246342, "learning_rate": 0.00017696952517774062, "logits/chosen": -1.8713442087173462, "logits/rejected": -1.2884734869003296, "logps/chosen": -389.5274658203125, "logps/rejected": -406.44696044921875, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.2542433738708496, "rewards/margins": 18.175609588623047, "rewards/rejected": -21.429855346679688, "step": 1180 }, { "epoch": 3.7180480247869867, "grad_norm": 0.0034171934239566326, "learning_rate": 0.00017614459583691346, "logits/chosen": -1.8342435359954834, "logits/rejected": -1.33168625831604, "logps/chosen": -392.7457275390625, "logps/rejected": -424.7430725097656, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.033926963806152, "rewards/margins": 17.532773971557617, "rewards/rejected": -21.566701889038086, "step": 1200 }, { "epoch": 3.78001549186677, "grad_norm": 0.00014497939264401793, "learning_rate": 0.00017530714660036112, "logits/chosen": -1.8120412826538086, "logits/rejected": -1.2837426662445068, "logps/chosen": -400.38055419921875, "logps/rejected": -432.98175048828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.632486343383789, "rewards/margins": 18.09763526916504, "rewards/rejected": -21.730119705200195, "step": 1220 }, { "epoch": 3.841982958946553, "grad_norm": 0.00035277256392873824, "learning_rate": 0.0001744573151637007, "logits/chosen": -1.7961149215698242, "logits/rejected": -1.2880661487579346, "logps/chosen": -389.3721618652344, "logps/rejected": -458.435546875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -4.226949214935303, "rewards/margins": 18.70314598083496, "rewards/rejected": -22.930095672607422, "step": 1240 }, { "epoch": 3.903950426026336, "grad_norm": 0.0018203147919848561, "learning_rate": 0.0001735952412584635, "logits/chosen": -1.8189284801483154, "logits/rejected": -1.2755413055419922, "logps/chosen": -403.92608642578125, "logps/rejected": -437.57470703125, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.302323818206787, "rewards/margins": 18.439044952392578, "rewards/rejected": -22.741369247436523, "step": 1260 }, { "epoch": 3.9659178931061194, "grad_norm": 0.000810753321275115, "learning_rate": 0.00017272106662911973, "logits/chosen": -1.8001739978790283, "logits/rejected": -1.2190439701080322, "logps/chosen": -392.6038513183594, "logps/rejected": -409.79754638671875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.5174388885498047, "rewards/margins": 18.15955924987793, "rewards/rejected": -21.676998138427734, "step": 1280 }, { "epoch": 4.027885360185903, "grad_norm": 0.0008877617656253278, "learning_rate": 0.00017183493500977278, "logits/chosen": -1.7996867895126343, "logits/rejected": -1.2403078079223633, "logps/chosen": -376.8688659667969, "logps/rejected": -401.3122863769531, "loss": 0.0012, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.8793225288391113, "rewards/margins": 17.706012725830078, "rewards/rejected": -21.58533477783203, "step": 1300 }, { "epoch": 4.089852827265686, "grad_norm": 0.0007201443077065051, "learning_rate": 0.0001709369921005258, "logits/chosen": -1.7817294597625732, "logits/rejected": -1.3144575357437134, "logps/chosen": -362.8156433105469, "logps/rejected": -421.5276794433594, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.907405376434326, "rewards/margins": 17.486907958984375, "rewards/rejected": -21.394317626953125, "step": 1320 }, { "epoch": 4.151820294345469, "grad_norm": 0.0004134229675401002, "learning_rate": 0.00017002738554352552, "logits/chosen": -1.7647602558135986, "logits/rejected": -1.2397964000701904, "logps/chosen": -400.63525390625, "logps/rejected": -434.27734375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.650538444519043, "rewards/margins": 17.86612319946289, "rewards/rejected": -22.516660690307617, "step": 1340 }, { "epoch": 4.213787761425252, "grad_norm": 0.0018414207734167576, "learning_rate": 0.00016910626489868649, "logits/chosen": -1.8098886013031006, "logits/rejected": -1.2557048797607422, "logps/chosen": -403.9068908691406, "logps/rejected": -441.5738220214844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.8825366497039795, "rewards/margins": 19.2824764251709, "rewards/rejected": -23.165014266967773, "step": 1360 }, { "epoch": 4.275755228505035, "grad_norm": 0.000604189292062074, "learning_rate": 0.00016817378161909996, "logits/chosen": -1.7331501245498657, "logits/rejected": -1.1988348960876465, "logps/chosen": -379.48004150390625, "logps/rejected": -416.23504638671875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.858603477478027, "rewards/margins": 17.692523956298828, "rewards/rejected": -22.551128387451172, "step": 1380 }, { "epoch": 4.337722695584818, "grad_norm": 0.0018184883520007133, "learning_rate": 0.0001672300890261317, "logits/chosen": -1.786969780921936, "logits/rejected": -1.1631317138671875, "logps/chosen": -399.63836669921875, "logps/rejected": -406.0413513183594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.497194766998291, "rewards/margins": 17.606014251708984, "rewards/rejected": -22.103206634521484, "step": 1400 }, { "epoch": 4.3996901626646014, "grad_norm": 0.0004817396984435618, "learning_rate": 0.0001662753422842123, "logits/chosen": -1.803607702255249, "logits/rejected": -1.2023392915725708, "logps/chosen": -397.8926086425781, "logps/rejected": -415.9464416503906, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.3522844314575195, "rewards/margins": 18.28469467163086, "rewards/rejected": -22.636978149414062, "step": 1420 }, { "epoch": 4.461657629744384, "grad_norm": 0.0003521572216413915, "learning_rate": 0.00016530969837532487, "logits/chosen": -1.745550513267517, "logits/rejected": -1.2345880270004272, "logps/chosen": -398.3353271484375, "logps/rejected": -455.84991455078125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.580657005310059, "rewards/margins": 18.520645141601562, "rewards/rejected": -23.101301193237305, "step": 1440 }, { "epoch": 4.523625096824167, "grad_norm": 0.001398236840032041, "learning_rate": 0.00016433331607319343, "logits/chosen": -1.7653003931045532, "logits/rejected": -1.2409374713897705, "logps/chosen": -390.4782409667969, "logps/rejected": -445.02203369140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.244819641113281, "rewards/margins": 19.066150665283203, "rewards/rejected": -23.31096839904785, "step": 1460 }, { "epoch": 4.58559256390395, "grad_norm": 0.0006393153453245759, "learning_rate": 0.00016334635591717703, "logits/chosen": -1.7738897800445557, "logits/rejected": -1.2459341287612915, "logps/chosen": -405.1599426269531, "logps/rejected": -465.34796142578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.869115114212036, "rewards/margins": 20.113529205322266, "rewards/rejected": -23.98264503479004, "step": 1480 }, { "epoch": 4.647560030983733, "grad_norm": 0.0002729636325966567, "learning_rate": 0.00016234898018587337, "logits/chosen": -1.7716586589813232, "logits/rejected": -1.156842589378357, "logps/chosen": -400.9200439453125, "logps/rejected": -419.4234924316406, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.553537368774414, "rewards/margins": 18.427448272705078, "rewards/rejected": -22.980987548828125, "step": 1500 }, { "epoch": 4.709527498063517, "grad_norm": 0.0016045222291722894, "learning_rate": 0.00016134135287043669, "logits/chosen": -1.7796188592910767, "logits/rejected": -1.1779518127441406, "logps/chosen": -407.48773193359375, "logps/rejected": -439.03143310546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.231381416320801, "rewards/margins": 19.530107498168945, "rewards/rejected": -23.761486053466797, "step": 1520 }, { "epoch": 4.7714949651433, "grad_norm": 0.0001898371265269816, "learning_rate": 0.00016032363964761363, "logits/chosen": -1.7506084442138672, "logits/rejected": -1.1158758401870728, "logps/chosen": -412.0704650878906, "logps/rejected": -419.58477783203125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.735566139221191, "rewards/margins": 18.557144165039062, "rewards/rejected": -23.292709350585938, "step": 1540 }, { "epoch": 4.833462432223083, "grad_norm": 0.0011102559510618448, "learning_rate": 0.00015929600785250257, "logits/chosen": -1.772351861000061, "logits/rejected": -1.199371576309204, "logps/chosen": -411.6983337402344, "logps/rejected": -456.08526611328125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.344552516937256, "rewards/margins": 19.66854476928711, "rewards/rejected": -24.01309585571289, "step": 1560 }, { "epoch": 4.895429899302866, "grad_norm": 0.0002147419872926548, "learning_rate": 0.0001582586264510396, "logits/chosen": -1.7624610662460327, "logits/rejected": -1.1555306911468506, "logps/chosen": -392.86846923828125, "logps/rejected": -411.6356506347656, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -3.8805503845214844, "rewards/margins": 18.482906341552734, "rewards/rejected": -22.36345672607422, "step": 1580 }, { "epoch": 4.957397366382649, "grad_norm": 0.00014843855751678348, "learning_rate": 0.00015721166601221698, "logits/chosen": -1.7433449029922485, "logits/rejected": -1.1605427265167236, "logps/chosen": -402.5615539550781, "logps/rejected": -437.72601318359375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.379772186279297, "rewards/margins": 19.26140022277832, "rewards/rejected": -23.641170501708984, "step": 1600 }, { "epoch": 5.019364833462432, "grad_norm": 9.896748815663159e-05, "learning_rate": 0.0001561552986800375, "logits/chosen": -1.7666635513305664, "logits/rejected": -1.2081592082977295, "logps/chosen": -409.02685546875, "logps/rejected": -462.6644592285156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.824324607849121, "rewards/margins": 19.418132781982422, "rewards/rejected": -24.242456436157227, "step": 1620 }, { "epoch": 5.081332300542216, "grad_norm": 6.193404988152906e-05, "learning_rate": 0.00015508969814521025, "logits/chosen": -1.7530428171157837, "logits/rejected": -1.2155699729919434, "logps/chosen": -396.701171875, "logps/rejected": -438.2998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.757896423339844, "rewards/margins": 18.720035552978516, "rewards/rejected": -23.47793197631836, "step": 1640 }, { "epoch": 5.143299767621999, "grad_norm": 0.0005012938636355102, "learning_rate": 0.00015401503961659204, "logits/chosen": -1.76808762550354, "logits/rejected": -1.2039562463760376, "logps/chosen": -416.18133544921875, "logps/rejected": -471.65032958984375, "loss": 0.0055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.3714799880981445, "rewards/margins": 20.104217529296875, "rewards/rejected": -24.475696563720703, "step": 1660 }, { "epoch": 5.205267234701782, "grad_norm": 0.0007204354042187333, "learning_rate": 0.00015293149979237876, "logits/chosen": -1.700727105140686, "logits/rejected": -1.1688693761825562, "logps/chosen": -395.04620361328125, "logps/rejected": -459.3890686035156, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.376019477844238, "rewards/margins": 19.267929077148438, "rewards/rejected": -24.643945693969727, "step": 1680 }, { "epoch": 5.267234701781565, "grad_norm": 0.00012067196075804532, "learning_rate": 0.00015183925683105254, "logits/chosen": -1.7348114252090454, "logits/rejected": -1.1479172706604004, "logps/chosen": -411.1114807128906, "logps/rejected": -467.02777099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.358091354370117, "rewards/margins": 20.035839080810547, "rewards/rejected": -24.393932342529297, "step": 1700 }, { "epoch": 5.329202168861348, "grad_norm": 0.0015901889419183135, "learning_rate": 0.00015073849032208822, "logits/chosen": -1.7161178588867188, "logits/rejected": -1.1550828218460083, "logps/chosen": -408.5069885253906, "logps/rejected": -455.2245178222656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.192176342010498, "rewards/margins": 19.474624633789062, "rewards/rejected": -24.66680145263672, "step": 1720 }, { "epoch": 5.3911696359411305, "grad_norm": 2.9804143196088262e-05, "learning_rate": 0.00014962938125642503, "logits/chosen": -1.7266225814819336, "logits/rejected": -1.1720420122146606, "logps/chosen": -404.70721435546875, "logps/rejected": -468.11956787109375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.117176532745361, "rewards/margins": 19.882728576660156, "rewards/rejected": -24.99990463256836, "step": 1740 }, { "epoch": 5.453137103020914, "grad_norm": 0.001581120421178639, "learning_rate": 0.00014851211199670721, "logits/chosen": -1.7630701065063477, "logits/rejected": -1.1630027294158936, "logps/chosen": -387.80364990234375, "logps/rejected": -445.5340270996094, "loss": 0.0076, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.650803565979004, "rewards/margins": 19.620697021484375, "rewards/rejected": -24.271501541137695, "step": 1760 }, { "epoch": 5.515104570100697, "grad_norm": 7.492147415177897e-05, "learning_rate": 0.00014738686624729986, "logits/chosen": -1.7199184894561768, "logits/rejected": -1.1519477367401123, "logps/chosen": -398.6278991699219, "logps/rejected": -449.28826904296875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.650136947631836, "rewards/margins": 19.344139099121094, "rewards/rejected": -23.99427604675293, "step": 1780 }, { "epoch": 5.57707203718048, "grad_norm": 0.0007189544849097729, "learning_rate": 0.00014625382902408356, "logits/chosen": -1.7485740184783936, "logits/rejected": -1.15171217918396, "logps/chosen": -413.4642639160156, "logps/rejected": -454.82623291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.690885543823242, "rewards/margins": 19.775279998779297, "rewards/rejected": -24.466161727905273, "step": 1800 }, { "epoch": 5.639039504260263, "grad_norm": 9.353666246170178e-05, "learning_rate": 0.00014511318662403347, "logits/chosen": -1.7578392028808594, "logits/rejected": -1.1830543279647827, "logps/chosen": -395.25433349609375, "logps/rejected": -461.00128173828125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.259980201721191, "rewards/margins": 20.097646713256836, "rewards/rejected": -24.35762596130371, "step": 1820 }, { "epoch": 5.701006971340046, "grad_norm": 0.00011017426731996238, "learning_rate": 0.00014396512659458824, "logits/chosen": -1.718340277671814, "logits/rejected": -1.1603585481643677, "logps/chosen": -397.50201416015625, "logps/rejected": -441.17120361328125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.123129844665527, "rewards/margins": 18.981271743774414, "rewards/rejected": -24.104402542114258, "step": 1840 }, { "epoch": 5.76297443841983, "grad_norm": 0.0007490446441806853, "learning_rate": 0.0001428098377028126, "logits/chosen": -1.7352231740951538, "logits/rejected": -1.1633882522583008, "logps/chosen": -395.93719482421875, "logps/rejected": -450.5420837402344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.510663032531738, "rewards/margins": 20.08230972290039, "rewards/rejected": -24.59296989440918, "step": 1860 }, { "epoch": 5.824941905499613, "grad_norm": 0.002562998328357935, "learning_rate": 0.0001416475099043599, "logits/chosen": -1.7280263900756836, "logits/rejected": -1.0888252258300781, "logps/chosen": -383.5231628417969, "logps/rejected": -423.22735595703125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.361128330230713, "rewards/margins": 19.707561492919922, "rewards/rejected": -24.06869125366211, "step": 1880 }, { "epoch": 5.886909372579396, "grad_norm": 0.0003409655182622373, "learning_rate": 0.00014047833431223938, "logits/chosen": -1.7228466272354126, "logits/rejected": -1.1678210496902466, "logps/chosen": -427.7156677246094, "logps/rejected": -484.9002990722656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.072082996368408, "rewards/margins": 19.94878387451172, "rewards/rejected": -25.0208683013916, "step": 1900 }, { "epoch": 5.948876839659179, "grad_norm": 3.485321212792769e-05, "learning_rate": 0.00013930250316539238, "logits/chosen": -1.7439708709716797, "logits/rejected": -1.1591265201568604, "logps/chosen": -409.28485107421875, "logps/rejected": -464.5729064941406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.592177867889404, "rewards/margins": 20.056758880615234, "rewards/rejected": -24.64893913269043, "step": 1920 }, { "epoch": 6.010844306738962, "grad_norm": 0.0024052930530160666, "learning_rate": 0.00013812020979708418, "logits/chosen": -1.766571044921875, "logits/rejected": -1.1335632801055908, "logps/chosen": -409.98095703125, "logps/rejected": -432.7437438964844, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.963695526123047, "rewards/margins": 19.679019927978516, "rewards/rejected": -24.642715454101562, "step": 1940 }, { "epoch": 6.072811773818745, "grad_norm": 7.735176041023806e-05, "learning_rate": 0.00013693164860311565, "logits/chosen": -1.7631984949111938, "logits/rejected": -1.1198147535324097, "logps/chosen": -398.9923400878906, "logps/rejected": -429.88861083984375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.000827312469482, "rewards/margins": 20.33033561706543, "rewards/rejected": -24.331165313720703, "step": 1960 }, { "epoch": 6.134779240898529, "grad_norm": 0.0003688503638841212, "learning_rate": 0.0001357370150098601, "logits/chosen": -1.7265870571136475, "logits/rejected": -1.1435579061508179, "logps/chosen": -390.2747497558594, "logps/rejected": -457.9873962402344, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.666455268859863, "rewards/margins": 20.30272102355957, "rewards/rejected": -24.969173431396484, "step": 1980 }, { "epoch": 6.196746707978312, "grad_norm": 0.0016685057198628783, "learning_rate": 0.00013453650544213076, "logits/chosen": -1.7364275455474854, "logits/rejected": -1.1212728023529053, "logps/chosen": -404.72869873046875, "logps/rejected": -440.9786071777344, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.581490993499756, "rewards/margins": 19.78643035888672, "rewards/rejected": -24.367919921875, "step": 2000 }, { "epoch": 6.258714175058095, "grad_norm": 0.00023198116105049849, "learning_rate": 0.00013333031729088419, "logits/chosen": -1.7448314428329468, "logits/rejected": -1.1462557315826416, "logps/chosen": -401.00048828125, "logps/rejected": -452.0621032714844, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.24946928024292, "rewards/margins": 20.46927833557129, "rewards/rejected": -24.718748092651367, "step": 2020 }, { "epoch": 6.3206816421378775, "grad_norm": 0.00022464637004304677, "learning_rate": 0.00013211864888076457, "logits/chosen": -1.691931962966919, "logits/rejected": -1.16156005859375, "logps/chosen": -417.93585205078125, "logps/rejected": -468.42791748046875, "loss": 0.0044, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.851279258728027, "rewards/margins": 19.037456512451172, "rewards/rejected": -24.888734817504883, "step": 2040 }, { "epoch": 6.3826491092176605, "grad_norm": 0.0001370076060993597, "learning_rate": 0.00013090169943749476, "logits/chosen": -1.7306629419326782, "logits/rejected": -1.16789972782135, "logps/chosen": -400.44989013671875, "logps/rejected": -461.5997009277344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.793812274932861, "rewards/margins": 20.2277889251709, "rewards/rejected": -25.02159881591797, "step": 2060 }, { "epoch": 6.4446165762974434, "grad_norm": 0.0007584911654703319, "learning_rate": 0.00012967966905511906, "logits/chosen": -1.7538254261016846, "logits/rejected": -1.1523357629776, "logps/chosen": -400.55078125, "logps/rejected": -457.19439697265625, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.016867637634277, "rewards/margins": 20.043991088867188, "rewards/rejected": -25.06085777282715, "step": 2080 }, { "epoch": 6.506584043377227, "grad_norm": 0.00025258222012780607, "learning_rate": 0.00012845275866310324, "logits/chosen": -1.709283471107483, "logits/rejected": -1.1272356510162354, "logps/chosen": -393.4644775390625, "logps/rejected": -445.11932373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.725881576538086, "rewards/margins": 20.157442092895508, "rewards/rejected": -24.88332176208496, "step": 2100 }, { "epoch": 6.56855151045701, "grad_norm": 0.0005373629392124712, "learning_rate": 0.00012722116999329712, "logits/chosen": -1.7319450378417969, "logits/rejected": -1.146323323249817, "logps/chosen": -400.94219970703125, "logps/rejected": -457.70294189453125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.707498073577881, "rewards/margins": 19.930648803710938, "rewards/rejected": -24.638147354125977, "step": 2120 }, { "epoch": 6.630518977536793, "grad_norm": 3.2575491786701605e-05, "learning_rate": 0.0001259851055467653, "logits/chosen": -1.7204310894012451, "logits/rejected": -1.1470435857772827, "logps/chosen": -407.14794921875, "logps/rejected": -463.16937255859375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.401209831237793, "rewards/margins": 19.731382369995117, "rewards/rejected": -25.132593154907227, "step": 2140 }, { "epoch": 6.692486444616576, "grad_norm": 4.120891753700562e-05, "learning_rate": 0.00012474476856049144, "logits/chosen": -1.758186936378479, "logits/rejected": -1.0516242980957031, "logps/chosen": -422.578125, "logps/rejected": -450.13360595703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.05043888092041, "rewards/margins": 20.296903610229492, "rewards/rejected": -25.347341537475586, "step": 2160 }, { "epoch": 6.754453911696359, "grad_norm": 0.0018112401012331247, "learning_rate": 0.00012350036297396154, "logits/chosen": -1.7569530010223389, "logits/rejected": -1.1236534118652344, "logps/chosen": -398.9664001464844, "logps/rejected": -440.2588806152344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.544419288635254, "rewards/margins": 20.12918472290039, "rewards/rejected": -24.673603057861328, "step": 2180 }, { "epoch": 6.816421378776143, "grad_norm": 0.0009737831423990428, "learning_rate": 0.00012225209339563145, "logits/chosen": -1.709917664527893, "logits/rejected": -1.1064178943634033, "logps/chosen": -414.5459899902344, "logps/rejected": -465.4837341308594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.264222145080566, "rewards/margins": 20.37704849243164, "rewards/rejected": -25.64126968383789, "step": 2200 }, { "epoch": 6.878388845855926, "grad_norm": 0.000668133026920259, "learning_rate": 0.00012100016506928493, "logits/chosen": -1.733787178993225, "logits/rejected": -1.1450860500335693, "logps/chosen": -403.2812805175781, "logps/rejected": -477.0782165527344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.641029357910156, "rewards/margins": 21.0471134185791, "rewards/rejected": -25.68814468383789, "step": 2220 }, { "epoch": 6.940356312935709, "grad_norm": 0.00028338556876406074, "learning_rate": 0.00011974478384028672, "logits/chosen": -1.703685998916626, "logits/rejected": -1.0926717519760132, "logps/chosen": -415.73248291015625, "logps/rejected": -474.7493591308594, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.833617210388184, "rewards/margins": 19.839744567871094, "rewards/rejected": -25.67336082458496, "step": 2240 }, { "epoch": 7.002323780015492, "grad_norm": 9.248249261872843e-05, "learning_rate": 0.00011848615612173688, "logits/chosen": -1.727691888809204, "logits/rejected": -1.1385018825531006, "logps/chosen": -404.37158203125, "logps/rejected": -455.1560974121094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.189269065856934, "rewards/margins": 20.383289337158203, "rewards/rejected": -25.572555541992188, "step": 2260 }, { "epoch": 7.064291247095275, "grad_norm": 1.9335082470206544e-05, "learning_rate": 0.0001172244888605319, "logits/chosen": -1.687378168106079, "logits/rejected": -1.1057562828063965, "logps/chosen": -406.32733154296875, "logps/rejected": -474.8482360839844, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.787657737731934, "rewards/margins": 20.789146423339844, "rewards/rejected": -25.576807022094727, "step": 2280 }, { "epoch": 7.126258714175058, "grad_norm": 8.403878018725663e-05, "learning_rate": 0.00011595998950333793, "logits/chosen": -1.6789989471435547, "logits/rejected": -1.1095144748687744, "logps/chosen": -409.31524658203125, "logps/rejected": -472.5364685058594, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.127674579620361, "rewards/margins": 20.548160552978516, "rewards/rejected": -25.675832748413086, "step": 2300 }, { "epoch": 7.188226181254842, "grad_norm": 0.0001840272598201409, "learning_rate": 0.00011469286596248181, "logits/chosen": -1.7186450958251953, "logits/rejected": -1.0815023183822632, "logps/chosen": -402.4718322753906, "logps/rejected": -446.8160095214844, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.929797172546387, "rewards/margins": 20.37470245361328, "rewards/rejected": -25.304500579833984, "step": 2320 }, { "epoch": 7.2501936483346245, "grad_norm": 0.00030283021624200046, "learning_rate": 0.00011342332658176555, "logits/chosen": -1.7267248630523682, "logits/rejected": -1.1029185056686401, "logps/chosen": -407.1277160644531, "logps/rejected": -443.208251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.898409843444824, "rewards/margins": 19.7962589263916, "rewards/rejected": -24.69466781616211, "step": 2340 }, { "epoch": 7.3121611154144075, "grad_norm": 0.000179938884684816, "learning_rate": 0.00011221521661813197, "logits/chosen": -1.7125059366226196, "logits/rejected": -1.107881784439087, "logps/chosen": -411.54571533203125, "logps/rejected": -468.47821044921875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.634856700897217, "rewards/margins": 20.49616050720215, "rewards/rejected": -26.131017684936523, "step": 2360 }, { "epoch": 7.3741285824941905, "grad_norm": 0.00018190982518717647, "learning_rate": 0.0001109415670719721, "logits/chosen": -1.6849457025527954, "logits/rejected": -1.0680724382400513, "logps/chosen": -408.02587890625, "logps/rejected": -460.41015625, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.917786598205566, "rewards/margins": 20.782718658447266, "rewards/rejected": -25.700504302978516, "step": 2380 }, { "epoch": 7.436096049573973, "grad_norm": 0.00010547572310315445, "learning_rate": 0.00010966611848443176, "logits/chosen": -1.6835496425628662, "logits/rejected": -1.0897111892700195, "logps/chosen": -407.20318603515625, "logps/rejected": -464.83935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.396719932556152, "rewards/margins": 20.730510711669922, "rewards/rejected": -26.127233505249023, "step": 2400 }, { "epoch": 7.498063516653756, "grad_norm": 0.0002746889949776232, "learning_rate": 0.00010838908056813919, "logits/chosen": -1.7222875356674194, "logits/rejected": -1.0569690465927124, "logps/chosen": -397.06500244140625, "logps/rejected": -429.73663330078125, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.969448566436768, "rewards/margins": 20.237773895263672, "rewards/rejected": -25.20722007751465, "step": 2420 }, { "epoch": 7.56003098373354, "grad_norm": 0.0010378537699580193, "learning_rate": 0.00010711066329704423, "logits/chosen": -1.7328182458877563, "logits/rejected": -1.0489845275878906, "logps/chosen": -410.6394958496094, "logps/rejected": -457.23126220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.752233505249023, "rewards/margins": 20.957183837890625, "rewards/rejected": -25.70941734313965, "step": 2440 }, { "epoch": 7.621998450813323, "grad_norm": 0.00035315402783453465, "learning_rate": 0.00010583107687189388, "logits/chosen": -1.7303959131240845, "logits/rejected": -1.0627490282058716, "logps/chosen": -394.2586364746094, "logps/rejected": -438.1336975097656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.017716407775879, "rewards/margins": 20.087886810302734, "rewards/rejected": -25.105602264404297, "step": 2460 }, { "epoch": 7.683965917893106, "grad_norm": 5.2913201216142625e-05, "learning_rate": 0.00010455053168567064, "logits/chosen": -1.701934814453125, "logits/rejected": -1.0837266445159912, "logps/chosen": -411.44390869140625, "logps/rejected": -451.9497985839844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.648865699768066, "rewards/margins": 20.401885986328125, "rewards/rejected": -26.050750732421875, "step": 2480 }, { "epoch": 7.745933384972889, "grad_norm": 0.0004144099075347185, "learning_rate": 0.00010326923828899894, "logits/chosen": -1.66423761844635, "logits/rejected": -1.0931271314620972, "logps/chosen": -413.04266357421875, "logps/rejected": -468.1424255371094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.445749282836914, "rewards/margins": 20.35373306274414, "rewards/rejected": -25.799480438232422, "step": 2500 }, { "epoch": 7.807900852052672, "grad_norm": 0.0005614625406451523, "learning_rate": 0.00010198740735552596, "logits/chosen": -1.7007503509521484, "logits/rejected": -1.0203969478607178, "logps/chosen": -409.26434326171875, "logps/rejected": -450.35284423828125, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.619626522064209, "rewards/margins": 20.54979133605957, "rewards/rejected": -26.169414520263672, "step": 2520 }, { "epoch": 7.869868319132456, "grad_norm": 0.00046529798419214785, "learning_rate": 0.00010070524964728218, "logits/chosen": -1.6950366497039795, "logits/rejected": -1.0599762201309204, "logps/chosen": -388.9576416015625, "logps/rejected": -438.4559020996094, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.69763708114624, "rewards/margins": 19.549518585205078, "rewards/rejected": -25.247156143188477, "step": 2540 }, { "epoch": 7.931835786212239, "grad_norm": 0.0005010979948565364, "learning_rate": 9.942297598002714e-05, "logits/chosen": -1.6910135746002197, "logits/rejected": -1.088746190071106, "logps/chosen": -409.673583984375, "logps/rejected": -460.9344177246094, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.090248107910156, "rewards/margins": 20.458660125732422, "rewards/rejected": -25.548908233642578, "step": 2560 }, { "epoch": 7.993803253292022, "grad_norm": 2.1018489860580303e-05, "learning_rate": 9.814079718858677e-05, "logits/chosen": -1.6951793432235718, "logits/rejected": -1.1038161516189575, "logps/chosen": -427.29669189453125, "logps/rejected": -482.02362060546875, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.359341621398926, "rewards/margins": 20.788881301879883, "rewards/rejected": -26.148223876953125, "step": 2580 }, { "epoch": 8.055770720371806, "grad_norm": 0.00020114157814532518, "learning_rate": 9.685892409218717e-05, "logits/chosen": -1.702978491783142, "logits/rejected": -1.0864311456680298, "logps/chosen": -405.50567626953125, "logps/rejected": -455.3516540527344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.847678184509277, "rewards/margins": 20.718107223510742, "rewards/rejected": -25.565786361694336, "step": 2600 }, { "epoch": 8.117738187451588, "grad_norm": 0.00014650092634838074, "learning_rate": 9.557756745979138e-05, "logits/chosen": -1.692112922668457, "logits/rejected": -1.106385588645935, "logps/chosen": -400.7706298828125, "logps/rejected": -458.6825256347656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.732221603393555, "rewards/margins": 21.058570861816406, "rewards/rejected": -25.79079246520996, "step": 2620 }, { "epoch": 8.179705654531372, "grad_norm": 0.0003632131847552955, "learning_rate": 9.429693797544388e-05, "logits/chosen": -1.727189302444458, "logits/rejected": -1.0760419368743896, "logps/chosen": -401.86767578125, "logps/rejected": -446.3102111816406, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.723801612854004, "rewards/margins": 20.717304229736328, "rewards/rejected": -25.441104888916016, "step": 2640 }, { "epoch": 8.241673121611154, "grad_norm": 0.00047560204984620214, "learning_rate": 9.301724620362973e-05, "logits/chosen": -1.7449928522109985, "logits/rejected": -1.0541192293167114, "logps/chosen": -409.01959228515625, "logps/rejected": -449.57666015625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.505074501037598, "rewards/margins": 20.396114349365234, "rewards/rejected": -25.901187896728516, "step": 2660 }, { "epoch": 8.303640588690937, "grad_norm": 0.0010067891562357545, "learning_rate": 9.173870255465275e-05, "logits/chosen": -1.7413511276245117, "logits/rejected": -1.073628544807434, "logps/chosen": -413.9063415527344, "logps/rejected": -457.25042724609375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.8417158126831055, "rewards/margins": 20.952346801757812, "rewards/rejected": -25.7940616607666, "step": 2680 }, { "epoch": 8.36560805577072, "grad_norm": 0.0007608987507410347, "learning_rate": 9.046151725003931e-05, "logits/chosen": -1.738470435142517, "logits/rejected": -1.118428111076355, "logps/chosen": -406.96368408203125, "logps/rejected": -458.2310485839844, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.3068695068359375, "rewards/margins": 20.518783569335938, "rewards/rejected": -25.825653076171875, "step": 2700 }, { "epoch": 8.427575522850503, "grad_norm": 0.00037170801078900695, "learning_rate": 8.918590028797327e-05, "logits/chosen": -1.6667039394378662, "logits/rejected": -1.076485276222229, "logps/chosen": -417.1942443847656, "logps/rejected": -475.34478759765625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.047384262084961, "rewards/margins": 21.4394588470459, "rewards/rejected": -26.48684310913086, "step": 2720 }, { "epoch": 8.489542989930287, "grad_norm": 0.00017155329987872392, "learning_rate": 8.791206140876746e-05, "logits/chosen": -1.6952327489852905, "logits/rejected": -1.0440196990966797, "logps/chosen": -390.47991943359375, "logps/rejected": -446.51611328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.408968448638916, "rewards/margins": 20.748926162719727, "rewards/rejected": -25.157894134521484, "step": 2740 }, { "epoch": 8.55151045701007, "grad_norm": 4.225455268169753e-05, "learning_rate": 8.664021006037762e-05, "logits/chosen": -1.7128692865371704, "logits/rejected": -1.0821470022201538, "logps/chosen": -424.44549560546875, "logps/rejected": -469.12652587890625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.529724597930908, "rewards/margins": 20.326000213623047, "rewards/rejected": -25.855722427368164, "step": 2760 }, { "epoch": 8.613477924089853, "grad_norm": 0.0004146189312450588, "learning_rate": 8.537055536396439e-05, "logits/chosen": -1.7189327478408813, "logits/rejected": -1.1234623193740845, "logps/chosen": -413.88092041015625, "logps/rejected": -489.74432373046875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.031737327575684, "rewards/margins": 20.76127815246582, "rewards/rejected": -26.793010711669922, "step": 2780 }, { "epoch": 8.675445391169635, "grad_norm": 0.0011191857047379017, "learning_rate": 8.410330607950913e-05, "logits/chosen": -1.6889803409576416, "logits/rejected": -1.0510902404785156, "logps/chosen": -409.9695739746094, "logps/rejected": -461.45257568359375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.462882041931152, "rewards/margins": 20.715688705444336, "rewards/rejected": -26.178569793701172, "step": 2800 }, { "epoch": 8.737412858249419, "grad_norm": 0.0015039819991216063, "learning_rate": 8.283867057148902e-05, "logits/chosen": -1.6871960163116455, "logits/rejected": -1.1272326707839966, "logps/chosen": -424.3963928222656, "logps/rejected": -478.30535888671875, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.292850971221924, "rewards/margins": 20.825016021728516, "rewards/rejected": -26.117868423461914, "step": 2820 }, { "epoch": 8.799380325329203, "grad_norm": 0.00024371009203605354, "learning_rate": 8.157685677461708e-05, "logits/chosen": -1.7314860820770264, "logits/rejected": -1.0632710456848145, "logps/chosen": -411.5020446777344, "logps/rejected": -450.3389587402344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.654230117797852, "rewards/margins": 21.339710235595703, "rewards/rejected": -25.993938446044922, "step": 2840 }, { "epoch": 8.861347792408985, "grad_norm": 0.0004402414197102189, "learning_rate": 8.031807215965337e-05, "logits/chosen": -1.7364399433135986, "logits/rejected": -1.0983723402023315, "logps/chosen": -417.08746337890625, "logps/rejected": -472.83984375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.4446940422058105, "rewards/margins": 21.18663215637207, "rewards/rejected": -26.63132667541504, "step": 2860 }, { "epoch": 8.923315259488769, "grad_norm": 0.00047181983245536685, "learning_rate": 7.906252369929154e-05, "logits/chosen": -1.6905673742294312, "logits/rejected": -1.084665060043335, "logps/chosen": -393.9977111816406, "logps/rejected": -455.0557556152344, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.183560371398926, "rewards/margins": 20.739307403564453, "rewards/rejected": -25.922870635986328, "step": 2880 }, { "epoch": 8.98528272656855, "grad_norm": 0.0003129359392914921, "learning_rate": 7.781041783412845e-05, "logits/chosen": -1.6950937509536743, "logits/rejected": -1.0535084009170532, "logps/chosen": -418.62701416015625, "logps/rejected": -476.28387451171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.857310771942139, "rewards/margins": 21.914113998413086, "rewards/rejected": -26.771427154541016, "step": 2900 }, { "epoch": 9.047250193648335, "grad_norm": 0.0004019307089038193, "learning_rate": 7.656196043872012e-05, "logits/chosen": -1.7096707820892334, "logits/rejected": -1.1031239032745361, "logps/chosen": -416.05206298828125, "logps/rejected": -494.614990234375, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.534869194030762, "rewards/margins": 21.93942642211914, "rewards/rejected": -27.474294662475586, "step": 2920 }, { "epoch": 9.109217660728119, "grad_norm": 0.0007387935766018927, "learning_rate": 7.531735678773171e-05, "logits/chosen": -1.7090095281600952, "logits/rejected": -1.0878323316574097, "logps/chosen": -400.01513671875, "logps/rejected": -477.05535888671875, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.275289058685303, "rewards/margins": 21.69790267944336, "rewards/rejected": -26.973194122314453, "step": 2940 }, { "epoch": 9.1711851278079, "grad_norm": 0.00027141955797560513, "learning_rate": 7.407681152218535e-05, "logits/chosen": -1.6808192729949951, "logits/rejected": -1.0295798778533936, "logps/chosen": -404.32513427734375, "logps/rejected": -460.8975524902344, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.207651615142822, "rewards/margins": 20.58077049255371, "rewards/rejected": -25.788421630859375, "step": 2960 }, { "epoch": 9.233152594887684, "grad_norm": 0.0005088089383207262, "learning_rate": 7.284052861581288e-05, "logits/chosen": -1.7368125915527344, "logits/rejected": -1.0655357837677002, "logps/chosen": -410.697021484375, "logps/rejected": -453.0840759277344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.395773410797119, "rewards/margins": 20.73539924621582, "rewards/rejected": -26.13117027282715, "step": 2980 }, { "epoch": 9.295120061967467, "grad_norm": 0.0002143807359971106, "learning_rate": 7.160871134151775e-05, "logits/chosen": -1.6661646366119385, "logits/rejected": -1.092222809791565, "logps/chosen": -405.39154052734375, "logps/rejected": -485.67578125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.456831455230713, "rewards/margins": 21.252620697021484, "rewards/rejected": -26.70945167541504, "step": 3000 }, { "epoch": 9.35708752904725, "grad_norm": 8.41324872453697e-05, "learning_rate": 7.038156223795224e-05, "logits/chosen": -1.7362842559814453, "logits/rejected": -1.082162857055664, "logps/chosen": -410.0975646972656, "logps/rejected": -466.8894958496094, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.814949989318848, "rewards/margins": 21.61594009399414, "rewards/rejected": -26.430889129638672, "step": 3020 }, { "epoch": 9.419054996127032, "grad_norm": 2.4985982236103155e-05, "learning_rate": 6.915928307621584e-05, "logits/chosen": -1.7000200748443604, "logits/rejected": -1.0128730535507202, "logps/chosen": -417.96405029296875, "logps/rejected": -461.15362548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.359194278717041, "rewards/margins": 21.4404296875, "rewards/rejected": -25.79962158203125, "step": 3040 }, { "epoch": 9.481022463206816, "grad_norm": 0.0002187406353186816, "learning_rate": 6.794207482667918e-05, "logits/chosen": -1.6875083446502686, "logits/rejected": -1.0425808429718018, "logps/chosen": -409.68170166015625, "logps/rejected": -456.98114013671875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.01973295211792, "rewards/margins": 20.8963623046875, "rewards/rejected": -25.916095733642578, "step": 3060 }, { "epoch": 9.5429899302866, "grad_norm": 0.0001037058827932924, "learning_rate": 6.673013762594022e-05, "logits/chosen": -1.6812347173690796, "logits/rejected": -1.0920425653457642, "logps/chosen": -409.3445129394531, "logps/rejected": -463.01702880859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.606844425201416, "rewards/margins": 20.97027015686035, "rewards/rejected": -26.57711410522461, "step": 3080 }, { "epoch": 9.604957397366382, "grad_norm": 6.546611984958872e-05, "learning_rate": 6.552367074391708e-05, "logits/chosen": -1.6708405017852783, "logits/rejected": -1.0272510051727295, "logps/chosen": -421.3130798339844, "logps/rejected": -468.8424377441406, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.817858695983887, "rewards/margins": 21.14541244506836, "rewards/rejected": -26.963272094726562, "step": 3100 }, { "epoch": 9.666924864446166, "grad_norm": 0.0009899769211187959, "learning_rate": 6.432287255108363e-05, "logits/chosen": -1.7139580249786377, "logits/rejected": -1.0682191848754883, "logps/chosen": -415.08154296875, "logps/rejected": -463.1947326660156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.88477087020874, "rewards/margins": 20.44330596923828, "rewards/rejected": -26.328075408935547, "step": 3120 }, { "epoch": 9.728892331525948, "grad_norm": 0.0010677826358005404, "learning_rate": 6.312794048585286e-05, "logits/chosen": -1.6608006954193115, "logits/rejected": -1.0799270868301392, "logps/chosen": -393.5787353515625, "logps/rejected": -458.1851501464844, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.194777488708496, "rewards/margins": 20.60002899169922, "rewards/rejected": -25.7948055267334, "step": 3140 }, { "epoch": 9.790859798605732, "grad_norm": 0.00037055814755149186, "learning_rate": 6.193907102211358e-05, "logits/chosen": -1.700254201889038, "logits/rejected": -1.149086594581604, "logps/chosen": -414.83575439453125, "logps/rejected": -480.109375, "loss": 0.0054, "rewards/accuracies": 0.984375, "rewards/chosen": -6.013056755065918, "rewards/margins": 20.352540969848633, "rewards/rejected": -26.3655948638916, "step": 3160 }, { "epoch": 9.852827265685516, "grad_norm": 0.00012906199845019728, "learning_rate": 6.075645963692567e-05, "logits/chosen": -1.6764156818389893, "logits/rejected": -1.0942738056182861, "logps/chosen": -410.2710876464844, "logps/rejected": -480.7608337402344, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.723294734954834, "rewards/margins": 21.212993621826172, "rewards/rejected": -26.936288833618164, "step": 3180 }, { "epoch": 9.914794732765298, "grad_norm": 9.71817207755521e-05, "learning_rate": 5.9580300778379087e-05, "logits/chosen": -1.6972318887710571, "logits/rejected": -1.06034255027771, "logps/chosen": -414.45697021484375, "logps/rejected": -478.67608642578125, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.851905822753906, "rewards/margins": 22.140657424926758, "rewards/rejected": -26.992563247680664, "step": 3200 }, { "epoch": 9.976762199845082, "grad_norm": 0.0005355001194402575, "learning_rate": 5.8410787833622414e-05, "logits/chosen": -1.701051950454712, "logits/rejected": -1.0390212535858154, "logps/chosen": -392.62689208984375, "logps/rejected": -438.70660400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.652411937713623, "rewards/margins": 21.09701156616211, "rewards/rejected": -25.749425888061523, "step": 3220 }, { "epoch": 10.038729666924864, "grad_norm": 0.0007227555033750832, "learning_rate": 5.724811309706547e-05, "logits/chosen": -1.7204704284667969, "logits/rejected": -1.0700039863586426, "logps/chosen": -430.43206787109375, "logps/rejected": -488.071044921875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -5.367037296295166, "rewards/margins": 21.72504425048828, "rewards/rejected": -27.092077255249023, "step": 3240 }, { "epoch": 10.100697134004648, "grad_norm": 0.00017314284923486412, "learning_rate": 5.6092467738761776e-05, "logits/chosen": -1.6834897994995117, "logits/rejected": -1.0887248516082764, "logps/chosen": -416.51348876953125, "logps/rejected": -469.4505920410156, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.5038862228393555, "rewards/margins": 21.196359634399414, "rewards/rejected": -26.700244903564453, "step": 3260 }, { "epoch": 10.162664601084431, "grad_norm": 0.00027020045672543347, "learning_rate": 5.494404177297595e-05, "logits/chosen": -1.696730613708496, "logits/rejected": -1.0611952543258667, "logps/chosen": -399.0355529785156, "logps/rejected": -449.93646240234375, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.055383682250977, "rewards/margins": 20.96977996826172, "rewards/rejected": -26.025165557861328, "step": 3280 }, { "epoch": 10.224632068164214, "grad_norm": 0.0003596362948883325, "learning_rate": 5.380302402694104e-05, "logits/chosen": -1.7198495864868164, "logits/rejected": -1.0654425621032715, "logps/chosen": -390.9352722167969, "logps/rejected": -453.2206115722656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.068055629730225, "rewards/margins": 20.917200088500977, "rewards/rejected": -25.98525619506836, "step": 3300 }, { "epoch": 10.286599535243997, "grad_norm": 2.4758495783316903e-05, "learning_rate": 5.266960210981089e-05, "logits/chosen": -1.664912462234497, "logits/rejected": -1.0661206245422363, "logps/chosen": -402.9308166503906, "logps/rejected": -467.4169921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.694643020629883, "rewards/margins": 21.313457489013672, "rewards/rejected": -27.008098602294922, "step": 3320 }, { "epoch": 10.34856700232378, "grad_norm": 0.00036736109177581966, "learning_rate": 5.15439623818132e-05, "logits/chosen": -1.7021472454071045, "logits/rejected": -1.1036940813064575, "logps/chosen": -395.59149169921875, "logps/rejected": -463.43316650390625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.990979194641113, "rewards/margins": 20.853925704956055, "rewards/rejected": -26.84490394592285, "step": 3340 }, { "epoch": 10.410534469403563, "grad_norm": 0.00021753676992375404, "learning_rate": 5.042628992360755e-05, "logits/chosen": -1.6948877573013306, "logits/rejected": -1.0948389768600464, "logps/chosen": -417.33160400390625, "logps/rejected": -491.01483154296875, "loss": 0.0033, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.629961967468262, "rewards/margins": 21.473012924194336, "rewards/rejected": -27.102975845336914, "step": 3360 }, { "epoch": 10.472501936483347, "grad_norm": 0.0005015567876398563, "learning_rate": 4.9316768505853864e-05, "logits/chosen": -1.7080516815185547, "logits/rejected": -1.0318862199783325, "logps/chosen": -397.1073913574219, "logps/rejected": -439.6314392089844, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.5096540451049805, "rewards/margins": 20.36575698852539, "rewards/rejected": -25.875408172607422, "step": 3380 }, { "epoch": 10.53446940356313, "grad_norm": 0.000426275102654472, "learning_rate": 4.8215580558996546e-05, "logits/chosen": -1.6764377355575562, "logits/rejected": -1.0771383047103882, "logps/chosen": -404.91937255859375, "logps/rejected": -485.12548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.389082908630371, "rewards/margins": 21.155742645263672, "rewards/rejected": -26.54482650756836, "step": 3400 }, { "epoch": 10.596436870642913, "grad_norm": 0.00011274849384790286, "learning_rate": 4.7122907143268645e-05, "logits/chosen": -1.7037220001220703, "logits/rejected": -1.0873366594314575, "logps/chosen": -417.3395080566406, "logps/rejected": -485.4212951660156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.40346622467041, "rewards/margins": 21.43330955505371, "rewards/rejected": -26.836772918701172, "step": 3420 }, { "epoch": 10.658404337722695, "grad_norm": 0.0008545616874471307, "learning_rate": 4.603892791892157e-05, "logits/chosen": -1.7251865863800049, "logits/rejected": -1.1108168363571167, "logps/chosen": -409.8521423339844, "logps/rejected": -483.19329833984375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.274342060089111, "rewards/margins": 22.360143661499023, "rewards/rejected": -26.634485244750977, "step": 3440 }, { "epoch": 10.720371804802479, "grad_norm": 0.0002442661498207599, "learning_rate": 4.4963821116684645e-05, "logits/chosen": -1.7168834209442139, "logits/rejected": -1.0469696521759033, "logps/chosen": -410.9766540527344, "logps/rejected": -462.96759033203125, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.01826286315918, "rewards/margins": 21.594696044921875, "rewards/rejected": -26.612957000732422, "step": 3460 }, { "epoch": 10.782339271882261, "grad_norm": 2.5067949536605738e-05, "learning_rate": 4.3897763508460235e-05, "logits/chosen": -1.6555604934692383, "logits/rejected": -1.067326307296753, "logps/chosen": -411.1241149902344, "logps/rejected": -471.122314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.384194374084473, "rewards/margins": 20.667926788330078, "rewards/rejected": -26.052120208740234, "step": 3480 }, { "epoch": 10.844306738962045, "grad_norm": 9.07514404389076e-05, "learning_rate": 4.284093037825829e-05, "logits/chosen": -1.7002710103988647, "logits/rejected": -1.0244972705841064, "logps/chosen": -396.713623046875, "logps/rejected": -450.4693298339844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.3324480056762695, "rewards/margins": 20.980426788330078, "rewards/rejected": -26.312875747680664, "step": 3500 }, { "epoch": 10.906274206041829, "grad_norm": 0.0001592998596606776, "learning_rate": 4.179349549337557e-05, "logits/chosen": -1.704119086265564, "logits/rejected": -1.0116019248962402, "logps/chosen": -402.82666015625, "logps/rejected": -443.30157470703125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.895948886871338, "rewards/margins": 21.18239402770996, "rewards/rejected": -26.07834243774414, "step": 3520 }, { "epoch": 10.96824167312161, "grad_norm": 1.9538027117960155e-05, "learning_rate": 4.075563107582472e-05, "logits/chosen": -1.668092966079712, "logits/rejected": -1.065983533859253, "logps/chosen": -398.3217468261719, "logps/rejected": -477.6726989746094, "loss": 0.0054, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.416517734527588, "rewards/margins": 21.412036895751953, "rewards/rejected": -26.82855224609375, "step": 3540 }, { "epoch": 11.030209140201395, "grad_norm": 5.915413566981442e-05, "learning_rate": 3.9727507774016635e-05, "logits/chosen": -1.6671562194824219, "logits/rejected": -1.0572084188461304, "logps/chosen": -400.4344177246094, "logps/rejected": -474.96038818359375, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.950907230377197, "rewards/margins": 20.902238845825195, "rewards/rejected": -26.8531494140625, "step": 3560 }, { "epoch": 11.092176607281177, "grad_norm": 0.0006108521483838558, "learning_rate": 3.8709294634702376e-05, "logits/chosen": -1.7030471563339233, "logits/rejected": -1.0317370891571045, "logps/chosen": -398.74090576171875, "logps/rejected": -459.75, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.71872615814209, "rewards/margins": 22.286239624023438, "rewards/rejected": -27.00496482849121, "step": 3580 }, { "epoch": 11.15414407436096, "grad_norm": 0.000467544246930629, "learning_rate": 3.770115907517773e-05, "logits/chosen": -1.6686887741088867, "logits/rejected": -1.0782063007354736, "logps/chosen": -406.98138427734375, "logps/rejected": -482.86572265625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.86759090423584, "rewards/margins": 21.316923141479492, "rewards/rejected": -27.184513092041016, "step": 3600 }, { "epoch": 11.216111541440744, "grad_norm": 0.0004900813801214099, "learning_rate": 3.670326685575632e-05, "logits/chosen": -1.7124903202056885, "logits/rejected": -1.0398648977279663, "logps/chosen": -415.08648681640625, "logps/rejected": -477.70709228515625, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.952596187591553, "rewards/margins": 22.07376480102539, "rewards/rejected": -27.026357650756836, "step": 3620 }, { "epoch": 11.278079008520526, "grad_norm": 0.0002428332227282226, "learning_rate": 3.571578205251459e-05, "logits/chosen": -1.7211148738861084, "logits/rejected": -1.1097770929336548, "logps/chosen": -406.6622009277344, "logps/rejected": -460.78643798828125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.563107490539551, "rewards/margins": 21.05852699279785, "rewards/rejected": -26.621633529663086, "step": 3640 }, { "epoch": 11.34004647560031, "grad_norm": 0.0004079696664121002, "learning_rate": 3.4738867030314235e-05, "logits/chosen": -1.7017863988876343, "logits/rejected": -1.0735719203948975, "logps/chosen": -414.16339111328125, "logps/rejected": -490.61944580078125, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.069756507873535, "rewards/margins": 22.46738052368164, "rewards/rejected": -27.53713607788086, "step": 3660 }, { "epoch": 11.402013942680092, "grad_norm": 0.0001673255901550874, "learning_rate": 3.377268241610555e-05, "logits/chosen": -1.692521095275879, "logits/rejected": -1.0149263143539429, "logps/chosen": -412.38507080078125, "logps/rejected": -467.0577697753906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.867552280426025, "rewards/margins": 20.83139991760254, "rewards/rejected": -26.698949813842773, "step": 3680 }, { "epoch": 11.463981409759876, "grad_norm": 0.00012532217078842223, "learning_rate": 3.2817387072516726e-05, "logits/chosen": -1.7133913040161133, "logits/rejected": -1.1119440793991089, "logps/chosen": -401.7035217285156, "logps/rejected": -476.5845642089844, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.1463212966918945, "rewards/margins": 22.046228408813477, "rewards/rejected": -27.192550659179688, "step": 3700 }, { "epoch": 11.52594887683966, "grad_norm": 0.0002491988998372108, "learning_rate": 3.18731380717334e-05, "logits/chosen": -1.6776504516601562, "logits/rejected": -1.0443401336669922, "logps/chosen": -402.75933837890625, "logps/rejected": -455.70068359375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.209097385406494, "rewards/margins": 21.239925384521484, "rewards/rejected": -26.449024200439453, "step": 3720 }, { "epoch": 11.587916343919442, "grad_norm": 0.0005044552381150424, "learning_rate": 3.0940090669672215e-05, "logits/chosen": -1.6772470474243164, "logits/rejected": -1.0744705200195312, "logps/chosen": -400.09912109375, "logps/rejected": -477.5372619628906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.949058532714844, "rewards/margins": 21.821866989135742, "rewards/rejected": -26.770925521850586, "step": 3740 }, { "epoch": 11.649883810999226, "grad_norm": 4.5204073103377596e-05, "learning_rate": 3.001839828045342e-05, "logits/chosen": -1.7325446605682373, "logits/rejected": -1.063987135887146, "logps/chosen": -415.75592041015625, "logps/rejected": -452.0940856933594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.492778778076172, "rewards/margins": 20.81328582763672, "rewards/rejected": -26.30606460571289, "step": 3760 }, { "epoch": 11.711851278079008, "grad_norm": 0.0002700432378333062, "learning_rate": 2.9108212451176033e-05, "logits/chosen": -1.7305303812026978, "logits/rejected": -1.083184003829956, "logps/chosen": -400.70635986328125, "logps/rejected": -472.36114501953125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.0615034103393555, "rewards/margins": 22.031635284423828, "rewards/rejected": -27.093135833740234, "step": 3780 }, { "epoch": 11.773818745158792, "grad_norm": 0.00013194057100918144, "learning_rate": 2.8209682837000072e-05, "logits/chosen": -1.6789268255233765, "logits/rejected": -1.0528620481491089, "logps/chosen": -403.6865539550781, "logps/rejected": -479.7601623535156, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.470952033996582, "rewards/margins": 21.67144775390625, "rewards/rejected": -27.14239501953125, "step": 3800 }, { "epoch": 11.835786212238574, "grad_norm": 0.0002364068350289017, "learning_rate": 2.7322957176539777e-05, "logits/chosen": -1.6753734350204468, "logits/rejected": -1.0195820331573486, "logps/chosen": -417.6498107910156, "logps/rejected": -472.09844970703125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.569521903991699, "rewards/margins": 20.978273391723633, "rewards/rejected": -26.54779624938965, "step": 3820 }, { "epoch": 11.897753679318358, "grad_norm": 0.00013174403284210712, "learning_rate": 2.6448181267572226e-05, "logits/chosen": -1.6455790996551514, "logits/rejected": -1.046744465827942, "logps/chosen": -410.19134521484375, "logps/rejected": -483.3438415527344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.728828430175781, "rewards/margins": 21.940776824951172, "rewards/rejected": -27.669601440429688, "step": 3840 }, { "epoch": 11.959721146398142, "grad_norm": 0.00042892919736914337, "learning_rate": 2.5585498943064724e-05, "logits/chosen": -1.6926710605621338, "logits/rejected": -1.0491944551467896, "logps/chosen": -415.20550537109375, "logps/rejected": -482.228271484375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.751172065734863, "rewards/margins": 21.466909408569336, "rewards/rejected": -27.21807861328125, "step": 3860 }, { "epoch": 12.021688613477924, "grad_norm": 8.727656677365303e-05, "learning_rate": 2.4735052047525398e-05, "logits/chosen": -1.7163196802139282, "logits/rejected": -1.059697151184082, "logps/chosen": -422.93359375, "logps/rejected": -472.23583984375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.125914573669434, "rewards/margins": 21.549646377563477, "rewards/rejected": -26.675561904907227, "step": 3880 }, { "epoch": 12.083656080557708, "grad_norm": 5.139048516866751e-05, "learning_rate": 2.389698041368089e-05, "logits/chosen": -1.682549238204956, "logits/rejected": -1.0410518646240234, "logps/chosen": -419.48529052734375, "logps/rejected": -488.83154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.320895195007324, "rewards/margins": 22.32204246520996, "rewards/rejected": -27.6429386138916, "step": 3900 }, { "epoch": 12.14562354763749, "grad_norm": 0.00013814242265652865, "learning_rate": 2.3071421839484554e-05, "logits/chosen": -1.6900997161865234, "logits/rejected": -1.0404036045074463, "logps/chosen": -399.94854736328125, "logps/rejected": -466.58642578125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.632657051086426, "rewards/margins": 21.346328735351562, "rewards/rejected": -26.978984832763672, "step": 3920 }, { "epoch": 12.207591014717273, "grad_norm": 0.0001951899757841602, "learning_rate": 2.2258512065459448e-05, "logits/chosen": -1.6699708700180054, "logits/rejected": -1.058363437652588, "logps/chosen": -421.36419677734375, "logps/rejected": -490.47100830078125, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.72733211517334, "rewards/margins": 21.7630672454834, "rewards/rejected": -27.490398406982422, "step": 3940 }, { "epoch": 12.269558481797057, "grad_norm": 0.001167879207059741, "learning_rate": 2.1458384752379357e-05, "logits/chosen": -1.6963287591934204, "logits/rejected": -1.078595757484436, "logps/chosen": -400.4660339355469, "logps/rejected": -470.71710205078125, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.399907112121582, "rewards/margins": 21.62917709350586, "rewards/rejected": -27.02908706665039, "step": 3960 }, { "epoch": 12.33152594887684, "grad_norm": 9.643881639931351e-06, "learning_rate": 2.067117145929216e-05, "logits/chosen": -1.688515305519104, "logits/rejected": -1.08303964138031, "logps/chosen": -402.33795166015625, "logps/rejected": -477.7525329589844, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.999421119689941, "rewards/margins": 22.334285736083984, "rewards/rejected": -27.333709716796875, "step": 3980 }, { "epoch": 12.393493415956623, "grad_norm": 0.0006664241082035005, "learning_rate": 1.9897001621888434e-05, "logits/chosen": -1.7171924114227295, "logits/rejected": -1.0485467910766602, "logps/chosen": -409.967529296875, "logps/rejected": -477.21551513671875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.805159568786621, "rewards/margins": 22.3187198638916, "rewards/rejected": -27.123876571655273, "step": 4000 }, { "epoch": 12.455460883036405, "grad_norm": 5.3627591114491224e-06, "learning_rate": 1.913600253121919e-05, "logits/chosen": -1.677496314048767, "logits/rejected": -1.0768311023712158, "logps/chosen": -421.8292541503906, "logps/rejected": -494.90606689453125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.249929904937744, "rewards/margins": 21.906986236572266, "rewards/rejected": -27.15691566467285, "step": 4020 }, { "epoch": 12.51742835011619, "grad_norm": 3.554378781700507e-05, "learning_rate": 1.838829931276653e-05, "logits/chosen": -1.6907306909561157, "logits/rejected": -1.0432696342468262, "logps/chosen": -398.9062805175781, "logps/rejected": -465.7071228027344, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.694939136505127, "rewards/margins": 22.108684539794922, "rewards/rejected": -26.80362319946289, "step": 4040 }, { "epoch": 12.579395817195973, "grad_norm": 6.133209535619244e-05, "learning_rate": 1.7654014905870098e-05, "logits/chosen": -1.6698366403579712, "logits/rejected": -1.0069531202316284, "logps/chosen": -417.49237060546875, "logps/rejected": -470.18902587890625, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.820713520050049, "rewards/margins": 21.33327865600586, "rewards/rejected": -27.15399169921875, "step": 4060 }, { "epoch": 12.641363284275755, "grad_norm": 0.00020697916625067592, "learning_rate": 1.6933270043513083e-05, "logits/chosen": -1.677680253982544, "logits/rejected": -1.0464431047439575, "logps/chosen": -408.2115478515625, "logps/rejected": -478.3711853027344, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.883364200592041, "rewards/margins": 21.521183013916016, "rewards/rejected": -27.404544830322266, "step": 4080 }, { "epoch": 12.703330751355539, "grad_norm": 0.00018397132225800306, "learning_rate": 1.622618323247087e-05, "logits/chosen": -1.6993494033813477, "logits/rejected": -1.0857021808624268, "logps/chosen": -405.2132873535156, "logps/rejected": -485.60321044921875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.594387531280518, "rewards/margins": 21.590347290039062, "rewards/rejected": -27.184734344482422, "step": 4100 }, { "epoch": 12.765298218435321, "grad_norm": 0.00029773233109153807, "learning_rate": 1.553287073382609e-05, "logits/chosen": -1.7119516134262085, "logits/rejected": -1.0656880140304565, "logps/chosen": -405.5570373535156, "logps/rejected": -462.2611389160156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.116886615753174, "rewards/margins": 21.456085205078125, "rewards/rejected": -26.57297134399414, "step": 4120 }, { "epoch": 12.827265685515105, "grad_norm": 0.0001080308502423577, "learning_rate": 1.485344654385239e-05, "logits/chosen": -1.6709296703338623, "logits/rejected": -1.053741693496704, "logps/chosen": -428.66839599609375, "logps/rejected": -500.01092529296875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -6.277214050292969, "rewards/margins": 22.146846771240234, "rewards/rejected": -28.424060821533203, "step": 4140 }, { "epoch": 12.889233152594887, "grad_norm": 6.432453665183857e-05, "learning_rate": 1.418802237527106e-05, "logits/chosen": -1.68827223777771, "logits/rejected": -1.0494086742401123, "logps/chosen": -424.75286865234375, "logps/rejected": -481.1043395996094, "loss": 0.0033, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.699560165405273, "rewards/margins": 21.662763595581055, "rewards/rejected": -27.362323760986328, "step": 4160 }, { "epoch": 12.95120061967467, "grad_norm": 0.0004029480624012649, "learning_rate": 1.3536707638882872e-05, "logits/chosen": -1.6849908828735352, "logits/rejected": -1.0281345844268799, "logps/chosen": -419.80010986328125, "logps/rejected": -460.801025390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.415833950042725, "rewards/margins": 20.73134422302246, "rewards/rejected": -26.147180557250977, "step": 4180 }, { "epoch": 13.013168086754455, "grad_norm": 0.0002039131213678047, "learning_rate": 1.289960942557844e-05, "logits/chosen": -1.686678171157837, "logits/rejected": -1.041481852531433, "logps/chosen": -418.22686767578125, "logps/rejected": -488.3094787597656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.922072410583496, "rewards/margins": 21.746536254882812, "rewards/rejected": -27.66861343383789, "step": 4200 }, { "epoch": 13.075135553834237, "grad_norm": 0.00016347317432519048, "learning_rate": 1.2276832488730094e-05, "logits/chosen": -1.7182451486587524, "logits/rejected": -1.0532605648040771, "logps/chosen": -441.8271484375, "logps/rejected": -510.87628173828125, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.668587684631348, "rewards/margins": 22.97989273071289, "rewards/rejected": -28.648479461669922, "step": 4220 }, { "epoch": 13.13710302091402, "grad_norm": 0.00020034710178151727, "learning_rate": 1.1668479226967965e-05, "logits/chosen": -1.6925156116485596, "logits/rejected": -1.0687302350997925, "logps/chosen": -399.3315124511719, "logps/rejected": -474.7539978027344, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.459714889526367, "rewards/margins": 21.628223419189453, "rewards/rejected": -27.087936401367188, "step": 4240 }, { "epoch": 13.199070487993803, "grad_norm": 0.00026680485461838543, "learning_rate": 1.1074649667343506e-05, "logits/chosen": -1.6791460514068604, "logits/rejected": -1.0547727346420288, "logps/chosen": -412.1854553222656, "logps/rejected": -474.461181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.000552654266357, "rewards/margins": 21.563953399658203, "rewards/rejected": -26.564502716064453, "step": 4260 }, { "epoch": 13.261037955073586, "grad_norm": 9.416981629328802e-05, "learning_rate": 1.0495441448882571e-05, "logits/chosen": -1.6752477884292603, "logits/rejected": -1.0648829936981201, "logps/chosen": -413.24609375, "logps/rejected": -496.79327392578125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.392674922943115, "rewards/margins": 22.125301361083984, "rewards/rejected": -27.51797866821289, "step": 4280 }, { "epoch": 13.32300542215337, "grad_norm": 0.00027022938593290746, "learning_rate": 9.930949806531509e-06, "logits/chosen": -1.6898155212402344, "logits/rejected": -1.0595139265060425, "logps/chosen": -410.2594299316406, "logps/rejected": -469.86883544921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.0617547035217285, "rewards/margins": 21.85466194152832, "rewards/rejected": -26.916418075561523, "step": 4300 }, { "epoch": 13.384972889233152, "grad_norm": 5.3291834774427116e-05, "learning_rate": 9.38126755549832e-06, "logits/chosen": -1.6853482723236084, "logits/rejected": -1.0476603507995605, "logps/chosen": -411.350830078125, "logps/rejected": -470.8251037597656, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.509891510009766, "rewards/margins": 21.421506881713867, "rewards/rejected": -26.931400299072266, "step": 4320 }, { "epoch": 13.446940356312936, "grad_norm": 8.903396519599482e-05, "learning_rate": 8.846485075991728e-06, "logits/chosen": -1.6736446619033813, "logits/rejected": -1.0330798625946045, "logps/chosen": -417.89044189453125, "logps/rejected": -477.1280822753906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.157768726348877, "rewards/margins": 21.815839767456055, "rewards/rejected": -26.973608016967773, "step": 4340 }, { "epoch": 13.508907823392718, "grad_norm": 0.0006522313342429698, "learning_rate": 8.326690298360639e-06, "logits/chosen": -1.679149866104126, "logits/rejected": -1.0622096061706543, "logps/chosen": -403.9975891113281, "logps/rejected": -478.72174072265625, "loss": 0.0043, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.251322269439697, "rewards/margins": 21.613903045654297, "rewards/rejected": -26.8652286529541, "step": 4360 }, { "epoch": 13.570875290472502, "grad_norm": 0.0001527480490040034, "learning_rate": 7.821968688636383e-06, "logits/chosen": -1.7000373601913452, "logits/rejected": -1.0500789880752563, "logps/chosen": -400.9742431640625, "logps/rejected": -477.05450439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.700057506561279, "rewards/margins": 21.45535659790039, "rewards/rejected": -27.155414581298828, "step": 4380 }, { "epoch": 13.632842757552286, "grad_norm": 0.0005368488491512835, "learning_rate": 7.332403234480223e-06, "logits/chosen": -1.683445692062378, "logits/rejected": -1.0166078805923462, "logps/chosen": -401.72607421875, "logps/rejected": -456.4202575683594, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.469435691833496, "rewards/margins": 21.11139488220215, "rewards/rejected": -26.580829620361328, "step": 4400 }, { "epoch": 13.694810224632068, "grad_norm": 0.0005580181023105979, "learning_rate": 6.858074431538164e-06, "logits/chosen": -1.6824891567230225, "logits/rejected": -1.0271477699279785, "logps/chosen": -399.6391296386719, "logps/rejected": -451.330078125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.243688583374023, "rewards/margins": 21.208574295043945, "rewards/rejected": -26.452260971069336, "step": 4420 }, { "epoch": 13.756777691711852, "grad_norm": NaN, "learning_rate": 6.421646080196197e-06, "logits/chosen": -1.6686054468154907, "logits/rejected": -1.0693179368972778, "logps/chosen": -401.59844970703125, "logps/rejected": -474.7311096191406, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.454672336578369, "rewards/margins": 21.39242172241211, "rewards/rejected": -26.847095489501953, "step": 4440 }, { "epoch": 13.818745158791634, "grad_norm": 1.7149226550827734e-05, "learning_rate": 5.9772507736462145e-06, "logits/chosen": -1.710008978843689, "logits/rejected": -1.0888980627059937, "logps/chosen": -407.61260986328125, "logps/rejected": -481.07550048828125, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.6366987228393555, "rewards/margins": 21.678539276123047, "rewards/rejected": -27.315237045288086, "step": 4460 }, { "epoch": 13.880712625871418, "grad_norm": 2.4136075808200985e-05, "learning_rate": 5.54831493606015e-06, "logits/chosen": -1.6713101863861084, "logits/rejected": -1.0732184648513794, "logps/chosen": -424.976806640625, "logps/rejected": -506.0423889160156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.765892028808594, "rewards/margins": 22.11074447631836, "rewards/rejected": -27.876636505126953, "step": 4480 }, { "epoch": 13.9426800929512, "grad_norm": 7.025560626061633e-05, "learning_rate": 5.134909094202267e-06, "logits/chosen": -1.699441909790039, "logits/rejected": -1.0467607975006104, "logps/chosen": -401.03375244140625, "logps/rejected": -447.85308837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.419959545135498, "rewards/margins": 20.893884658813477, "rewards/rejected": -26.313846588134766, "step": 4500 }, { "epoch": 14.004647560030984, "grad_norm": 0.0002559265703894198, "learning_rate": 4.7371012213538235e-06, "logits/chosen": -1.6893657445907593, "logits/rejected": -1.0456167459487915, "logps/chosen": -425.73895263671875, "logps/rejected": -486.43890380859375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.418589115142822, "rewards/margins": 22.638408660888672, "rewards/rejected": -28.0570011138916, "step": 4520 }, { "epoch": 14.066615027110767, "grad_norm": 0.00043519827886484563, "learning_rate": 4.35495672613685e-06, "logits/chosen": -1.6840267181396484, "logits/rejected": -1.0660759210586548, "logps/chosen": -420.65692138671875, "logps/rejected": -481.805419921875, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.826098442077637, "rewards/margins": 21.706336975097656, "rewards/rejected": -27.53243637084961, "step": 4540 }, { "epoch": 14.12858249419055, "grad_norm": 0.0004038415208924562, "learning_rate": 3.988538441759382e-06, "logits/chosen": -1.673048973083496, "logits/rejected": -1.0200636386871338, "logps/chosen": -403.9557189941406, "logps/rejected": -461.65179443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.021474361419678, "rewards/margins": 21.59840965270996, "rewards/rejected": -26.619884490966797, "step": 4560 }, { "epoch": 14.190549961270333, "grad_norm": 0.00038054597098380327, "learning_rate": 3.637906615684328e-06, "logits/chosen": -1.6679537296295166, "logits/rejected": -1.0269415378570557, "logps/chosen": -410.174072265625, "logps/rejected": -484.68865966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.3633928298950195, "rewards/margins": 22.351978302001953, "rewards/rejected": -27.715368270874023, "step": 4580 }, { "epoch": 14.252517428350115, "grad_norm": 5.562596925301477e-05, "learning_rate": 3.3031188997233676e-06, "logits/chosen": -1.6873247623443604, "logits/rejected": -1.0105091333389282, "logps/chosen": -405.04132080078125, "logps/rejected": -454.36920166015625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.410122871398926, "rewards/margins": 21.17348289489746, "rewards/rejected": -26.583606719970703, "step": 4600 }, { "epoch": 14.3144848954299, "grad_norm": 4.7735171392560005e-05, "learning_rate": 2.9842303405577366e-06, "logits/chosen": -1.6932716369628906, "logits/rejected": -1.026926040649414, "logps/chosen": -416.610595703125, "logps/rejected": -469.50335693359375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.13016414642334, "rewards/margins": 20.862241744995117, "rewards/rejected": -26.99240493774414, "step": 4620 }, { "epoch": 14.376452362509683, "grad_norm": 0.00047004391672089696, "learning_rate": 2.6812933706872545e-06, "logits/chosen": -1.6934292316436768, "logits/rejected": -1.063394546508789, "logps/chosen": -415.4750061035156, "logps/rejected": -489.5491638183594, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.2422404289245605, "rewards/margins": 22.516773223876953, "rewards/rejected": -27.759014129638672, "step": 4640 }, { "epoch": 14.438419829589465, "grad_norm": 0.0008643981418572366, "learning_rate": 2.394357799809277e-06, "logits/chosen": -1.735192894935608, "logits/rejected": -1.069784164428711, "logps/chosen": -409.0735168457031, "logps/rejected": -455.7366638183594, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.419035911560059, "rewards/margins": 21.468860626220703, "rewards/rejected": -26.887897491455078, "step": 4660 }, { "epoch": 14.500387296669249, "grad_norm": 0.0002557814004831016, "learning_rate": 2.123470806628858e-06, "logits/chosen": -1.6932361125946045, "logits/rejected": -1.03562331199646, "logps/chosen": -404.10223388671875, "logps/rejected": -452.8517150878906, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.449051856994629, "rewards/margins": 21.111392974853516, "rewards/rejected": -26.560443878173828, "step": 4680 }, { "epoch": 14.562354763749031, "grad_norm": 0.00017765916709322482, "learning_rate": 1.868676931101465e-06, "logits/chosen": -1.6715888977050781, "logits/rejected": -1.057328462600708, "logps/chosen": -411.4977111816406, "logps/rejected": -486.6917419433594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.450153827667236, "rewards/margins": 22.20999526977539, "rewards/rejected": -27.6601505279541, "step": 4700 }, { "epoch": 14.624322230828815, "grad_norm": 0.0006002355949021876, "learning_rate": 1.6300180671096288e-06, "logits/chosen": -1.6742595434188843, "logits/rejected": -1.0468966960906982, "logps/chosen": -414.0707092285156, "logps/rejected": -482.42657470703125, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.493812561035156, "rewards/margins": 21.657306671142578, "rewards/rejected": -27.151119232177734, "step": 4720 }, { "epoch": 14.686289697908599, "grad_norm": 0.00020658239373005927, "learning_rate": 1.4075334555746055e-06, "logits/chosen": -1.662987470626831, "logits/rejected": -1.016445279121399, "logps/chosen": -407.02423095703125, "logps/rejected": -467.1194763183594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.999020576477051, "rewards/margins": 20.836938858032227, "rewards/rejected": -26.83595848083496, "step": 4740 }, { "epoch": 14.748257164988381, "grad_norm": 6.777382805012167e-05, "learning_rate": 1.2012596780043627e-06, "logits/chosen": -1.6404949426651, "logits/rejected": -1.0619919300079346, "logps/chosen": -394.98443603515625, "logps/rejected": -479.7742614746094, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.746143341064453, "rewards/margins": 21.60362434387207, "rewards/rejected": -27.349767684936523, "step": 4760 }, { "epoch": 14.810224632068165, "grad_norm": 0.00017278394079767168, "learning_rate": 1.011230650478634e-06, "logits/chosen": -1.6573286056518555, "logits/rejected": -1.0122966766357422, "logps/chosen": -396.2731018066406, "logps/rejected": -456.626220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.127909183502197, "rewards/margins": 21.664600372314453, "rewards/rejected": -26.79250717163086, "step": 4780 }, { "epoch": 14.872192099147947, "grad_norm": 0.00017635834228713065, "learning_rate": 8.374776180724575e-07, "logits/chosen": -1.7095073461532593, "logits/rejected": -1.0201966762542725, "logps/chosen": -402.76763916015625, "logps/rejected": -461.19903564453125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -5.404868125915527, "rewards/margins": 21.330501556396484, "rewards/rejected": -26.735370635986328, "step": 4800 }, { "epoch": 14.93415956622773, "grad_norm": 0.0006217029877007008, "learning_rate": 6.800291497187083e-07, "logits/chosen": -1.7389657497406006, "logits/rejected": -1.0253870487213135, "logps/chosen": -406.7480163574219, "logps/rejected": -461.8179626464844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.920414447784424, "rewards/margins": 21.916866302490234, "rewards/rejected": -26.8372802734375, "step": 4820 }, { "epoch": 14.996127033307513, "grad_norm": 0.0001935044419951737, "learning_rate": 5.389111335107556e-07, "logits/chosen": -1.696392297744751, "logits/rejected": -1.0922819375991821, "logps/chosen": -414.5367736816406, "logps/rejected": -476.94012451171875, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.998685359954834, "rewards/margins": 21.558393478393555, "rewards/rejected": -27.557079315185547, "step": 4840 }, { "epoch": 15.058094500387297, "grad_norm": 4.989042645320296e-05, "learning_rate": 4.1414677244584477e-07, "logits/chosen": -1.690422773361206, "logits/rejected": -1.0694575309753418, "logps/chosen": -417.68487548828125, "logps/rejected": -490.20989990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.380603313446045, "rewards/margins": 21.939071655273438, "rewards/rejected": -27.31967544555664, "step": 4860 }, { "epoch": 15.12006196746708, "grad_norm": 0.0008857127977535129, "learning_rate": 3.0575658061001713e-07, "logits/chosen": -1.692728042602539, "logits/rejected": -1.0653448104858398, "logps/chosen": -414.1552734375, "logps/rejected": -490.3134765625, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.1294355392456055, "rewards/margins": 21.649871826171875, "rewards/rejected": -27.779308319091797, "step": 4880 }, { "epoch": 15.182029434546862, "grad_norm": 7.71297054598108e-05, "learning_rate": 2.1375837980512904e-07, "logits/chosen": -1.687190294265747, "logits/rejected": -1.0721074342727661, "logps/chosen": -410.22161865234375, "logps/rejected": -491.24835205078125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.182304859161377, "rewards/margins": 22.23093032836914, "rewards/rejected": -27.41323471069336, "step": 4900 }, { "epoch": 15.243996901626646, "grad_norm": 0.00017248830408789217, "learning_rate": 1.38167296618541e-07, "logits/chosen": -1.682885766029358, "logits/rejected": -1.0524094104766846, "logps/chosen": -410.17681884765625, "logps/rejected": -472.13885498046875, "loss": 0.0011, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.831109046936035, "rewards/margins": 21.399702072143555, "rewards/rejected": -27.230810165405273, "step": 4920 }, { "epoch": 15.305964368706428, "grad_norm": 0.0008164289756678045, "learning_rate": 7.899575993597363e-08, "logits/chosen": -1.6627308130264282, "logits/rejected": -0.9520984888076782, "logps/chosen": -395.6473693847656, "logps/rejected": -433.9269104003906, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.2272748947143555, "rewards/margins": 20.858642578125, "rewards/rejected": -26.08591651916504, "step": 4940 }, { "epoch": 15.367931835786212, "grad_norm": 0.00019182954565621912, "learning_rate": 3.6253498897886873e-08, "logits/chosen": -1.6554197072982788, "logits/rejected": -1.0059171915054321, "logps/chosen": -394.91973876953125, "logps/rejected": -451.76312255859375, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.4583234786987305, "rewards/margins": 21.091644287109375, "rewards/rejected": -26.549968719482422, "step": 4960 }, { "epoch": 15.429899302865996, "grad_norm": 0.00014239229494705796, "learning_rate": 9.947541299837327e-09, "logits/chosen": -1.7060569524765015, "logits/rejected": -1.0418967008590698, "logps/chosen": -427.88525390625, "logps/rejected": -482.782958984375, "loss": 0.0043, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.570733070373535, "rewards/margins": 21.934314727783203, "rewards/rejected": -27.505046844482422, "step": 4980 }, { "epoch": 15.491866769945778, "grad_norm": 0.0005336150643415749, "learning_rate": 8.221243689154889e-11, "logits/chosen": -1.6255543231964111, "logits/rejected": -1.027090311050415, "logps/chosen": -393.7467956542969, "logps/rejected": -484.93804931640625, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.538996696472168, "rewards/margins": 21.719022750854492, "rewards/rejected": -27.25801658630371, "step": 5000 } ], "logging_steps": 20, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }