diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.491866769945778, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.061967467079783116, + "grad_norm": 0.6070870757102966, + "learning_rate": 4e-05, + "logits/chosen": -2.0001754760742188, + "logits/rejected": -1.449440598487854, + "logps/chosen": -374.65521240234375, + "logps/rejected": -215.3085479736328, + "loss": 1.007, + "rewards/accuracies": 0.44062501192092896, + "rewards/chosen": -0.3046182096004486, + "rewards/margins": -0.20184263586997986, + "rewards/rejected": -0.10277555137872696, + "step": 20 + }, + { + "epoch": 0.12393493415956623, + "grad_norm": 0.5136411190032959, + "learning_rate": 8e-05, + "logits/chosen": -2.083824872970581, + "logits/rejected": -1.584017038345337, + "logps/chosen": -341.329833984375, + "logps/rejected": -208.3067169189453, + "loss": 0.1907, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.0636544227600098, + "rewards/margins": 2.9626474380493164, + "rewards/rejected": -0.8989933133125305, + "step": 40 + }, + { + "epoch": 0.18590240123934934, + "grad_norm": 0.18788862228393555, + "learning_rate": 0.00012, + "logits/chosen": -2.0708529949188232, + "logits/rejected": -1.5524569749832153, + "logps/chosen": -329.73193359375, + "logps/rejected": -221.080078125, + "loss": 0.0732, + "rewards/accuracies": 0.984375, + "rewards/chosen": 2.1646170616149902, + "rewards/margins": 4.800443649291992, + "rewards/rejected": -2.635826826095581, + "step": 60 + }, + { + "epoch": 0.24786986831913246, + "grad_norm": 0.2149907350540161, + "learning_rate": 0.00016, + "logits/chosen": -1.964525580406189, + "logits/rejected": -1.425443172454834, + "logps/chosen": -337.01165771484375, + "logps/rejected": -236.92935180664062, + "loss": 0.0384, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 1.6233104467391968, + "rewards/margins": 6.25473690032959, + "rewards/rejected": -4.631426811218262, + "step": 80 + }, + { + "epoch": 0.30983733539891556, + "grad_norm": 0.13132674992084503, + "learning_rate": 0.0002, + "logits/chosen": -1.8194172382354736, + "logits/rejected": -1.3340699672698975, + "logps/chosen": -329.0172424316406, + "logps/rejected": -260.6822814941406, + "loss": 0.024, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.2860015332698822, + "rewards/margins": 7.288111686706543, + "rewards/rejected": -7.002110958099365, + "step": 100 + }, + { + "epoch": 0.3718048024786987, + "grad_norm": 0.06768889725208282, + "learning_rate": 0.00019999177886783194, + "logits/chosen": -1.818981409072876, + "logits/rejected": -1.3484697341918945, + "logps/chosen": -359.87005615234375, + "logps/rejected": -294.05047607421875, + "loss": 0.021, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.2960149049758911, + "rewards/margins": 8.185277938842773, + "rewards/rejected": -7.889264106750488, + "step": 120 + }, + { + "epoch": 0.4337722695584818, + "grad_norm": 0.00373012013733387, + "learning_rate": 0.000199967116823068, + "logits/chosen": -1.747314453125, + "logits/rejected": -1.209826946258545, + "logps/chosen": -356.72686767578125, + "logps/rejected": -287.92205810546875, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20157980918884277, + "rewards/margins": 8.92736530303955, + "rewards/rejected": -8.725785255432129, + "step": 140 + }, + { + "epoch": 0.4957397366382649, + "grad_norm": 0.08832018822431564, + "learning_rate": 0.00019992601792070679, + "logits/chosen": -1.760593056678772, + "logits/rejected": -1.227081060409546, + "logps/chosen": -359.7059326171875, + "logps/rejected": -307.3652648925781, + "loss": 0.0121, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.5851167440414429, + "rewards/margins": 9.88296890258789, + "rewards/rejected": -10.468085289001465, + "step": 160 + }, + { + "epoch": 0.557707203718048, + "grad_norm": 0.12635135650634766, + "learning_rate": 0.00019986848891833845, + "logits/chosen": -1.6951453685760498, + "logits/rejected": -1.1247837543487549, + "logps/chosen": -369.36383056640625, + "logps/rejected": -313.21380615234375, + "loss": 0.0159, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.165026903152466, + "rewards/margins": 9.382209777832031, + "rewards/rejected": -11.547235488891602, + "step": 180 + }, + { + "epoch": 0.6196746707978311, + "grad_norm": 0.5119428038597107, + "learning_rate": 0.00019979453927503364, + "logits/chosen": -1.5557712316513062, + "logits/rejected": -0.9883753657341003, + "logps/chosen": -378.3529357910156, + "logps/rejected": -338.2301330566406, + "loss": 0.0109, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.9073299169540405, + "rewards/margins": 10.27137565612793, + "rewards/rejected": -12.178706169128418, + "step": 200 + }, + { + "epoch": 0.6816421378776143, + "grad_norm": 0.012499742209911346, + "learning_rate": 0.0001997041811497882, + "logits/chosen": -1.639301061630249, + "logits/rejected": -1.059734582901001, + "logps/chosen": -403.56439208984375, + "logps/rejected": -362.4933776855469, + "loss": 0.0113, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.50722599029541, + "rewards/margins": 11.781638145446777, + "rewards/rejected": -16.288862228393555, + "step": 220 + }, + { + "epoch": 0.7436096049573974, + "grad_norm": 0.015822602435946465, + "learning_rate": 0.00019959742939952392, + "logits/chosen": -1.801640510559082, + "logits/rejected": -1.2558636665344238, + "logps/chosen": -358.8158264160156, + "logps/rejected": -329.281494140625, + "loss": 0.0085, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.591296911239624, + "rewards/margins": 11.404090881347656, + "rewards/rejected": -12.995388984680176, + "step": 240 + }, + { + "epoch": 0.8055770720371804, + "grad_norm": 0.06576687842607498, + "learning_rate": 0.00019947430157664576, + "logits/chosen": -1.816361427307129, + "logits/rejected": -1.3142831325531006, + "logps/chosen": -375.107421875, + "logps/rejected": -361.25567626953125, + "loss": 0.0121, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.420842409133911, + "rewards/margins": 11.270395278930664, + "rewards/rejected": -13.691238403320312, + "step": 260 + }, + { + "epoch": 0.8675445391169636, + "grad_norm": 0.01211523823440075, + "learning_rate": 0.00019933481792615583, + "logits/chosen": -1.7951005697250366, + "logits/rejected": -1.256089448928833, + "logps/chosen": -363.334228515625, + "logps/rejected": -335.49615478515625, + "loss": 0.0069, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.6655162572860718, + "rewards/margins": 11.434516906738281, + "rewards/rejected": -13.1000337600708, + "step": 280 + }, + { + "epoch": 0.9295120061967467, + "grad_norm": 0.005867226514965296, + "learning_rate": 0.0001991790013823246, + "logits/chosen": -1.8247705698013306, + "logits/rejected": -1.2836697101593018, + "logps/chosen": -373.73175048828125, + "logps/rejected": -328.99371337890625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.960078239440918, + "rewards/margins": 11.281866073608398, + "rewards/rejected": -13.241943359375, + "step": 300 + }, + { + "epoch": 0.9914794732765299, + "grad_norm": 0.11168529838323593, + "learning_rate": 0.0001990068775649202, + "logits/chosen": -1.8314838409423828, + "logits/rejected": -1.3281538486480713, + "logps/chosen": -362.94549560546875, + "logps/rejected": -310.90692138671875, + "loss": 0.0109, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.7653158903121948, + "rewards/margins": 10.92064094543457, + "rewards/rejected": -11.685956001281738, + "step": 320 + }, + { + "epoch": 1.053446940356313, + "grad_norm": 0.053166139870882034, + "learning_rate": 0.00019881847477499557, + "logits/chosen": -1.8288739919662476, + "logits/rejected": -1.2687069177627563, + "logps/chosen": -379.93914794921875, + "logps/rejected": -346.6662902832031, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3435510993003845, + "rewards/margins": 12.371174812316895, + "rewards/rejected": -12.714726448059082, + "step": 340 + }, + { + "epoch": 1.115414407436096, + "grad_norm": 0.007846315391361713, + "learning_rate": 0.0001986138239902355, + "logits/chosen": -1.8146957159042358, + "logits/rejected": -1.1931467056274414, + "logps/chosen": -361.128173828125, + "logps/rejected": -333.5379333496094, + "loss": 0.0035, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.7167718410491943, + "rewards/margins": 13.46613597869873, + "rewards/rejected": -14.182907104492188, + "step": 360 + }, + { + "epoch": 1.1773818745158793, + "grad_norm": 0.0029342020861804485, + "learning_rate": 0.00019839295885986296, + "logits/chosen": -1.8402125835418701, + "logits/rejected": -1.3026095628738403, + "logps/chosen": -367.6770935058594, + "logps/rejected": -334.61505126953125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48788753151893616, + "rewards/margins": 12.355894088745117, + "rewards/rejected": -12.843780517578125, + "step": 380 + }, + { + "epoch": 1.2393493415956622, + "grad_norm": 0.0005422068061307073, + "learning_rate": 0.00019815591569910654, + "logits/chosen": -1.781711220741272, + "logits/rejected": -1.2187694311141968, + "logps/chosen": -368.02130126953125, + "logps/rejected": -336.0605163574219, + "loss": 0.004, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -0.474712073802948, + "rewards/margins": 13.070175170898438, + "rewards/rejected": -13.544886589050293, + "step": 400 + }, + { + "epoch": 1.3013168086754454, + "grad_norm": 0.004247570876032114, + "learning_rate": 0.0001979027334832293, + "logits/chosen": -1.729142189025879, + "logits/rejected": -1.1420295238494873, + "logps/chosen": -363.62261962890625, + "logps/rejected": -350.509765625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9223267436027527, + "rewards/margins": 14.022272109985352, + "rewards/rejected": -14.944600105285645, + "step": 420 + }, + { + "epoch": 1.3632842757552286, + "grad_norm": 0.025411546230316162, + "learning_rate": 0.00019763345384112043, + "logits/chosen": -1.6916519403457642, + "logits/rejected": -1.1293952465057373, + "logps/chosen": -368.69122314453125, + "logps/rejected": -357.363037109375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3415491580963135, + "rewards/margins": 13.441192626953125, + "rewards/rejected": -14.782742500305176, + "step": 440 + }, + { + "epoch": 1.4252517428350115, + "grad_norm": 0.023552559316158295, + "learning_rate": 0.00019734812104845047, + "logits/chosen": -1.6404588222503662, + "logits/rejected": -1.0976492166519165, + "logps/chosen": -358.5830993652344, + "logps/rejected": -323.82977294921875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1879071146249771, + "rewards/margins": 11.893779754638672, + "rewards/rejected": -12.081686019897461, + "step": 460 + }, + { + "epoch": 1.4872192099147947, + "grad_norm": 0.04839726537466049, + "learning_rate": 0.0001970467820203915, + "logits/chosen": -1.4514319896697998, + "logits/rejected": -0.7945712208747864, + "logps/chosen": -395.62109375, + "logps/rejected": -361.99224853515625, + "loss": 0.0052, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.660977602005005, + "rewards/margins": 13.56675910949707, + "rewards/rejected": -16.227737426757812, + "step": 480 + }, + { + "epoch": 1.549186676994578, + "grad_norm": 0.04717102646827698, + "learning_rate": 0.00019672948630390294, + "logits/chosen": -1.6030662059783936, + "logits/rejected": -1.008603811264038, + "logps/chosen": -382.2178955078125, + "logps/rejected": -384.981201171875, + "loss": 0.0185, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.418046474456787, + "rewards/margins": 14.233471870422363, + "rewards/rejected": -17.65151596069336, + "step": 500 + }, + { + "epoch": 1.6111541440743609, + "grad_norm": 0.022282173857092857, + "learning_rate": 0.00019639628606958533, + "logits/chosen": -1.943267822265625, + "logits/rejected": -1.5064051151275635, + "logps/chosen": -350.5743408203125, + "logps/rejected": -292.48321533203125, + "loss": 0.0043, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -0.508022129535675, + "rewards/margins": 10.412274360656738, + "rewards/rejected": -10.920295715332031, + "step": 520 + }, + { + "epoch": 1.673121611154144, + "grad_norm": 0.009392939507961273, + "learning_rate": 0.00019604723610310194, + "logits/chosen": -1.932124376296997, + "logits/rejected": -1.507216215133667, + "logps/chosen": -366.7988586425781, + "logps/rejected": -342.846923828125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8672822713851929, + "rewards/margins": 11.667869567871094, + "rewards/rejected": -12.535151481628418, + "step": 540 + }, + { + "epoch": 1.7350890782339272, + "grad_norm": 0.008884243667125702, + "learning_rate": 0.00019568239379617088, + "logits/chosen": -1.8822323083877563, + "logits/rejected": -1.4790470600128174, + "logps/chosen": -364.321044921875, + "logps/rejected": -341.40081787109375, + "loss": 0.0035, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.8236000537872314, + "rewards/margins": 12.299530982971191, + "rewards/rejected": -14.123130798339844, + "step": 560 + }, + { + "epoch": 1.7970565453137102, + "grad_norm": 0.0044061969965696335, + "learning_rate": 0.00019530181913712872, + "logits/chosen": -1.926490068435669, + "logits/rejected": -1.4624470472335815, + "logps/chosen": -372.48468017578125, + "logps/rejected": -331.5034484863281, + "loss": 0.0055, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.4063794612884521, + "rewards/margins": 12.16389274597168, + "rewards/rejected": -13.570272445678711, + "step": 580 + }, + { + "epoch": 1.8590240123934936, + "grad_norm": 0.028566114604473114, + "learning_rate": 0.00019490557470106686, + "logits/chosen": -1.92436945438385, + "logits/rejected": -1.499299168586731, + "logps/chosen": -355.2225646972656, + "logps/rejected": -351.27313232421875, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.2374690771102905, + "rewards/margins": 13.03515338897705, + "rewards/rejected": -14.272623062133789, + "step": 600 + }, + { + "epoch": 1.9209914794732765, + "grad_norm": 0.006185224745422602, + "learning_rate": 0.00019449372563954293, + "logits/chosen": -1.9587417840957642, + "logits/rejected": -1.4495702981948853, + "logps/chosen": -383.0813903808594, + "logps/rejected": -355.744873046875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4742207527160645, + "rewards/margins": 13.494425773620605, + "rewards/rejected": -15.968646049499512, + "step": 620 + }, + { + "epoch": 1.9829589465530595, + "grad_norm": 0.006004327442497015, + "learning_rate": 0.00019406633966986828, + "logits/chosen": -1.9453758001327515, + "logits/rejected": -1.512027621269226, + "logps/chosen": -392.6808166503906, + "logps/rejected": -378.18316650390625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3614554405212402, + "rewards/margins": 13.526113510131836, + "rewards/rejected": -15.88757038116455, + "step": 640 + }, + { + "epoch": 2.044926413632843, + "grad_norm": 0.013266593217849731, + "learning_rate": 0.00019362348706397373, + "logits/chosen": -1.9494597911834717, + "logits/rejected": -1.4765260219573975, + "logps/chosen": -373.5834045410156, + "logps/rejected": -355.810546875, + "loss": 0.0021, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.2433362007141113, + "rewards/margins": 13.272119522094727, + "rewards/rejected": -15.51545524597168, + "step": 660 + }, + { + "epoch": 2.106893880712626, + "grad_norm": 0.0013421621406450868, + "learning_rate": 0.0001931652406368554, + "logits/chosen": -1.879929542541504, + "logits/rejected": -1.4265925884246826, + "logps/chosen": -377.5626220703125, + "logps/rejected": -365.1024475097656, + "loss": 0.0016, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.054849624633789, + "rewards/margins": 14.068676948547363, + "rewards/rejected": -16.123525619506836, + "step": 680 + }, + { + "epoch": 2.168861347792409, + "grad_norm": 0.0016059954650700092, + "learning_rate": 0.0001926916757346022, + "logits/chosen": -1.8783481121063232, + "logits/rejected": -1.4017314910888672, + "logps/chosen": -375.7680969238281, + "logps/rejected": -356.9335021972656, + "loss": 0.0024, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.0512871742248535, + "rewards/margins": 14.513456344604492, + "rewards/rejected": -16.564743041992188, + "step": 700 + }, + { + "epoch": 2.230828814872192, + "grad_norm": 0.0020687805954366922, + "learning_rate": 0.00019220287022200707, + "logits/chosen": -1.8722127676010132, + "logits/rejected": -1.4170135259628296, + "logps/chosen": -360.9228515625, + "logps/rejected": -376.93304443359375, + "loss": 0.0024, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.443851947784424, + "rewards/margins": 15.007545471191406, + "rewards/rejected": -17.451396942138672, + "step": 720 + }, + { + "epoch": 2.292796281951975, + "grad_norm": 0.03182324767112732, + "learning_rate": 0.00019169890446976454, + "logits/chosen": -1.8520162105560303, + "logits/rejected": -1.316450834274292, + "logps/chosen": -392.74285888671875, + "logps/rejected": -379.98138427734375, + "loss": 0.0013, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.4773547649383545, + "rewards/margins": 15.281835556030273, + "rewards/rejected": -17.75918960571289, + "step": 740 + }, + { + "epoch": 2.3547637490317586, + "grad_norm": 0.015935391187667847, + "learning_rate": 0.0001911798613412557, + "logits/chosen": -1.8732004165649414, + "logits/rejected": -1.374529480934143, + "logps/chosen": -386.89178466796875, + "logps/rejected": -386.22894287109375, + "loss": 0.0034, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.536558151245117, + "rewards/margins": 15.137763977050781, + "rewards/rejected": -17.6743221282959, + "step": 760 + }, + { + "epoch": 2.4167312161115415, + "grad_norm": 0.00028358056442812085, + "learning_rate": 0.0001906458261789238, + "logits/chosen": -1.8395631313323975, + "logits/rejected": -1.3308550119400024, + "logps/chosen": -388.93792724609375, + "logps/rejected": -391.17559814453125, + "loss": 0.0018, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.6551461219787598, + "rewards/margins": 15.461560249328613, + "rewards/rejected": -18.116708755493164, + "step": 780 + }, + { + "epoch": 2.4786986831913245, + "grad_norm": 0.001103501650504768, + "learning_rate": 0.0001900968867902419, + "logits/chosen": -1.8540499210357666, + "logits/rejected": -1.3438807725906372, + "logps/chosen": -397.89093017578125, + "logps/rejected": -393.6608581542969, + "loss": 0.0015, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.684976100921631, + "rewards/margins": 15.562596321105957, + "rewards/rejected": -18.247573852539062, + "step": 800 + }, + { + "epoch": 2.5406661502711074, + "grad_norm": 0.05029486119747162, + "learning_rate": 0.0001895331334332753, + "logits/chosen": -1.8151705265045166, + "logits/rejected": -1.3103126287460327, + "logps/chosen": -396.3746643066406, + "logps/rejected": -391.5860900878906, + "loss": 0.0037, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.1363155841827393, + "rewards/margins": 15.38147258758545, + "rewards/rejected": -18.51778793334961, + "step": 820 + }, + { + "epoch": 2.602633617350891, + "grad_norm": 0.0015266811242327094, + "learning_rate": 0.0001889546588018412, + "logits/chosen": -1.850388765335083, + "logits/rejected": -1.3118959665298462, + "logps/chosen": -381.0390319824219, + "logps/rejected": -371.218505859375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7308974266052246, + "rewards/margins": 15.474958419799805, + "rewards/rejected": -18.205854415893555, + "step": 840 + }, + { + "epoch": 2.664601084430674, + "grad_norm": 0.010239909403026104, + "learning_rate": 0.00018836155801026753, + "logits/chosen": -1.8376766443252563, + "logits/rejected": -1.337482213973999, + "logps/chosen": -380.15032958984375, + "logps/rejected": -385.6625061035156, + "loss": 0.0059, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.8081612586975098, + "rewards/margins": 15.317975997924805, + "rewards/rejected": -18.12613868713379, + "step": 860 + }, + { + "epoch": 2.726568551510457, + "grad_norm": 0.005239796359091997, + "learning_rate": 0.00018775392857775432, + "logits/chosen": -1.8260116577148438, + "logits/rejected": -1.3371708393096924, + "logps/chosen": -386.72052001953125, + "logps/rejected": -393.1973571777344, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4543259143829346, + "rewards/margins": 15.405393600463867, + "rewards/rejected": -18.859722137451172, + "step": 880 + }, + { + "epoch": 2.78853601859024, + "grad_norm": 0.0014312748098745942, + "learning_rate": 0.00018713187041233896, + "logits/chosen": -1.8437349796295166, + "logits/rejected": -1.295083999633789, + "logps/chosen": -396.12713623046875, + "logps/rejected": -400.5750427246094, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4062328338623047, + "rewards/margins": 17.027809143066406, + "rewards/rejected": -20.434043884277344, + "step": 900 + }, + { + "epoch": 2.850503485670023, + "grad_norm": 0.03151211887598038, + "learning_rate": 0.00018649548579446936, + "logits/chosen": -1.8418632745742798, + "logits/rejected": -1.3832991123199463, + "logps/chosen": -387.4415588378906, + "logps/rejected": -418.4268493652344, + "loss": 0.0036, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.485564708709717, + "rewards/margins": 15.658266067504883, + "rewards/rejected": -19.14383316040039, + "step": 920 + }, + { + "epoch": 2.9124709527498065, + "grad_norm": 0.003437014762312174, + "learning_rate": 0.00018584487936018661, + "logits/chosen": -1.957241415977478, + "logits/rejected": -1.4707096815109253, + "logps/chosen": -370.52734375, + "logps/rejected": -367.0068054199219, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7640680074691772, + "rewards/margins": 14.591270446777344, + "rewards/rejected": -16.3553409576416, + "step": 940 + }, + { + "epoch": 2.9744384198295895, + "grad_norm": 0.0018515066476538777, + "learning_rate": 0.00018518015808392045, + "logits/chosen": -1.8616878986358643, + "logits/rejected": -1.3850669860839844, + "logps/chosen": -370.74847412109375, + "logps/rejected": -395.7770690917969, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.756985664367676, + "rewards/margins": 15.77873420715332, + "rewards/rejected": -18.53571891784668, + "step": 960 + }, + { + "epoch": 3.0364058869093724, + "grad_norm": 0.0055403695441782475, + "learning_rate": 0.00018450143126090015, + "logits/chosen": -1.9129266738891602, + "logits/rejected": -1.4352341890335083, + "logps/chosen": -378.54547119140625, + "logps/rejected": -389.22955322265625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.454970359802246, + "rewards/margins": 15.567869186401367, + "rewards/rejected": -18.022838592529297, + "step": 980 + }, + { + "epoch": 3.098373353989156, + "grad_norm": 0.0003845282772090286, + "learning_rate": 0.00018380881048918405, + "logits/chosen": -1.955512285232544, + "logits/rejected": -1.4428436756134033, + "logps/chosen": -375.7381286621094, + "logps/rejected": -373.1043701171875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9304916858673096, + "rewards/margins": 15.572137832641602, + "rewards/rejected": -17.502628326416016, + "step": 1000 + }, + { + "epoch": 3.1603408210689388, + "grad_norm": 0.000813652528449893, + "learning_rate": 0.00018310240965131041, + "logits/chosen": -1.9499313831329346, + "logits/rejected": -1.4106732606887817, + "logps/chosen": -363.78314208984375, + "logps/rejected": -364.62835693359375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.788172721862793, + "rewards/margins": 15.584823608398438, + "rewards/rejected": -17.372997283935547, + "step": 1020 + }, + { + "epoch": 3.2223082881487217, + "grad_norm": 0.0015642641810700297, + "learning_rate": 0.00018238234489557215, + "logits/chosen": -1.9376710653305054, + "logits/rejected": -1.4058828353881836, + "logps/chosen": -391.0188903808594, + "logps/rejected": -384.52716064453125, + "loss": 0.0023, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -1.709324598312378, + "rewards/margins": 16.003910064697266, + "rewards/rejected": -17.713237762451172, + "step": 1040 + }, + { + "epoch": 3.284275755228505, + "grad_norm": 0.013190961442887783, + "learning_rate": 0.00018164873461691986, + "logits/chosen": -1.9225285053253174, + "logits/rejected": -1.4039231538772583, + "logps/chosen": -389.7248840332031, + "logps/rejected": -403.44891357421875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2535457611083984, + "rewards/margins": 17.14788818359375, + "rewards/rejected": -19.401432037353516, + "step": 1060 + }, + { + "epoch": 3.346243222308288, + "grad_norm": 0.0009441258735023439, + "learning_rate": 0.00018090169943749476, + "logits/chosen": -1.9266620874404907, + "logits/rejected": -1.3820419311523438, + "logps/chosen": -377.3229064941406, + "logps/rejected": -394.3813171386719, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.6834962368011475, + "rewards/margins": 16.853666305541992, + "rewards/rejected": -19.537160873413086, + "step": 1080 + }, + { + "epoch": 3.4082106893880715, + "grad_norm": 0.000891213770955801, + "learning_rate": 0.00018014136218679567, + "logits/chosen": -1.8898261785507202, + "logits/rejected": -1.3582581281661987, + "logps/chosen": -367.8475341796875, + "logps/rejected": -381.94219970703125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8650197982788086, + "rewards/margins": 16.576953887939453, + "rewards/rejected": -19.441974639892578, + "step": 1100 + }, + { + "epoch": 3.4701781564678544, + "grad_norm": 0.0021270292345434427, + "learning_rate": 0.00017936784788148328, + "logits/chosen": -1.9054046869277954, + "logits/rejected": -1.3137685060501099, + "logps/chosen": -396.55718994140625, + "logps/rejected": -399.8603515625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9427146911621094, + "rewards/margins": 17.294252395629883, + "rewards/rejected": -20.236968994140625, + "step": 1120 + }, + { + "epoch": 3.5321456235476374, + "grad_norm": 0.0006443614838644862, + "learning_rate": 0.00017858128370482426, + "logits/chosen": -1.8784294128417969, + "logits/rejected": -1.3266098499298096, + "logps/chosen": -376.5830993652344, + "logps/rejected": -384.6981506347656, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.870404005050659, + "rewards/margins": 17.322202682495117, + "rewards/rejected": -20.192609786987305, + "step": 1140 + }, + { + "epoch": 3.5941130906274203, + "grad_norm": 0.0011427829740568995, + "learning_rate": 0.00017778179898577973, + "logits/chosen": -1.8605209589004517, + "logits/rejected": -1.3551753759384155, + "logps/chosen": -393.83099365234375, + "logps/rejected": -431.01824951171875, + "loss": 0.0044, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.188037872314453, + "rewards/margins": 17.58969497680664, + "rewards/rejected": -21.77773094177246, + "step": 1160 + }, + { + "epoch": 3.6560805577072037, + "grad_norm": 0.00015023932792246342, + "learning_rate": 0.00017696952517774062, + "logits/chosen": -1.8713442087173462, + "logits/rejected": -1.2884734869003296, + "logps/chosen": -389.5274658203125, + "logps/rejected": -406.44696044921875, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.2542433738708496, + "rewards/margins": 18.175609588623047, + "rewards/rejected": -21.429855346679688, + "step": 1180 + }, + { + "epoch": 3.7180480247869867, + "grad_norm": 0.0034171934239566326, + "learning_rate": 0.00017614459583691346, + "logits/chosen": -1.8342435359954834, + "logits/rejected": -1.33168625831604, + "logps/chosen": -392.7457275390625, + "logps/rejected": -424.7430725097656, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.033926963806152, + "rewards/margins": 17.532773971557617, + "rewards/rejected": -21.566701889038086, + "step": 1200 + }, + { + "epoch": 3.78001549186677, + "grad_norm": 0.00014497939264401793, + "learning_rate": 0.00017530714660036112, + "logits/chosen": -1.8120412826538086, + "logits/rejected": -1.2837426662445068, + "logps/chosen": -400.38055419921875, + "logps/rejected": -432.98175048828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.632486343383789, + "rewards/margins": 18.09763526916504, + "rewards/rejected": -21.730119705200195, + "step": 1220 + }, + { + "epoch": 3.841982958946553, + "grad_norm": 0.00035277256392873824, + "learning_rate": 0.0001744573151637007, + "logits/chosen": -1.7961149215698242, + "logits/rejected": -1.2880661487579346, + "logps/chosen": -389.3721618652344, + "logps/rejected": -458.435546875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.226949214935303, + "rewards/margins": 18.70314598083496, + "rewards/rejected": -22.930095672607422, + "step": 1240 + }, + { + "epoch": 3.903950426026336, + "grad_norm": 0.0018203147919848561, + "learning_rate": 0.0001735952412584635, + "logits/chosen": -1.8189284801483154, + "logits/rejected": -1.2755413055419922, + "logps/chosen": -403.92608642578125, + "logps/rejected": -437.57470703125, + "loss": 0.0023, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.302323818206787, + "rewards/margins": 18.439044952392578, + "rewards/rejected": -22.741369247436523, + "step": 1260 + }, + { + "epoch": 3.9659178931061194, + "grad_norm": 0.000810753321275115, + "learning_rate": 0.00017272106662911973, + "logits/chosen": -1.8001739978790283, + "logits/rejected": -1.2190439701080322, + "logps/chosen": -392.6038513183594, + "logps/rejected": -409.79754638671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5174388885498047, + "rewards/margins": 18.15955924987793, + "rewards/rejected": -21.676998138427734, + "step": 1280 + }, + { + "epoch": 4.027885360185903, + "grad_norm": 0.0008877617656253278, + "learning_rate": 0.00017183493500977278, + "logits/chosen": -1.7996867895126343, + "logits/rejected": -1.2403078079223633, + "logps/chosen": -376.8688659667969, + "logps/rejected": -401.3122863769531, + "loss": 0.0012, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.8793225288391113, + "rewards/margins": 17.706012725830078, + "rewards/rejected": -21.58533477783203, + "step": 1300 + }, + { + "epoch": 4.089852827265686, + "grad_norm": 0.0007201443077065051, + "learning_rate": 0.0001709369921005258, + "logits/chosen": -1.7817294597625732, + "logits/rejected": -1.3144575357437134, + "logps/chosen": -362.8156433105469, + "logps/rejected": -421.5276794433594, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.907405376434326, + "rewards/margins": 17.486907958984375, + "rewards/rejected": -21.394317626953125, + "step": 1320 + }, + { + "epoch": 4.151820294345469, + "grad_norm": 0.0004134229675401002, + "learning_rate": 0.00017002738554352552, + "logits/chosen": -1.7647602558135986, + "logits/rejected": -1.2397964000701904, + "logps/chosen": -400.63525390625, + "logps/rejected": -434.27734375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.650538444519043, + "rewards/margins": 17.86612319946289, + "rewards/rejected": -22.516660690307617, + "step": 1340 + }, + { + "epoch": 4.213787761425252, + "grad_norm": 0.0018414207734167576, + "learning_rate": 0.00016910626489868649, + "logits/chosen": -1.8098886013031006, + "logits/rejected": -1.2557048797607422, + "logps/chosen": -403.9068908691406, + "logps/rejected": -441.5738220214844, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8825366497039795, + "rewards/margins": 19.2824764251709, + "rewards/rejected": -23.165014266967773, + "step": 1360 + }, + { + "epoch": 4.275755228505035, + "grad_norm": 0.000604189292062074, + "learning_rate": 0.00016817378161909996, + "logits/chosen": -1.7331501245498657, + "logits/rejected": -1.1988348960876465, + "logps/chosen": -379.48004150390625, + "logps/rejected": -416.23504638671875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.858603477478027, + "rewards/margins": 17.692523956298828, + "rewards/rejected": -22.551128387451172, + "step": 1380 + }, + { + "epoch": 4.337722695584818, + "grad_norm": 0.0018184883520007133, + "learning_rate": 0.0001672300890261317, + "logits/chosen": -1.786969780921936, + "logits/rejected": -1.1631317138671875, + "logps/chosen": -399.63836669921875, + "logps/rejected": -406.0413513183594, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.497194766998291, + "rewards/margins": 17.606014251708984, + "rewards/rejected": -22.103206634521484, + "step": 1400 + }, + { + "epoch": 4.3996901626646014, + "grad_norm": 0.0004817396984435618, + "learning_rate": 0.0001662753422842123, + "logits/chosen": -1.803607702255249, + "logits/rejected": -1.2023392915725708, + "logps/chosen": -397.8926086425781, + "logps/rejected": -415.9464416503906, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3522844314575195, + "rewards/margins": 18.28469467163086, + "rewards/rejected": -22.636978149414062, + "step": 1420 + }, + { + "epoch": 4.461657629744384, + "grad_norm": 0.0003521572216413915, + "learning_rate": 0.00016530969837532487, + "logits/chosen": -1.745550513267517, + "logits/rejected": -1.2345880270004272, + "logps/chosen": -398.3353271484375, + "logps/rejected": -455.84991455078125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.580657005310059, + "rewards/margins": 18.520645141601562, + "rewards/rejected": -23.101301193237305, + "step": 1440 + }, + { + "epoch": 4.523625096824167, + "grad_norm": 0.001398236840032041, + "learning_rate": 0.00016433331607319343, + "logits/chosen": -1.7653003931045532, + "logits/rejected": -1.2409374713897705, + "logps/chosen": -390.4782409667969, + "logps/rejected": -445.02203369140625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.244819641113281, + "rewards/margins": 19.066150665283203, + "rewards/rejected": -23.31096839904785, + "step": 1460 + }, + { + "epoch": 4.58559256390395, + "grad_norm": 0.0006393153453245759, + "learning_rate": 0.00016334635591717703, + "logits/chosen": -1.7738897800445557, + "logits/rejected": -1.2459341287612915, + "logps/chosen": -405.1599426269531, + "logps/rejected": -465.34796142578125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.869115114212036, + "rewards/margins": 20.113529205322266, + "rewards/rejected": -23.98264503479004, + "step": 1480 + }, + { + "epoch": 4.647560030983733, + "grad_norm": 0.0002729636325966567, + "learning_rate": 0.00016234898018587337, + "logits/chosen": -1.7716586589813232, + "logits/rejected": -1.156842589378357, + "logps/chosen": -400.9200439453125, + "logps/rejected": -419.4234924316406, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.553537368774414, + "rewards/margins": 18.427448272705078, + "rewards/rejected": -22.980987548828125, + "step": 1500 + }, + { + "epoch": 4.709527498063517, + "grad_norm": 0.0016045222291722894, + "learning_rate": 0.00016134135287043669, + "logits/chosen": -1.7796188592910767, + "logits/rejected": -1.1779518127441406, + "logps/chosen": -407.48773193359375, + "logps/rejected": -439.03143310546875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.231381416320801, + "rewards/margins": 19.530107498168945, + "rewards/rejected": -23.761486053466797, + "step": 1520 + }, + { + "epoch": 4.7714949651433, + "grad_norm": 0.0001898371265269816, + "learning_rate": 0.00016032363964761363, + "logits/chosen": -1.7506084442138672, + "logits/rejected": -1.1158758401870728, + "logps/chosen": -412.0704650878906, + "logps/rejected": -419.58477783203125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.735566139221191, + "rewards/margins": 18.557144165039062, + "rewards/rejected": -23.292709350585938, + "step": 1540 + }, + { + "epoch": 4.833462432223083, + "grad_norm": 0.0011102559510618448, + "learning_rate": 0.00015929600785250257, + "logits/chosen": -1.772351861000061, + "logits/rejected": -1.199371576309204, + "logps/chosen": -411.6983337402344, + "logps/rejected": -456.08526611328125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.344552516937256, + "rewards/margins": 19.66854476928711, + "rewards/rejected": -24.01309585571289, + "step": 1560 + }, + { + "epoch": 4.895429899302866, + "grad_norm": 0.0002147419872926548, + "learning_rate": 0.0001582586264510396, + "logits/chosen": -1.7624610662460327, + "logits/rejected": -1.1555306911468506, + "logps/chosen": -392.86846923828125, + "logps/rejected": -411.6356506347656, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8805503845214844, + "rewards/margins": 18.482906341552734, + "rewards/rejected": -22.36345672607422, + "step": 1580 + }, + { + "epoch": 4.957397366382649, + "grad_norm": 0.00014843855751678348, + "learning_rate": 0.00015721166601221698, + "logits/chosen": -1.7433449029922485, + "logits/rejected": -1.1605427265167236, + "logps/chosen": -402.5615539550781, + "logps/rejected": -437.72601318359375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.379772186279297, + "rewards/margins": 19.26140022277832, + "rewards/rejected": -23.641170501708984, + "step": 1600 + }, + { + "epoch": 5.019364833462432, + "grad_norm": 9.896748815663159e-05, + "learning_rate": 0.0001561552986800375, + "logits/chosen": -1.7666635513305664, + "logits/rejected": -1.2081592082977295, + "logps/chosen": -409.02685546875, + "logps/rejected": -462.6644592285156, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.824324607849121, + "rewards/margins": 19.418132781982422, + "rewards/rejected": -24.242456436157227, + "step": 1620 + }, + { + "epoch": 5.081332300542216, + "grad_norm": 6.193404988152906e-05, + "learning_rate": 0.00015508969814521025, + "logits/chosen": -1.7530428171157837, + "logits/rejected": -1.2155699729919434, + "logps/chosen": -396.701171875, + "logps/rejected": -438.2998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.757896423339844, + "rewards/margins": 18.720035552978516, + "rewards/rejected": -23.47793197631836, + "step": 1640 + }, + { + "epoch": 5.143299767621999, + "grad_norm": 0.0005012938636355102, + "learning_rate": 0.00015401503961659204, + "logits/chosen": -1.76808762550354, + "logits/rejected": -1.2039562463760376, + "logps/chosen": -416.18133544921875, + "logps/rejected": -471.65032958984375, + "loss": 0.0055, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.3714799880981445, + "rewards/margins": 20.104217529296875, + "rewards/rejected": -24.475696563720703, + "step": 1660 + }, + { + "epoch": 5.205267234701782, + "grad_norm": 0.0007204354042187333, + "learning_rate": 0.00015293149979237876, + "logits/chosen": -1.700727105140686, + "logits/rejected": -1.1688693761825562, + "logps/chosen": -395.04620361328125, + "logps/rejected": -459.3890686035156, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.376019477844238, + "rewards/margins": 19.267929077148438, + "rewards/rejected": -24.643945693969727, + "step": 1680 + }, + { + "epoch": 5.267234701781565, + "grad_norm": 0.00012067196075804532, + "learning_rate": 0.00015183925683105254, + "logits/chosen": -1.7348114252090454, + "logits/rejected": -1.1479172706604004, + "logps/chosen": -411.1114807128906, + "logps/rejected": -467.02777099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.358091354370117, + "rewards/margins": 20.035839080810547, + "rewards/rejected": -24.393932342529297, + "step": 1700 + }, + { + "epoch": 5.329202168861348, + "grad_norm": 0.0015901889419183135, + "learning_rate": 0.00015073849032208822, + "logits/chosen": -1.7161178588867188, + "logits/rejected": -1.1550828218460083, + "logps/chosen": -408.5069885253906, + "logps/rejected": -455.2245178222656, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.192176342010498, + "rewards/margins": 19.474624633789062, + "rewards/rejected": -24.66680145263672, + "step": 1720 + }, + { + "epoch": 5.3911696359411305, + "grad_norm": 2.9804143196088262e-05, + "learning_rate": 0.00014962938125642503, + "logits/chosen": -1.7266225814819336, + "logits/rejected": -1.1720420122146606, + "logps/chosen": -404.70721435546875, + "logps/rejected": -468.11956787109375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.117176532745361, + "rewards/margins": 19.882728576660156, + "rewards/rejected": -24.99990463256836, + "step": 1740 + }, + { + "epoch": 5.453137103020914, + "grad_norm": 0.001581120421178639, + "learning_rate": 0.00014851211199670721, + "logits/chosen": -1.7630701065063477, + "logits/rejected": -1.1630027294158936, + "logps/chosen": -387.80364990234375, + "logps/rejected": -445.5340270996094, + "loss": 0.0076, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -4.650803565979004, + "rewards/margins": 19.620697021484375, + "rewards/rejected": -24.271501541137695, + "step": 1760 + }, + { + "epoch": 5.515104570100697, + "grad_norm": 7.492147415177897e-05, + "learning_rate": 0.00014738686624729986, + "logits/chosen": -1.7199184894561768, + "logits/rejected": -1.1519477367401123, + "logps/chosen": -398.6278991699219, + "logps/rejected": -449.28826904296875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.650136947631836, + "rewards/margins": 19.344139099121094, + "rewards/rejected": -23.99427604675293, + "step": 1780 + }, + { + "epoch": 5.57707203718048, + "grad_norm": 0.0007189544849097729, + "learning_rate": 0.00014625382902408356, + "logits/chosen": -1.7485740184783936, + "logits/rejected": -1.15171217918396, + "logps/chosen": -413.4642639160156, + "logps/rejected": -454.82623291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.690885543823242, + "rewards/margins": 19.775279998779297, + "rewards/rejected": -24.466161727905273, + "step": 1800 + }, + { + "epoch": 5.639039504260263, + "grad_norm": 9.353666246170178e-05, + "learning_rate": 0.00014511318662403347, + "logits/chosen": -1.7578392028808594, + "logits/rejected": -1.1830543279647827, + "logps/chosen": -395.25433349609375, + "logps/rejected": -461.00128173828125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.259980201721191, + "rewards/margins": 20.097646713256836, + "rewards/rejected": -24.35762596130371, + "step": 1820 + }, + { + "epoch": 5.701006971340046, + "grad_norm": 0.00011017426731996238, + "learning_rate": 0.00014396512659458824, + "logits/chosen": -1.718340277671814, + "logits/rejected": -1.1603585481643677, + "logps/chosen": -397.50201416015625, + "logps/rejected": -441.17120361328125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.123129844665527, + "rewards/margins": 18.981271743774414, + "rewards/rejected": -24.104402542114258, + "step": 1840 + }, + { + "epoch": 5.76297443841983, + "grad_norm": 0.0007490446441806853, + "learning_rate": 0.0001428098377028126, + "logits/chosen": -1.7352231740951538, + "logits/rejected": -1.1633882522583008, + "logps/chosen": -395.93719482421875, + "logps/rejected": -450.5420837402344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.510663032531738, + "rewards/margins": 20.08230972290039, + "rewards/rejected": -24.59296989440918, + "step": 1860 + }, + { + "epoch": 5.824941905499613, + "grad_norm": 0.002562998328357935, + "learning_rate": 0.0001416475099043599, + "logits/chosen": -1.7280263900756836, + "logits/rejected": -1.0888252258300781, + "logps/chosen": -383.5231628417969, + "logps/rejected": -423.22735595703125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.361128330230713, + "rewards/margins": 19.707561492919922, + "rewards/rejected": -24.06869125366211, + "step": 1880 + }, + { + "epoch": 5.886909372579396, + "grad_norm": 0.0003409655182622373, + "learning_rate": 0.00014047833431223938, + "logits/chosen": -1.7228466272354126, + "logits/rejected": -1.1678210496902466, + "logps/chosen": -427.7156677246094, + "logps/rejected": -484.9002990722656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.072082996368408, + "rewards/margins": 19.94878387451172, + "rewards/rejected": -25.0208683013916, + "step": 1900 + }, + { + "epoch": 5.948876839659179, + "grad_norm": 3.485321212792769e-05, + "learning_rate": 0.00013930250316539238, + "logits/chosen": -1.7439708709716797, + "logits/rejected": -1.1591265201568604, + "logps/chosen": -409.28485107421875, + "logps/rejected": -464.5729064941406, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.592177867889404, + "rewards/margins": 20.056758880615234, + "rewards/rejected": -24.64893913269043, + "step": 1920 + }, + { + "epoch": 6.010844306738962, + "grad_norm": 0.0024052930530160666, + "learning_rate": 0.00013812020979708418, + "logits/chosen": -1.766571044921875, + "logits/rejected": -1.1335632801055908, + "logps/chosen": -409.98095703125, + "logps/rejected": -432.7437438964844, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.963695526123047, + "rewards/margins": 19.679019927978516, + "rewards/rejected": -24.642715454101562, + "step": 1940 + }, + { + "epoch": 6.072811773818745, + "grad_norm": 7.735176041023806e-05, + "learning_rate": 0.00013693164860311565, + "logits/chosen": -1.7631984949111938, + "logits/rejected": -1.1198147535324097, + "logps/chosen": -398.9923400878906, + "logps/rejected": -429.88861083984375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.000827312469482, + "rewards/margins": 20.33033561706543, + "rewards/rejected": -24.331165313720703, + "step": 1960 + }, + { + "epoch": 6.134779240898529, + "grad_norm": 0.0003688503638841212, + "learning_rate": 0.0001357370150098601, + "logits/chosen": -1.7265870571136475, + "logits/rejected": -1.1435579061508179, + "logps/chosen": -390.2747497558594, + "logps/rejected": -457.9873962402344, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.666455268859863, + "rewards/margins": 20.30272102355957, + "rewards/rejected": -24.969173431396484, + "step": 1980 + }, + { + "epoch": 6.196746707978312, + "grad_norm": 0.0016685057198628783, + "learning_rate": 0.00013453650544213076, + "logits/chosen": -1.7364275455474854, + "logits/rejected": -1.1212728023529053, + "logps/chosen": -404.72869873046875, + "logps/rejected": -440.9786071777344, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.581490993499756, + "rewards/margins": 19.78643035888672, + "rewards/rejected": -24.367919921875, + "step": 2000 + }, + { + "epoch": 6.258714175058095, + "grad_norm": 0.00023198116105049849, + "learning_rate": 0.00013333031729088419, + "logits/chosen": -1.7448314428329468, + "logits/rejected": -1.1462557315826416, + "logps/chosen": -401.00048828125, + "logps/rejected": -452.0621032714844, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.24946928024292, + "rewards/margins": 20.46927833557129, + "rewards/rejected": -24.718748092651367, + "step": 2020 + }, + { + "epoch": 6.3206816421378775, + "grad_norm": 0.00022464637004304677, + "learning_rate": 0.00013211864888076457, + "logits/chosen": -1.691931962966919, + "logits/rejected": -1.16156005859375, + "logps/chosen": -417.93585205078125, + "logps/rejected": -468.42791748046875, + "loss": 0.0044, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -5.851279258728027, + "rewards/margins": 19.037456512451172, + "rewards/rejected": -24.888734817504883, + "step": 2040 + }, + { + "epoch": 6.3826491092176605, + "grad_norm": 0.0001370076060993597, + "learning_rate": 0.00013090169943749476, + "logits/chosen": -1.7306629419326782, + "logits/rejected": -1.16789972782135, + "logps/chosen": -400.44989013671875, + "logps/rejected": -461.5997009277344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.793812274932861, + "rewards/margins": 20.2277889251709, + "rewards/rejected": -25.02159881591797, + "step": 2060 + }, + { + "epoch": 6.4446165762974434, + "grad_norm": 0.0007584911654703319, + "learning_rate": 0.00012967966905511906, + "logits/chosen": -1.7538254261016846, + "logits/rejected": -1.1523357629776, + "logps/chosen": -400.55078125, + "logps/rejected": -457.19439697265625, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.016867637634277, + "rewards/margins": 20.043991088867188, + "rewards/rejected": -25.06085777282715, + "step": 2080 + }, + { + "epoch": 6.506584043377227, + "grad_norm": 0.00025258222012780607, + "learning_rate": 0.00012845275866310324, + "logits/chosen": -1.709283471107483, + "logits/rejected": -1.1272356510162354, + "logps/chosen": -393.4644775390625, + "logps/rejected": -445.11932373046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.725881576538086, + "rewards/margins": 20.157442092895508, + "rewards/rejected": -24.88332176208496, + "step": 2100 + }, + { + "epoch": 6.56855151045701, + "grad_norm": 0.0005373629392124712, + "learning_rate": 0.00012722116999329712, + "logits/chosen": -1.7319450378417969, + "logits/rejected": -1.146323323249817, + "logps/chosen": -400.94219970703125, + "logps/rejected": -457.70294189453125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.707498073577881, + "rewards/margins": 19.930648803710938, + "rewards/rejected": -24.638147354125977, + "step": 2120 + }, + { + "epoch": 6.630518977536793, + "grad_norm": 3.2575491786701605e-05, + "learning_rate": 0.0001259851055467653, + "logits/chosen": -1.7204310894012451, + "logits/rejected": -1.1470435857772827, + "logps/chosen": -407.14794921875, + "logps/rejected": -463.16937255859375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.401209831237793, + "rewards/margins": 19.731382369995117, + "rewards/rejected": -25.132593154907227, + "step": 2140 + }, + { + "epoch": 6.692486444616576, + "grad_norm": 4.120891753700562e-05, + "learning_rate": 0.00012474476856049144, + "logits/chosen": -1.758186936378479, + "logits/rejected": -1.0516242980957031, + "logps/chosen": -422.578125, + "logps/rejected": -450.13360595703125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.05043888092041, + "rewards/margins": 20.296903610229492, + "rewards/rejected": -25.347341537475586, + "step": 2160 + }, + { + "epoch": 6.754453911696359, + "grad_norm": 0.0018112401012331247, + "learning_rate": 0.00012350036297396154, + "logits/chosen": -1.7569530010223389, + "logits/rejected": -1.1236534118652344, + "logps/chosen": -398.9664001464844, + "logps/rejected": -440.2588806152344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.544419288635254, + "rewards/margins": 20.12918472290039, + "rewards/rejected": -24.673603057861328, + "step": 2180 + }, + { + "epoch": 6.816421378776143, + "grad_norm": 0.0009737831423990428, + "learning_rate": 0.00012225209339563145, + "logits/chosen": -1.709917664527893, + "logits/rejected": -1.1064178943634033, + "logps/chosen": -414.5459899902344, + "logps/rejected": -465.4837341308594, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.264222145080566, + "rewards/margins": 20.37704849243164, + "rewards/rejected": -25.64126968383789, + "step": 2200 + }, + { + "epoch": 6.878388845855926, + "grad_norm": 0.000668133026920259, + "learning_rate": 0.00012100016506928493, + "logits/chosen": -1.733787178993225, + "logits/rejected": -1.1450860500335693, + "logps/chosen": -403.2812805175781, + "logps/rejected": -477.0782165527344, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.641029357910156, + "rewards/margins": 21.0471134185791, + "rewards/rejected": -25.68814468383789, + "step": 2220 + }, + { + "epoch": 6.940356312935709, + "grad_norm": 0.00028338556876406074, + "learning_rate": 0.00011974478384028672, + "logits/chosen": -1.703685998916626, + "logits/rejected": -1.0926717519760132, + "logps/chosen": -415.73248291015625, + "logps/rejected": -474.7493591308594, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.833617210388184, + "rewards/margins": 19.839744567871094, + "rewards/rejected": -25.67336082458496, + "step": 2240 + }, + { + "epoch": 7.002323780015492, + "grad_norm": 9.248249261872843e-05, + "learning_rate": 0.00011848615612173688, + "logits/chosen": -1.727691888809204, + "logits/rejected": -1.1385018825531006, + "logps/chosen": -404.37158203125, + "logps/rejected": -455.1560974121094, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.189269065856934, + "rewards/margins": 20.383289337158203, + "rewards/rejected": -25.572555541992188, + "step": 2260 + }, + { + "epoch": 7.064291247095275, + "grad_norm": 1.9335082470206544e-05, + "learning_rate": 0.0001172244888605319, + "logits/chosen": -1.687378168106079, + "logits/rejected": -1.1057562828063965, + "logps/chosen": -406.32733154296875, + "logps/rejected": -474.8482360839844, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.787657737731934, + "rewards/margins": 20.789146423339844, + "rewards/rejected": -25.576807022094727, + "step": 2280 + }, + { + "epoch": 7.126258714175058, + "grad_norm": 8.403878018725663e-05, + "learning_rate": 0.00011595998950333793, + "logits/chosen": -1.6789989471435547, + "logits/rejected": -1.1095144748687744, + "logps/chosen": -409.31524658203125, + "logps/rejected": -472.5364685058594, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.127674579620361, + "rewards/margins": 20.548160552978516, + "rewards/rejected": -25.675832748413086, + "step": 2300 + }, + { + "epoch": 7.188226181254842, + "grad_norm": 0.0001840272598201409, + "learning_rate": 0.00011469286596248181, + "logits/chosen": -1.7186450958251953, + "logits/rejected": -1.0815023183822632, + "logps/chosen": -402.4718322753906, + "logps/rejected": -446.8160095214844, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.929797172546387, + "rewards/margins": 20.37470245361328, + "rewards/rejected": -25.304500579833984, + "step": 2320 + }, + { + "epoch": 7.2501936483346245, + "grad_norm": 0.00030283021624200046, + "learning_rate": 0.00011342332658176555, + "logits/chosen": -1.7267248630523682, + "logits/rejected": -1.1029185056686401, + "logps/chosen": -407.1277160644531, + "logps/rejected": -443.208251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.898409843444824, + "rewards/margins": 19.7962589263916, + "rewards/rejected": -24.69466781616211, + "step": 2340 + }, + { + "epoch": 7.3121611154144075, + "grad_norm": 0.000179938884684816, + "learning_rate": 0.00011221521661813197, + "logits/chosen": -1.7125059366226196, + "logits/rejected": -1.107881784439087, + "logps/chosen": -411.54571533203125, + "logps/rejected": -468.47821044921875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.634856700897217, + "rewards/margins": 20.49616050720215, + "rewards/rejected": -26.131017684936523, + "step": 2360 + }, + { + "epoch": 7.3741285824941905, + "grad_norm": 0.00018190982518717647, + "learning_rate": 0.0001109415670719721, + "logits/chosen": -1.6849457025527954, + "logits/rejected": -1.0680724382400513, + "logps/chosen": -408.02587890625, + "logps/rejected": -460.41015625, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.917786598205566, + "rewards/margins": 20.782718658447266, + "rewards/rejected": -25.700504302978516, + "step": 2380 + }, + { + "epoch": 7.436096049573973, + "grad_norm": 0.00010547572310315445, + "learning_rate": 0.00010966611848443176, + "logits/chosen": -1.6835496425628662, + "logits/rejected": -1.0897111892700195, + "logps/chosen": -407.20318603515625, + "logps/rejected": -464.83935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.396719932556152, + "rewards/margins": 20.730510711669922, + "rewards/rejected": -26.127233505249023, + "step": 2400 + }, + { + "epoch": 7.498063516653756, + "grad_norm": 0.0002746889949776232, + "learning_rate": 0.00010838908056813919, + "logits/chosen": -1.7222875356674194, + "logits/rejected": -1.0569690465927124, + "logps/chosen": -397.06500244140625, + "logps/rejected": -429.73663330078125, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.969448566436768, + "rewards/margins": 20.237773895263672, + "rewards/rejected": -25.20722007751465, + "step": 2420 + }, + { + "epoch": 7.56003098373354, + "grad_norm": 0.0010378537699580193, + "learning_rate": 0.00010711066329704423, + "logits/chosen": -1.7328182458877563, + "logits/rejected": -1.0489845275878906, + "logps/chosen": -410.6394958496094, + "logps/rejected": -457.23126220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.752233505249023, + "rewards/margins": 20.957183837890625, + "rewards/rejected": -25.70941734313965, + "step": 2440 + }, + { + "epoch": 7.621998450813323, + "grad_norm": 0.00035315402783453465, + "learning_rate": 0.00010583107687189388, + "logits/chosen": -1.7303959131240845, + "logits/rejected": -1.0627490282058716, + "logps/chosen": -394.2586364746094, + "logps/rejected": -438.1336975097656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.017716407775879, + "rewards/margins": 20.087886810302734, + "rewards/rejected": -25.105602264404297, + "step": 2460 + }, + { + "epoch": 7.683965917893106, + "grad_norm": 5.2913201216142625e-05, + "learning_rate": 0.00010455053168567064, + "logits/chosen": -1.701934814453125, + "logits/rejected": -1.0837266445159912, + "logps/chosen": -411.44390869140625, + "logps/rejected": -451.9497985839844, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.648865699768066, + "rewards/margins": 20.401885986328125, + "rewards/rejected": -26.050750732421875, + "step": 2480 + }, + { + "epoch": 7.745933384972889, + "grad_norm": 0.0004144099075347185, + "learning_rate": 0.00010326923828899894, + "logits/chosen": -1.66423761844635, + "logits/rejected": -1.0931271314620972, + "logps/chosen": -413.04266357421875, + "logps/rejected": -468.1424255371094, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.445749282836914, + "rewards/margins": 20.35373306274414, + "rewards/rejected": -25.799480438232422, + "step": 2500 + }, + { + "epoch": 7.807900852052672, + "grad_norm": 0.0005614625406451523, + "learning_rate": 0.00010198740735552596, + "logits/chosen": -1.7007503509521484, + "logits/rejected": -1.0203969478607178, + "logps/chosen": -409.26434326171875, + "logps/rejected": -450.35284423828125, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.619626522064209, + "rewards/margins": 20.54979133605957, + "rewards/rejected": -26.169414520263672, + "step": 2520 + }, + { + "epoch": 7.869868319132456, + "grad_norm": 0.00046529798419214785, + "learning_rate": 0.00010070524964728218, + "logits/chosen": -1.6950366497039795, + "logits/rejected": -1.0599762201309204, + "logps/chosen": -388.9576416015625, + "logps/rejected": -438.4559020996094, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.69763708114624, + "rewards/margins": 19.549518585205078, + "rewards/rejected": -25.247156143188477, + "step": 2540 + }, + { + "epoch": 7.931835786212239, + "grad_norm": 0.0005010979948565364, + "learning_rate": 9.942297598002714e-05, + "logits/chosen": -1.6910135746002197, + "logits/rejected": -1.088746190071106, + "logps/chosen": -409.673583984375, + "logps/rejected": -460.9344177246094, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.090248107910156, + "rewards/margins": 20.458660125732422, + "rewards/rejected": -25.548908233642578, + "step": 2560 + }, + { + "epoch": 7.993803253292022, + "grad_norm": 2.1018489860580303e-05, + "learning_rate": 9.814079718858677e-05, + "logits/chosen": -1.6951793432235718, + "logits/rejected": -1.1038161516189575, + "logps/chosen": -427.29669189453125, + "logps/rejected": -482.02362060546875, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.359341621398926, + "rewards/margins": 20.788881301879883, + "rewards/rejected": -26.148223876953125, + "step": 2580 + }, + { + "epoch": 8.055770720371806, + "grad_norm": 0.00020114157814532518, + "learning_rate": 9.685892409218717e-05, + "logits/chosen": -1.702978491783142, + "logits/rejected": -1.0864311456680298, + "logps/chosen": -405.50567626953125, + "logps/rejected": -455.3516540527344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.847678184509277, + "rewards/margins": 20.718107223510742, + "rewards/rejected": -25.565786361694336, + "step": 2600 + }, + { + "epoch": 8.117738187451588, + "grad_norm": 0.00014650092634838074, + "learning_rate": 9.557756745979138e-05, + "logits/chosen": -1.692112922668457, + "logits/rejected": -1.106385588645935, + "logps/chosen": -400.7706298828125, + "logps/rejected": -458.6825256347656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.732221603393555, + "rewards/margins": 21.058570861816406, + "rewards/rejected": -25.79079246520996, + "step": 2620 + }, + { + "epoch": 8.179705654531372, + "grad_norm": 0.0003632131847552955, + "learning_rate": 9.429693797544388e-05, + "logits/chosen": -1.727189302444458, + "logits/rejected": -1.0760419368743896, + "logps/chosen": -401.86767578125, + "logps/rejected": -446.3102111816406, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.723801612854004, + "rewards/margins": 20.717304229736328, + "rewards/rejected": -25.441104888916016, + "step": 2640 + }, + { + "epoch": 8.241673121611154, + "grad_norm": 0.00047560204984620214, + "learning_rate": 9.301724620362973e-05, + "logits/chosen": -1.7449928522109985, + "logits/rejected": -1.0541192293167114, + "logps/chosen": -409.01959228515625, + "logps/rejected": -449.57666015625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.505074501037598, + "rewards/margins": 20.396114349365234, + "rewards/rejected": -25.901187896728516, + "step": 2660 + }, + { + "epoch": 8.303640588690937, + "grad_norm": 0.0010067891562357545, + "learning_rate": 9.173870255465275e-05, + "logits/chosen": -1.7413511276245117, + "logits/rejected": -1.073628544807434, + "logps/chosen": -413.9063415527344, + "logps/rejected": -457.25042724609375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8417158126831055, + "rewards/margins": 20.952346801757812, + "rewards/rejected": -25.7940616607666, + "step": 2680 + }, + { + "epoch": 8.36560805577072, + "grad_norm": 0.0007608987507410347, + "learning_rate": 9.046151725003931e-05, + "logits/chosen": -1.738470435142517, + "logits/rejected": -1.118428111076355, + "logps/chosen": -406.96368408203125, + "logps/rejected": -458.2310485839844, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.3068695068359375, + "rewards/margins": 20.518783569335938, + "rewards/rejected": -25.825653076171875, + "step": 2700 + }, + { + "epoch": 8.427575522850503, + "grad_norm": 0.00037170801078900695, + "learning_rate": 8.918590028797327e-05, + "logits/chosen": -1.6667039394378662, + "logits/rejected": -1.076485276222229, + "logps/chosen": -417.1942443847656, + "logps/rejected": -475.34478759765625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.047384262084961, + "rewards/margins": 21.4394588470459, + "rewards/rejected": -26.48684310913086, + "step": 2720 + }, + { + "epoch": 8.489542989930287, + "grad_norm": 0.00017155329987872392, + "learning_rate": 8.791206140876746e-05, + "logits/chosen": -1.6952327489852905, + "logits/rejected": -1.0440196990966797, + "logps/chosen": -390.47991943359375, + "logps/rejected": -446.51611328125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.408968448638916, + "rewards/margins": 20.748926162719727, + "rewards/rejected": -25.157894134521484, + "step": 2740 + }, + { + "epoch": 8.55151045701007, + "grad_norm": 4.225455268169753e-05, + "learning_rate": 8.664021006037762e-05, + "logits/chosen": -1.7128692865371704, + "logits/rejected": -1.0821470022201538, + "logps/chosen": -424.44549560546875, + "logps/rejected": -469.12652587890625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.529724597930908, + "rewards/margins": 20.326000213623047, + "rewards/rejected": -25.855722427368164, + "step": 2760 + }, + { + "epoch": 8.613477924089853, + "grad_norm": 0.0004146189312450588, + "learning_rate": 8.537055536396439e-05, + "logits/chosen": -1.7189327478408813, + "logits/rejected": -1.1234623193740845, + "logps/chosen": -413.88092041015625, + "logps/rejected": -489.74432373046875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.031737327575684, + "rewards/margins": 20.76127815246582, + "rewards/rejected": -26.793010711669922, + "step": 2780 + }, + { + "epoch": 8.675445391169635, + "grad_norm": 0.0011191857047379017, + "learning_rate": 8.410330607950913e-05, + "logits/chosen": -1.6889803409576416, + "logits/rejected": -1.0510902404785156, + "logps/chosen": -409.9695739746094, + "logps/rejected": -461.45257568359375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.462882041931152, + "rewards/margins": 20.715688705444336, + "rewards/rejected": -26.178569793701172, + "step": 2800 + }, + { + "epoch": 8.737412858249419, + "grad_norm": 0.0015039819991216063, + "learning_rate": 8.283867057148902e-05, + "logits/chosen": -1.6871960163116455, + "logits/rejected": -1.1272326707839966, + "logps/chosen": -424.3963928222656, + "logps/rejected": -478.30535888671875, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.292850971221924, + "rewards/margins": 20.825016021728516, + "rewards/rejected": -26.117868423461914, + "step": 2820 + }, + { + "epoch": 8.799380325329203, + "grad_norm": 0.00024371009203605354, + "learning_rate": 8.157685677461708e-05, + "logits/chosen": -1.7314860820770264, + "logits/rejected": -1.0632710456848145, + "logps/chosen": -411.5020446777344, + "logps/rejected": -450.3389587402344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.654230117797852, + "rewards/margins": 21.339710235595703, + "rewards/rejected": -25.993938446044922, + "step": 2840 + }, + { + "epoch": 8.861347792408985, + "grad_norm": 0.0004402414197102189, + "learning_rate": 8.031807215965337e-05, + "logits/chosen": -1.7364399433135986, + "logits/rejected": -1.0983723402023315, + "logps/chosen": -417.08746337890625, + "logps/rejected": -472.83984375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.4446940422058105, + "rewards/margins": 21.18663215637207, + "rewards/rejected": -26.63132667541504, + "step": 2860 + }, + { + "epoch": 8.923315259488769, + "grad_norm": 0.00047181983245536685, + "learning_rate": 7.906252369929154e-05, + "logits/chosen": -1.6905673742294312, + "logits/rejected": -1.084665060043335, + "logps/chosen": -393.9977111816406, + "logps/rejected": -455.0557556152344, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.183560371398926, + "rewards/margins": 20.739307403564453, + "rewards/rejected": -25.922870635986328, + "step": 2880 + }, + { + "epoch": 8.98528272656855, + "grad_norm": 0.0003129359392914921, + "learning_rate": 7.781041783412845e-05, + "logits/chosen": -1.6950937509536743, + "logits/rejected": -1.0535084009170532, + "logps/chosen": -418.62701416015625, + "logps/rejected": -476.28387451171875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.857310771942139, + "rewards/margins": 21.914113998413086, + "rewards/rejected": -26.771427154541016, + "step": 2900 + }, + { + "epoch": 9.047250193648335, + "grad_norm": 0.0004019307089038193, + "learning_rate": 7.656196043872012e-05, + "logits/chosen": -1.7096707820892334, + "logits/rejected": -1.1031239032745361, + "logps/chosen": -416.05206298828125, + "logps/rejected": -494.614990234375, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.534869194030762, + "rewards/margins": 21.93942642211914, + "rewards/rejected": -27.474294662475586, + "step": 2920 + }, + { + "epoch": 9.109217660728119, + "grad_norm": 0.0007387935766018927, + "learning_rate": 7.531735678773171e-05, + "logits/chosen": -1.7090095281600952, + "logits/rejected": -1.0878323316574097, + "logps/chosen": -400.01513671875, + "logps/rejected": -477.05535888671875, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.275289058685303, + "rewards/margins": 21.69790267944336, + "rewards/rejected": -26.973194122314453, + "step": 2940 + }, + { + "epoch": 9.1711851278079, + "grad_norm": 0.00027141955797560513, + "learning_rate": 7.407681152218535e-05, + "logits/chosen": -1.6808192729949951, + "logits/rejected": -1.0295798778533936, + "logps/chosen": -404.32513427734375, + "logps/rejected": -460.8975524902344, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.207651615142822, + "rewards/margins": 20.58077049255371, + "rewards/rejected": -25.788421630859375, + "step": 2960 + }, + { + "epoch": 9.233152594887684, + "grad_norm": 0.0005088089383207262, + "learning_rate": 7.284052861581288e-05, + "logits/chosen": -1.7368125915527344, + "logits/rejected": -1.0655357837677002, + "logps/chosen": -410.697021484375, + "logps/rejected": -453.0840759277344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.395773410797119, + "rewards/margins": 20.73539924621582, + "rewards/rejected": -26.13117027282715, + "step": 2980 + }, + { + "epoch": 9.295120061967467, + "grad_norm": 0.0002143807359971106, + "learning_rate": 7.160871134151775e-05, + "logits/chosen": -1.6661646366119385, + "logits/rejected": -1.092222809791565, + "logps/chosen": -405.39154052734375, + "logps/rejected": -485.67578125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.456831455230713, + "rewards/margins": 21.252620697021484, + "rewards/rejected": -26.70945167541504, + "step": 3000 + }, + { + "epoch": 9.35708752904725, + "grad_norm": 8.41324872453697e-05, + "learning_rate": 7.038156223795224e-05, + "logits/chosen": -1.7362842559814453, + "logits/rejected": -1.082162857055664, + "logps/chosen": -410.0975646972656, + "logps/rejected": -466.8894958496094, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.814949989318848, + "rewards/margins": 21.61594009399414, + "rewards/rejected": -26.430889129638672, + "step": 3020 + }, + { + "epoch": 9.419054996127032, + "grad_norm": 2.4985982236103155e-05, + "learning_rate": 6.915928307621584e-05, + "logits/chosen": -1.7000200748443604, + "logits/rejected": -1.0128730535507202, + "logps/chosen": -417.96405029296875, + "logps/rejected": -461.15362548828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.359194278717041, + "rewards/margins": 21.4404296875, + "rewards/rejected": -25.79962158203125, + "step": 3040 + }, + { + "epoch": 9.481022463206816, + "grad_norm": 0.0002187406353186816, + "learning_rate": 6.794207482667918e-05, + "logits/chosen": -1.6875083446502686, + "logits/rejected": -1.0425808429718018, + "logps/chosen": -409.68170166015625, + "logps/rejected": -456.98114013671875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.01973295211792, + "rewards/margins": 20.8963623046875, + "rewards/rejected": -25.916095733642578, + "step": 3060 + }, + { + "epoch": 9.5429899302866, + "grad_norm": 0.0001037058827932924, + "learning_rate": 6.673013762594022e-05, + "logits/chosen": -1.6812347173690796, + "logits/rejected": -1.0920425653457642, + "logps/chosen": -409.3445129394531, + "logps/rejected": -463.01702880859375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.606844425201416, + "rewards/margins": 20.97027015686035, + "rewards/rejected": -26.57711410522461, + "step": 3080 + }, + { + "epoch": 9.604957397366382, + "grad_norm": 6.546611984958872e-05, + "learning_rate": 6.552367074391708e-05, + "logits/chosen": -1.6708405017852783, + "logits/rejected": -1.0272510051727295, + "logps/chosen": -421.3130798339844, + "logps/rejected": -468.8424377441406, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.817858695983887, + "rewards/margins": 21.14541244506836, + "rewards/rejected": -26.963272094726562, + "step": 3100 + }, + { + "epoch": 9.666924864446166, + "grad_norm": 0.0009899769211187959, + "learning_rate": 6.432287255108363e-05, + "logits/chosen": -1.7139580249786377, + "logits/rejected": -1.0682191848754883, + "logps/chosen": -415.08154296875, + "logps/rejected": -463.1947326660156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.88477087020874, + "rewards/margins": 20.44330596923828, + "rewards/rejected": -26.328075408935547, + "step": 3120 + }, + { + "epoch": 9.728892331525948, + "grad_norm": 0.0010677826358005404, + "learning_rate": 6.312794048585286e-05, + "logits/chosen": -1.6608006954193115, + "logits/rejected": -1.0799270868301392, + "logps/chosen": -393.5787353515625, + "logps/rejected": -458.1851501464844, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.194777488708496, + "rewards/margins": 20.60002899169922, + "rewards/rejected": -25.7948055267334, + "step": 3140 + }, + { + "epoch": 9.790859798605732, + "grad_norm": 0.00037055814755149186, + "learning_rate": 6.193907102211358e-05, + "logits/chosen": -1.700254201889038, + "logits/rejected": -1.149086594581604, + "logps/chosen": -414.83575439453125, + "logps/rejected": -480.109375, + "loss": 0.0054, + "rewards/accuracies": 0.984375, + "rewards/chosen": -6.013056755065918, + "rewards/margins": 20.352540969848633, + "rewards/rejected": -26.3655948638916, + "step": 3160 + }, + { + "epoch": 9.852827265685516, + "grad_norm": 0.00012906199845019728, + "learning_rate": 6.075645963692567e-05, + "logits/chosen": -1.6764156818389893, + "logits/rejected": -1.0942738056182861, + "logps/chosen": -410.2710876464844, + "logps/rejected": -480.7608337402344, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.723294734954834, + "rewards/margins": 21.212993621826172, + "rewards/rejected": -26.936288833618164, + "step": 3180 + }, + { + "epoch": 9.914794732765298, + "grad_norm": 9.71817207755521e-05, + "learning_rate": 5.9580300778379087e-05, + "logits/chosen": -1.6972318887710571, + "logits/rejected": -1.06034255027771, + "logps/chosen": -414.45697021484375, + "logps/rejected": -478.67608642578125, + "loss": 0.0054, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.851905822753906, + "rewards/margins": 22.140657424926758, + "rewards/rejected": -26.992563247680664, + "step": 3200 + }, + { + "epoch": 9.976762199845082, + "grad_norm": 0.0005355001194402575, + "learning_rate": 5.8410787833622414e-05, + "logits/chosen": -1.701051950454712, + "logits/rejected": -1.0390212535858154, + "logps/chosen": -392.62689208984375, + "logps/rejected": -438.70660400390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.652411937713623, + "rewards/margins": 21.09701156616211, + "rewards/rejected": -25.749425888061523, + "step": 3220 + }, + { + "epoch": 10.038729666924864, + "grad_norm": 0.0007227555033750832, + "learning_rate": 5.724811309706547e-05, + "logits/chosen": -1.7204704284667969, + "logits/rejected": -1.0700039863586426, + "logps/chosen": -430.43206787109375, + "logps/rejected": -488.071044921875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.367037296295166, + "rewards/margins": 21.72504425048828, + "rewards/rejected": -27.092077255249023, + "step": 3240 + }, + { + "epoch": 10.100697134004648, + "grad_norm": 0.00017314284923486412, + "learning_rate": 5.6092467738761776e-05, + "logits/chosen": -1.6834897994995117, + "logits/rejected": -1.0887248516082764, + "logps/chosen": -416.51348876953125, + "logps/rejected": -469.4505920410156, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.5038862228393555, + "rewards/margins": 21.196359634399414, + "rewards/rejected": -26.700244903564453, + "step": 3260 + }, + { + "epoch": 10.162664601084431, + "grad_norm": 0.00027020045672543347, + "learning_rate": 5.494404177297595e-05, + "logits/chosen": -1.696730613708496, + "logits/rejected": -1.0611952543258667, + "logps/chosen": -399.0355529785156, + "logps/rejected": -449.93646240234375, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.055383682250977, + "rewards/margins": 20.96977996826172, + "rewards/rejected": -26.025165557861328, + "step": 3280 + }, + { + "epoch": 10.224632068164214, + "grad_norm": 0.0003596362948883325, + "learning_rate": 5.380302402694104e-05, + "logits/chosen": -1.7198495864868164, + "logits/rejected": -1.0654425621032715, + "logps/chosen": -390.9352722167969, + "logps/rejected": -453.2206115722656, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.068055629730225, + "rewards/margins": 20.917200088500977, + "rewards/rejected": -25.98525619506836, + "step": 3300 + }, + { + "epoch": 10.286599535243997, + "grad_norm": 2.4758495783316903e-05, + "learning_rate": 5.266960210981089e-05, + "logits/chosen": -1.664912462234497, + "logits/rejected": -1.0661206245422363, + "logps/chosen": -402.9308166503906, + "logps/rejected": -467.4169921875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.694643020629883, + "rewards/margins": 21.313457489013672, + "rewards/rejected": -27.008098602294922, + "step": 3320 + }, + { + "epoch": 10.34856700232378, + "grad_norm": 0.00036736109177581966, + "learning_rate": 5.15439623818132e-05, + "logits/chosen": -1.7021472454071045, + "logits/rejected": -1.1036940813064575, + "logps/chosen": -395.59149169921875, + "logps/rejected": -463.43316650390625, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.990979194641113, + "rewards/margins": 20.853925704956055, + "rewards/rejected": -26.84490394592285, + "step": 3340 + }, + { + "epoch": 10.410534469403563, + "grad_norm": 0.00021753676992375404, + "learning_rate": 5.042628992360755e-05, + "logits/chosen": -1.6948877573013306, + "logits/rejected": -1.0948389768600464, + "logps/chosen": -417.33160400390625, + "logps/rejected": -491.01483154296875, + "loss": 0.0033, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -5.629961967468262, + "rewards/margins": 21.473012924194336, + "rewards/rejected": -27.102975845336914, + "step": 3360 + }, + { + "epoch": 10.472501936483347, + "grad_norm": 0.0005015567876398563, + "learning_rate": 4.9316768505853864e-05, + "logits/chosen": -1.7080516815185547, + "logits/rejected": -1.0318862199783325, + "logps/chosen": -397.1073913574219, + "logps/rejected": -439.6314392089844, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.5096540451049805, + "rewards/margins": 20.36575698852539, + "rewards/rejected": -25.875408172607422, + "step": 3380 + }, + { + "epoch": 10.53446940356313, + "grad_norm": 0.000426275102654472, + "learning_rate": 4.8215580558996546e-05, + "logits/chosen": -1.6764377355575562, + "logits/rejected": -1.0771383047103882, + "logps/chosen": -404.91937255859375, + "logps/rejected": -485.12548828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.389082908630371, + "rewards/margins": 21.155742645263672, + "rewards/rejected": -26.54482650756836, + "step": 3400 + }, + { + "epoch": 10.596436870642913, + "grad_norm": 0.00011274849384790286, + "learning_rate": 4.7122907143268645e-05, + "logits/chosen": -1.7037220001220703, + "logits/rejected": -1.0873366594314575, + "logps/chosen": -417.3395080566406, + "logps/rejected": -485.4212951660156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.40346622467041, + "rewards/margins": 21.43330955505371, + "rewards/rejected": -26.836772918701172, + "step": 3420 + }, + { + "epoch": 10.658404337722695, + "grad_norm": 0.0008545616874471307, + "learning_rate": 4.603892791892157e-05, + "logits/chosen": -1.7251865863800049, + "logits/rejected": -1.1108168363571167, + "logps/chosen": -409.8521423339844, + "logps/rejected": -483.19329833984375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.274342060089111, + "rewards/margins": 22.360143661499023, + "rewards/rejected": -26.634485244750977, + "step": 3440 + }, + { + "epoch": 10.720371804802479, + "grad_norm": 0.0002442661498207599, + "learning_rate": 4.4963821116684645e-05, + "logits/chosen": -1.7168834209442139, + "logits/rejected": -1.0469696521759033, + "logps/chosen": -410.9766540527344, + "logps/rejected": -462.96759033203125, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.01826286315918, + "rewards/margins": 21.594696044921875, + "rewards/rejected": -26.612957000732422, + "step": 3460 + }, + { + "epoch": 10.782339271882261, + "grad_norm": 2.5067949536605738e-05, + "learning_rate": 4.3897763508460235e-05, + "logits/chosen": -1.6555604934692383, + "logits/rejected": -1.067326307296753, + "logps/chosen": -411.1241149902344, + "logps/rejected": -471.122314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.384194374084473, + "rewards/margins": 20.667926788330078, + "rewards/rejected": -26.052120208740234, + "step": 3480 + }, + { + "epoch": 10.844306738962045, + "grad_norm": 9.07514404389076e-05, + "learning_rate": 4.284093037825829e-05, + "logits/chosen": -1.7002710103988647, + "logits/rejected": -1.0244972705841064, + "logps/chosen": -396.713623046875, + "logps/rejected": -450.4693298339844, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3324480056762695, + "rewards/margins": 20.980426788330078, + "rewards/rejected": -26.312875747680664, + "step": 3500 + }, + { + "epoch": 10.906274206041829, + "grad_norm": 0.0001592998596606776, + "learning_rate": 4.179349549337557e-05, + "logits/chosen": -1.704119086265564, + "logits/rejected": -1.0116019248962402, + "logps/chosen": -402.82666015625, + "logps/rejected": -443.30157470703125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.895948886871338, + "rewards/margins": 21.18239402770996, + "rewards/rejected": -26.07834243774414, + "step": 3520 + }, + { + "epoch": 10.96824167312161, + "grad_norm": 1.9538027117960155e-05, + "learning_rate": 4.075563107582472e-05, + "logits/chosen": -1.668092966079712, + "logits/rejected": -1.065983533859253, + "logps/chosen": -398.3217468261719, + "logps/rejected": -477.6726989746094, + "loss": 0.0054, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -5.416517734527588, + "rewards/margins": 21.412036895751953, + "rewards/rejected": -26.82855224609375, + "step": 3540 + }, + { + "epoch": 11.030209140201395, + "grad_norm": 5.915413566981442e-05, + "learning_rate": 3.9727507774016635e-05, + "logits/chosen": -1.6671562194824219, + "logits/rejected": -1.0572084188461304, + "logps/chosen": -400.4344177246094, + "logps/rejected": -474.96038818359375, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.950907230377197, + "rewards/margins": 20.902238845825195, + "rewards/rejected": -26.8531494140625, + "step": 3560 + }, + { + "epoch": 11.092176607281177, + "grad_norm": 0.0006108521483838558, + "learning_rate": 3.8709294634702376e-05, + "logits/chosen": -1.7030471563339233, + "logits/rejected": -1.0317370891571045, + "logps/chosen": -398.74090576171875, + "logps/rejected": -459.75, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.71872615814209, + "rewards/margins": 22.286239624023438, + "rewards/rejected": -27.00496482849121, + "step": 3580 + }, + { + "epoch": 11.15414407436096, + "grad_norm": 0.000467544246930629, + "learning_rate": 3.770115907517773e-05, + "logits/chosen": -1.6686887741088867, + "logits/rejected": -1.0782063007354736, + "logps/chosen": -406.98138427734375, + "logps/rejected": -482.86572265625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.86759090423584, + "rewards/margins": 21.316923141479492, + "rewards/rejected": -27.184513092041016, + "step": 3600 + }, + { + "epoch": 11.216111541440744, + "grad_norm": 0.0004900813801214099, + "learning_rate": 3.670326685575632e-05, + "logits/chosen": -1.7124903202056885, + "logits/rejected": -1.0398648977279663, + "logps/chosen": -415.08648681640625, + "logps/rejected": -477.70709228515625, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.952596187591553, + "rewards/margins": 22.07376480102539, + "rewards/rejected": -27.026357650756836, + "step": 3620 + }, + { + "epoch": 11.278079008520526, + "grad_norm": 0.0002428332227282226, + "learning_rate": 3.571578205251459e-05, + "logits/chosen": -1.7211148738861084, + "logits/rejected": -1.1097770929336548, + "logps/chosen": -406.6622009277344, + "logps/rejected": -460.78643798828125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.563107490539551, + "rewards/margins": 21.05852699279785, + "rewards/rejected": -26.621633529663086, + "step": 3640 + }, + { + "epoch": 11.34004647560031, + "grad_norm": 0.0004079696664121002, + "learning_rate": 3.4738867030314235e-05, + "logits/chosen": -1.7017863988876343, + "logits/rejected": -1.0735719203948975, + "logps/chosen": -414.16339111328125, + "logps/rejected": -490.61944580078125, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.069756507873535, + "rewards/margins": 22.46738052368164, + "rewards/rejected": -27.53713607788086, + "step": 3660 + }, + { + "epoch": 11.402013942680092, + "grad_norm": 0.0001673255901550874, + "learning_rate": 3.377268241610555e-05, + "logits/chosen": -1.692521095275879, + "logits/rejected": -1.0149263143539429, + "logps/chosen": -412.38507080078125, + "logps/rejected": -467.0577697753906, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.867552280426025, + "rewards/margins": 20.83139991760254, + "rewards/rejected": -26.698949813842773, + "step": 3680 + }, + { + "epoch": 11.463981409759876, + "grad_norm": 0.00012532217078842223, + "learning_rate": 3.2817387072516726e-05, + "logits/chosen": -1.7133913040161133, + "logits/rejected": -1.1119440793991089, + "logps/chosen": -401.7035217285156, + "logps/rejected": -476.5845642089844, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.1463212966918945, + "rewards/margins": 22.046228408813477, + "rewards/rejected": -27.192550659179688, + "step": 3700 + }, + { + "epoch": 11.52594887683966, + "grad_norm": 0.0002491988998372108, + "learning_rate": 3.18731380717334e-05, + "logits/chosen": -1.6776504516601562, + "logits/rejected": -1.0443401336669922, + "logps/chosen": -402.75933837890625, + "logps/rejected": -455.70068359375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.209097385406494, + "rewards/margins": 21.239925384521484, + "rewards/rejected": -26.449024200439453, + "step": 3720 + }, + { + "epoch": 11.587916343919442, + "grad_norm": 0.0005044552381150424, + "learning_rate": 3.0940090669672215e-05, + "logits/chosen": -1.6772470474243164, + "logits/rejected": -1.0744705200195312, + "logps/chosen": -400.09912109375, + "logps/rejected": -477.5372619628906, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.949058532714844, + "rewards/margins": 21.821866989135742, + "rewards/rejected": -26.770925521850586, + "step": 3740 + }, + { + "epoch": 11.649883810999226, + "grad_norm": 4.5204073103377596e-05, + "learning_rate": 3.001839828045342e-05, + "logits/chosen": -1.7325446605682373, + "logits/rejected": -1.063987135887146, + "logps/chosen": -415.75592041015625, + "logps/rejected": -452.0940856933594, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.492778778076172, + "rewards/margins": 20.81328582763672, + "rewards/rejected": -26.30606460571289, + "step": 3760 + }, + { + "epoch": 11.711851278079008, + "grad_norm": 0.0002700432378333062, + "learning_rate": 2.9108212451176033e-05, + "logits/chosen": -1.7305303812026978, + "logits/rejected": -1.083184003829956, + "logps/chosen": -400.70635986328125, + "logps/rejected": -472.36114501953125, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.0615034103393555, + "rewards/margins": 22.031635284423828, + "rewards/rejected": -27.093135833740234, + "step": 3780 + }, + { + "epoch": 11.773818745158792, + "grad_norm": 0.00013194057100918144, + "learning_rate": 2.8209682837000072e-05, + "logits/chosen": -1.6789268255233765, + "logits/rejected": -1.0528620481491089, + "logps/chosen": -403.6865539550781, + "logps/rejected": -479.7601623535156, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.470952033996582, + "rewards/margins": 21.67144775390625, + "rewards/rejected": -27.14239501953125, + "step": 3800 + }, + { + "epoch": 11.835786212238574, + "grad_norm": 0.0002364068350289017, + "learning_rate": 2.7322957176539777e-05, + "logits/chosen": -1.6753734350204468, + "logits/rejected": -1.0195820331573486, + "logps/chosen": -417.6498107910156, + "logps/rejected": -472.09844970703125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.569521903991699, + "rewards/margins": 20.978273391723633, + "rewards/rejected": -26.54779624938965, + "step": 3820 + }, + { + "epoch": 11.897753679318358, + "grad_norm": 0.00013174403284210712, + "learning_rate": 2.6448181267572226e-05, + "logits/chosen": -1.6455790996551514, + "logits/rejected": -1.046744465827942, + "logps/chosen": -410.19134521484375, + "logps/rejected": -483.3438415527344, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.728828430175781, + "rewards/margins": 21.940776824951172, + "rewards/rejected": -27.669601440429688, + "step": 3840 + }, + { + "epoch": 11.959721146398142, + "grad_norm": 0.00042892919736914337, + "learning_rate": 2.5585498943064724e-05, + "logits/chosen": -1.6926710605621338, + "logits/rejected": -1.0491944551467896, + "logps/chosen": -415.20550537109375, + "logps/rejected": -482.228271484375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.751172065734863, + "rewards/margins": 21.466909408569336, + "rewards/rejected": -27.21807861328125, + "step": 3860 + }, + { + "epoch": 12.021688613477924, + "grad_norm": 8.727656677365303e-05, + "learning_rate": 2.4735052047525398e-05, + "logits/chosen": -1.7163196802139282, + "logits/rejected": -1.059697151184082, + "logps/chosen": -422.93359375, + "logps/rejected": -472.23583984375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.125914573669434, + "rewards/margins": 21.549646377563477, + "rewards/rejected": -26.675561904907227, + "step": 3880 + }, + { + "epoch": 12.083656080557708, + "grad_norm": 5.139048516866751e-05, + "learning_rate": 2.389698041368089e-05, + "logits/chosen": -1.682549238204956, + "logits/rejected": -1.0410518646240234, + "logps/chosen": -419.48529052734375, + "logps/rejected": -488.83154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.320895195007324, + "rewards/margins": 22.32204246520996, + "rewards/rejected": -27.6429386138916, + "step": 3900 + }, + { + "epoch": 12.14562354763749, + "grad_norm": 0.00013814242265652865, + "learning_rate": 2.3071421839484554e-05, + "logits/chosen": -1.6900997161865234, + "logits/rejected": -1.0404036045074463, + "logps/chosen": -399.94854736328125, + "logps/rejected": -466.58642578125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.632657051086426, + "rewards/margins": 21.346328735351562, + "rewards/rejected": -26.978984832763672, + "step": 3920 + }, + { + "epoch": 12.207591014717273, + "grad_norm": 0.0001951899757841602, + "learning_rate": 2.2258512065459448e-05, + "logits/chosen": -1.6699708700180054, + "logits/rejected": -1.058363437652588, + "logps/chosen": -421.36419677734375, + "logps/rejected": -490.47100830078125, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.72733211517334, + "rewards/margins": 21.7630672454834, + "rewards/rejected": -27.490398406982422, + "step": 3940 + }, + { + "epoch": 12.269558481797057, + "grad_norm": 0.001167879207059741, + "learning_rate": 2.1458384752379357e-05, + "logits/chosen": -1.6963287591934204, + "logits/rejected": -1.078595757484436, + "logps/chosen": -400.4660339355469, + "logps/rejected": -470.71710205078125, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.399907112121582, + "rewards/margins": 21.62917709350586, + "rewards/rejected": -27.02908706665039, + "step": 3960 + }, + { + "epoch": 12.33152594887684, + "grad_norm": 9.643881639931351e-06, + "learning_rate": 2.067117145929216e-05, + "logits/chosen": -1.688515305519104, + "logits/rejected": -1.08303964138031, + "logps/chosen": -402.33795166015625, + "logps/rejected": -477.7525329589844, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -4.999421119689941, + "rewards/margins": 22.334285736083984, + "rewards/rejected": -27.333709716796875, + "step": 3980 + }, + { + "epoch": 12.393493415956623, + "grad_norm": 0.0006664241082035005, + "learning_rate": 1.9897001621888434e-05, + "logits/chosen": -1.7171924114227295, + "logits/rejected": -1.0485467910766602, + "logps/chosen": -409.967529296875, + "logps/rejected": -477.21551513671875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.805159568786621, + "rewards/margins": 22.3187198638916, + "rewards/rejected": -27.123876571655273, + "step": 4000 + }, + { + "epoch": 12.455460883036405, + "grad_norm": 5.3627591114491224e-06, + "learning_rate": 1.913600253121919e-05, + "logits/chosen": -1.677496314048767, + "logits/rejected": -1.0768311023712158, + "logps/chosen": -421.8292541503906, + "logps/rejected": -494.90606689453125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.249929904937744, + "rewards/margins": 21.906986236572266, + "rewards/rejected": -27.15691566467285, + "step": 4020 + }, + { + "epoch": 12.51742835011619, + "grad_norm": 3.554378781700507e-05, + "learning_rate": 1.838829931276653e-05, + "logits/chosen": -1.6907306909561157, + "logits/rejected": -1.0432696342468262, + "logps/chosen": -398.9062805175781, + "logps/rejected": -465.7071228027344, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -4.694939136505127, + "rewards/margins": 22.108684539794922, + "rewards/rejected": -26.80362319946289, + "step": 4040 + }, + { + "epoch": 12.579395817195973, + "grad_norm": 6.133209535619244e-05, + "learning_rate": 1.7654014905870098e-05, + "logits/chosen": -1.6698366403579712, + "logits/rejected": -1.0069531202316284, + "logps/chosen": -417.49237060546875, + "logps/rejected": -470.18902587890625, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.820713520050049, + "rewards/margins": 21.33327865600586, + "rewards/rejected": -27.15399169921875, + "step": 4060 + }, + { + "epoch": 12.641363284275755, + "grad_norm": 0.00020697916625067592, + "learning_rate": 1.6933270043513083e-05, + "logits/chosen": -1.677680253982544, + "logits/rejected": -1.0464431047439575, + "logps/chosen": -408.2115478515625, + "logps/rejected": -478.3711853027344, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.883364200592041, + "rewards/margins": 21.521183013916016, + "rewards/rejected": -27.404544830322266, + "step": 4080 + }, + { + "epoch": 12.703330751355539, + "grad_norm": 0.00018397132225800306, + "learning_rate": 1.622618323247087e-05, + "logits/chosen": -1.6993494033813477, + "logits/rejected": -1.0857021808624268, + "logps/chosen": -405.2132873535156, + "logps/rejected": -485.60321044921875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.594387531280518, + "rewards/margins": 21.590347290039062, + "rewards/rejected": -27.184734344482422, + "step": 4100 + }, + { + "epoch": 12.765298218435321, + "grad_norm": 0.00029773233109153807, + "learning_rate": 1.553287073382609e-05, + "logits/chosen": -1.7119516134262085, + "logits/rejected": -1.0656880140304565, + "logps/chosen": -405.5570373535156, + "logps/rejected": -462.2611389160156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.116886615753174, + "rewards/margins": 21.456085205078125, + "rewards/rejected": -26.57297134399414, + "step": 4120 + }, + { + "epoch": 12.827265685515105, + "grad_norm": 0.0001080308502423577, + "learning_rate": 1.485344654385239e-05, + "logits/chosen": -1.6709296703338623, + "logits/rejected": -1.053741693496704, + "logps/chosen": -428.66839599609375, + "logps/rejected": -500.01092529296875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.277214050292969, + "rewards/margins": 22.146846771240234, + "rewards/rejected": -28.424060821533203, + "step": 4140 + }, + { + "epoch": 12.889233152594887, + "grad_norm": 6.432453665183857e-05, + "learning_rate": 1.418802237527106e-05, + "logits/chosen": -1.68827223777771, + "logits/rejected": -1.0494086742401123, + "logps/chosen": -424.75286865234375, + "logps/rejected": -481.1043395996094, + "loss": 0.0033, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.699560165405273, + "rewards/margins": 21.662763595581055, + "rewards/rejected": -27.362323760986328, + "step": 4160 + }, + { + "epoch": 12.95120061967467, + "grad_norm": 0.0004029480624012649, + "learning_rate": 1.3536707638882872e-05, + "logits/chosen": -1.6849908828735352, + "logits/rejected": -1.0281345844268799, + "logps/chosen": -419.80010986328125, + "logps/rejected": -460.801025390625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.415833950042725, + "rewards/margins": 20.73134422302246, + "rewards/rejected": -26.147180557250977, + "step": 4180 + }, + { + "epoch": 13.013168086754455, + "grad_norm": 0.0002039131213678047, + "learning_rate": 1.289960942557844e-05, + "logits/chosen": -1.686678171157837, + "logits/rejected": -1.041481852531433, + "logps/chosen": -418.22686767578125, + "logps/rejected": -488.3094787597656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.922072410583496, + "rewards/margins": 21.746536254882812, + "rewards/rejected": -27.66861343383789, + "step": 4200 + }, + { + "epoch": 13.075135553834237, + "grad_norm": 0.00016347317432519048, + "learning_rate": 1.2276832488730094e-05, + "logits/chosen": -1.7182451486587524, + "logits/rejected": -1.0532605648040771, + "logps/chosen": -441.8271484375, + "logps/rejected": -510.87628173828125, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.668587684631348, + "rewards/margins": 22.97989273071289, + "rewards/rejected": -28.648479461669922, + "step": 4220 + }, + { + "epoch": 13.13710302091402, + "grad_norm": 0.00020034710178151727, + "learning_rate": 1.1668479226967965e-05, + "logits/chosen": -1.6925156116485596, + "logits/rejected": -1.0687302350997925, + "logps/chosen": -399.3315124511719, + "logps/rejected": -474.7539978027344, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.459714889526367, + "rewards/margins": 21.628223419189453, + "rewards/rejected": -27.087936401367188, + "step": 4240 + }, + { + "epoch": 13.199070487993803, + "grad_norm": 0.00026680485461838543, + "learning_rate": 1.1074649667343506e-05, + "logits/chosen": -1.6791460514068604, + "logits/rejected": -1.0547727346420288, + "logps/chosen": -412.1854553222656, + "logps/rejected": -474.461181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.000552654266357, + "rewards/margins": 21.563953399658203, + "rewards/rejected": -26.564502716064453, + "step": 4260 + }, + { + "epoch": 13.261037955073586, + "grad_norm": 9.416981629328802e-05, + "learning_rate": 1.0495441448882571e-05, + "logits/chosen": -1.6752477884292603, + "logits/rejected": -1.0648829936981201, + "logps/chosen": -413.24609375, + "logps/rejected": -496.79327392578125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.392674922943115, + "rewards/margins": 22.125301361083984, + "rewards/rejected": -27.51797866821289, + "step": 4280 + }, + { + "epoch": 13.32300542215337, + "grad_norm": 0.00027022938593290746, + "learning_rate": 9.930949806531509e-06, + "logits/chosen": -1.6898155212402344, + "logits/rejected": -1.0595139265060425, + "logps/chosen": -410.2594299316406, + "logps/rejected": -469.86883544921875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0617547035217285, + "rewards/margins": 21.85466194152832, + "rewards/rejected": -26.916418075561523, + "step": 4300 + }, + { + "epoch": 13.384972889233152, + "grad_norm": 5.3291834774427116e-05, + "learning_rate": 9.38126755549832e-06, + "logits/chosen": -1.6853482723236084, + "logits/rejected": -1.0476603507995605, + "logps/chosen": -411.350830078125, + "logps/rejected": -470.8251037597656, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.509891510009766, + "rewards/margins": 21.421506881713867, + "rewards/rejected": -26.931400299072266, + "step": 4320 + }, + { + "epoch": 13.446940356312936, + "grad_norm": 8.903396519599482e-05, + "learning_rate": 8.846485075991728e-06, + "logits/chosen": -1.6736446619033813, + "logits/rejected": -1.0330798625946045, + "logps/chosen": -417.89044189453125, + "logps/rejected": -477.1280822753906, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.157768726348877, + "rewards/margins": 21.815839767456055, + "rewards/rejected": -26.973608016967773, + "step": 4340 + }, + { + "epoch": 13.508907823392718, + "grad_norm": 0.0006522313342429698, + "learning_rate": 8.326690298360639e-06, + "logits/chosen": -1.679149866104126, + "logits/rejected": -1.0622096061706543, + "logps/chosen": -403.9975891113281, + "logps/rejected": -478.72174072265625, + "loss": 0.0043, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.251322269439697, + "rewards/margins": 21.613903045654297, + "rewards/rejected": -26.8652286529541, + "step": 4360 + }, + { + "epoch": 13.570875290472502, + "grad_norm": 0.0001527480490040034, + "learning_rate": 7.821968688636383e-06, + "logits/chosen": -1.7000373601913452, + "logits/rejected": -1.0500789880752563, + "logps/chosen": -400.9742431640625, + "logps/rejected": -477.05450439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.700057506561279, + "rewards/margins": 21.45535659790039, + "rewards/rejected": -27.155414581298828, + "step": 4380 + }, + { + "epoch": 13.632842757552286, + "grad_norm": 0.0005368488491512835, + "learning_rate": 7.332403234480223e-06, + "logits/chosen": -1.683445692062378, + "logits/rejected": -1.0166078805923462, + "logps/chosen": -401.72607421875, + "logps/rejected": -456.4202575683594, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.469435691833496, + "rewards/margins": 21.11139488220215, + "rewards/rejected": -26.580829620361328, + "step": 4400 + }, + { + "epoch": 13.694810224632068, + "grad_norm": 0.0005580181023105979, + "learning_rate": 6.858074431538164e-06, + "logits/chosen": -1.6824891567230225, + "logits/rejected": -1.0271477699279785, + "logps/chosen": -399.6391296386719, + "logps/rejected": -451.330078125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.243688583374023, + "rewards/margins": 21.208574295043945, + "rewards/rejected": -26.452260971069336, + "step": 4420 + }, + { + "epoch": 13.756777691711852, + "grad_norm": NaN, + "learning_rate": 6.421646080196197e-06, + "logits/chosen": -1.6686054468154907, + "logits/rejected": -1.0693179368972778, + "logps/chosen": -401.59844970703125, + "logps/rejected": -474.7311096191406, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.454672336578369, + "rewards/margins": 21.39242172241211, + "rewards/rejected": -26.847095489501953, + "step": 4440 + }, + { + "epoch": 13.818745158791634, + "grad_norm": 1.7149226550827734e-05, + "learning_rate": 5.9772507736462145e-06, + "logits/chosen": -1.710008978843689, + "logits/rejected": -1.0888980627059937, + "logps/chosen": -407.61260986328125, + "logps/rejected": -481.07550048828125, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.6366987228393555, + "rewards/margins": 21.678539276123047, + "rewards/rejected": -27.315237045288086, + "step": 4460 + }, + { + "epoch": 13.880712625871418, + "grad_norm": 2.4136075808200985e-05, + "learning_rate": 5.54831493606015e-06, + "logits/chosen": -1.6713101863861084, + "logits/rejected": -1.0732184648513794, + "logps/chosen": -424.976806640625, + "logps/rejected": -506.0423889160156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.765892028808594, + "rewards/margins": 22.11074447631836, + "rewards/rejected": -27.876636505126953, + "step": 4480 + }, + { + "epoch": 13.9426800929512, + "grad_norm": 7.025560626061633e-05, + "learning_rate": 5.134909094202267e-06, + "logits/chosen": -1.699441909790039, + "logits/rejected": -1.0467607975006104, + "logps/chosen": -401.03375244140625, + "logps/rejected": -447.85308837890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.419959545135498, + "rewards/margins": 20.893884658813477, + "rewards/rejected": -26.313846588134766, + "step": 4500 + }, + { + "epoch": 14.004647560030984, + "grad_norm": 0.0002559265703894198, + "learning_rate": 4.7371012213538235e-06, + "logits/chosen": -1.6893657445907593, + "logits/rejected": -1.0456167459487915, + "logps/chosen": -425.73895263671875, + "logps/rejected": -486.43890380859375, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.418589115142822, + "rewards/margins": 22.638408660888672, + "rewards/rejected": -28.0570011138916, + "step": 4520 + }, + { + "epoch": 14.066615027110767, + "grad_norm": 0.00043519827886484563, + "learning_rate": 4.35495672613685e-06, + "logits/chosen": -1.6840267181396484, + "logits/rejected": -1.0660759210586548, + "logps/chosen": -420.65692138671875, + "logps/rejected": -481.805419921875, + "loss": 0.0065, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -5.826098442077637, + "rewards/margins": 21.706336975097656, + "rewards/rejected": -27.53243637084961, + "step": 4540 + }, + { + "epoch": 14.12858249419055, + "grad_norm": 0.0004038415208924562, + "learning_rate": 3.988538441759382e-06, + "logits/chosen": -1.673048973083496, + "logits/rejected": -1.0200636386871338, + "logps/chosen": -403.9557189941406, + "logps/rejected": -461.65179443359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.021474361419678, + "rewards/margins": 21.59840965270996, + "rewards/rejected": -26.619884490966797, + "step": 4560 + }, + { + "epoch": 14.190549961270333, + "grad_norm": 0.00038054597098380327, + "learning_rate": 3.637906615684328e-06, + "logits/chosen": -1.6679537296295166, + "logits/rejected": -1.0269415378570557, + "logps/chosen": -410.174072265625, + "logps/rejected": -484.68865966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3633928298950195, + "rewards/margins": 22.351978302001953, + "rewards/rejected": -27.715368270874023, + "step": 4580 + }, + { + "epoch": 14.252517428350115, + "grad_norm": 5.562596925301477e-05, + "learning_rate": 3.3031188997233676e-06, + "logits/chosen": -1.6873247623443604, + "logits/rejected": -1.0105091333389282, + "logps/chosen": -405.04132080078125, + "logps/rejected": -454.36920166015625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.410122871398926, + "rewards/margins": 21.17348289489746, + "rewards/rejected": -26.583606719970703, + "step": 4600 + }, + { + "epoch": 14.3144848954299, + "grad_norm": 4.7735171392560005e-05, + "learning_rate": 2.9842303405577366e-06, + "logits/chosen": -1.6932716369628906, + "logits/rejected": -1.026926040649414, + "logps/chosen": -416.610595703125, + "logps/rejected": -469.50335693359375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.13016414642334, + "rewards/margins": 20.862241744995117, + "rewards/rejected": -26.99240493774414, + "step": 4620 + }, + { + "epoch": 14.376452362509683, + "grad_norm": 0.00047004391672089696, + "learning_rate": 2.6812933706872545e-06, + "logits/chosen": -1.6934292316436768, + "logits/rejected": -1.063394546508789, + "logps/chosen": -415.4750061035156, + "logps/rejected": -489.5491638183594, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.2422404289245605, + "rewards/margins": 22.516773223876953, + "rewards/rejected": -27.759014129638672, + "step": 4640 + }, + { + "epoch": 14.438419829589465, + "grad_norm": 0.0008643981418572366, + "learning_rate": 2.394357799809277e-06, + "logits/chosen": -1.735192894935608, + "logits/rejected": -1.069784164428711, + "logps/chosen": -409.0735168457031, + "logps/rejected": -455.7366638183594, + "loss": 0.0022, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.419035911560059, + "rewards/margins": 21.468860626220703, + "rewards/rejected": -26.887897491455078, + "step": 4660 + }, + { + "epoch": 14.500387296669249, + "grad_norm": 0.0002557814004831016, + "learning_rate": 2.123470806628858e-06, + "logits/chosen": -1.6932361125946045, + "logits/rejected": -1.03562331199646, + "logps/chosen": -404.10223388671875, + "logps/rejected": -452.8517150878906, + "loss": 0.0043, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.449051856994629, + "rewards/margins": 21.111392974853516, + "rewards/rejected": -26.560443878173828, + "step": 4680 + }, + { + "epoch": 14.562354763749031, + "grad_norm": 0.00017765916709322482, + "learning_rate": 1.868676931101465e-06, + "logits/chosen": -1.6715888977050781, + "logits/rejected": -1.057328462600708, + "logps/chosen": -411.4977111816406, + "logps/rejected": -486.6917419433594, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.450153827667236, + "rewards/margins": 22.20999526977539, + "rewards/rejected": -27.6601505279541, + "step": 4700 + }, + { + "epoch": 14.624322230828815, + "grad_norm": 0.0006002355949021876, + "learning_rate": 1.6300180671096288e-06, + "logits/chosen": -1.6742595434188843, + "logits/rejected": -1.0468966960906982, + "logps/chosen": -414.0707092285156, + "logps/rejected": -482.42657470703125, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.493812561035156, + "rewards/margins": 21.657306671142578, + "rewards/rejected": -27.151119232177734, + "step": 4720 + }, + { + "epoch": 14.686289697908599, + "grad_norm": 0.00020658239373005927, + "learning_rate": 1.4075334555746055e-06, + "logits/chosen": -1.662987470626831, + "logits/rejected": -1.016445279121399, + "logps/chosen": -407.02423095703125, + "logps/rejected": -467.1194763183594, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.999020576477051, + "rewards/margins": 20.836938858032227, + "rewards/rejected": -26.83595848083496, + "step": 4740 + }, + { + "epoch": 14.748257164988381, + "grad_norm": 6.777382805012167e-05, + "learning_rate": 1.2012596780043627e-06, + "logits/chosen": -1.6404949426651, + "logits/rejected": -1.0619919300079346, + "logps/chosen": -394.98443603515625, + "logps/rejected": -479.7742614746094, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.746143341064453, + "rewards/margins": 21.60362434387207, + "rewards/rejected": -27.349767684936523, + "step": 4760 + }, + { + "epoch": 14.810224632068165, + "grad_norm": 0.00017278394079767168, + "learning_rate": 1.011230650478634e-06, + "logits/chosen": -1.6573286056518555, + "logits/rejected": -1.0122966766357422, + "logps/chosen": -396.2731018066406, + "logps/rejected": -456.626220703125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.127909183502197, + "rewards/margins": 21.664600372314453, + "rewards/rejected": -26.79250717163086, + "step": 4780 + }, + { + "epoch": 14.872192099147947, + "grad_norm": 0.00017635834228713065, + "learning_rate": 8.374776180724575e-07, + "logits/chosen": -1.7095073461532593, + "logits/rejected": -1.0201966762542725, + "logps/chosen": -402.76763916015625, + "logps/rejected": -461.19903564453125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.404868125915527, + "rewards/margins": 21.330501556396484, + "rewards/rejected": -26.735370635986328, + "step": 4800 + }, + { + "epoch": 14.93415956622773, + "grad_norm": 0.0006217029877007008, + "learning_rate": 6.800291497187083e-07, + "logits/chosen": -1.7389657497406006, + "logits/rejected": -1.0253870487213135, + "logps/chosen": -406.7480163574219, + "logps/rejected": -461.8179626464844, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.920414447784424, + "rewards/margins": 21.916866302490234, + "rewards/rejected": -26.8372802734375, + "step": 4820 + }, + { + "epoch": 14.996127033307513, + "grad_norm": 0.0001935044419951737, + "learning_rate": 5.389111335107556e-07, + "logits/chosen": -1.696392297744751, + "logits/rejected": -1.0922819375991821, + "logps/chosen": -414.5367736816406, + "logps/rejected": -476.94012451171875, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.998685359954834, + "rewards/margins": 21.558393478393555, + "rewards/rejected": -27.557079315185547, + "step": 4840 + }, + { + "epoch": 15.058094500387297, + "grad_norm": 4.989042645320296e-05, + "learning_rate": 4.1414677244584477e-07, + "logits/chosen": -1.690422773361206, + "logits/rejected": -1.0694575309753418, + "logps/chosen": -417.68487548828125, + "logps/rejected": -490.20989990234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.380603313446045, + "rewards/margins": 21.939071655273438, + "rewards/rejected": -27.31967544555664, + "step": 4860 + }, + { + "epoch": 15.12006196746708, + "grad_norm": 0.0008857127977535129, + "learning_rate": 3.0575658061001713e-07, + "logits/chosen": -1.692728042602539, + "logits/rejected": -1.0653448104858398, + "logps/chosen": -414.1552734375, + "logps/rejected": -490.3134765625, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -6.1294355392456055, + "rewards/margins": 21.649871826171875, + "rewards/rejected": -27.779308319091797, + "step": 4880 + }, + { + "epoch": 15.182029434546862, + "grad_norm": 7.71297054598108e-05, + "learning_rate": 2.1375837980512904e-07, + "logits/chosen": -1.687190294265747, + "logits/rejected": -1.0721074342727661, + "logps/chosen": -410.22161865234375, + "logps/rejected": -491.24835205078125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.182304859161377, + "rewards/margins": 22.23093032836914, + "rewards/rejected": -27.41323471069336, + "step": 4900 + }, + { + "epoch": 15.243996901626646, + "grad_norm": 0.00017248830408789217, + "learning_rate": 1.38167296618541e-07, + "logits/chosen": -1.682885766029358, + "logits/rejected": -1.0524094104766846, + "logps/chosen": -410.17681884765625, + "logps/rejected": -472.13885498046875, + "loss": 0.0011, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.831109046936035, + "rewards/margins": 21.399702072143555, + "rewards/rejected": -27.230810165405273, + "step": 4920 + }, + { + "epoch": 15.305964368706428, + "grad_norm": 0.0008164289756678045, + "learning_rate": 7.899575993597363e-08, + "logits/chosen": -1.6627308130264282, + "logits/rejected": -0.9520984888076782, + "logps/chosen": -395.6473693847656, + "logps/rejected": -433.9269104003906, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -5.2272748947143555, + "rewards/margins": 20.858642578125, + "rewards/rejected": -26.08591651916504, + "step": 4940 + }, + { + "epoch": 15.367931835786212, + "grad_norm": 0.00019182954565621912, + "learning_rate": 3.6253498897886873e-08, + "logits/chosen": -1.6554197072982788, + "logits/rejected": -1.0059171915054321, + "logps/chosen": -394.91973876953125, + "logps/rejected": -451.76312255859375, + "loss": 0.0022, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -5.4583234786987305, + "rewards/margins": 21.091644287109375, + "rewards/rejected": -26.549968719482422, + "step": 4960 + }, + { + "epoch": 15.429899302865996, + "grad_norm": 0.00014239229494705796, + "learning_rate": 9.947541299837327e-09, + "logits/chosen": -1.7060569524765015, + "logits/rejected": -1.0418967008590698, + "logps/chosen": -427.88525390625, + "logps/rejected": -482.782958984375, + "loss": 0.0043, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -5.570733070373535, + "rewards/margins": 21.934314727783203, + "rewards/rejected": -27.505046844482422, + "step": 4980 + }, + { + "epoch": 15.491866769945778, + "grad_norm": 0.0005336150643415749, + "learning_rate": 8.221243689154889e-11, + "logits/chosen": -1.6255543231964111, + "logits/rejected": -1.027090311050415, + "logps/chosen": -393.7467956542969, + "logps/rejected": -484.93804931640625, + "loss": 0.0033, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -5.538996696472168, + "rewards/margins": 21.719022750854492, + "rewards/rejected": -27.25801658630371, + "step": 5000 + } + ], + "logging_steps": 20, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 16, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}