diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6672 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997172745264349, + "eval_steps": 500, + "global_step": 442, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0022618037885213456, + "grad_norm": 131.355215075171, + "learning_rate": 1.7777777777777777e-08, + "logits/chosen": -11.149957656860352, + "logits/rejected": -11.106039047241211, + "logps/chosen": -0.4639046788215637, + "logps/rejected": -0.459951788187027, + "loss": 4.9394, + "rewards/accuracies": 0.375, + "rewards/chosen": -4.639046669006348, + "rewards/margins": -0.039528995752334595, + "rewards/rejected": -4.599517822265625, + "step": 1 + }, + { + "epoch": 0.004523607577042691, + "grad_norm": 68.75410369813167, + "learning_rate": 3.5555555555555554e-08, + "logits/chosen": -10.890952110290527, + "logits/rejected": -10.69871711730957, + "logps/chosen": -0.5820316672325134, + "logps/rejected": -0.5644893646240234, + "loss": 5.3979, + "rewards/accuracies": 0.3125, + "rewards/chosen": -5.820316314697266, + "rewards/margins": -0.17542320489883423, + "rewards/rejected": -5.644893646240234, + "step": 2 + }, + { + "epoch": 0.006785411365564037, + "grad_norm": 98.10181248017503, + "learning_rate": 5.333333333333333e-08, + "logits/chosen": -10.386991500854492, + "logits/rejected": -10.3389892578125, + "logps/chosen": -0.7467580437660217, + "logps/rejected": -0.7350905537605286, + "loss": 5.3645, + "rewards/accuracies": 0.59375, + "rewards/chosen": -7.467580318450928, + "rewards/margins": -0.11667439341545105, + "rewards/rejected": -7.350905418395996, + "step": 3 + }, + { + "epoch": 0.009047215154085382, + "grad_norm": 85.65000023027866, + "learning_rate": 7.111111111111111e-08, + "logits/chosen": -10.73530387878418, + "logits/rejected": -10.564247131347656, + "logps/chosen": -0.5459556579589844, + "logps/rejected": -0.4875364899635315, + "loss": 5.1509, + "rewards/accuracies": 0.46875, + "rewards/chosen": -5.459556579589844, + "rewards/margins": -0.584191620349884, + "rewards/rejected": -4.875364780426025, + "step": 4 + }, + { + "epoch": 0.01130901894260673, + "grad_norm": 74.9791226991399, + "learning_rate": 8.888888888888888e-08, + "logits/chosen": -10.528997421264648, + "logits/rejected": -9.97251033782959, + "logps/chosen": -0.6336177587509155, + "logps/rejected": -0.6410748958587646, + "loss": 5.2521, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.336178302764893, + "rewards/margins": 0.07457125931978226, + "rewards/rejected": -6.410749435424805, + "step": 5 + }, + { + "epoch": 0.013570822731128074, + "grad_norm": 85.29375010182012, + "learning_rate": 1.0666666666666666e-07, + "logits/chosen": -10.442211151123047, + "logits/rejected": -10.494767189025879, + "logps/chosen": -0.5163740515708923, + "logps/rejected": -0.5633006691932678, + "loss": 5.3068, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.163741111755371, + "rewards/margins": 0.46926558017730713, + "rewards/rejected": -5.6330060958862305, + "step": 6 + }, + { + "epoch": 0.01583262651964942, + "grad_norm": 72.34857602748069, + "learning_rate": 1.2444444444444443e-07, + "logits/chosen": -10.570417404174805, + "logits/rejected": -10.182153701782227, + "logps/chosen": -0.49319958686828613, + "logps/rejected": -0.5546016693115234, + "loss": 4.7171, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.931995868682861, + "rewards/margins": 0.6140204668045044, + "rewards/rejected": -5.546016693115234, + "step": 7 + }, + { + "epoch": 0.018094430308170765, + "grad_norm": 96.60351151716078, + "learning_rate": 1.4222222222222222e-07, + "logits/chosen": -11.686549186706543, + "logits/rejected": -11.599323272705078, + "logps/chosen": -0.654168963432312, + "logps/rejected": -0.7179521322250366, + "loss": 5.2239, + "rewards/accuracies": 0.53125, + "rewards/chosen": -6.541690349578857, + "rewards/margins": 0.6378321051597595, + "rewards/rejected": -7.179522514343262, + "step": 8 + }, + { + "epoch": 0.020356234096692113, + "grad_norm": 69.08528348151097, + "learning_rate": 1.6e-07, + "logits/chosen": -10.81006908416748, + "logits/rejected": -10.815924644470215, + "logps/chosen": -0.6012924313545227, + "logps/rejected": -0.6476706266403198, + "loss": 5.1255, + "rewards/accuracies": 0.53125, + "rewards/chosen": -6.0129241943359375, + "rewards/margins": 0.46378093957901, + "rewards/rejected": -6.476705074310303, + "step": 9 + }, + { + "epoch": 0.02261803788521346, + "grad_norm": 98.9910788377654, + "learning_rate": 1.7777777777777776e-07, + "logits/chosen": -11.031579971313477, + "logits/rejected": -10.416993141174316, + "logps/chosen": -0.534875750541687, + "logps/rejected": -0.5605251789093018, + "loss": 5.0563, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.348757743835449, + "rewards/margins": 0.25649386644363403, + "rewards/rejected": -5.605251789093018, + "step": 10 + }, + { + "epoch": 0.024879841673734804, + "grad_norm": 92.47375199683604, + "learning_rate": 1.9555555555555555e-07, + "logits/chosen": -11.211597442626953, + "logits/rejected": -10.974644660949707, + "logps/chosen": -0.5139535665512085, + "logps/rejected": -0.5967007279396057, + "loss": 4.9212, + "rewards/accuracies": 0.4375, + "rewards/chosen": -5.139535903930664, + "rewards/margins": 0.8274715542793274, + "rewards/rejected": -5.967007637023926, + "step": 11 + }, + { + "epoch": 0.02714164546225615, + "grad_norm": 109.2539243326917, + "learning_rate": 2.133333333333333e-07, + "logits/chosen": -10.3906831741333, + "logits/rejected": -10.407954216003418, + "logps/chosen": -0.5846738815307617, + "logps/rejected": -0.5586296319961548, + "loss": 5.2254, + "rewards/accuracies": 0.5625, + "rewards/chosen": -5.846738338470459, + "rewards/margins": -0.26044273376464844, + "rewards/rejected": -5.586296081542969, + "step": 12 + }, + { + "epoch": 0.029403449250777494, + "grad_norm": 93.26949602496981, + "learning_rate": 2.3111111111111107e-07, + "logits/chosen": -11.434279441833496, + "logits/rejected": -11.00756549835205, + "logps/chosen": -0.57530277967453, + "logps/rejected": -0.5521742105484009, + "loss": 5.3681, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.753026962280273, + "rewards/margins": -0.23128610849380493, + "rewards/rejected": -5.52174186706543, + "step": 13 + }, + { + "epoch": 0.03166525303929884, + "grad_norm": 54.76880243693634, + "learning_rate": 2.4888888888888886e-07, + "logits/chosen": -11.06928825378418, + "logits/rejected": -10.667929649353027, + "logps/chosen": -0.49921348690986633, + "logps/rejected": -0.5616152882575989, + "loss": 4.7488, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.992135047912598, + "rewards/margins": 0.6240180134773254, + "rewards/rejected": -5.616153240203857, + "step": 14 + }, + { + "epoch": 0.033927056827820185, + "grad_norm": 71.5182395693498, + "learning_rate": 2.666666666666666e-07, + "logits/chosen": -11.895730972290039, + "logits/rejected": -11.64004135131836, + "logps/chosen": -0.49031415581703186, + "logps/rejected": -0.5405735373497009, + "loss": 4.9109, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.903141498565674, + "rewards/margins": 0.5025936961174011, + "rewards/rejected": -5.405735015869141, + "step": 15 + }, + { + "epoch": 0.03618886061634153, + "grad_norm": 79.7034591294439, + "learning_rate": 2.8444444444444443e-07, + "logits/chosen": -10.60659122467041, + "logits/rejected": -10.282760620117188, + "logps/chosen": -0.6062531471252441, + "logps/rejected": -0.5618928670883179, + "loss": 5.1619, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.062531471252441, + "rewards/margins": -0.44360262155532837, + "rewards/rejected": -5.618928909301758, + "step": 16 + }, + { + "epoch": 0.038450664404862875, + "grad_norm": 87.84593204498888, + "learning_rate": 3.022222222222222e-07, + "logits/chosen": -12.490971565246582, + "logits/rejected": -12.19153881072998, + "logps/chosen": -0.41767269372940063, + "logps/rejected": -0.40474578738212585, + "loss": 5.3782, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.176727294921875, + "rewards/margins": -0.12926939129829407, + "rewards/rejected": -4.047458171844482, + "step": 17 + }, + { + "epoch": 0.04071246819338423, + "grad_norm": 110.95316335628588, + "learning_rate": 3.2e-07, + "logits/chosen": -11.399320602416992, + "logits/rejected": -11.420541763305664, + "logps/chosen": -0.6453328728675842, + "logps/rejected": -0.6450071334838867, + "loss": 5.0468, + "rewards/accuracies": 0.3125, + "rewards/chosen": -6.453329086303711, + "rewards/margins": -0.0032582059502601624, + "rewards/rejected": -6.450070381164551, + "step": 18 + }, + { + "epoch": 0.04297427198190557, + "grad_norm": 86.46973948808649, + "learning_rate": 3.3777777777777777e-07, + "logits/chosen": -12.149145126342773, + "logits/rejected": -12.085639953613281, + "logps/chosen": -0.49516329169273376, + "logps/rejected": -0.41885480284690857, + "loss": 5.052, + "rewards/accuracies": 0.34375, + "rewards/chosen": -4.951632499694824, + "rewards/margins": -0.7630849480628967, + "rewards/rejected": -4.188547611236572, + "step": 19 + }, + { + "epoch": 0.04523607577042692, + "grad_norm": 71.36928249846149, + "learning_rate": 3.5555555555555553e-07, + "logits/chosen": -10.735100746154785, + "logits/rejected": -10.853598594665527, + "logps/chosen": -0.5569137930870056, + "logps/rejected": -0.5804443955421448, + "loss": 4.6835, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.5691375732421875, + "rewards/margins": 0.2353065013885498, + "rewards/rejected": -5.804443836212158, + "step": 20 + }, + { + "epoch": 0.04749787955894826, + "grad_norm": 61.042781292962395, + "learning_rate": 3.7333333333333334e-07, + "logits/chosen": -11.995122909545898, + "logits/rejected": -11.363445281982422, + "logps/chosen": -0.4467337131500244, + "logps/rejected": -0.4861924648284912, + "loss": 5.0424, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.467337131500244, + "rewards/margins": 0.3945879340171814, + "rewards/rejected": -4.86192512512207, + "step": 21 + }, + { + "epoch": 0.04975968334746961, + "grad_norm": 89.07868369848414, + "learning_rate": 3.911111111111111e-07, + "logits/chosen": -11.574544906616211, + "logits/rejected": -11.458039283752441, + "logps/chosen": -0.5010548233985901, + "logps/rejected": -0.5385463833808899, + "loss": 5.0061, + "rewards/accuracies": 0.40625, + "rewards/chosen": -5.010547637939453, + "rewards/margins": 0.37491610646247864, + "rewards/rejected": -5.385463714599609, + "step": 22 + }, + { + "epoch": 0.05202148713599095, + "grad_norm": 169.3436229453717, + "learning_rate": 4.0888888888888886e-07, + "logits/chosen": -10.919013977050781, + "logits/rejected": -10.827438354492188, + "logps/chosen": -0.5667375326156616, + "logps/rejected": -0.5423346161842346, + "loss": 4.978, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.667375564575195, + "rewards/margins": -0.2440294474363327, + "rewards/rejected": -5.423345565795898, + "step": 23 + }, + { + "epoch": 0.0542832909245123, + "grad_norm": 58.12255640669961, + "learning_rate": 4.266666666666666e-07, + "logits/chosen": -12.290349960327148, + "logits/rejected": -12.1292142868042, + "logps/chosen": -0.3345947861671448, + "logps/rejected": -0.37946847081184387, + "loss": 4.7173, + "rewards/accuracies": 0.53125, + "rewards/chosen": -3.3459479808807373, + "rewards/margins": 0.44873636960983276, + "rewards/rejected": -3.794684410095215, + "step": 24 + }, + { + "epoch": 0.05654509471303364, + "grad_norm": 93.15623268640158, + "learning_rate": 4.4444444444444444e-07, + "logits/chosen": -11.1387939453125, + "logits/rejected": -10.918662071228027, + "logps/chosen": -0.48852428793907166, + "logps/rejected": -0.5354989767074585, + "loss": 4.8657, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.8852434158325195, + "rewards/margins": 0.46974682807922363, + "rewards/rejected": -5.354990005493164, + "step": 25 + }, + { + "epoch": 0.05880689850155499, + "grad_norm": 64.22760084086403, + "learning_rate": 4.6222222222222214e-07, + "logits/chosen": -11.181513786315918, + "logits/rejected": -10.725030899047852, + "logps/chosen": -0.43731987476348877, + "logps/rejected": -0.48887380957603455, + "loss": 4.7432, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.373198986053467, + "rewards/margins": 0.5155391097068787, + "rewards/rejected": -4.88873815536499, + "step": 26 + }, + { + "epoch": 0.061068702290076333, + "grad_norm": 86.04795850026115, + "learning_rate": 4.8e-07, + "logits/chosen": -10.508721351623535, + "logits/rejected": -10.471704483032227, + "logps/chosen": -0.4588507413864136, + "logps/rejected": -0.5172092914581299, + "loss": 4.8766, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.588507175445557, + "rewards/margins": 0.5835859775543213, + "rewards/rejected": -5.172093391418457, + "step": 27 + }, + { + "epoch": 0.06333050607859768, + "grad_norm": 74.38153566770265, + "learning_rate": 4.977777777777777e-07, + "logits/chosen": -10.403984069824219, + "logits/rejected": -10.611076354980469, + "logps/chosen": -0.47997862100601196, + "logps/rejected": -0.4639643728733063, + "loss": 4.9471, + "rewards/accuracies": 0.40625, + "rewards/chosen": -4.79978609085083, + "rewards/margins": -0.16014233231544495, + "rewards/rejected": -4.639643669128418, + "step": 28 + }, + { + "epoch": 0.06559230986711903, + "grad_norm": 71.67345566558542, + "learning_rate": 5.155555555555556e-07, + "logits/chosen": -11.623980522155762, + "logits/rejected": -11.187301635742188, + "logps/chosen": -0.47259610891342163, + "logps/rejected": -0.4480085074901581, + "loss": 4.9723, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.725960731506348, + "rewards/margins": -0.24587592482566833, + "rewards/rejected": -4.480085372924805, + "step": 29 + }, + { + "epoch": 0.06785411365564037, + "grad_norm": 67.06827734638706, + "learning_rate": 5.333333333333332e-07, + "logits/chosen": -11.207446098327637, + "logits/rejected": -10.807114601135254, + "logps/chosen": -0.37271052598953247, + "logps/rejected": -0.4592744708061218, + "loss": 4.7976, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.7271053791046143, + "rewards/margins": 0.8656396865844727, + "rewards/rejected": -4.592744827270508, + "step": 30 + }, + { + "epoch": 0.07011591744416172, + "grad_norm": 72.20873132579756, + "learning_rate": 5.511111111111111e-07, + "logits/chosen": -10.57562255859375, + "logits/rejected": -10.428007125854492, + "logps/chosen": -0.43888184428215027, + "logps/rejected": -0.464484840631485, + "loss": 4.8315, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.388818264007568, + "rewards/margins": 0.25603026151657104, + "rewards/rejected": -4.644848823547363, + "step": 31 + }, + { + "epoch": 0.07237772123268306, + "grad_norm": 65.66194705402224, + "learning_rate": 5.688888888888889e-07, + "logits/chosen": -10.86208724975586, + "logits/rejected": -10.499285697937012, + "logps/chosen": -0.4337802529335022, + "logps/rejected": -0.4744107127189636, + "loss": 5.0868, + "rewards/accuracies": 0.40625, + "rewards/chosen": -4.337802886962891, + "rewards/margins": 0.4063045084476471, + "rewards/rejected": -4.744107246398926, + "step": 32 + }, + { + "epoch": 0.07463952502120441, + "grad_norm": 69.10527972763198, + "learning_rate": 5.866666666666666e-07, + "logits/chosen": -10.774530410766602, + "logits/rejected": -10.699942588806152, + "logps/chosen": -0.4071800112724304, + "logps/rejected": -0.4233216643333435, + "loss": 4.9605, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.0717997550964355, + "rewards/margins": 0.1614171266555786, + "rewards/rejected": -4.233217239379883, + "step": 33 + }, + { + "epoch": 0.07690132880972575, + "grad_norm": 64.69922555278133, + "learning_rate": 6.044444444444444e-07, + "logits/chosen": -10.982305526733398, + "logits/rejected": -10.901609420776367, + "logps/chosen": -0.37619659304618835, + "logps/rejected": -0.4010980725288391, + "loss": 4.7433, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.7619662284851074, + "rewards/margins": 0.2490149885416031, + "rewards/rejected": -4.01098108291626, + "step": 34 + }, + { + "epoch": 0.0791631325982471, + "grad_norm": 64.60258006742473, + "learning_rate": 6.222222222222223e-07, + "logits/chosen": -10.168670654296875, + "logits/rejected": -10.236058235168457, + "logps/chosen": -0.46076497435569763, + "logps/rejected": -0.4832019805908203, + "loss": 4.6019, + "rewards/accuracies": 0.46875, + "rewards/chosen": -4.607649803161621, + "rewards/margins": 0.22437021136283875, + "rewards/rejected": -4.832019805908203, + "step": 35 + }, + { + "epoch": 0.08142493638676845, + "grad_norm": 66.16525403071286, + "learning_rate": 6.4e-07, + "logits/chosen": -10.984823226928711, + "logits/rejected": -10.869633674621582, + "logps/chosen": -0.4266025125980377, + "logps/rejected": -0.44316184520721436, + "loss": 4.6435, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.266025066375732, + "rewards/margins": 0.16559378802776337, + "rewards/rejected": -4.431619167327881, + "step": 36 + }, + { + "epoch": 0.08368674017528979, + "grad_norm": 62.62054800184612, + "learning_rate": 6.577777777777777e-07, + "logits/chosen": -11.466747283935547, + "logits/rejected": -10.947083473205566, + "logps/chosen": -0.4133344292640686, + "logps/rejected": -0.46087807416915894, + "loss": 4.6937, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.1333441734313965, + "rewards/margins": 0.47543561458587646, + "rewards/rejected": -4.608780384063721, + "step": 37 + }, + { + "epoch": 0.08594854396381114, + "grad_norm": 68.71159791998826, + "learning_rate": 6.755555555555555e-07, + "logits/chosen": -10.58063793182373, + "logits/rejected": -10.66105842590332, + "logps/chosen": -0.4248150587081909, + "logps/rejected": -0.4461151957511902, + "loss": 4.6967, + "rewards/accuracies": 0.34375, + "rewards/chosen": -4.248150825500488, + "rewards/margins": 0.2130012959241867, + "rewards/rejected": -4.461152076721191, + "step": 38 + }, + { + "epoch": 0.08821034775233248, + "grad_norm": 70.5941913597691, + "learning_rate": 6.933333333333333e-07, + "logits/chosen": -11.376758575439453, + "logits/rejected": -11.398736953735352, + "logps/chosen": -0.47147077322006226, + "logps/rejected": -0.4626140296459198, + "loss": 4.6912, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.714707374572754, + "rewards/margins": -0.08856695890426636, + "rewards/rejected": -4.626140594482422, + "step": 39 + }, + { + "epoch": 0.09047215154085383, + "grad_norm": 68.46057335163108, + "learning_rate": 7.111111111111111e-07, + "logits/chosen": -11.606950759887695, + "logits/rejected": -11.105400085449219, + "logps/chosen": -0.39962151646614075, + "logps/rejected": -0.4578825831413269, + "loss": 4.5643, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.9962148666381836, + "rewards/margins": 0.5826107263565063, + "rewards/rejected": -4.578825950622559, + "step": 40 + }, + { + "epoch": 0.09273395532937517, + "grad_norm": 48.972841115050805, + "learning_rate": 7.288888888888888e-07, + "logits/chosen": -11.43770980834961, + "logits/rejected": -11.56243896484375, + "logps/chosen": -0.41414573788642883, + "logps/rejected": -0.4131737947463989, + "loss": 4.6472, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.141457557678223, + "rewards/margins": -0.009719468653202057, + "rewards/rejected": -4.13173770904541, + "step": 41 + }, + { + "epoch": 0.09499575911789652, + "grad_norm": 143.02756863226023, + "learning_rate": 7.466666666666667e-07, + "logits/chosen": -11.282739639282227, + "logits/rejected": -10.897704124450684, + "logps/chosen": -0.4278091490268707, + "logps/rejected": -0.4735082983970642, + "loss": 4.8335, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.278091907501221, + "rewards/margins": 0.4569913148880005, + "rewards/rejected": -4.735082626342773, + "step": 42 + }, + { + "epoch": 0.09725756290641786, + "grad_norm": 74.71400905339502, + "learning_rate": 7.644444444444444e-07, + "logits/chosen": -10.040319442749023, + "logits/rejected": -9.910164833068848, + "logps/chosen": -0.5288741588592529, + "logps/rejected": -0.5330761671066284, + "loss": 4.9162, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.2887420654296875, + "rewards/margins": 0.04202008247375488, + "rewards/rejected": -5.330761909484863, + "step": 43 + }, + { + "epoch": 0.09951936669493922, + "grad_norm": 81.89498458784372, + "learning_rate": 7.822222222222222e-07, + "logits/chosen": -11.844489097595215, + "logits/rejected": -11.597496032714844, + "logps/chosen": -0.3373556435108185, + "logps/rejected": -0.42685818672180176, + "loss": 4.4339, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.373556613922119, + "rewards/margins": 0.8950251340866089, + "rewards/rejected": -4.268581390380859, + "step": 44 + }, + { + "epoch": 0.10178117048346055, + "grad_norm": 70.48086533057375, + "learning_rate": 8e-07, + "logits/chosen": -10.962928771972656, + "logits/rejected": -10.9669771194458, + "logps/chosen": -0.40894240140914917, + "logps/rejected": -0.47473400831222534, + "loss": 4.6343, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.089423656463623, + "rewards/margins": 0.657916784286499, + "rewards/rejected": -4.747340679168701, + "step": 45 + }, + { + "epoch": 0.1040429742719819, + "grad_norm": 68.45670029697006, + "learning_rate": 7.999874759018868e-07, + "logits/chosen": -10.595868110656738, + "logits/rejected": -10.306282043457031, + "logps/chosen": -0.463877409696579, + "logps/rejected": -0.5967152118682861, + "loss": 4.6017, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.6387739181518555, + "rewards/margins": 1.3283770084381104, + "rewards/rejected": -5.967151165008545, + "step": 46 + }, + { + "epoch": 0.10630477806050326, + "grad_norm": 48.95212367492393, + "learning_rate": 7.999499043918123e-07, + "logits/chosen": -12.154573440551758, + "logits/rejected": -12.19536304473877, + "logps/chosen": -0.45791739225387573, + "logps/rejected": -0.5772292017936707, + "loss": 4.7494, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.579174041748047, + "rewards/margins": 1.1931182146072388, + "rewards/rejected": -5.772292137145996, + "step": 47 + }, + { + "epoch": 0.1085665818490246, + "grad_norm": 78.70326465142523, + "learning_rate": 7.998872878225228e-07, + "logits/chosen": -11.652605056762695, + "logits/rejected": -11.418684005737305, + "logps/chosen": -0.4871661365032196, + "logps/rejected": -0.5542778372764587, + "loss": 4.8017, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.871661186218262, + "rewards/margins": 0.6711173057556152, + "rewards/rejected": -5.542778491973877, + "step": 48 + }, + { + "epoch": 0.11082838563754595, + "grad_norm": 52.90074896212443, + "learning_rate": 7.997996301150987e-07, + "logits/chosen": -12.08781623840332, + "logits/rejected": -11.528596878051758, + "logps/chosen": -0.4144824743270874, + "logps/rejected": -0.5010120868682861, + "loss": 4.6588, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.144824981689453, + "rewards/margins": 0.8652949929237366, + "rewards/rejected": -5.010120391845703, + "step": 49 + }, + { + "epoch": 0.11309018942606729, + "grad_norm": 94.26111558590127, + "learning_rate": 7.996869367587088e-07, + "logits/chosen": -11.582418441772461, + "logits/rejected": -10.963386535644531, + "logps/chosen": -0.4467932879924774, + "logps/rejected": -0.4817226529121399, + "loss": 4.7057, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.46793270111084, + "rewards/margins": 0.34929385781288147, + "rewards/rejected": -4.817226409912109, + "step": 50 + }, + { + "epoch": 0.11535199321458864, + "grad_norm": 49.386260226236786, + "learning_rate": 7.99549214810266e-07, + "logits/chosen": -10.763232231140137, + "logits/rejected": -10.5509672164917, + "logps/chosen": -0.5276934504508972, + "logps/rejected": -0.5668250322341919, + "loss": 4.505, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.276934623718262, + "rewards/margins": 0.39131537079811096, + "rewards/rejected": -5.668249607086182, + "step": 51 + }, + { + "epoch": 0.11761379700310998, + "grad_norm": 75.77039924958046, + "learning_rate": 7.993864728939867e-07, + "logits/chosen": -10.996638298034668, + "logits/rejected": -11.125421524047852, + "logps/chosen": -0.4168677031993866, + "logps/rejected": -0.4338124394416809, + "loss": 4.8877, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.16867733001709, + "rewards/margins": 0.1694469451904297, + "rewards/rejected": -4.3381242752075195, + "step": 52 + }, + { + "epoch": 0.11987560079163133, + "grad_norm": 128.1604406398008, + "learning_rate": 7.991987212008491e-07, + "logits/chosen": -10.845922470092773, + "logits/rejected": -10.981675148010254, + "logps/chosen": -0.5582807660102844, + "logps/rejected": -0.5974184274673462, + "loss": 4.4079, + "rewards/accuracies": 0.46875, + "rewards/chosen": -5.582807540893555, + "rewards/margins": 0.3913762867450714, + "rewards/rejected": -5.974184036254883, + "step": 53 + }, + { + "epoch": 0.12213740458015267, + "grad_norm": 58.15165845926177, + "learning_rate": 7.989859714879565e-07, + "logits/chosen": -10.547262191772461, + "logits/rejected": -10.480108261108398, + "logps/chosen": -0.5517194271087646, + "logps/rejected": -0.6646666526794434, + "loss": 4.9804, + "rewards/accuracies": 0.5625, + "rewards/chosen": -5.5171942710876465, + "rewards/margins": 1.1294726133346558, + "rewards/rejected": -6.646667003631592, + "step": 54 + }, + { + "epoch": 0.12439920836867402, + "grad_norm": 48.309919673308315, + "learning_rate": 7.987482370778005e-07, + "logits/chosen": -11.610102653503418, + "logits/rejected": -11.80359935760498, + "logps/chosen": -0.5112394690513611, + "logps/rejected": -0.5051109790802002, + "loss": 4.7434, + "rewards/accuracies": 0.46875, + "rewards/chosen": -5.112394332885742, + "rewards/margins": -0.061284855008125305, + "rewards/rejected": -5.051109790802002, + "step": 55 + }, + { + "epoch": 0.12666101215719536, + "grad_norm": 92.15090562707765, + "learning_rate": 7.984855328574262e-07, + "logits/chosen": -11.098040580749512, + "logits/rejected": -10.789083480834961, + "logps/chosen": -0.489580363035202, + "logps/rejected": -0.5100796818733215, + "loss": 4.5609, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.895802974700928, + "rewards/margins": 0.20499347150325775, + "rewards/rejected": -5.100796699523926, + "step": 56 + }, + { + "epoch": 0.1289228159457167, + "grad_norm": 94.24070870623638, + "learning_rate": 7.981978752775009e-07, + "logits/chosen": -9.92190933227539, + "logits/rejected": -9.928149223327637, + "logps/chosen": -0.6262676119804382, + "logps/rejected": -0.6750127077102661, + "loss": 4.5092, + "rewards/accuracies": 0.53125, + "rewards/chosen": -6.262676239013672, + "rewards/margins": 0.48745113611221313, + "rewards/rejected": -6.750126838684082, + "step": 57 + }, + { + "epoch": 0.13118461973423806, + "grad_norm": 73.68507158036604, + "learning_rate": 7.978852823512833e-07, + "logits/chosen": -10.95576000213623, + "logits/rejected": -10.358962059020996, + "logps/chosen": -0.4652557671070099, + "logps/rejected": -0.4836958050727844, + "loss": 4.8084, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.652557849884033, + "rewards/margins": 0.1843997836112976, + "rewards/rejected": -4.8369574546813965, + "step": 58 + }, + { + "epoch": 0.1334464235227594, + "grad_norm": 66.30964256222111, + "learning_rate": 7.975477736534957e-07, + "logits/chosen": -12.005413055419922, + "logits/rejected": -11.653824806213379, + "logps/chosen": -0.46185585856437683, + "logps/rejected": -0.5776143670082092, + "loss": 4.5199, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.618558883666992, + "rewards/margins": 1.1575853824615479, + "rewards/rejected": -5.776144027709961, + "step": 59 + }, + { + "epoch": 0.13570822731128074, + "grad_norm": 101.38755828571541, + "learning_rate": 7.971853703190986e-07, + "logits/chosen": -11.413613319396973, + "logits/rejected": -10.73826789855957, + "logps/chosen": -0.5611809492111206, + "logps/rejected": -0.6577370762825012, + "loss": 4.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": -5.611808776855469, + "rewards/margins": 0.9655615091323853, + "rewards/rejected": -6.577370643615723, + "step": 60 + }, + { + "epoch": 0.1379700310998021, + "grad_norm": 68.86595519869128, + "learning_rate": 7.967980950419664e-07, + "logits/chosen": -11.096137046813965, + "logits/rejected": -10.685264587402344, + "logps/chosen": -0.4981518089771271, + "logps/rejected": -0.665467381477356, + "loss": 4.4761, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.981517791748047, + "rewards/margins": 1.673156499862671, + "rewards/rejected": -6.654675006866455, + "step": 61 + }, + { + "epoch": 0.14023183488832344, + "grad_norm": 65.33178558436228, + "learning_rate": 7.963859720734669e-07, + "logits/chosen": -12.070573806762695, + "logits/rejected": -11.637935638427734, + "logps/chosen": -0.38139575719833374, + "logps/rejected": -0.45192593336105347, + "loss": 4.5535, + "rewards/accuracies": 0.65625, + "rewards/chosen": -3.813957691192627, + "rewards/margins": 0.7053009867668152, + "rewards/rejected": -4.519258975982666, + "step": 62 + }, + { + "epoch": 0.14249363867684478, + "grad_norm": 83.62029217364055, + "learning_rate": 7.959490272209427e-07, + "logits/chosen": -10.89778995513916, + "logits/rejected": -10.380571365356445, + "logps/chosen": -0.4739726185798645, + "logps/rejected": -0.6313707232475281, + "loss": 4.5111, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.7397260665893555, + "rewards/margins": 1.5739809274673462, + "rewards/rejected": -6.31370735168457, + "step": 63 + }, + { + "epoch": 0.14475544246536612, + "grad_norm": 54.013979661163816, + "learning_rate": 7.954872878460946e-07, + "logits/chosen": -11.172213554382324, + "logits/rejected": -10.982388496398926, + "logps/chosen": -0.4742993414402008, + "logps/rejected": -0.6619201898574829, + "loss": 4.3085, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.742993354797363, + "rewards/margins": 1.8762080669403076, + "rewards/rejected": -6.619201183319092, + "step": 64 + }, + { + "epoch": 0.14701724625388748, + "grad_norm": 81.96626905055028, + "learning_rate": 7.950007828632691e-07, + "logits/chosen": -10.859444618225098, + "logits/rejected": -10.706047058105469, + "logps/chosen": -0.5983306169509888, + "logps/rejected": -0.6700727939605713, + "loss": 4.3127, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.98330545425415, + "rewards/margins": 0.717422366142273, + "rewards/rejected": -6.700727462768555, + "step": 65 + }, + { + "epoch": 0.14927905004240882, + "grad_norm": 58.0945518868609, + "learning_rate": 7.944895427376465e-07, + "logits/chosen": -10.645411491394043, + "logits/rejected": -10.482881546020508, + "logps/chosen": -0.5315589904785156, + "logps/rejected": -0.7207262516021729, + "loss": 4.238, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.315589904785156, + "rewards/margins": 1.8916726112365723, + "rewards/rejected": -7.2072625160217285, + "step": 66 + }, + { + "epoch": 0.15154085383093016, + "grad_norm": 40.98461722424557, + "learning_rate": 7.939535994833345e-07, + "logits/chosen": -12.032543182373047, + "logits/rejected": -11.655169486999512, + "logps/chosen": -0.40713435411453247, + "logps/rejected": -0.5286428928375244, + "loss": 4.1796, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.071343421936035, + "rewards/margins": 1.215085744857788, + "rewards/rejected": -5.286429405212402, + "step": 67 + }, + { + "epoch": 0.1538026576194515, + "grad_norm": 58.64617780662404, + "learning_rate": 7.933929866613628e-07, + "logits/chosen": -11.718114852905273, + "logits/rejected": -11.243300437927246, + "logps/chosen": -0.5240508317947388, + "logps/rejected": -0.5563682317733765, + "loss": 4.6826, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.240508556365967, + "rewards/margins": 0.3231736421585083, + "rewards/rejected": -5.5636820793151855, + "step": 68 + }, + { + "epoch": 0.15606446140797287, + "grad_norm": 70.31268006854283, + "learning_rate": 7.928077393775808e-07, + "logits/chosen": -11.418298721313477, + "logits/rejected": -11.400525093078613, + "logps/chosen": -0.5047922730445862, + "logps/rejected": -0.6909648776054382, + "loss": 3.9852, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.0479230880737305, + "rewards/margins": 1.8617255687713623, + "rewards/rejected": -6.909648418426514, + "step": 69 + }, + { + "epoch": 0.1583262651964942, + "grad_norm": 80.21299453365818, + "learning_rate": 7.921978942804609e-07, + "logits/chosen": -10.426657676696777, + "logits/rejected": -10.646503448486328, + "logps/chosen": -0.5763324499130249, + "logps/rejected": -0.6343460083007812, + "loss": 4.159, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.7633256912231445, + "rewards/margins": 0.5801345705986023, + "rewards/rejected": -6.3434600830078125, + "step": 70 + }, + { + "epoch": 0.16058806898501554, + "grad_norm": 80.79055167712902, + "learning_rate": 7.915634895588021e-07, + "logits/chosen": -11.959595680236816, + "logits/rejected": -12.10846996307373, + "logps/chosen": -0.5684102177619934, + "logps/rejected": -0.5796740055084229, + "loss": 4.8753, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.684101581573486, + "rewards/margins": 0.11263775080442429, + "rewards/rejected": -5.796739101409912, + "step": 71 + }, + { + "epoch": 0.1628498727735369, + "grad_norm": 75.73711259124929, + "learning_rate": 7.909045649393394e-07, + "logits/chosen": -12.076671600341797, + "logits/rejected": -11.380012512207031, + "logps/chosen": -0.5402446389198303, + "logps/rejected": -0.5312026739120483, + "loss": 4.8356, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.402446269989014, + "rewards/margins": -0.09041957557201385, + "rewards/rejected": -5.3120269775390625, + "step": 72 + }, + { + "epoch": 0.16511167656205825, + "grad_norm": 75.75288370918786, + "learning_rate": 7.902211616842556e-07, + "logits/chosen": -10.804548263549805, + "logits/rejected": -10.961880683898926, + "logps/chosen": -0.5909055471420288, + "logps/rejected": -0.673595666885376, + "loss": 4.3771, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.909054756164551, + "rewards/margins": 0.8269017934799194, + "rewards/rejected": -6.735957145690918, + "step": 73 + }, + { + "epoch": 0.16737348035057958, + "grad_norm": 62.67088862221453, + "learning_rate": 7.89513322588598e-07, + "logits/chosen": -12.931127548217773, + "logits/rejected": -12.251081466674805, + "logps/chosen": -0.4514605700969696, + "logps/rejected": -0.5236379504203796, + "loss": 4.3108, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.51460599899292, + "rewards/margins": 0.7217735648155212, + "rewards/rejected": -5.236379623413086, + "step": 74 + }, + { + "epoch": 0.16963528413910092, + "grad_norm": 80.95433872904526, + "learning_rate": 7.887810919775976e-07, + "logits/chosen": -11.493197441101074, + "logits/rejected": -11.485479354858398, + "logps/chosen": -0.6065416932106018, + "logps/rejected": -0.6815317273139954, + "loss": 4.3937, + "rewards/accuracies": 0.53125, + "rewards/chosen": -6.065417289733887, + "rewards/margins": 0.7499004006385803, + "rewards/rejected": -6.815317630767822, + "step": 75 + }, + { + "epoch": 0.1718970879276223, + "grad_norm": 46.993337340785516, + "learning_rate": 7.880245157038949e-07, + "logits/chosen": -11.63713264465332, + "logits/rejected": -11.74251651763916, + "logps/chosen": -0.5179169178009033, + "logps/rejected": -0.5965338945388794, + "loss": 4.3272, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.179169178009033, + "rewards/margins": 0.7861694693565369, + "rewards/rejected": -5.965338706970215, + "step": 76 + }, + { + "epoch": 0.17415889171614363, + "grad_norm": 87.9689908780646, + "learning_rate": 7.872436411446671e-07, + "logits/chosen": -12.063104629516602, + "logits/rejected": -11.92426872253418, + "logps/chosen": -0.5937929749488831, + "logps/rejected": -0.7317577004432678, + "loss": 4.6281, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.937929153442383, + "rewards/margins": 1.3796474933624268, + "rewards/rejected": -7.3175764083862305, + "step": 77 + }, + { + "epoch": 0.17642069550466496, + "grad_norm": 58.552629911827445, + "learning_rate": 7.86438517198662e-07, + "logits/chosen": -11.951095581054688, + "logits/rejected": -11.894678115844727, + "logps/chosen": -0.6496031880378723, + "logps/rejected": -0.7183038592338562, + "loss": 4.4127, + "rewards/accuracies": 0.46875, + "rewards/chosen": -6.496031761169434, + "rewards/margins": 0.6870064735412598, + "rewards/rejected": -7.183038711547852, + "step": 78 + }, + { + "epoch": 0.1786824992931863, + "grad_norm": 50.220014654534005, + "learning_rate": 7.856091942831366e-07, + "logits/chosen": -12.49548625946045, + "logits/rejected": -12.11488151550293, + "logps/chosen": -0.5599091649055481, + "logps/rejected": -0.6432383060455322, + "loss": 4.5361, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.599091529846191, + "rewards/margins": 0.83329176902771, + "rewards/rejected": -6.432382583618164, + "step": 79 + }, + { + "epoch": 0.18094430308170767, + "grad_norm": 66.70412693533851, + "learning_rate": 7.847557243306982e-07, + "logits/chosen": -11.657049179077148, + "logits/rejected": -11.260804176330566, + "logps/chosen": -0.5219194293022156, + "logps/rejected": -0.6938945055007935, + "loss": 4.3537, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.219193458557129, + "rewards/margins": 1.719750165939331, + "rewards/rejected": -6.9389448165893555, + "step": 80 + }, + { + "epoch": 0.183206106870229, + "grad_norm": 63.80924880121896, + "learning_rate": 7.838781607860541e-07, + "logits/chosen": -12.70258903503418, + "logits/rejected": -12.420878410339355, + "logps/chosen": -0.625399112701416, + "logps/rejected": -0.7799273133277893, + "loss": 4.2639, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.25399112701416, + "rewards/margins": 1.5452824831008911, + "rewards/rejected": -7.7992730140686035, + "step": 81 + }, + { + "epoch": 0.18546791065875035, + "grad_norm": 57.78934306013499, + "learning_rate": 7.82976558602664e-07, + "logits/chosen": -11.984509468078613, + "logits/rejected": -12.11713695526123, + "logps/chosen": -0.5547804832458496, + "logps/rejected": -0.6996307373046875, + "loss": 4.4859, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.547804832458496, + "rewards/margins": 1.4485028982162476, + "rewards/rejected": -6.996307373046875, + "step": 82 + }, + { + "epoch": 0.1877297144472717, + "grad_norm": 74.92592513817463, + "learning_rate": 7.820509742392988e-07, + "logits/chosen": -12.603782653808594, + "logits/rejected": -12.160541534423828, + "logps/chosen": -0.6248946189880371, + "logps/rejected": -0.6692970991134644, + "loss": 4.2867, + "rewards/accuracies": 0.65625, + "rewards/chosen": -6.248946189880371, + "rewards/margins": 0.444024920463562, + "rewards/rejected": -6.692971229553223, + "step": 83 + }, + { + "epoch": 0.18999151823579305, + "grad_norm": 88.28010047139081, + "learning_rate": 7.811014656565054e-07, + "logits/chosen": -12.70538330078125, + "logits/rejected": -12.26504898071289, + "logps/chosen": -0.5449544191360474, + "logps/rejected": -0.7343254685401917, + "loss": 4.002, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.4495439529418945, + "rewards/margins": 1.8937102556228638, + "rewards/rejected": -7.343254566192627, + "step": 84 + }, + { + "epoch": 0.1922533220243144, + "grad_norm": 91.0049552486995, + "learning_rate": 7.801280923129773e-07, + "logits/chosen": -11.466194152832031, + "logits/rejected": -11.01245403289795, + "logps/chosen": -0.6134600043296814, + "logps/rejected": -0.7226859927177429, + "loss": 4.7861, + "rewards/accuracies": 0.59375, + "rewards/chosen": -6.134600639343262, + "rewards/margins": 1.0922595262527466, + "rewards/rejected": -7.226860046386719, + "step": 85 + }, + { + "epoch": 0.19451512581283573, + "grad_norm": 80.46205455624374, + "learning_rate": 7.791309151618305e-07, + "logits/chosen": -12.178560256958008, + "logits/rejected": -12.0822114944458, + "logps/chosen": -0.589695930480957, + "logps/rejected": -0.6354808211326599, + "loss": 4.5683, + "rewards/accuracies": 0.46875, + "rewards/chosen": -5.8969597816467285, + "rewards/margins": 0.45784902572631836, + "rewards/rejected": -6.354808330535889, + "step": 86 + }, + { + "epoch": 0.1967769296013571, + "grad_norm": 65.70377095012364, + "learning_rate": 7.781099966467874e-07, + "logits/chosen": -14.043821334838867, + "logits/rejected": -13.850515365600586, + "logps/chosen": -0.5184203386306763, + "logps/rejected": -0.5960665345191956, + "loss": 4.4942, + "rewards/accuracies": 0.65625, + "rewards/chosen": -5.184203147888184, + "rewards/margins": 0.7764618396759033, + "rewards/rejected": -5.960664749145508, + "step": 87 + }, + { + "epoch": 0.19903873338987843, + "grad_norm": 85.44105817952351, + "learning_rate": 7.770654006982664e-07, + "logits/chosen": -11.956082344055176, + "logits/rejected": -11.715299606323242, + "logps/chosen": -0.7433941960334778, + "logps/rejected": -0.8704244494438171, + "loss": 4.6953, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.4339423179626465, + "rewards/margins": 1.2703025341033936, + "rewards/rejected": -8.704244613647461, + "step": 88 + }, + { + "epoch": 0.20130053717839977, + "grad_norm": 66.70633497431983, + "learning_rate": 7.759971927293781e-07, + "logits/chosen": -12.565323829650879, + "logits/rejected": -12.145037651062012, + "logps/chosen": -0.5749909281730652, + "logps/rejected": -0.7184647917747498, + "loss": 4.1798, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.749909400939941, + "rewards/margins": 1.4347392320632935, + "rewards/rejected": -7.184648513793945, + "step": 89 + }, + { + "epoch": 0.2035623409669211, + "grad_norm": 61.091515513789844, + "learning_rate": 7.749054396318297e-07, + "logits/chosen": -11.981965065002441, + "logits/rejected": -11.925725936889648, + "logps/chosen": -0.6095532178878784, + "logps/rejected": -0.7097649574279785, + "loss": 4.5833, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.095531463623047, + "rewards/margins": 1.00211763381958, + "rewards/rejected": -7.097649097442627, + "step": 90 + }, + { + "epoch": 0.20582414475544247, + "grad_norm": 98.35174892551164, + "learning_rate": 7.737902097717356e-07, + "logits/chosen": -12.858875274658203, + "logits/rejected": -12.816228866577148, + "logps/chosen": -0.5555391907691956, + "logps/rejected": -0.6473320722579956, + "loss": 4.3657, + "rewards/accuracies": 0.46875, + "rewards/chosen": -5.555391788482666, + "rewards/margins": 0.9179282784461975, + "rewards/rejected": -6.473320960998535, + "step": 91 + }, + { + "epoch": 0.2080859485439638, + "grad_norm": 71.84927934229238, + "learning_rate": 7.726515729853367e-07, + "logits/chosen": -11.215812683105469, + "logits/rejected": -10.828777313232422, + "logps/chosen": -0.6313311457633972, + "logps/rejected": -0.8224250674247742, + "loss": 4.3914, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.313311576843262, + "rewards/margins": 1.9109392166137695, + "rewards/rejected": -8.224250793457031, + "step": 92 + }, + { + "epoch": 0.21034775233248515, + "grad_norm": 63.14236456247472, + "learning_rate": 7.714896005746272e-07, + "logits/chosen": -12.176814079284668, + "logits/rejected": -11.898388862609863, + "logps/chosen": -0.5294336080551147, + "logps/rejected": -0.6646890044212341, + "loss": 4.0327, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.294336318969727, + "rewards/margins": 1.3525540828704834, + "rewards/rejected": -6.646890163421631, + "step": 93 + }, + { + "epoch": 0.21260955612100652, + "grad_norm": 121.71357306492177, + "learning_rate": 7.703043653028896e-07, + "logits/chosen": -12.20483684539795, + "logits/rejected": -11.81241226196289, + "logps/chosen": -0.6999309659004211, + "logps/rejected": -0.8097646236419678, + "loss": 4.668, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.999309539794922, + "rewards/margins": 1.0983363389968872, + "rewards/rejected": -8.09764575958252, + "step": 94 + }, + { + "epoch": 0.21487135990952785, + "grad_norm": 104.68871191294573, + "learning_rate": 7.690959413901379e-07, + "logits/chosen": -13.26455307006836, + "logits/rejected": -13.093438148498535, + "logps/chosen": -0.6004514694213867, + "logps/rejected": -0.7420926690101624, + "loss": 4.5244, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.004515171051025, + "rewards/margins": 1.4164113998413086, + "rewards/rejected": -7.420926570892334, + "step": 95 + }, + { + "epoch": 0.2171331636980492, + "grad_norm": 96.3754544871051, + "learning_rate": 7.678644045084704e-07, + "logits/chosen": -13.176921844482422, + "logits/rejected": -12.706074714660645, + "logps/chosen": -0.5092126727104187, + "logps/rejected": -0.673244833946228, + "loss": 4.126, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.09212589263916, + "rewards/margins": 1.6403214931488037, + "rewards/rejected": -6.732447624206543, + "step": 96 + }, + { + "epoch": 0.21939496748657053, + "grad_norm": 64.90730762680234, + "learning_rate": 7.666098317773308e-07, + "logits/chosen": -12.79755687713623, + "logits/rejected": -12.852503776550293, + "logps/chosen": -0.730464518070221, + "logps/rejected": -0.8265626430511475, + "loss": 4.1382, + "rewards/accuracies": 0.5625, + "rewards/chosen": -7.30464506149292, + "rewards/margins": 0.9609812498092651, + "rewards/rejected": -8.265625953674316, + "step": 97 + }, + { + "epoch": 0.2216567712750919, + "grad_norm": 61.06704486908139, + "learning_rate": 7.653323017586789e-07, + "logits/chosen": -13.87999153137207, + "logits/rejected": -13.832286834716797, + "logps/chosen": -0.629042387008667, + "logps/rejected": -0.607448399066925, + "loss": 4.3219, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.290423393249512, + "rewards/margins": -0.21593987941741943, + "rewards/rejected": -6.074484348297119, + "step": 98 + }, + { + "epoch": 0.22391857506361323, + "grad_norm": 68.27361430232956, + "learning_rate": 7.640318944520711e-07, + "logits/chosen": -12.233078956604004, + "logits/rejected": -11.775206565856934, + "logps/chosen": -0.7409114241600037, + "logps/rejected": -0.9343410134315491, + "loss": 4.2391, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.409113883972168, + "rewards/margins": 1.9342964887619019, + "rewards/rejected": -9.343409538269043, + "step": 99 + }, + { + "epoch": 0.22618037885213457, + "grad_norm": 101.95762268614794, + "learning_rate": 7.627086912896511e-07, + "logits/chosen": -13.06617546081543, + "logits/rejected": -12.822061538696289, + "logps/chosen": -0.6488937139511108, + "logps/rejected": -0.6966894268989563, + "loss": 4.3114, + "rewards/accuracies": 0.59375, + "rewards/chosen": -6.4889373779296875, + "rewards/margins": 0.47795701026916504, + "rewards/rejected": -6.966893672943115, + "step": 100 + }, + { + "epoch": 0.2284421826406559, + "grad_norm": 62.41706596840518, + "learning_rate": 7.613627751310499e-07, + "logits/chosen": -13.649486541748047, + "logits/rejected": -13.323113441467285, + "logps/chosen": -0.5429686307907104, + "logps/rejected": -0.7514999508857727, + "loss": 4.1066, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.429686546325684, + "rewards/margins": 2.085312604904175, + "rewards/rejected": -7.5149993896484375, + "step": 101 + }, + { + "epoch": 0.23070398642917728, + "grad_norm": 98.8807647714299, + "learning_rate": 7.599942302581977e-07, + "logits/chosen": -13.609407424926758, + "logits/rejected": -13.286027908325195, + "logps/chosen": -0.6267982721328735, + "logps/rejected": -0.8196748495101929, + "loss": 4.085, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.267982006072998, + "rewards/margins": 1.9287660121917725, + "rewards/rejected": -8.196748733520508, + "step": 102 + }, + { + "epoch": 0.23296579021769862, + "grad_norm": 69.62248080154978, + "learning_rate": 7.586031423700457e-07, + "logits/chosen": -13.66258430480957, + "logits/rejected": -13.500720024108887, + "logps/chosen": -0.67485111951828, + "logps/rejected": -0.7922409772872925, + "loss": 4.2884, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.74851131439209, + "rewards/margins": 1.1738990545272827, + "rewards/rejected": -7.92241096496582, + "step": 103 + }, + { + "epoch": 0.23522759400621995, + "grad_norm": 128.9243427342601, + "learning_rate": 7.571895985772e-07, + "logits/chosen": -13.242142677307129, + "logits/rejected": -13.256977081298828, + "logps/chosen": -0.6713986396789551, + "logps/rejected": -0.823998749256134, + "loss": 4.4093, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.713986873626709, + "rewards/margins": 1.526000738143921, + "rewards/rejected": -8.239988327026367, + "step": 104 + }, + { + "epoch": 0.23748939779474132, + "grad_norm": 90.13452963738787, + "learning_rate": 7.557536873964661e-07, + "logits/chosen": -13.565170288085938, + "logits/rejected": -13.13564682006836, + "logps/chosen": -0.6910791993141174, + "logps/rejected": -0.9325417280197144, + "loss": 4.4509, + "rewards/accuracies": 0.78125, + "rewards/chosen": -6.910791873931885, + "rewards/margins": 2.4146251678466797, + "rewards/rejected": -9.325417518615723, + "step": 105 + }, + { + "epoch": 0.23975120158326266, + "grad_norm": 87.0306411773028, + "learning_rate": 7.542954987453069e-07, + "logits/chosen": -14.550992012023926, + "logits/rejected": -14.176923751831055, + "logps/chosen": -0.6862035393714905, + "logps/rejected": -0.8450896143913269, + "loss": 3.9713, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.862035751342773, + "rewards/margins": 1.5888599157333374, + "rewards/rejected": -8.450895309448242, + "step": 106 + }, + { + "epoch": 0.242013005371784, + "grad_norm": 79.22884465246462, + "learning_rate": 7.528151239362108e-07, + "logits/chosen": -14.102907180786133, + "logits/rejected": -13.666712760925293, + "logps/chosen": -0.6612896919250488, + "logps/rejected": -0.831270694732666, + "loss": 4.1818, + "rewards/accuracies": 0.6875, + "rewards/chosen": -6.6128973960876465, + "rewards/margins": 1.6998090744018555, + "rewards/rejected": -8.312705993652344, + "step": 107 + }, + { + "epoch": 0.24427480916030533, + "grad_norm": 127.69113358221105, + "learning_rate": 7.513126556709748e-07, + "logits/chosen": -11.86813735961914, + "logits/rejected": -11.855137825012207, + "logps/chosen": -0.6619610786437988, + "logps/rejected": -0.9614608287811279, + "loss": 3.5632, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.6196112632751465, + "rewards/margins": 2.9949963092803955, + "rewards/rejected": -9.614606857299805, + "step": 108 + }, + { + "epoch": 0.2465366129488267, + "grad_norm": 68.99980133964193, + "learning_rate": 7.497881880348984e-07, + "logits/chosen": -14.216558456420898, + "logits/rejected": -13.699520111083984, + "logps/chosen": -0.6432782411575317, + "logps/rejected": -0.8865514397621155, + "loss": 3.6564, + "rewards/accuracies": 0.71875, + "rewards/chosen": -6.432782173156738, + "rewards/margins": 2.432731866836548, + "rewards/rejected": -8.865514755249023, + "step": 109 + }, + { + "epoch": 0.24879841673734804, + "grad_norm": 94.37373909884498, + "learning_rate": 7.482418164908931e-07, + "logits/chosen": -13.685918807983398, + "logits/rejected": -13.722275733947754, + "logps/chosen": -0.7590062618255615, + "logps/rejected": -0.8727726936340332, + "loss": 4.4461, + "rewards/accuracies": 0.5625, + "rewards/chosen": -7.590063095092773, + "rewards/margins": 1.1376643180847168, + "rewards/rejected": -8.727726936340332, + "step": 110 + }, + { + "epoch": 0.2510602205258694, + "grad_norm": 104.73538958533439, + "learning_rate": 7.466736378735035e-07, + "logits/chosen": -13.90713882446289, + "logits/rejected": -13.833703994750977, + "logps/chosen": -0.9833253622055054, + "logps/rejected": -1.1174235343933105, + "loss": 4.0684, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.833253860473633, + "rewards/margins": 1.340980887413025, + "rewards/rejected": -11.174234390258789, + "step": 111 + }, + { + "epoch": 0.2533220243143907, + "grad_norm": 85.78487702365295, + "learning_rate": 7.450837503828439e-07, + "logits/chosen": -14.123536109924316, + "logits/rejected": -14.122791290283203, + "logps/chosen": -0.7747003436088562, + "logps/rejected": -0.9367392063140869, + "loss": 3.7847, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.747003555297852, + "rewards/margins": 1.6203885078430176, + "rewards/rejected": -9.367391586303711, + "step": 112 + }, + { + "epoch": 0.2555838281029121, + "grad_norm": 79.39379626286185, + "learning_rate": 7.43472253578449e-07, + "logits/chosen": -15.111526489257812, + "logits/rejected": -15.17776870727539, + "logps/chosen": -0.6799838542938232, + "logps/rejected": -0.7487653493881226, + "loss": 4.1428, + "rewards/accuracies": 0.53125, + "rewards/chosen": -6.799839019775391, + "rewards/margins": 0.6878141760826111, + "rewards/rejected": -7.487652778625488, + "step": 113 + }, + { + "epoch": 0.2578456318914334, + "grad_norm": 95.27260892673486, + "learning_rate": 7.418392483730389e-07, + "logits/chosen": -15.093989372253418, + "logits/rejected": -14.798469543457031, + "logps/chosen": -0.611186683177948, + "logps/rejected": -0.733193039894104, + "loss": 3.9567, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.111865997314453, + "rewards/margins": 1.2200640439987183, + "rewards/rejected": -7.331930637359619, + "step": 114 + }, + { + "epoch": 0.26010743567995476, + "grad_norm": 81.84816803138266, + "learning_rate": 7.401848370262012e-07, + "logits/chosen": -16.052608489990234, + "logits/rejected": -15.86906623840332, + "logps/chosen": -0.7116187810897827, + "logps/rejected": -0.8240950107574463, + "loss": 4.2147, + "rewards/accuracies": 0.71875, + "rewards/chosen": -7.116188049316406, + "rewards/margins": 1.1247621774673462, + "rewards/rejected": -8.240950584411621, + "step": 115 + }, + { + "epoch": 0.2623692394684761, + "grad_norm": 86.40835196804031, + "learning_rate": 7.385091231379856e-07, + "logits/chosen": -15.110920906066895, + "logits/rejected": -15.024141311645508, + "logps/chosen": -0.7939636707305908, + "logps/rejected": -0.9955480098724365, + "loss": 4.0034, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.93963623046875, + "rewards/margins": 2.0158443450927734, + "rewards/rejected": -9.955480575561523, + "step": 116 + }, + { + "epoch": 0.26463104325699743, + "grad_norm": 96.7390682646137, + "learning_rate": 7.368122116424182e-07, + "logits/chosen": -13.677536964416504, + "logits/rejected": -13.632445335388184, + "logps/chosen": -0.8173962235450745, + "logps/rejected": -0.8863806128501892, + "loss": 4.2779, + "rewards/accuracies": 0.53125, + "rewards/chosen": -8.173962593078613, + "rewards/margins": 0.6898432970046997, + "rewards/rejected": -8.863805770874023, + "step": 117 + }, + { + "epoch": 0.2668928470455188, + "grad_norm": 114.76228974717415, + "learning_rate": 7.350942088009289e-07, + "logits/chosen": -16.132448196411133, + "logits/rejected": -15.948546409606934, + "logps/chosen": -0.8236314058303833, + "logps/rejected": -0.9717513918876648, + "loss": 3.7875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.236313819885254, + "rewards/margins": 1.4811999797821045, + "rewards/rejected": -9.717514038085938, + "step": 118 + }, + { + "epoch": 0.26915465083404017, + "grad_norm": 124.74702715605902, + "learning_rate": 7.333552221956986e-07, + "logits/chosen": -14.226578712463379, + "logits/rejected": -13.749677658081055, + "logps/chosen": -0.9559565782546997, + "logps/rejected": -1.1939551830291748, + "loss": 3.7927, + "rewards/accuracies": 0.71875, + "rewards/chosen": -9.559566497802734, + "rewards/margins": 2.37998628616333, + "rewards/rejected": -11.939552307128906, + "step": 119 + }, + { + "epoch": 0.2714164546225615, + "grad_norm": 139.60375866242782, + "learning_rate": 7.315953607229217e-07, + "logits/chosen": -15.55072021484375, + "logits/rejected": -15.846210479736328, + "logps/chosen": -0.9729312658309937, + "logps/rejected": -1.1994317770004272, + "loss": 4.0626, + "rewards/accuracies": 0.65625, + "rewards/chosen": -9.729312896728516, + "rewards/margins": 2.265005588531494, + "rewards/rejected": -11.994318962097168, + "step": 120 + }, + { + "epoch": 0.27367825841108284, + "grad_norm": 90.79159608076576, + "learning_rate": 7.298147345859869e-07, + "logits/chosen": -15.140702247619629, + "logits/rejected": -14.700098037719727, + "logps/chosen": -0.8421116471290588, + "logps/rejected": -1.0816903114318848, + "loss": 4.0231, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.421116828918457, + "rewards/margins": 2.3957865238189697, + "rewards/rejected": -10.816903114318848, + "step": 121 + }, + { + "epoch": 0.2759400621996042, + "grad_norm": 100.537933010007, + "learning_rate": 7.280134552885762e-07, + "logits/chosen": -16.38404083251953, + "logits/rejected": -15.996622085571289, + "logps/chosen": -0.7793571949005127, + "logps/rejected": -0.9447546005249023, + "loss": 4.1454, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.793571949005127, + "rewards/margins": 1.6539742946624756, + "rewards/rejected": -9.44754695892334, + "step": 122 + }, + { + "epoch": 0.2782018659881255, + "grad_norm": 92.3597151880949, + "learning_rate": 7.261916356276831e-07, + "logits/chosen": -16.811389923095703, + "logits/rejected": -16.297218322753906, + "logps/chosen": -1.1431193351745605, + "logps/rejected": -1.4224827289581299, + "loss": 3.5578, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.431194305419922, + "rewards/margins": 2.7936320304870605, + "rewards/rejected": -14.22482681274414, + "step": 123 + }, + { + "epoch": 0.2804636697766469, + "grad_norm": 85.41024914421368, + "learning_rate": 7.243493896865486e-07, + "logits/chosen": -16.567768096923828, + "logits/rejected": -16.550174713134766, + "logps/chosen": -0.7305294275283813, + "logps/rejected": -0.8769953846931458, + "loss": 3.8388, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.305294990539551, + "rewards/margins": 1.4646586179733276, + "rewards/rejected": -8.769953727722168, + "step": 124 + }, + { + "epoch": 0.2827254735651682, + "grad_norm": 116.2256817022879, + "learning_rate": 7.224868328275169e-07, + "logits/chosen": -15.456303596496582, + "logits/rejected": -15.146892547607422, + "logps/chosen": -0.8332209587097168, + "logps/rejected": -1.0618098974227905, + "loss": 3.8379, + "rewards/accuracies": 0.78125, + "rewards/chosen": -8.332210540771484, + "rewards/margins": 2.285888433456421, + "rewards/rejected": -10.618098258972168, + "step": 125 + }, + { + "epoch": 0.28498727735368956, + "grad_norm": 171.39099454139836, + "learning_rate": 7.206040816848126e-07, + "logits/chosen": -13.179584503173828, + "logits/rejected": -13.4354887008667, + "logps/chosen": -0.7763444781303406, + "logps/rejected": -1.0508021116256714, + "loss": 4.0712, + "rewards/accuracies": 0.65625, + "rewards/chosen": -7.763444423675537, + "rewards/margins": 2.7445759773254395, + "rewards/rejected": -10.508020401000977, + "step": 126 + }, + { + "epoch": 0.2872490811422109, + "grad_norm": 113.59593931675269, + "learning_rate": 7.187012541572356e-07, + "logits/chosen": -16.92714500427246, + "logits/rejected": -16.797697067260742, + "logps/chosen": -0.8984054923057556, + "logps/rejected": -1.2400306463241577, + "loss": 3.9837, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.984055519104004, + "rewards/margins": 3.416250467300415, + "rewards/rejected": -12.400304794311523, + "step": 127 + }, + { + "epoch": 0.28951088493073224, + "grad_norm": 106.75878513488918, + "learning_rate": 7.167784694007791e-07, + "logits/chosen": -17.011579513549805, + "logits/rejected": -16.66845703125, + "logps/chosen": -0.8532888889312744, + "logps/rejected": -1.0494173765182495, + "loss": 3.6795, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.532888412475586, + "rewards/margins": 1.9612853527069092, + "rewards/rejected": -10.494174003601074, + "step": 128 + }, + { + "epoch": 0.2917726887192536, + "grad_norm": 100.82906895906213, + "learning_rate": 7.148358478211682e-07, + "logits/chosen": -17.22789764404297, + "logits/rejected": -16.70311737060547, + "logps/chosen": -1.0011430978775024, + "logps/rejected": -1.2051304578781128, + "loss": 3.8174, + "rewards/accuracies": 0.65625, + "rewards/chosen": -10.011430740356445, + "rewards/margins": 2.039872646331787, + "rewards/rejected": -12.05130386352539, + "step": 129 + }, + { + "epoch": 0.29403449250777497, + "grad_norm": 70.96241822690538, + "learning_rate": 7.128735110663187e-07, + "logits/chosen": -16.649568557739258, + "logits/rejected": -16.656688690185547, + "logps/chosen": -0.9126195907592773, + "logps/rejected": -1.2946593761444092, + "loss": 3.4187, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.12619686126709, + "rewards/margins": 3.8203978538513184, + "rewards/rejected": -12.946593284606934, + "step": 130 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 101.80550264504929, + "learning_rate": 7.108915820187211e-07, + "logits/chosen": -14.773153305053711, + "logits/rejected": -14.596695899963379, + "logps/chosen": -0.9897314310073853, + "logps/rejected": -1.3485132455825806, + "loss": 3.2847, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.89731502532959, + "rewards/margins": 3.587818145751953, + "rewards/rejected": -13.48513126373291, + "step": 131 + }, + { + "epoch": 0.29855810008481765, + "grad_norm": 118.07277179415749, + "learning_rate": 7.088901847877447e-07, + "logits/chosen": -15.380992889404297, + "logits/rejected": -15.349335670471191, + "logps/chosen": -0.9675842523574829, + "logps/rejected": -1.2969377040863037, + "loss": 4.5014, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.67584228515625, + "rewards/margins": 3.293534755706787, + "rewards/rejected": -12.969377517700195, + "step": 132 + }, + { + "epoch": 0.300819903873339, + "grad_norm": 126.2514417468051, + "learning_rate": 7.068694447018658e-07, + "logits/chosen": -16.715206146240234, + "logits/rejected": -16.719194412231445, + "logps/chosen": -0.8660197854042053, + "logps/rejected": -0.9733752608299255, + "loss": 3.7002, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.660198211669922, + "rewards/margins": 1.073554515838623, + "rewards/rejected": -9.733752250671387, + "step": 133 + }, + { + "epoch": 0.3030817076618603, + "grad_norm": 115.59595983452179, + "learning_rate": 7.048294883008199e-07, + "logits/chosen": -17.525606155395508, + "logits/rejected": -17.24049186706543, + "logps/chosen": -0.9643100500106812, + "logps/rejected": -1.2070279121398926, + "loss": 3.5993, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.64310073852539, + "rewards/margins": 2.4271788597106934, + "rewards/rejected": -12.070280075073242, + "step": 134 + }, + { + "epoch": 0.3053435114503817, + "grad_norm": 135.03715755778867, + "learning_rate": 7.027704433276776e-07, + "logits/chosen": -18.083145141601562, + "logits/rejected": -17.37511444091797, + "logps/chosen": -0.9572893381118774, + "logps/rejected": -1.3515632152557373, + "loss": 3.6303, + "rewards/accuracies": 0.78125, + "rewards/chosen": -9.572893142700195, + "rewards/margins": 3.9427390098571777, + "rewards/rejected": -13.515631675720215, + "step": 135 + }, + { + "epoch": 0.307605315238903, + "grad_norm": 123.3145878922545, + "learning_rate": 7.006924387208452e-07, + "logits/chosen": -16.337995529174805, + "logits/rejected": -16.18612289428711, + "logps/chosen": -0.7492311596870422, + "logps/rejected": -0.9364847540855408, + "loss": 3.8638, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.492311000823975, + "rewards/margins": 1.8725361824035645, + "rewards/rejected": -9.364849090576172, + "step": 136 + }, + { + "epoch": 0.30986711902742436, + "grad_norm": 110.47458672576012, + "learning_rate": 6.985956046059904e-07, + "logits/chosen": -15.281987190246582, + "logits/rejected": -15.222082138061523, + "logps/chosen": -0.8718824982643127, + "logps/rejected": -1.2887756824493408, + "loss": 3.8853, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.718825340270996, + "rewards/margins": 4.168931007385254, + "rewards/rejected": -12.887757301330566, + "step": 137 + }, + { + "epoch": 0.31212892281594573, + "grad_norm": 93.15535964190208, + "learning_rate": 6.964800722878945e-07, + "logits/chosen": -16.852909088134766, + "logits/rejected": -16.60708999633789, + "logps/chosen": -0.8743470907211304, + "logps/rejected": -1.0895050764083862, + "loss": 3.3061, + "rewards/accuracies": 0.59375, + "rewards/chosen": -8.743470191955566, + "rewards/margins": 2.151580810546875, + "rewards/rejected": -10.895051002502441, + "step": 138 + }, + { + "epoch": 0.31439072660446704, + "grad_norm": 116.25516493400698, + "learning_rate": 6.943459742422287e-07, + "logits/chosen": -16.162519454956055, + "logits/rejected": -15.761104583740234, + "logps/chosen": -1.1220320463180542, + "logps/rejected": -1.5008246898651123, + "loss": 3.8586, + "rewards/accuracies": 0.71875, + "rewards/chosen": -11.220320701599121, + "rewards/margins": 3.7879250049591064, + "rewards/rejected": -15.008245468139648, + "step": 139 + }, + { + "epoch": 0.3166525303929884, + "grad_norm": 115.45171227467374, + "learning_rate": 6.921934441072597e-07, + "logits/chosen": -17.508764266967773, + "logits/rejected": -17.509645462036133, + "logps/chosen": -1.0907777547836304, + "logps/rejected": -1.355396032333374, + "loss": 3.8912, + "rewards/accuracies": 0.65625, + "rewards/chosen": -10.90777587890625, + "rewards/margins": 2.646184206008911, + "rewards/rejected": -13.553960800170898, + "step": 140 + }, + { + "epoch": 0.3189143341815098, + "grad_norm": 145.15074217143749, + "learning_rate": 6.900226166754807e-07, + "logits/chosen": -16.634740829467773, + "logits/rejected": -16.976970672607422, + "logps/chosen": -1.3348404169082642, + "logps/rejected": -1.4817439317703247, + "loss": 4.6661, + "rewards/accuracies": 0.78125, + "rewards/chosen": -13.348404884338379, + "rewards/margins": 1.469035029411316, + "rewards/rejected": -14.817439079284668, + "step": 141 + }, + { + "epoch": 0.3211761379700311, + "grad_norm": 108.45634524328419, + "learning_rate": 6.8783362788517e-07, + "logits/chosen": -16.96560287475586, + "logits/rejected": -16.87692642211914, + "logps/chosen": -1.285522699356079, + "logps/rejected": -1.6592097282409668, + "loss": 3.4579, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.855226516723633, + "rewards/margins": 3.7368712425231934, + "rewards/rejected": -16.592098236083984, + "step": 142 + }, + { + "epoch": 0.32343794175855245, + "grad_norm": 108.50675097342489, + "learning_rate": 6.856266148118796e-07, + "logits/chosen": -16.803754806518555, + "logits/rejected": -17.20581817626953, + "logps/chosen": -1.0886934995651245, + "logps/rejected": -1.5176775455474854, + "loss": 3.3977, + "rewards/accuracies": 0.8125, + "rewards/chosen": -10.886935234069824, + "rewards/margins": 4.2898406982421875, + "rewards/rejected": -15.176775932312012, + "step": 143 + }, + { + "epoch": 0.3256997455470738, + "grad_norm": 135.30929548671452, + "learning_rate": 6.834017156598512e-07, + "logits/chosen": -17.159934997558594, + "logits/rejected": -16.915081024169922, + "logps/chosen": -1.0447005033493042, + "logps/rejected": -1.5800331830978394, + "loss": 3.5652, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.447005271911621, + "rewards/margins": 5.353327751159668, + "rewards/rejected": -15.800333023071289, + "step": 144 + }, + { + "epoch": 0.3279615493355951, + "grad_norm": 92.79875046989964, + "learning_rate": 6.811590697533607e-07, + "logits/chosen": -18.941160202026367, + "logits/rejected": -18.880695343017578, + "logps/chosen": -1.2164652347564697, + "logps/rejected": -1.3915354013442993, + "loss": 3.8421, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.164650917053223, + "rewards/margins": 1.7507033348083496, + "rewards/rejected": -13.915353775024414, + "step": 145 + }, + { + "epoch": 0.3302233531241165, + "grad_norm": 152.87216949016505, + "learning_rate": 6.788988175279951e-07, + "logits/chosen": -17.32018280029297, + "logits/rejected": -17.33584976196289, + "logps/chosen": -1.1823511123657227, + "logps/rejected": -1.55608069896698, + "loss": 3.9364, + "rewards/accuracies": 0.71875, + "rewards/chosen": -11.823511123657227, + "rewards/margins": 3.7372941970825195, + "rewards/rejected": -15.560805320739746, + "step": 146 + }, + { + "epoch": 0.3324851569126378, + "grad_norm": 112.74886884591804, + "learning_rate": 6.766211005218577e-07, + "logits/chosen": -17.034011840820312, + "logits/rejected": -16.896516799926758, + "logps/chosen": -1.0992332696914673, + "logps/rejected": -1.6042219400405884, + "loss": 3.2047, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.992332458496094, + "rewards/margins": 5.049887180328369, + "rewards/rejected": -16.042219161987305, + "step": 147 + }, + { + "epoch": 0.33474696070115917, + "grad_norm": 102.37494798795927, + "learning_rate": 6.743260613667047e-07, + "logits/chosen": -20.16105079650879, + "logits/rejected": -20.097129821777344, + "logps/chosen": -1.3072175979614258, + "logps/rejected": -1.7791482210159302, + "loss": 3.6159, + "rewards/accuracies": 0.84375, + "rewards/chosen": -13.072174072265625, + "rewards/margins": 4.719306945800781, + "rewards/rejected": -17.79148292541504, + "step": 148 + }, + { + "epoch": 0.33700876448968053, + "grad_norm": 92.8889637457248, + "learning_rate": 6.720138437790139e-07, + "logits/chosen": -18.851449966430664, + "logits/rejected": -19.006174087524414, + "logps/chosen": -1.1900596618652344, + "logps/rejected": -1.6059695482254028, + "loss": 3.0262, + "rewards/accuracies": 0.71875, + "rewards/chosen": -11.900595664978027, + "rewards/margins": 4.1590986251831055, + "rewards/rejected": -16.059694290161133, + "step": 149 + }, + { + "epoch": 0.33927056827820185, + "grad_norm": 128.41082842918968, + "learning_rate": 6.696845925509848e-07, + "logits/chosen": -17.904184341430664, + "logits/rejected": -17.44746208190918, + "logps/chosen": -1.3113856315612793, + "logps/rejected": -1.5605354309082031, + "loss": 3.9659, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.113856315612793, + "rewards/margins": 2.491497755050659, + "rewards/rejected": -15.605354309082031, + "step": 150 + }, + { + "epoch": 0.3415323720667232, + "grad_norm": 120.26296110157324, + "learning_rate": 6.673384535414718e-07, + "logits/chosen": -18.292760848999023, + "logits/rejected": -18.07993507385254, + "logps/chosen": -1.2153687477111816, + "logps/rejected": -1.442406415939331, + "loss": 4.4772, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.153687477111816, + "rewards/margins": 2.270376205444336, + "rewards/rejected": -14.424064636230469, + "step": 151 + }, + { + "epoch": 0.3437941758552446, + "grad_norm": 108.61106396576423, + "learning_rate": 6.649755736668511e-07, + "logits/chosen": -16.841121673583984, + "logits/rejected": -16.501068115234375, + "logps/chosen": -1.1176737546920776, + "logps/rejected": -1.6219063997268677, + "loss": 2.9905, + "rewards/accuracies": 0.90625, + "rewards/chosen": -11.176735877990723, + "rewards/margins": 5.042326927185059, + "rewards/rejected": -16.21906280517578, + "step": 152 + }, + { + "epoch": 0.3460559796437659, + "grad_norm": 128.2647564085794, + "learning_rate": 6.625961008918192e-07, + "logits/chosen": -18.715444564819336, + "logits/rejected": -18.511932373046875, + "logps/chosen": -1.2932363748550415, + "logps/rejected": -1.488201379776001, + "loss": 3.3092, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.932364463806152, + "rewards/margins": 1.9496493339538574, + "rewards/rejected": -14.882014274597168, + "step": 153 + }, + { + "epoch": 0.34831778343228725, + "grad_norm": 119.57670893186517, + "learning_rate": 6.602001842201289e-07, + "logits/chosen": -16.945566177368164, + "logits/rejected": -17.048381805419922, + "logps/chosen": -1.1972765922546387, + "logps/rejected": -1.4838308095932007, + "loss": 3.6387, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.972765922546387, + "rewards/margins": 2.8655428886413574, + "rewards/rejected": -14.838308334350586, + "step": 154 + }, + { + "epoch": 0.3505795872208086, + "grad_norm": 133.44404947093932, + "learning_rate": 6.577879736852571e-07, + "logits/chosen": -17.318836212158203, + "logits/rejected": -17.2701473236084, + "logps/chosen": -1.3334428071975708, + "logps/rejected": -1.554374098777771, + "loss": 3.9227, + "rewards/accuracies": 0.71875, + "rewards/chosen": -13.334427833557129, + "rewards/margins": 2.2093122005462646, + "rewards/rejected": -15.543739318847656, + "step": 155 + }, + { + "epoch": 0.35284139100932993, + "grad_norm": 108.58327489587623, + "learning_rate": 6.553596203410112e-07, + "logits/chosen": -16.674957275390625, + "logits/rejected": -16.426877975463867, + "logps/chosen": -1.0741811990737915, + "logps/rejected": -1.548369288444519, + "loss": 3.0135, + "rewards/accuracies": 0.84375, + "rewards/chosen": -10.741811752319336, + "rewards/margins": 4.741880893707275, + "rewards/rejected": -15.483692169189453, + "step": 156 + }, + { + "epoch": 0.3551031947978513, + "grad_norm": 114.21004986072782, + "learning_rate": 6.529152762520688e-07, + "logits/chosen": -18.453733444213867, + "logits/rejected": -18.391489028930664, + "logps/chosen": -1.3322038650512695, + "logps/rejected": -1.5323253870010376, + "loss": 3.8828, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.322039604187012, + "rewards/margins": 2.0012147426605225, + "rewards/rejected": -15.323253631591797, + "step": 157 + }, + { + "epoch": 0.3573649985863726, + "grad_norm": 140.31074175994047, + "learning_rate": 6.504550944844558e-07, + "logits/chosen": -16.778514862060547, + "logits/rejected": -16.645111083984375, + "logps/chosen": -1.3454078435897827, + "logps/rejected": -1.8546462059020996, + "loss": 3.6056, + "rewards/accuracies": 0.8125, + "rewards/chosen": -13.454076766967773, + "rewards/margins": 5.092383861541748, + "rewards/rejected": -18.546463012695312, + "step": 158 + }, + { + "epoch": 0.359626802374894, + "grad_norm": 137.59691231656896, + "learning_rate": 6.479792290959613e-07, + "logits/chosen": -16.80532455444336, + "logits/rejected": -16.812410354614258, + "logps/chosen": -1.4085781574249268, + "logps/rejected": -1.807809591293335, + "loss": 3.3238, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.085782051086426, + "rewards/margins": 3.9923133850097656, + "rewards/rejected": -18.078096389770508, + "step": 159 + }, + { + "epoch": 0.36188860616341534, + "grad_norm": 122.31193582134684, + "learning_rate": 6.454878351264906e-07, + "logits/chosen": -17.32520866394043, + "logits/rejected": -17.327320098876953, + "logps/chosen": -1.2008891105651855, + "logps/rejected": -1.3473906517028809, + "loss": 3.8266, + "rewards/accuracies": 0.65625, + "rewards/chosen": -12.008890151977539, + "rewards/margins": 1.4650166034698486, + "rewards/rejected": -13.473907470703125, + "step": 160 + }, + { + "epoch": 0.36415040995193665, + "grad_norm": 131.3884611405496, + "learning_rate": 6.429810685883565e-07, + "logits/chosen": -16.57655143737793, + "logits/rejected": -16.592082977294922, + "logps/chosen": -1.295839786529541, + "logps/rejected": -1.7100400924682617, + "loss": 3.3158, + "rewards/accuracies": 0.71875, + "rewards/chosen": -12.958398818969727, + "rewards/margins": 4.142002582550049, + "rewards/rejected": -17.100400924682617, + "step": 161 + }, + { + "epoch": 0.366412213740458, + "grad_norm": 131.23411742145908, + "learning_rate": 6.404590864565088e-07, + "logits/chosen": -17.949878692626953, + "logits/rejected": -18.036605834960938, + "logps/chosen": -1.171650767326355, + "logps/rejected": -1.3193507194519043, + "loss": 3.9769, + "rewards/accuracies": 0.53125, + "rewards/chosen": -11.716507911682129, + "rewards/margins": 1.4770005941390991, + "rewards/rejected": -13.19350814819336, + "step": 162 + }, + { + "epoch": 0.3686740175289794, + "grad_norm": 118.65175820446706, + "learning_rate": 6.379220466587063e-07, + "logits/chosen": -19.885251998901367, + "logits/rejected": -19.330791473388672, + "logps/chosen": -1.2915012836456299, + "logps/rejected": -1.452092170715332, + "loss": 3.4104, + "rewards/accuracies": 0.71875, + "rewards/chosen": -12.91501235961914, + "rewards/margins": 1.6059094667434692, + "rewards/rejected": -14.520920753479004, + "step": 163 + }, + { + "epoch": 0.3709358213175007, + "grad_norm": 115.53093238478114, + "learning_rate": 6.353701080656254e-07, + "logits/chosen": -18.273351669311523, + "logits/rejected": -18.36724090576172, + "logps/chosen": -1.3921793699264526, + "logps/rejected": -1.6417995691299438, + "loss": 3.3543, + "rewards/accuracies": 0.6875, + "rewards/chosen": -13.921795845031738, + "rewards/margins": 2.496201276779175, + "rewards/rejected": -16.41799545288086, + "step": 164 + }, + { + "epoch": 0.37319762510602206, + "grad_norm": 124.5033686839827, + "learning_rate": 6.32803430480913e-07, + "logits/chosen": -18.828781127929688, + "logits/rejected": -18.2823429107666, + "logps/chosen": -1.3336889743804932, + "logps/rejected": -1.6634752750396729, + "loss": 3.7914, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.336889266967773, + "rewards/margins": 3.297863721847534, + "rewards/rejected": -16.63475227355957, + "step": 165 + }, + { + "epoch": 0.3754594288945434, + "grad_norm": 147.97213663568766, + "learning_rate": 6.302221746311782e-07, + "logits/chosen": -16.221012115478516, + "logits/rejected": -15.638816833496094, + "logps/chosen": -1.24800443649292, + "logps/rejected": -1.5752636194229126, + "loss": 3.9877, + "rewards/accuracies": 0.65625, + "rewards/chosen": -12.480045318603516, + "rewards/margins": 3.272590398788452, + "rewards/rejected": -15.752635955810547, + "step": 166 + }, + { + "epoch": 0.37772123268306473, + "grad_norm": 117.72793765671894, + "learning_rate": 6.276265021559288e-07, + "logits/chosen": -17.692129135131836, + "logits/rejected": -17.678510665893555, + "logps/chosen": -1.4085066318511963, + "logps/rejected": -1.5413269996643066, + "loss": 3.7453, + "rewards/accuracies": 0.59375, + "rewards/chosen": -14.085065841674805, + "rewards/margins": 1.3282032012939453, + "rewards/rejected": -15.41326904296875, + "step": 167 + }, + { + "epoch": 0.3799830364715861, + "grad_norm": 123.6576549431064, + "learning_rate": 6.250165755974487e-07, + "logits/chosen": -18.768051147460938, + "logits/rejected": -18.692035675048828, + "logps/chosen": -1.2646162509918213, + "logps/rejected": -1.4007298946380615, + "loss": 3.4565, + "rewards/accuracies": 0.59375, + "rewards/chosen": -12.646160125732422, + "rewards/margins": 1.361138939857483, + "rewards/rejected": -14.00730037689209, + "step": 168 + }, + { + "epoch": 0.3822448402601074, + "grad_norm": 111.39581567132997, + "learning_rate": 6.223925583906192e-07, + "logits/chosen": -18.353418350219727, + "logits/rejected": -17.735267639160156, + "logps/chosen": -1.327247142791748, + "logps/rejected": -1.6749954223632812, + "loss": 3.351, + "rewards/accuracies": 0.65625, + "rewards/chosen": -13.272473335266113, + "rewards/margins": 3.477482318878174, + "rewards/rejected": -16.749956130981445, + "step": 169 + }, + { + "epoch": 0.3845066440486288, + "grad_norm": 115.58098332357874, + "learning_rate": 6.19754614852685e-07, + "logits/chosen": -17.524866104125977, + "logits/rejected": -17.590625762939453, + "logps/chosen": -1.1806279420852661, + "logps/rejected": -1.5952324867248535, + "loss": 3.6054, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.806280136108398, + "rewards/margins": 4.146044731140137, + "rewards/rejected": -15.952325820922852, + "step": 170 + }, + { + "epoch": 0.38676844783715014, + "grad_norm": 107.33538738128671, + "learning_rate": 6.171029101729644e-07, + "logits/chosen": -17.01272964477539, + "logits/rejected": -17.01720428466797, + "logps/chosen": -1.2651264667510986, + "logps/rejected": -1.5476595163345337, + "loss": 3.5012, + "rewards/accuracies": 0.65625, + "rewards/chosen": -12.651265144348145, + "rewards/margins": 2.8253297805786133, + "rewards/rejected": -15.476594924926758, + "step": 171 + }, + { + "epoch": 0.38903025162567145, + "grad_norm": 125.94524393171766, + "learning_rate": 6.144376104025055e-07, + "logits/chosen": -16.98217010498047, + "logits/rejected": -16.957050323486328, + "logps/chosen": -1.1778655052185059, + "logps/rejected": -1.4646776914596558, + "loss": 3.2286, + "rewards/accuracies": 0.71875, + "rewards/chosen": -11.778654098510742, + "rewards/margins": 2.8681228160858154, + "rewards/rejected": -14.64677619934082, + "step": 172 + }, + { + "epoch": 0.3912920554141928, + "grad_norm": 115.30228389834534, + "learning_rate": 6.117588824436873e-07, + "logits/chosen": -17.979293823242188, + "logits/rejected": -17.900651931762695, + "logps/chosen": -1.158996820449829, + "logps/rejected": -1.3833591938018799, + "loss": 3.6577, + "rewards/accuracies": 0.59375, + "rewards/chosen": -11.589967727661133, + "rewards/margins": 2.243624448776245, + "rewards/rejected": -13.83359146118164, + "step": 173 + }, + { + "epoch": 0.3935538592027142, + "grad_norm": 136.92412625465224, + "learning_rate": 6.090668940397688e-07, + "logits/chosen": -17.41019058227539, + "logits/rejected": -17.082996368408203, + "logps/chosen": -1.200378179550171, + "logps/rejected": -1.5303970575332642, + "loss": 3.3817, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.003782272338867, + "rewards/margins": 3.3001890182495117, + "rewards/rejected": -15.303971290588379, + "step": 174 + }, + { + "epoch": 0.3958156629912355, + "grad_norm": 97.25464380215708, + "learning_rate": 6.063618137643844e-07, + "logits/chosen": -17.9305419921875, + "logits/rejected": -17.675626754760742, + "logps/chosen": -1.0930852890014648, + "logps/rejected": -1.3468796014785767, + "loss": 3.0673, + "rewards/accuracies": 0.65625, + "rewards/chosen": -10.930851936340332, + "rewards/margins": 2.5379440784454346, + "rewards/rejected": -13.468796730041504, + "step": 175 + }, + { + "epoch": 0.39807746677975686, + "grad_norm": 109.33547647771188, + "learning_rate": 6.03643811010988e-07, + "logits/chosen": -18.435937881469727, + "logits/rejected": -18.244789123535156, + "logps/chosen": -1.421694278717041, + "logps/rejected": -1.6195881366729736, + "loss": 3.4374, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.216943740844727, + "rewards/margins": 1.9789376258850098, + "rewards/rejected": -16.195880889892578, + "step": 176 + }, + { + "epoch": 0.4003392705682782, + "grad_norm": 124.54512017176646, + "learning_rate": 6.009130559822453e-07, + "logits/chosen": -18.292753219604492, + "logits/rejected": -18.013795852661133, + "logps/chosen": -1.2830564975738525, + "logps/rejected": -1.6480597257614136, + "loss": 3.5887, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.830564498901367, + "rewards/margins": 3.6500320434570312, + "rewards/rejected": -16.4805965423584, + "step": 177 + }, + { + "epoch": 0.40260107435679954, + "grad_norm": 117.11523823927375, + "learning_rate": 5.981697196793758e-07, + "logits/chosen": -17.92546844482422, + "logits/rejected": -17.709861755371094, + "logps/chosen": -1.328133225440979, + "logps/rejected": -1.6237107515335083, + "loss": 3.5286, + "rewards/accuracies": 0.71875, + "rewards/chosen": -13.281332015991211, + "rewards/margins": 2.9557762145996094, + "rewards/rejected": -16.23710823059082, + "step": 178 + }, + { + "epoch": 0.4048628781453209, + "grad_norm": 135.21248374147646, + "learning_rate": 5.954139738914446e-07, + "logits/chosen": -16.00653648376465, + "logits/rejected": -16.205612182617188, + "logps/chosen": -1.5146088600158691, + "logps/rejected": -1.722532868385315, + "loss": 3.3813, + "rewards/accuracies": 0.78125, + "rewards/chosen": -15.146087646484375, + "rewards/margins": 2.079242467880249, + "rewards/rejected": -17.225330352783203, + "step": 179 + }, + { + "epoch": 0.4071246819338422, + "grad_norm": 118.75409568384903, + "learning_rate": 5.92645991184605e-07, + "logits/chosen": -18.836807250976562, + "logits/rejected": -18.096431732177734, + "logps/chosen": -1.328464388847351, + "logps/rejected": -1.7455822229385376, + "loss": 3.0959, + "rewards/accuracies": 0.71875, + "rewards/chosen": -13.284643173217773, + "rewards/margins": 4.171177387237549, + "rewards/rejected": -17.455821990966797, + "step": 180 + }, + { + "epoch": 0.4093864857223636, + "grad_norm": 149.61294643110665, + "learning_rate": 5.898659448912917e-07, + "logits/chosen": -19.704021453857422, + "logits/rejected": -19.59141731262207, + "logps/chosen": -1.322415828704834, + "logps/rejected": -1.6818199157714844, + "loss": 3.99, + "rewards/accuracies": 0.65625, + "rewards/chosen": -13.224160194396973, + "rewards/margins": 3.5940399169921875, + "rewards/rejected": -16.818199157714844, + "step": 181 + }, + { + "epoch": 0.41164828951088495, + "grad_norm": 122.65957791889335, + "learning_rate": 5.870740090993676e-07, + "logits/chosen": -18.265836715698242, + "logits/rejected": -18.42105484008789, + "logps/chosen": -1.5347073078155518, + "logps/rejected": -1.7209949493408203, + "loss": 3.171, + "rewards/accuracies": 0.625, + "rewards/chosen": -15.347073554992676, + "rewards/margins": 1.8628755807876587, + "rewards/rejected": -17.209949493408203, + "step": 182 + }, + { + "epoch": 0.41391009329940626, + "grad_norm": 135.01368423233092, + "learning_rate": 5.842703586412214e-07, + "logits/chosen": -18.84733009338379, + "logits/rejected": -18.886695861816406, + "logps/chosen": -1.4349242448806763, + "logps/rejected": -1.703200340270996, + "loss": 3.7337, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.3492431640625, + "rewards/margins": 2.6827609539031982, + "rewards/rejected": -17.03200340270996, + "step": 183 + }, + { + "epoch": 0.4161718970879276, + "grad_norm": 106.57687497552105, + "learning_rate": 5.814551690828203e-07, + "logits/chosen": -18.91258430480957, + "logits/rejected": -18.3468017578125, + "logps/chosen": -1.2373554706573486, + "logps/rejected": -1.6082764863967896, + "loss": 3.0833, + "rewards/accuracies": 0.78125, + "rewards/chosen": -12.373554229736328, + "rewards/margins": 3.709210157394409, + "rewards/rejected": -16.082765579223633, + "step": 184 + }, + { + "epoch": 0.418433700876449, + "grad_norm": 139.70919536594133, + "learning_rate": 5.786286167127155e-07, + "logits/chosen": -18.484155654907227, + "logits/rejected": -18.344846725463867, + "logps/chosen": -1.4353551864624023, + "logps/rejected": -1.933018684387207, + "loss": 3.4633, + "rewards/accuracies": 0.8125, + "rewards/chosen": -14.353551864624023, + "rewards/margins": 4.976635932922363, + "rewards/rejected": -19.33018684387207, + "step": 185 + }, + { + "epoch": 0.4206955046649703, + "grad_norm": 128.37055319468598, + "learning_rate": 5.757908785310031e-07, + "logits/chosen": -17.390743255615234, + "logits/rejected": -17.32730484008789, + "logps/chosen": -1.4048118591308594, + "logps/rejected": -1.8442890644073486, + "loss": 3.7353, + "rewards/accuracies": 0.78125, + "rewards/chosen": -14.048118591308594, + "rewards/margins": 4.394770622253418, + "rewards/rejected": -18.442888259887695, + "step": 186 + }, + { + "epoch": 0.42295730845349166, + "grad_norm": 125.14297515296605, + "learning_rate": 5.729421322382399e-07, + "logits/chosen": -16.7410888671875, + "logits/rejected": -16.759994506835938, + "logps/chosen": -1.1190869808197021, + "logps/rejected": -1.3602699041366577, + "loss": 3.4939, + "rewards/accuracies": 0.71875, + "rewards/chosen": -11.19087028503418, + "rewards/margins": 2.4118287563323975, + "rewards/rejected": -13.602697372436523, + "step": 187 + }, + { + "epoch": 0.42521911224201303, + "grad_norm": 121.636291085123, + "learning_rate": 5.700825562243163e-07, + "logits/chosen": -17.965713500976562, + "logits/rejected": -17.82717514038086, + "logps/chosen": -1.3181742429733276, + "logps/rejected": -1.5977309942245483, + "loss": 3.2506, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.181741714477539, + "rewards/margins": 2.7955687046051025, + "rewards/rejected": -15.977310180664062, + "step": 188 + }, + { + "epoch": 0.42748091603053434, + "grad_norm": 115.58317098156645, + "learning_rate": 5.672123295572854e-07, + "logits/chosen": -16.11078453063965, + "logits/rejected": -15.989376068115234, + "logps/chosen": -1.318684458732605, + "logps/rejected": -1.7406071424484253, + "loss": 3.2275, + "rewards/accuracies": 0.71875, + "rewards/chosen": -13.186845779418945, + "rewards/margins": 4.219226837158203, + "rewards/rejected": -17.40607261657715, + "step": 189 + }, + { + "epoch": 0.4297427198190557, + "grad_norm": 110.76706668825072, + "learning_rate": 5.643316319721487e-07, + "logits/chosen": -21.12446403503418, + "logits/rejected": -21.084503173828125, + "logps/chosen": -1.7526358366012573, + "logps/rejected": -1.8315396308898926, + "loss": 3.7724, + "rewards/accuracies": 0.59375, + "rewards/chosen": -17.52635955810547, + "rewards/margins": 0.789039134979248, + "rewards/rejected": -18.315397262573242, + "step": 190 + }, + { + "epoch": 0.432004523607577, + "grad_norm": 128.02917137545202, + "learning_rate": 5.614406438596026e-07, + "logits/chosen": -18.617406845092773, + "logits/rejected": -18.170780181884766, + "logps/chosen": -1.6118402481079102, + "logps/rejected": -1.8639785051345825, + "loss": 3.8619, + "rewards/accuracies": 0.65625, + "rewards/chosen": -16.1184024810791, + "rewards/margins": 2.521383285522461, + "rewards/rejected": -18.639785766601562, + "step": 191 + }, + { + "epoch": 0.4342663273960984, + "grad_norm": 128.62006828713473, + "learning_rate": 5.585395462547406e-07, + "logits/chosen": -17.90593719482422, + "logits/rejected": -18.07216453552246, + "logps/chosen": -1.6061056852340698, + "logps/rejected": -1.8645837306976318, + "loss": 3.6755, + "rewards/accuracies": 0.65625, + "rewards/chosen": -16.06105613708496, + "rewards/margins": 2.5847792625427246, + "rewards/rejected": -18.645837783813477, + "step": 192 + }, + { + "epoch": 0.43652813118461975, + "grad_norm": 115.59247682163482, + "learning_rate": 5.55628520825718e-07, + "logits/chosen": -17.22564125061035, + "logits/rejected": -17.25448989868164, + "logps/chosen": -1.4208124876022339, + "logps/rejected": -1.9312503337860107, + "loss": 3.3126, + "rewards/accuracies": 0.8125, + "rewards/chosen": -14.208124160766602, + "rewards/margins": 5.104379177093506, + "rewards/rejected": -19.312503814697266, + "step": 193 + }, + { + "epoch": 0.43878993497314106, + "grad_norm": 125.18412016900433, + "learning_rate": 5.527077498623752e-07, + "logits/chosen": -16.629623413085938, + "logits/rejected": -16.644580841064453, + "logps/chosen": -1.4364906549453735, + "logps/rejected": -1.7086538076400757, + "loss": 3.5968, + "rewards/accuracies": 0.6875, + "rewards/chosen": -14.364906311035156, + "rewards/margins": 2.7216320037841797, + "rewards/rejected": -17.086536407470703, + "step": 194 + }, + { + "epoch": 0.4410517387616624, + "grad_norm": 95.71934222963769, + "learning_rate": 5.497774162648228e-07, + "logits/chosen": -17.9370059967041, + "logits/rejected": -17.442899703979492, + "logps/chosen": -1.521501064300537, + "logps/rejected": -2.050462484359741, + "loss": 2.8302, + "rewards/accuracies": 0.8125, + "rewards/chosen": -15.215010643005371, + "rewards/margins": 5.289614677429199, + "rewards/rejected": -20.50462532043457, + "step": 195 + }, + { + "epoch": 0.4433135425501838, + "grad_norm": 118.37601502769598, + "learning_rate": 5.468377035319882e-07, + "logits/chosen": -18.176965713500977, + "logits/rejected": -17.80450439453125, + "logps/chosen": -1.5505520105361938, + "logps/rejected": -2.088866949081421, + "loss": 3.251, + "rewards/accuracies": 0.6875, + "rewards/chosen": -15.505517959594727, + "rewards/margins": 5.383152008056641, + "rewards/rejected": -20.888671875, + "step": 196 + }, + { + "epoch": 0.4455753463387051, + "grad_norm": 114.61725176586181, + "learning_rate": 5.438887957501248e-07, + "logits/chosen": -18.173625946044922, + "logits/rejected": -18.10104751586914, + "logps/chosen": -1.6463440656661987, + "logps/rejected": -1.9076793193817139, + "loss": 3.6095, + "rewards/accuracies": 0.71875, + "rewards/chosen": -16.46343994140625, + "rewards/margins": 2.6133527755737305, + "rewards/rejected": -19.076791763305664, + "step": 197 + }, + { + "epoch": 0.44783715012722647, + "grad_norm": 131.97444146862202, + "learning_rate": 5.409308775812844e-07, + "logits/chosen": -17.92854118347168, + "logits/rejected": -18.174327850341797, + "logps/chosen": -1.7027937173843384, + "logps/rejected": -1.9399760961532593, + "loss": 3.6652, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.027938842773438, + "rewards/margins": 2.3718223571777344, + "rewards/rejected": -19.39975929260254, + "step": 198 + }, + { + "epoch": 0.45009895391574783, + "grad_norm": 112.00245200831505, + "learning_rate": 5.379641342517541e-07, + "logits/chosen": -17.90815544128418, + "logits/rejected": -17.855358123779297, + "logps/chosen": -1.369492530822754, + "logps/rejected": -1.7647912502288818, + "loss": 3.6049, + "rewards/accuracies": 0.65625, + "rewards/chosen": -13.694926261901855, + "rewards/margins": 3.9529881477355957, + "rewards/rejected": -17.64791488647461, + "step": 199 + }, + { + "epoch": 0.45236075770426915, + "grad_norm": 110.39676860636777, + "learning_rate": 5.349887515404564e-07, + "logits/chosen": -19.14180564880371, + "logits/rejected": -18.728225708007812, + "logps/chosen": -1.567470669746399, + "logps/rejected": -1.8445343971252441, + "loss": 3.0688, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.674705505371094, + "rewards/margins": 2.7706375122070312, + "rewards/rejected": -18.445341110229492, + "step": 200 + }, + { + "epoch": 0.4546225614927905, + "grad_norm": 112.66708655546361, + "learning_rate": 5.320049157673163e-07, + "logits/chosen": -19.58551597595215, + "logits/rejected": -19.09307289123535, + "logps/chosen": -1.4883310794830322, + "logps/rejected": -1.705352783203125, + "loss": 3.2351, + "rewards/accuracies": 0.625, + "rewards/chosen": -14.88331127166748, + "rewards/margins": 2.1702170372009277, + "rewards/rejected": -17.05352783203125, + "step": 201 + }, + { + "epoch": 0.4568843652813118, + "grad_norm": 140.2319323230872, + "learning_rate": 5.290128137815938e-07, + "logits/chosen": -18.30789566040039, + "logits/rejected": -18.139724731445312, + "logps/chosen": -1.4757699966430664, + "logps/rejected": -1.8749364614486694, + "loss": 3.259, + "rewards/accuracies": 0.78125, + "rewards/chosen": -14.75770092010498, + "rewards/margins": 3.991664409637451, + "rewards/rejected": -18.749366760253906, + "step": 202 + }, + { + "epoch": 0.4591461690698332, + "grad_norm": 124.01739804659297, + "learning_rate": 5.260126329501828e-07, + "logits/chosen": -18.476545333862305, + "logits/rejected": -18.295202255249023, + "logps/chosen": -1.3175721168518066, + "logps/rejected": -1.8804268836975098, + "loss": 2.8801, + "rewards/accuracies": 0.84375, + "rewards/chosen": -13.175721168518066, + "rewards/margins": 5.628549098968506, + "rewards/rejected": -18.804269790649414, + "step": 203 + }, + { + "epoch": 0.46140797285835455, + "grad_norm": 110.8800697134872, + "learning_rate": 5.230045611458789e-07, + "logits/chosen": -19.596027374267578, + "logits/rejected": -19.41700553894043, + "logps/chosen": -1.2929422855377197, + "logps/rejected": -1.7962149381637573, + "loss": 3.0127, + "rewards/accuracies": 0.8125, + "rewards/chosen": -12.929424285888672, + "rewards/margins": 5.032725811004639, + "rewards/rejected": -17.962148666381836, + "step": 204 + }, + { + "epoch": 0.46366977664687586, + "grad_norm": 130.9926070201208, + "learning_rate": 5.199887867356143e-07, + "logits/chosen": -18.125207901000977, + "logits/rejected": -18.14871597290039, + "logps/chosen": -1.4900728464126587, + "logps/rejected": -1.9675356149673462, + "loss": 3.1097, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.900728225708008, + "rewards/margins": 4.774627685546875, + "rewards/rejected": -19.675355911254883, + "step": 205 + }, + { + "epoch": 0.46593158043539723, + "grad_norm": 120.13166100619164, + "learning_rate": 5.16965498568662e-07, + "logits/chosen": -18.744281768798828, + "logits/rejected": -18.2139835357666, + "logps/chosen": -1.6062824726104736, + "logps/rejected": -2.2554736137390137, + "loss": 3.0776, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.062824249267578, + "rewards/margins": 6.491910934448242, + "rewards/rejected": -22.554733276367188, + "step": 206 + }, + { + "epoch": 0.4681933842239186, + "grad_norm": 120.48417226561662, + "learning_rate": 5.139348859648098e-07, + "logits/chosen": -18.755203247070312, + "logits/rejected": -18.599624633789062, + "logps/chosen": -1.2531086206436157, + "logps/rejected": -1.658897042274475, + "loss": 3.4743, + "rewards/accuracies": 0.71875, + "rewards/chosen": -12.531085968017578, + "rewards/margins": 4.05788516998291, + "rewards/rejected": -16.588970184326172, + "step": 207 + }, + { + "epoch": 0.4704551880124399, + "grad_norm": 108.6230474054679, + "learning_rate": 5.10897138702506e-07, + "logits/chosen": -19.518808364868164, + "logits/rejected": -19.44478416442871, + "logps/chosen": -1.500132441520691, + "logps/rejected": -1.9223947525024414, + "loss": 2.9948, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.001323699951172, + "rewards/margins": 4.2226243019104, + "rewards/rejected": -19.223947525024414, + "step": 208 + }, + { + "epoch": 0.4727169918009613, + "grad_norm": 137.90239347147582, + "learning_rate": 5.078524470069743e-07, + "logits/chosen": -20.06089973449707, + "logits/rejected": -19.945919036865234, + "logps/chosen": -1.5609779357910156, + "logps/rejected": -2.011448860168457, + "loss": 3.3272, + "rewards/accuracies": 0.78125, + "rewards/chosen": -15.609781265258789, + "rewards/margins": 4.504709720611572, + "rewards/rejected": -20.114490509033203, + "step": 209 + }, + { + "epoch": 0.47497879558948264, + "grad_norm": 122.82261924752474, + "learning_rate": 5.048010015383021e-07, + "logits/chosen": -20.49317169189453, + "logits/rejected": -20.084096908569336, + "logps/chosen": -1.8195672035217285, + "logps/rejected": -2.4391162395477295, + "loss": 3.0204, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.1956729888916, + "rewards/margins": 6.195489406585693, + "rewards/rejected": -24.39116096496582, + "step": 210 + }, + { + "epoch": 0.47724059937800395, + "grad_norm": 133.40914227876348, + "learning_rate": 5.01742993379502e-07, + "logits/chosen": -20.487716674804688, + "logits/rejected": -20.366500854492188, + "logps/chosen": -1.6833590269088745, + "logps/rejected": -1.998462200164795, + "loss": 3.3079, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.833589553833008, + "rewards/margins": 3.151031017303467, + "rewards/rejected": -19.984619140625, + "step": 211 + }, + { + "epoch": 0.4795024031665253, + "grad_norm": 121.21962647812443, + "learning_rate": 4.986786140245446e-07, + "logits/chosen": -18.02095603942871, + "logits/rejected": -18.15108299255371, + "logps/chosen": -1.5042206048965454, + "logps/rejected": -1.7882376909255981, + "loss": 3.3782, + "rewards/accuracies": 0.8125, + "rewards/chosen": -15.042205810546875, + "rewards/margins": 2.8401715755462646, + "rewards/rejected": -17.88237762451172, + "step": 212 + }, + { + "epoch": 0.4817642069550466, + "grad_norm": 138.8873467278285, + "learning_rate": 4.956080553663687e-07, + "logits/chosen": -19.02423095703125, + "logits/rejected": -18.977191925048828, + "logps/chosen": -1.8565832376480103, + "logps/rejected": -2.2188644409179688, + "loss": 3.2943, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.565834045410156, + "rewards/margins": 3.622810125350952, + "rewards/rejected": -22.188644409179688, + "step": 213 + }, + { + "epoch": 0.484026010743568, + "grad_norm": 120.38379558424705, + "learning_rate": 4.925315096848636e-07, + "logits/chosen": -17.092893600463867, + "logits/rejected": -17.469482421875, + "logps/chosen": -1.5928668975830078, + "logps/rejected": -2.1185383796691895, + "loss": 3.2942, + "rewards/accuracies": 0.6875, + "rewards/chosen": -15.928670883178711, + "rewards/margins": 5.256712913513184, + "rewards/rejected": -21.185382843017578, + "step": 214 + }, + { + "epoch": 0.48628781453208936, + "grad_norm": 141.47624101645354, + "learning_rate": 4.894491696348293e-07, + "logits/chosen": -18.64133644104004, + "logits/rejected": -18.508480072021484, + "logps/chosen": -1.7584447860717773, + "logps/rejected": -1.9836417436599731, + "loss": 3.8231, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.58444595336914, + "rewards/margins": 2.2519688606262207, + "rewards/rejected": -19.836416244506836, + "step": 215 + }, + { + "epoch": 0.48854961832061067, + "grad_norm": 100.6834936019963, + "learning_rate": 4.863612282339116e-07, + "logits/chosen": -18.912193298339844, + "logits/rejected": -18.51605224609375, + "logps/chosen": -1.4050565958023071, + "logps/rejected": -1.7702962160110474, + "loss": 3.2087, + "rewards/accuracies": 0.71875, + "rewards/chosen": -14.050565719604492, + "rewards/margins": 3.6523966789245605, + "rewards/rejected": -17.70296287536621, + "step": 216 + }, + { + "epoch": 0.49081142210913203, + "grad_norm": 126.90139310026458, + "learning_rate": 4.832678788505161e-07, + "logits/chosen": -20.156646728515625, + "logits/rejected": -20.06357192993164, + "logps/chosen": -1.8053876161575317, + "logps/rejected": -2.161924362182617, + "loss": 3.2957, + "rewards/accuracies": 0.625, + "rewards/chosen": -18.053876876831055, + "rewards/margins": 3.565363883972168, + "rewards/rejected": -21.619239807128906, + "step": 217 + }, + { + "epoch": 0.4930732258976534, + "grad_norm": 112.75413161542235, + "learning_rate": 4.801693151916985e-07, + "logits/chosen": -18.10401153564453, + "logits/rejected": -18.311925888061523, + "logps/chosen": -1.7536580562591553, + "logps/rejected": -2.2070531845092773, + "loss": 3.1204, + "rewards/accuracies": 0.78125, + "rewards/chosen": -17.53658103942871, + "rewards/margins": 4.533949375152588, + "rewards/rejected": -22.070531845092773, + "step": 218 + }, + { + "epoch": 0.4953350296861747, + "grad_norm": 120.36363917960784, + "learning_rate": 4.770657312910354e-07, + "logits/chosen": -19.36819839477539, + "logits/rejected": -19.35448455810547, + "logps/chosen": -1.7574162483215332, + "logps/rejected": -2.383183002471924, + "loss": 3.6944, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.57416343688965, + "rewards/margins": 6.257665634155273, + "rewards/rejected": -23.831829071044922, + "step": 219 + }, + { + "epoch": 0.4975968334746961, + "grad_norm": 123.44879094752798, + "learning_rate": 4.739573214964729e-07, + "logits/chosen": -18.070960998535156, + "logits/rejected": -17.84283447265625, + "logps/chosen": -1.280721664428711, + "logps/rejected": -1.6026414632797241, + "loss": 2.9387, + "rewards/accuracies": 0.71875, + "rewards/chosen": -12.80721664428711, + "rewards/margins": 3.2191972732543945, + "rewards/rejected": -16.02641487121582, + "step": 220 + }, + { + "epoch": 0.49985863726321744, + "grad_norm": 170.10224468225707, + "learning_rate": 4.7084428045815733e-07, + "logits/chosen": -19.83563995361328, + "logits/rejected": -19.78580665588379, + "logps/chosen": -1.7471987009048462, + "logps/rejected": -2.135683059692383, + "loss": 3.8038, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.471986770629883, + "rewards/margins": 3.884843349456787, + "rewards/rejected": -21.356828689575195, + "step": 221 + }, + { + "epoch": 0.5021204410517388, + "grad_norm": 133.7068988108329, + "learning_rate": 4.677268031162457e-07, + "logits/chosen": -18.73598861694336, + "logits/rejected": -18.53685188293457, + "logps/chosen": -1.8922369480133057, + "logps/rejected": -2.408550977706909, + "loss": 3.5777, + "rewards/accuracies": 0.71875, + "rewards/chosen": -18.9223690032959, + "rewards/margins": 5.163141250610352, + "rewards/rejected": -24.085508346557617, + "step": 222 + }, + { + "epoch": 0.5043822448402601, + "grad_norm": 124.33610987958505, + "learning_rate": 4.646050846886985e-07, + "logits/chosen": -17.20172882080078, + "logits/rejected": -17.508880615234375, + "logps/chosen": -1.3590439558029175, + "logps/rejected": -1.6888562440872192, + "loss": 3.4386, + "rewards/accuracies": 0.59375, + "rewards/chosen": -13.59044075012207, + "rewards/margins": 3.2981221675872803, + "rewards/rejected": -16.88856315612793, + "step": 223 + }, + { + "epoch": 0.5066440486287814, + "grad_norm": 142.45176890673545, + "learning_rate": 4.6147932065905494e-07, + "logits/chosen": -18.200490951538086, + "logits/rejected": -17.826982498168945, + "logps/chosen": -1.4690287113189697, + "logps/rejected": -2.0389232635498047, + "loss": 3.4706, + "rewards/accuracies": 0.6875, + "rewards/chosen": -14.690287590026855, + "rewards/margins": 5.698945999145508, + "rewards/rejected": -20.38923454284668, + "step": 224 + }, + { + "epoch": 0.5089058524173028, + "grad_norm": 122.4324863969919, + "learning_rate": 4.5834970676419214e-07, + "logits/chosen": -18.388614654541016, + "logits/rejected": -18.511974334716797, + "logps/chosen": -1.8286144733428955, + "logps/rejected": -2.2092440128326416, + "loss": 3.3465, + "rewards/accuracies": 0.71875, + "rewards/chosen": -18.28614616394043, + "rewards/margins": 3.8062963485717773, + "rewards/rejected": -22.09244155883789, + "step": 225 + }, + { + "epoch": 0.5111676562058242, + "grad_norm": 129.2266776919472, + "learning_rate": 4.552164389820673e-07, + "logits/chosen": -19.732501983642578, + "logits/rejected": -19.419715881347656, + "logps/chosen": -1.65671706199646, + "logps/rejected": -1.9091390371322632, + "loss": 3.195, + "rewards/accuracies": 0.65625, + "rewards/chosen": -16.567171096801758, + "rewards/margins": 2.5242207050323486, + "rewards/rejected": -19.09139060974121, + "step": 226 + }, + { + "epoch": 0.5134294599943455, + "grad_norm": 133.10160051050664, + "learning_rate": 4.5207971351944605e-07, + "logits/chosen": -18.20813751220703, + "logits/rejected": -17.936553955078125, + "logps/chosen": -1.5020235776901245, + "logps/rejected": -2.2087697982788086, + "loss": 3.6883, + "rewards/accuracies": 0.65625, + "rewards/chosen": -15.02023696899414, + "rewards/margins": 7.067460536956787, + "rewards/rejected": -22.087697982788086, + "step": 227 + }, + { + "epoch": 0.5156912637828668, + "grad_norm": 115.75597435267949, + "learning_rate": 4.489397267996157e-07, + "logits/chosen": -18.27444839477539, + "logits/rejected": -18.018796920776367, + "logps/chosen": -1.620775580406189, + "logps/rejected": -2.2751684188842773, + "loss": 3.0586, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.2077579498291, + "rewards/margins": 6.543926239013672, + "rewards/rejected": -22.75168228149414, + "step": 228 + }, + { + "epoch": 0.5179530675713881, + "grad_norm": 119.59564297014721, + "learning_rate": 4.45796675450085e-07, + "logits/chosen": -19.229650497436523, + "logits/rejected": -19.270090103149414, + "logps/chosen": -1.7479207515716553, + "logps/rejected": -1.9961845874786377, + "loss": 3.46, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.47920799255371, + "rewards/margins": 2.482638120651245, + "rewards/rejected": -19.96184730529785, + "step": 229 + }, + { + "epoch": 0.5202148713599095, + "grad_norm": 123.01980120189968, + "learning_rate": 4.4265075629027126e-07, + "logits/chosen": -20.439476013183594, + "logits/rejected": -20.24441146850586, + "logps/chosen": -1.8146342039108276, + "logps/rejected": -2.263535976409912, + "loss": 3.4224, + "rewards/accuracies": 0.65625, + "rewards/chosen": -18.146343231201172, + "rewards/margins": 4.489017486572266, + "rewards/rejected": -22.635360717773438, + "step": 230 + }, + { + "epoch": 0.5224766751484309, + "grad_norm": 107.58798664449722, + "learning_rate": 4.3950216631917563e-07, + "logits/chosen": -19.177587509155273, + "logits/rejected": -18.99146270751953, + "logps/chosen": -1.755948543548584, + "logps/rejected": -2.040555477142334, + "loss": 2.9987, + "rewards/accuracies": 0.625, + "rewards/chosen": -17.559486389160156, + "rewards/margins": 2.846068859100342, + "rewards/rejected": -20.405555725097656, + "step": 231 + }, + { + "epoch": 0.5247384789369522, + "grad_norm": 109.78374887710692, + "learning_rate": 4.3635110270304676e-07, + "logits/chosen": -18.429826736450195, + "logits/rejected": -18.861085891723633, + "logps/chosen": -1.559441328048706, + "logps/rejected": -1.994086742401123, + "loss": 2.6686, + "rewards/accuracies": 0.78125, + "rewards/chosen": -15.59441089630127, + "rewards/margins": 4.3464555740356445, + "rewards/rejected": -19.940868377685547, + "step": 232 + }, + { + "epoch": 0.5270002827254736, + "grad_norm": 116.99222466276613, + "learning_rate": 4.331977627630339e-07, + "logits/chosen": -17.902793884277344, + "logits/rejected": -18.096345901489258, + "logps/chosen": -1.382148265838623, + "logps/rejected": -1.986084222793579, + "loss": 2.6777, + "rewards/accuracies": 0.875, + "rewards/chosen": -13.82148265838623, + "rewards/margins": 6.039360046386719, + "rewards/rejected": -19.860841751098633, + "step": 233 + }, + { + "epoch": 0.5292620865139949, + "grad_norm": 111.82478148043788, + "learning_rate": 4.300423439628313e-07, + "logits/chosen": -18.652141571044922, + "logits/rejected": -18.740739822387695, + "logps/chosen": -1.7629899978637695, + "logps/rejected": -2.2177019119262695, + "loss": 2.6484, + "rewards/accuracies": 0.78125, + "rewards/chosen": -17.629901885986328, + "rewards/margins": 4.547117233276367, + "rewards/rejected": -22.177017211914062, + "step": 234 + }, + { + "epoch": 0.5315238903025162, + "grad_norm": 124.7698836200117, + "learning_rate": 4.268850438963118e-07, + "logits/chosen": -19.93507957458496, + "logits/rejected": -19.908302307128906, + "logps/chosen": -1.6999324560165405, + "logps/rejected": -2.0379884243011475, + "loss": 3.3517, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.999324798583984, + "rewards/margins": 3.380560874938965, + "rewards/rejected": -20.379884719848633, + "step": 235 + }, + { + "epoch": 0.5337856940910376, + "grad_norm": 130.0202969035073, + "learning_rate": 4.2372606027515463e-07, + "logits/chosen": -16.75927734375, + "logits/rejected": -16.6347713470459, + "logps/chosen": -1.7100436687469482, + "logps/rejected": -2.0675430297851562, + "loss": 3.3574, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.100439071655273, + "rewards/margins": 3.574993133544922, + "rewards/rejected": -20.675430297851562, + "step": 236 + }, + { + "epoch": 0.536047497879559, + "grad_norm": 142.3393610446344, + "learning_rate": 4.2056559091646387e-07, + "logits/chosen": -19.50155258178711, + "logits/rejected": -19.233562469482422, + "logps/chosen": -1.7303612232208252, + "logps/rejected": -1.9693727493286133, + "loss": 3.8961, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.303613662719727, + "rewards/margins": 2.390113353729248, + "rewards/rejected": -19.693727493286133, + "step": 237 + }, + { + "epoch": 0.5383093016680803, + "grad_norm": 132.7580435084033, + "learning_rate": 4.1740383373038116e-07, + "logits/chosen": -19.230268478393555, + "logits/rejected": -19.026260375976562, + "logps/chosen": -1.7516860961914062, + "logps/rejected": -2.302055835723877, + "loss": 2.958, + "rewards/accuracies": 0.84375, + "rewards/chosen": -17.516862869262695, + "rewards/margins": 5.503696441650391, + "rewards/rejected": -23.020557403564453, + "step": 238 + }, + { + "epoch": 0.5405711054566016, + "grad_norm": 146.2605729576581, + "learning_rate": 4.1424098670769255e-07, + "logits/chosen": -16.876083374023438, + "logits/rejected": -16.858041763305664, + "logps/chosen": -1.4845470190048218, + "logps/rejected": -1.819509506225586, + "loss": 3.4692, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.845470428466797, + "rewards/margins": 3.349626302719116, + "rewards/rejected": -18.195096969604492, + "step": 239 + }, + { + "epoch": 0.542832909245123, + "grad_norm": 103.3850975548694, + "learning_rate": 4.1107724790743007e-07, + "logits/chosen": -19.218101501464844, + "logits/rejected": -19.07253646850586, + "logps/chosen": -1.6440057754516602, + "logps/rejected": -2.002814531326294, + "loss": 2.7882, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.4400577545166, + "rewards/margins": 3.5880849361419678, + "rewards/rejected": -20.028141021728516, + "step": 240 + }, + { + "epoch": 0.5450947130336443, + "grad_norm": 108.59641705360205, + "learning_rate": 4.0791281544446947e-07, + "logits/chosen": -18.29979705810547, + "logits/rejected": -18.103771209716797, + "logps/chosen": -1.5138269662857056, + "logps/rejected": -1.9497716426849365, + "loss": 2.4937, + "rewards/accuracies": 0.84375, + "rewards/chosen": -15.138269424438477, + "rewards/margins": 4.3594465255737305, + "rewards/rejected": -19.49771499633789, + "step": 241 + }, + { + "epoch": 0.5473565168221657, + "grad_norm": 118.47748367490442, + "learning_rate": 4.0474788747712416e-07, + "logits/chosen": -16.5893611907959, + "logits/rejected": -16.470932006835938, + "logps/chosen": -1.4450711011886597, + "logps/rejected": -1.7044168710708618, + "loss": 3.6487, + "rewards/accuracies": 0.65625, + "rewards/chosen": -14.450712203979492, + "rewards/margins": 2.5934557914733887, + "rewards/rejected": -17.04416847229004, + "step": 242 + }, + { + "epoch": 0.549618320610687, + "grad_norm": 127.18536302103081, + "learning_rate": 4.0158266219473573e-07, + "logits/chosen": -19.53125762939453, + "logits/rejected": -19.866533279418945, + "logps/chosen": -1.3576165437698364, + "logps/rejected": -1.6340572834014893, + "loss": 2.8172, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.576166152954102, + "rewards/margins": 2.7644076347351074, + "rewards/rejected": -16.340572357177734, + "step": 243 + }, + { + "epoch": 0.5518801243992084, + "grad_norm": 144.24875011893795, + "learning_rate": 3.984173378052643e-07, + "logits/chosen": -17.787691116333008, + "logits/rejected": -17.381996154785156, + "logps/chosen": -1.4772697687149048, + "logps/rejected": -1.9439443349838257, + "loss": 2.7811, + "rewards/accuracies": 0.78125, + "rewards/chosen": -14.772696495056152, + "rewards/margins": 4.666746139526367, + "rewards/rejected": -19.439443588256836, + "step": 244 + }, + { + "epoch": 0.5541419281877297, + "grad_norm": 130.02624554472013, + "learning_rate": 3.9525211252287585e-07, + "logits/chosen": -17.32337760925293, + "logits/rejected": -17.11507797241211, + "logps/chosen": -1.8103289604187012, + "logps/rejected": -2.3584718704223633, + "loss": 2.8682, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.103288650512695, + "rewards/margins": 5.4814324378967285, + "rewards/rejected": -23.584720611572266, + "step": 245 + }, + { + "epoch": 0.556403731976251, + "grad_norm": 112.27330200229477, + "learning_rate": 3.920871845555305e-07, + "logits/chosen": -19.526695251464844, + "logits/rejected": -19.684539794921875, + "logps/chosen": -1.6799914836883545, + "logps/rejected": -2.0144991874694824, + "loss": 3.2575, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.799915313720703, + "rewards/margins": 3.3450779914855957, + "rewards/rejected": -20.14499282836914, + "step": 246 + }, + { + "epoch": 0.5586655357647724, + "grad_norm": 123.49123400492937, + "learning_rate": 3.8892275209256984e-07, + "logits/chosen": -17.89940643310547, + "logits/rejected": -18.42051124572754, + "logps/chosen": -1.7134385108947754, + "logps/rejected": -2.140021324157715, + "loss": 3.0396, + "rewards/accuracies": 0.78125, + "rewards/chosen": -17.134387969970703, + "rewards/margins": 4.26582670211792, + "rewards/rejected": -21.40021324157715, + "step": 247 + }, + { + "epoch": 0.5609273395532938, + "grad_norm": 135.4244307593356, + "learning_rate": 3.8575901329230747e-07, + "logits/chosen": -19.93168067932129, + "logits/rejected": -19.66005516052246, + "logps/chosen": -2.0851705074310303, + "logps/rejected": -2.714930534362793, + "loss": 3.4958, + "rewards/accuracies": 0.71875, + "rewards/chosen": -20.85170555114746, + "rewards/margins": 6.297600746154785, + "rewards/rejected": -27.14930534362793, + "step": 248 + }, + { + "epoch": 0.5631891433418151, + "grad_norm": 126.1113881766967, + "learning_rate": 3.8259616626961886e-07, + "logits/chosen": -19.275150299072266, + "logits/rejected": -19.156465530395508, + "logps/chosen": -1.4787800312042236, + "logps/rejected": -1.7590197324752808, + "loss": 3.2882, + "rewards/accuracies": 0.78125, + "rewards/chosen": -14.787800788879395, + "rewards/margins": 2.8023955821990967, + "rewards/rejected": -17.59019660949707, + "step": 249 + }, + { + "epoch": 0.5654509471303364, + "grad_norm": 143.11865197858194, + "learning_rate": 3.794344090835362e-07, + "logits/chosen": -19.26443099975586, + "logits/rejected": -18.946765899658203, + "logps/chosen": -1.713568925857544, + "logps/rejected": -2.3817105293273926, + "loss": 3.9405, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.13568878173828, + "rewards/margins": 6.681418418884277, + "rewards/rejected": -23.817108154296875, + "step": 250 + }, + { + "epoch": 0.5677127509188578, + "grad_norm": 135.07580917666817, + "learning_rate": 3.7627393972484534e-07, + "logits/chosen": -19.917835235595703, + "logits/rejected": -19.873958587646484, + "logps/chosen": -1.6692298650741577, + "logps/rejected": -1.9403576850891113, + "loss": 3.9729, + "rewards/accuracies": 0.71875, + "rewards/chosen": -16.692298889160156, + "rewards/margins": 2.711277961730957, + "rewards/rejected": -19.40357780456543, + "step": 251 + }, + { + "epoch": 0.5699745547073791, + "grad_norm": 121.70724549497747, + "learning_rate": 3.7311495610368823e-07, + "logits/chosen": -19.70743179321289, + "logits/rejected": -19.559131622314453, + "logps/chosen": -1.725498914718628, + "logps/rejected": -1.9476670026779175, + "loss": 3.2505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.254987716674805, + "rewards/margins": 2.221681833267212, + "rewards/rejected": -19.476669311523438, + "step": 252 + }, + { + "epoch": 0.5722363584959005, + "grad_norm": 92.60358297039429, + "learning_rate": 3.699576560371689e-07, + "logits/chosen": -19.507673263549805, + "logits/rejected": -19.240455627441406, + "logps/chosen": -1.6785533428192139, + "logps/rejected": -2.16404128074646, + "loss": 2.668, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.785533905029297, + "rewards/margins": 4.854878902435303, + "rewards/rejected": -21.640413284301758, + "step": 253 + }, + { + "epoch": 0.5744981622844219, + "grad_norm": 117.8494457797105, + "learning_rate": 3.66802237236966e-07, + "logits/chosen": -17.930335998535156, + "logits/rejected": -17.775684356689453, + "logps/chosen": -1.4922497272491455, + "logps/rejected": -1.9657816886901855, + "loss": 2.9119, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.92249584197998, + "rewards/margins": 4.735319137573242, + "rewards/rejected": -19.65781593322754, + "step": 254 + }, + { + "epoch": 0.5767599660729432, + "grad_norm": 128.0769904494541, + "learning_rate": 3.636488972969532e-07, + "logits/chosen": -18.133464813232422, + "logits/rejected": -18.1317195892334, + "logps/chosen": -1.7718310356140137, + "logps/rejected": -2.1774401664733887, + "loss": 3.32, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.71830940246582, + "rewards/margins": 4.056089878082275, + "rewards/rejected": -21.77440071105957, + "step": 255 + }, + { + "epoch": 0.5790217698614645, + "grad_norm": 109.14078747974439, + "learning_rate": 3.604978336808244e-07, + "logits/chosen": -18.034282684326172, + "logits/rejected": -17.809715270996094, + "logps/chosen": -1.6046905517578125, + "logps/rejected": -2.003675937652588, + "loss": 3.0045, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.046907424926758, + "rewards/margins": 3.9898502826690674, + "rewards/rejected": -20.03675651550293, + "step": 256 + }, + { + "epoch": 0.5812835736499858, + "grad_norm": 143.3661499166921, + "learning_rate": 3.5734924370972876e-07, + "logits/chosen": -18.07189178466797, + "logits/rejected": -17.913497924804688, + "logps/chosen": -1.4232511520385742, + "logps/rejected": -1.7245614528656006, + "loss": 3.0709, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.232512474060059, + "rewards/margins": 3.0131046772003174, + "rewards/rejected": -17.245615005493164, + "step": 257 + }, + { + "epoch": 0.5835453774385072, + "grad_norm": 115.02007432023626, + "learning_rate": 3.5420332454991504e-07, + "logits/chosen": -18.96527862548828, + "logits/rejected": -18.798969268798828, + "logps/chosen": -1.7819788455963135, + "logps/rejected": -2.11525559425354, + "loss": 3.3907, + "rewards/accuracies": 0.78125, + "rewards/chosen": -17.819787979125977, + "rewards/margins": 3.3327670097351074, + "rewards/rejected": -21.152557373046875, + "step": 258 + }, + { + "epoch": 0.5858071812270286, + "grad_norm": 116.46242436554796, + "learning_rate": 3.510602732003843e-07, + "logits/chosen": -18.923112869262695, + "logits/rejected": -19.192468643188477, + "logps/chosen": -1.7312378883361816, + "logps/rejected": -2.397825002670288, + "loss": 2.9306, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.312379837036133, + "rewards/margins": 6.665870189666748, + "rewards/rejected": -23.978248596191406, + "step": 259 + }, + { + "epoch": 0.5880689850155499, + "grad_norm": 130.25971956929465, + "learning_rate": 3.4792028648055396e-07, + "logits/chosen": -18.882343292236328, + "logits/rejected": -18.996551513671875, + "logps/chosen": -1.6002156734466553, + "logps/rejected": -2.0233724117279053, + "loss": 2.9946, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.00215721130371, + "rewards/margins": 4.231566905975342, + "rewards/rejected": -20.23372459411621, + "step": 260 + }, + { + "epoch": 0.5903307888040712, + "grad_norm": 116.58608387387143, + "learning_rate": 3.447835610179327e-07, + "logits/chosen": -18.31661033630371, + "logits/rejected": -18.64508819580078, + "logps/chosen": -1.852226972579956, + "logps/rejected": -2.575822591781616, + "loss": 2.6653, + "rewards/accuracies": 0.84375, + "rewards/chosen": -18.52227020263672, + "rewards/margins": 7.235957145690918, + "rewards/rejected": -25.75822639465332, + "step": 261 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 131.22286636501386, + "learning_rate": 3.416502932358079e-07, + "logits/chosen": -19.86322021484375, + "logits/rejected": -19.80363655090332, + "logps/chosen": -1.677018642425537, + "logps/rejected": -1.9988960027694702, + "loss": 3.2105, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.770187377929688, + "rewards/margins": 3.2187743186950684, + "rewards/rejected": -19.98896026611328, + "step": 262 + }, + { + "epoch": 0.5948543963811139, + "grad_norm": 113.95397777389032, + "learning_rate": 3.385206793409451e-07, + "logits/chosen": -16.9749813079834, + "logits/rejected": -16.70389175415039, + "logps/chosen": -1.5683423280715942, + "logps/rejected": -1.973022699356079, + "loss": 2.895, + "rewards/accuracies": 0.8125, + "rewards/chosen": -15.68342399597168, + "rewards/margins": 4.046802520751953, + "rewards/rejected": -19.730226516723633, + "step": 263 + }, + { + "epoch": 0.5971162001696353, + "grad_norm": 136.6059459741964, + "learning_rate": 3.3539491531130163e-07, + "logits/chosen": -17.935535430908203, + "logits/rejected": -17.720043182373047, + "logps/chosen": -1.4503566026687622, + "logps/rejected": -1.7065664529800415, + "loss": 3.1485, + "rewards/accuracies": 0.71875, + "rewards/chosen": -14.50356674194336, + "rewards/margins": 2.5620980262756348, + "rewards/rejected": -17.065662384033203, + "step": 264 + }, + { + "epoch": 0.5993780039581567, + "grad_norm": 137.85024895532698, + "learning_rate": 3.3227319688375426e-07, + "logits/chosen": -19.27477264404297, + "logits/rejected": -19.29145050048828, + "logps/chosen": -1.9958823919296265, + "logps/rejected": -2.369366407394409, + "loss": 3.4478, + "rewards/accuracies": 0.71875, + "rewards/chosen": -19.958826065063477, + "rewards/margins": 3.7348380088806152, + "rewards/rejected": -23.693662643432617, + "step": 265 + }, + { + "epoch": 0.601639807746678, + "grad_norm": 123.86696879933385, + "learning_rate": 3.291557195418427e-07, + "logits/chosen": -18.97182273864746, + "logits/rejected": -18.623531341552734, + "logps/chosen": -1.6371110677719116, + "logps/rejected": -2.1235477924346924, + "loss": 3.124, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.371112823486328, + "rewards/margins": 4.864367485046387, + "rewards/rejected": -21.235477447509766, + "step": 266 + }, + { + "epoch": 0.6039016115351993, + "grad_norm": 160.30929493983072, + "learning_rate": 3.260426785035272e-07, + "logits/chosen": -18.151859283447266, + "logits/rejected": -18.189985275268555, + "logps/chosen": -1.5115103721618652, + "logps/rejected": -1.9283788204193115, + "loss": 3.4577, + "rewards/accuracies": 0.71875, + "rewards/chosen": -15.115103721618652, + "rewards/margins": 4.168684959411621, + "rewards/rejected": -19.283788681030273, + "step": 267 + }, + { + "epoch": 0.6061634153237206, + "grad_norm": 118.85038786779553, + "learning_rate": 3.229342687089646e-07, + "logits/chosen": -17.767433166503906, + "logits/rejected": -17.30542755126953, + "logps/chosen": -1.7307448387145996, + "logps/rejected": -2.2132887840270996, + "loss": 3.2675, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.30744743347168, + "rewards/margins": 4.825439453125, + "rewards/rejected": -22.132884979248047, + "step": 268 + }, + { + "epoch": 0.608425219112242, + "grad_norm": 135.72304791395987, + "learning_rate": 3.1983068480830143e-07, + "logits/chosen": -17.994487762451172, + "logits/rejected": -17.969486236572266, + "logps/chosen": -1.7408254146575928, + "logps/rejected": -2.30222225189209, + "loss": 3.0847, + "rewards/accuracies": 0.84375, + "rewards/chosen": -17.408254623413086, + "rewards/margins": 5.613969326019287, + "rewards/rejected": -23.0222225189209, + "step": 269 + }, + { + "epoch": 0.6106870229007634, + "grad_norm": 136.8409835051581, + "learning_rate": 3.1673212114948387e-07, + "logits/chosen": -18.464635848999023, + "logits/rejected": -18.186416625976562, + "logps/chosen": -1.8000985383987427, + "logps/rejected": -2.355130672454834, + "loss": 2.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": -18.000986099243164, + "rewards/margins": 5.550319671630859, + "rewards/rejected": -23.551307678222656, + "step": 270 + }, + { + "epoch": 0.6129488266892847, + "grad_norm": 128.71421710643776, + "learning_rate": 3.1363877176608845e-07, + "logits/chosen": -18.273387908935547, + "logits/rejected": -18.52509117126465, + "logps/chosen": -1.7283263206481934, + "logps/rejected": -2.170729637145996, + "loss": 2.919, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.28326416015625, + "rewards/margins": 4.424034118652344, + "rewards/rejected": -21.707298278808594, + "step": 271 + }, + { + "epoch": 0.615210630477806, + "grad_norm": 130.4882843458114, + "learning_rate": 3.1055083036517076e-07, + "logits/chosen": -18.288068771362305, + "logits/rejected": -17.75768280029297, + "logps/chosen": -1.6948351860046387, + "logps/rejected": -2.2144925594329834, + "loss": 3.3302, + "rewards/accuracies": 0.71875, + "rewards/chosen": -16.948348999023438, + "rewards/margins": 5.196574687957764, + "rewards/rejected": -22.14492416381836, + "step": 272 + }, + { + "epoch": 0.6174724342663274, + "grad_norm": 119.37105820236062, + "learning_rate": 3.074684903151364e-07, + "logits/chosen": -17.694923400878906, + "logits/rejected": -17.45270538330078, + "logps/chosen": -1.4144465923309326, + "logps/rejected": -1.7332209348678589, + "loss": 3.1823, + "rewards/accuracies": 0.71875, + "rewards/chosen": -14.144466400146484, + "rewards/margins": 3.1877427101135254, + "rewards/rejected": -17.33220863342285, + "step": 273 + }, + { + "epoch": 0.6197342380548487, + "grad_norm": 126.9873723041469, + "learning_rate": 3.0439194463363136e-07, + "logits/chosen": -19.154897689819336, + "logits/rejected": -19.074947357177734, + "logps/chosen": -1.6273291110992432, + "logps/rejected": -2.1156094074249268, + "loss": 3.1957, + "rewards/accuracies": 0.65625, + "rewards/chosen": -16.27328872680664, + "rewards/margins": 4.882803916931152, + "rewards/rejected": -21.156095504760742, + "step": 274 + }, + { + "epoch": 0.6219960418433701, + "grad_norm": 98.82421414344171, + "learning_rate": 3.0132138597545537e-07, + "logits/chosen": -18.89469337463379, + "logits/rejected": -18.92743492126465, + "logps/chosen": -1.8614561557769775, + "logps/rejected": -2.298145294189453, + "loss": 2.4871, + "rewards/accuracies": 0.625, + "rewards/chosen": -18.614561080932617, + "rewards/margins": 4.366891860961914, + "rewards/rejected": -22.9814510345459, + "step": 275 + }, + { + "epoch": 0.6242578456318915, + "grad_norm": 116.98836789151454, + "learning_rate": 2.982570066204981e-07, + "logits/chosen": -17.621952056884766, + "logits/rejected": -17.41912841796875, + "logps/chosen": -1.7095118761062622, + "logps/rejected": -2.267876625061035, + "loss": 2.8961, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.09511947631836, + "rewards/margins": 5.583648204803467, + "rewards/rejected": -22.678768157958984, + "step": 276 + }, + { + "epoch": 0.6265196494204128, + "grad_norm": 139.7828630658467, + "learning_rate": 2.951989984616979e-07, + "logits/chosen": -18.495176315307617, + "logits/rejected": -18.713180541992188, + "logps/chosen": -1.794584035873413, + "logps/rejected": -2.701646327972412, + "loss": 3.3014, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.94584083557129, + "rewards/margins": 9.070621490478516, + "rewards/rejected": -27.016462326049805, + "step": 277 + }, + { + "epoch": 0.6287814532089341, + "grad_norm": 104.32477594909946, + "learning_rate": 2.9214755299302584e-07, + "logits/chosen": -18.10324478149414, + "logits/rejected": -18.533466339111328, + "logps/chosen": -1.4521610736846924, + "logps/rejected": -2.136770486831665, + "loss": 2.7298, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.521611213684082, + "rewards/margins": 6.84609317779541, + "rewards/rejected": -21.367706298828125, + "step": 278 + }, + { + "epoch": 0.6310432569974554, + "grad_norm": 129.12312580227652, + "learning_rate": 2.89102861297494e-07, + "logits/chosen": -16.307287216186523, + "logits/rejected": -16.62302589416504, + "logps/chosen": -1.5291308164596558, + "logps/rejected": -1.9218378067016602, + "loss": 3.2843, + "rewards/accuracies": 0.6875, + "rewards/chosen": -15.29130744934082, + "rewards/margins": 3.9270708560943604, + "rewards/rejected": -19.218379974365234, + "step": 279 + }, + { + "epoch": 0.6333050607859768, + "grad_norm": 119.86450692791884, + "learning_rate": 2.860651140351902e-07, + "logits/chosen": -17.81388282775879, + "logits/rejected": -17.59682273864746, + "logps/chosen": -1.4970345497131348, + "logps/rejected": -2.2067017555236816, + "loss": 3.0318, + "rewards/accuracies": 0.8125, + "rewards/chosen": -14.970344543457031, + "rewards/margins": 7.096673011779785, + "rewards/rejected": -22.0670166015625, + "step": 280 + }, + { + "epoch": 0.6355668645744982, + "grad_norm": 138.33272392046018, + "learning_rate": 2.830345014313381e-07, + "logits/chosen": -18.549711227416992, + "logits/rejected": -18.178396224975586, + "logps/chosen": -1.5726195573806763, + "logps/rejected": -2.220799684524536, + "loss": 3.0664, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.726195335388184, + "rewards/margins": 6.481801986694336, + "rewards/rejected": -22.20799446105957, + "step": 281 + }, + { + "epoch": 0.6378286683630195, + "grad_norm": 121.49306068303021, + "learning_rate": 2.800112132643856e-07, + "logits/chosen": -18.666532516479492, + "logits/rejected": -18.698705673217773, + "logps/chosen": -1.9378407001495361, + "logps/rejected": -2.5402820110321045, + "loss": 3.0881, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.378408432006836, + "rewards/margins": 6.024411678314209, + "rewards/rejected": -25.40281867980957, + "step": 282 + }, + { + "epoch": 0.6400904721515408, + "grad_norm": 114.50153480696909, + "learning_rate": 2.7699543885412105e-07, + "logits/chosen": -18.842344284057617, + "logits/rejected": -19.002525329589844, + "logps/chosen": -1.7454712390899658, + "logps/rejected": -2.214015483856201, + "loss": 2.7145, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.454710006713867, + "rewards/margins": 4.685445308685303, + "rewards/rejected": -22.14015769958496, + "step": 283 + }, + { + "epoch": 0.6423522759400622, + "grad_norm": 126.9731178541442, + "learning_rate": 2.7398736704981725e-07, + "logits/chosen": -17.94224739074707, + "logits/rejected": -18.106706619262695, + "logps/chosen": -1.8006949424743652, + "logps/rejected": -2.4084813594818115, + "loss": 2.7514, + "rewards/accuracies": 0.78125, + "rewards/chosen": -18.00695037841797, + "rewards/margins": 6.077863693237305, + "rewards/rejected": -24.084814071655273, + "step": 284 + }, + { + "epoch": 0.6446140797285835, + "grad_norm": 121.79509986844188, + "learning_rate": 2.709871862184063e-07, + "logits/chosen": -16.98878288269043, + "logits/rejected": -17.01874542236328, + "logps/chosen": -1.8407750129699707, + "logps/rejected": -2.2451648712158203, + "loss": 3.3275, + "rewards/accuracies": 0.6875, + "rewards/chosen": -18.407751083374023, + "rewards/margins": 4.0438995361328125, + "rewards/rejected": -22.451650619506836, + "step": 285 + }, + { + "epoch": 0.6468758835171049, + "grad_norm": 108.87797931776414, + "learning_rate": 2.679950842326837e-07, + "logits/chosen": -18.95654296875, + "logits/rejected": -18.801700592041016, + "logps/chosen": -1.6954306364059448, + "logps/rejected": -2.5442655086517334, + "loss": 2.5918, + "rewards/accuracies": 0.78125, + "rewards/chosen": -16.95430564880371, + "rewards/margins": 8.488348960876465, + "rewards/rejected": -25.442655563354492, + "step": 286 + }, + { + "epoch": 0.6491376873056263, + "grad_norm": 111.41714747114163, + "learning_rate": 2.6501124845954363e-07, + "logits/chosen": -16.922765731811523, + "logits/rejected": -16.570079803466797, + "logps/chosen": -1.5942519903182983, + "logps/rejected": -2.0841176509857178, + "loss": 2.7103, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.942520141601562, + "rewards/margins": 4.898656368255615, + "rewards/rejected": -20.841175079345703, + "step": 287 + }, + { + "epoch": 0.6513994910941476, + "grad_norm": 111.54831143350387, + "learning_rate": 2.62035865748246e-07, + "logits/chosen": -19.410310745239258, + "logits/rejected": -19.599136352539062, + "logps/chosen": -1.7219384908676147, + "logps/rejected": -2.0403764247894287, + "loss": 3.257, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.219385147094727, + "rewards/margins": 3.184377670288086, + "rewards/rejected": -20.403764724731445, + "step": 288 + }, + { + "epoch": 0.6536612948826689, + "grad_norm": 129.1650835585718, + "learning_rate": 2.5906912241871554e-07, + "logits/chosen": -19.173494338989258, + "logits/rejected": -19.192523956298828, + "logps/chosen": -1.6308451890945435, + "logps/rejected": -2.0142531394958496, + "loss": 3.5351, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.308452606201172, + "rewards/margins": 3.8340790271759033, + "rewards/rejected": -20.142528533935547, + "step": 289 + }, + { + "epoch": 0.6559230986711903, + "grad_norm": 114.97395819625001, + "learning_rate": 2.561112042498753e-07, + "logits/chosen": -17.663278579711914, + "logits/rejected": -17.458215713500977, + "logps/chosen": -1.433032751083374, + "logps/rejected": -1.9120677709579468, + "loss": 3.1725, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.330328941345215, + "rewards/margins": 4.79034948348999, + "rewards/rejected": -19.12067985534668, + "step": 290 + }, + { + "epoch": 0.6581849024597116, + "grad_norm": 118.80435385837328, + "learning_rate": 2.5316229646801195e-07, + "logits/chosen": -19.93079948425293, + "logits/rejected": -19.657909393310547, + "logps/chosen": -1.6756254434585571, + "logps/rejected": -2.254075288772583, + "loss": 2.7255, + "rewards/accuracies": 0.78125, + "rewards/chosen": -16.756254196166992, + "rewards/margins": 5.7845001220703125, + "rewards/rejected": -22.540752410888672, + "step": 291 + }, + { + "epoch": 0.660446706248233, + "grad_norm": 120.26763355250853, + "learning_rate": 2.5022258373517714e-07, + "logits/chosen": -18.864389419555664, + "logits/rejected": -18.524669647216797, + "logps/chosen": -1.6191332340240479, + "logps/rejected": -2.0536766052246094, + "loss": 2.8735, + "rewards/accuracies": 0.71875, + "rewards/chosen": -16.191333770751953, + "rewards/margins": 4.345433235168457, + "rewards/rejected": -20.536766052246094, + "step": 292 + }, + { + "epoch": 0.6627085100367544, + "grad_norm": 147.0811073810554, + "learning_rate": 2.4729225013762474e-07, + "logits/chosen": -18.751914978027344, + "logits/rejected": -18.83761215209961, + "logps/chosen": -1.7884494066238403, + "logps/rejected": -2.187746524810791, + "loss": 3.8995, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.884492874145508, + "rewards/margins": 3.9929721355438232, + "rewards/rejected": -21.877464294433594, + "step": 293 + }, + { + "epoch": 0.6649703138252756, + "grad_norm": 148.90534716218426, + "learning_rate": 2.4437147917428203e-07, + "logits/chosen": -18.826107025146484, + "logits/rejected": -18.503259658813477, + "logps/chosen": -1.7195425033569336, + "logps/rejected": -2.187434434890747, + "loss": 3.0299, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.195425033569336, + "rewards/margins": 4.678918838500977, + "rewards/rejected": -21.87434196472168, + "step": 294 + }, + { + "epoch": 0.667232117613797, + "grad_norm": 127.90476462202498, + "learning_rate": 2.414604537452595e-07, + "logits/chosen": -18.674943923950195, + "logits/rejected": -18.60759735107422, + "logps/chosen": -1.7201087474822998, + "logps/rejected": -2.014519214630127, + "loss": 3.128, + "rewards/accuracies": 0.625, + "rewards/chosen": -17.201087951660156, + "rewards/margins": 2.9441049098968506, + "rewards/rejected": -20.145193099975586, + "step": 295 + }, + { + "epoch": 0.6694939214023183, + "grad_norm": 108.9145846232443, + "learning_rate": 2.385593561403974e-07, + "logits/chosen": -19.400646209716797, + "logits/rejected": -19.273517608642578, + "logps/chosen": -1.726077914237976, + "logps/rejected": -2.1319644451141357, + "loss": 2.882, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.260780334472656, + "rewards/margins": 4.058864593505859, + "rewards/rejected": -21.319643020629883, + "step": 296 + }, + { + "epoch": 0.6717557251908397, + "grad_norm": 112.86761120489062, + "learning_rate": 2.3566836802785119e-07, + "logits/chosen": -18.81859016418457, + "logits/rejected": -18.859493255615234, + "logps/chosen": -1.9398648738861084, + "logps/rejected": -2.3005058765411377, + "loss": 2.8161, + "rewards/accuracies": 0.6875, + "rewards/chosen": -19.39864730834961, + "rewards/margins": 3.60640811920166, + "rewards/rejected": -23.005056381225586, + "step": 297 + }, + { + "epoch": 0.6740175289793611, + "grad_norm": 118.40226934077113, + "learning_rate": 2.327876704427146e-07, + "logits/chosen": -18.128990173339844, + "logits/rejected": -18.05478858947754, + "logps/chosen": -1.7885990142822266, + "logps/rejected": -2.1967928409576416, + "loss": 3.3258, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.885990142822266, + "rewards/margins": 4.08193826675415, + "rewards/rejected": -21.96792984008789, + "step": 298 + }, + { + "epoch": 0.6762793327678824, + "grad_norm": 153.7897090271479, + "learning_rate": 2.2991744377568358e-07, + "logits/chosen": -17.88959312438965, + "logits/rejected": -17.185943603515625, + "logps/chosen": -1.6429543495178223, + "logps/rejected": -2.067078113555908, + "loss": 3.74, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.429542541503906, + "rewards/margins": 4.241240501403809, + "rewards/rejected": -20.67078399658203, + "step": 299 + }, + { + "epoch": 0.6785411365564037, + "grad_norm": 133.58644267056656, + "learning_rate": 2.270578677617601e-07, + "logits/chosen": -18.508695602416992, + "logits/rejected": -18.557147979736328, + "logps/chosen": -1.617353916168213, + "logps/rejected": -2.0458405017852783, + "loss": 3.4581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -16.173538208007812, + "rewards/margins": 4.2848663330078125, + "rewards/rejected": -20.458406448364258, + "step": 300 + }, + { + "epoch": 0.6808029403449251, + "grad_norm": 116.99592631499642, + "learning_rate": 2.242091214689971e-07, + "logits/chosen": -18.887380599975586, + "logits/rejected": -18.278608322143555, + "logps/chosen": -1.7732751369476318, + "logps/rejected": -2.1223254203796387, + "loss": 2.8134, + "rewards/accuracies": 0.78125, + "rewards/chosen": -17.732751846313477, + "rewards/margins": 3.4905025959014893, + "rewards/rejected": -21.223255157470703, + "step": 301 + }, + { + "epoch": 0.6830647441334464, + "grad_norm": 129.92140054225982, + "learning_rate": 2.2137138328728456e-07, + "logits/chosen": -18.269765853881836, + "logits/rejected": -17.92385482788086, + "logps/chosen": -1.8314377069473267, + "logps/rejected": -2.02689266204834, + "loss": 3.2214, + "rewards/accuracies": 0.625, + "rewards/chosen": -18.314376831054688, + "rewards/margins": 1.9545530080795288, + "rewards/rejected": -20.26892852783203, + "step": 302 + }, + { + "epoch": 0.6853265479219678, + "grad_norm": 120.44838708614284, + "learning_rate": 2.1854483091717974e-07, + "logits/chosen": -17.881437301635742, + "logits/rejected": -17.71358299255371, + "logps/chosen": -1.6855335235595703, + "logps/rejected": -2.2010655403137207, + "loss": 2.7716, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.855335235595703, + "rewards/margins": 5.155317783355713, + "rewards/rejected": -22.010652542114258, + "step": 303 + }, + { + "epoch": 0.6875883517104892, + "grad_norm": 142.94697133093283, + "learning_rate": 2.1572964135877863e-07, + "logits/chosen": -17.533218383789062, + "logits/rejected": -17.355274200439453, + "logps/chosen": -1.5027070045471191, + "logps/rejected": -2.0504214763641357, + "loss": 3.376, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.027069091796875, + "rewards/margins": 5.477144241333008, + "rewards/rejected": -20.504213333129883, + "step": 304 + }, + { + "epoch": 0.6898501554990104, + "grad_norm": 114.71461725743285, + "learning_rate": 2.1292599090063245e-07, + "logits/chosen": -18.869152069091797, + "logits/rejected": -18.81059455871582, + "logps/chosen": -1.6714305877685547, + "logps/rejected": -2.1169075965881348, + "loss": 2.7332, + "rewards/accuracies": 0.78125, + "rewards/chosen": -16.714305877685547, + "rewards/margins": 4.454771518707275, + "rewards/rejected": -21.169076919555664, + "step": 305 + }, + { + "epoch": 0.6921119592875318, + "grad_norm": 111.93201937391214, + "learning_rate": 2.1013405510870824e-07, + "logits/chosen": -18.295650482177734, + "logits/rejected": -18.45261573791504, + "logps/chosen": -1.8770661354064941, + "logps/rejected": -2.236387252807617, + "loss": 3.3382, + "rewards/accuracies": 0.625, + "rewards/chosen": -18.770662307739258, + "rewards/margins": 3.593210220336914, + "rewards/rejected": -22.36387062072754, + "step": 306 + }, + { + "epoch": 0.6943737630760531, + "grad_norm": 129.49294218978284, + "learning_rate": 2.0735400881539494e-07, + "logits/chosen": -20.06885528564453, + "logits/rejected": -20.67595672607422, + "logps/chosen": -1.699569821357727, + "logps/rejected": -2.1293015480041504, + "loss": 3.2719, + "rewards/accuracies": 0.78125, + "rewards/chosen": -16.995698928833008, + "rewards/margins": 4.29731559753418, + "rewards/rejected": -21.293014526367188, + "step": 307 + }, + { + "epoch": 0.6966355668645745, + "grad_norm": 126.68283700786048, + "learning_rate": 2.0458602610855536e-07, + "logits/chosen": -16.85354995727539, + "logits/rejected": -17.0955753326416, + "logps/chosen": -1.633847713470459, + "logps/rejected": -2.1871325969696045, + "loss": 2.746, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.338476181030273, + "rewards/margins": 5.532848834991455, + "rewards/rejected": -21.87132453918457, + "step": 308 + }, + { + "epoch": 0.6988973706530959, + "grad_norm": 121.46567102959813, + "learning_rate": 2.0183028032062422e-07, + "logits/chosen": -18.197134017944336, + "logits/rejected": -18.313335418701172, + "logps/chosen": -1.7123744487762451, + "logps/rejected": -2.3391215801239014, + "loss": 3.301, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.123743057250977, + "rewards/margins": 6.267471790313721, + "rewards/rejected": -23.391216278076172, + "step": 309 + }, + { + "epoch": 0.7011591744416172, + "grad_norm": 124.40646143719266, + "learning_rate": 1.9908694401775473e-07, + "logits/chosen": -19.83294677734375, + "logits/rejected": -20.086868286132812, + "logps/chosen": -2.0055930614471436, + "logps/rejected": -2.405197858810425, + "loss": 3.0917, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.055932998657227, + "rewards/margins": 3.996046781539917, + "rewards/rejected": -24.051979064941406, + "step": 310 + }, + { + "epoch": 0.7034209782301385, + "grad_norm": 126.95244540601655, + "learning_rate": 1.9635618898901196e-07, + "logits/chosen": -19.060583114624023, + "logits/rejected": -19.149402618408203, + "logps/chosen": -1.9011938571929932, + "logps/rejected": -2.284623146057129, + "loss": 3.0781, + "rewards/accuracies": 0.65625, + "rewards/chosen": -19.011938095092773, + "rewards/margins": 3.834294080734253, + "rewards/rejected": -22.846233367919922, + "step": 311 + }, + { + "epoch": 0.7056827820186599, + "grad_norm": 131.1304953783553, + "learning_rate": 1.9363818623561565e-07, + "logits/chosen": -18.06791114807129, + "logits/rejected": -17.94767189025879, + "logps/chosen": -1.763685941696167, + "logps/rejected": -2.0710325241088867, + "loss": 3.5313, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.636857986450195, + "rewards/margins": 3.0734646320343018, + "rewards/rejected": -20.710325241088867, + "step": 312 + }, + { + "epoch": 0.7079445858071812, + "grad_norm": 116.35057290069598, + "learning_rate": 1.9093310596023108e-07, + "logits/chosen": -18.00191307067871, + "logits/rejected": -18.05375099182129, + "logps/chosen": -1.9328045845031738, + "logps/rejected": -2.434842348098755, + "loss": 2.5712, + "rewards/accuracies": 0.78125, + "rewards/chosen": -19.328044891357422, + "rewards/margins": 5.020379543304443, + "rewards/rejected": -24.348424911499023, + "step": 313 + }, + { + "epoch": 0.7102063895957026, + "grad_norm": 107.72415682056965, + "learning_rate": 1.8824111755631274e-07, + "logits/chosen": -17.74974250793457, + "logits/rejected": -17.714256286621094, + "logps/chosen": -1.6889841556549072, + "logps/rejected": -2.1979172229766846, + "loss": 3.324, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.889841079711914, + "rewards/margins": 5.089331150054932, + "rewards/rejected": -21.97917366027832, + "step": 314 + }, + { + "epoch": 0.712468193384224, + "grad_norm": 175.2751395769359, + "learning_rate": 1.8556238959749457e-07, + "logits/chosen": -20.16362762451172, + "logits/rejected": -20.45577049255371, + "logps/chosen": -1.9660546779632568, + "logps/rejected": -2.3499526977539062, + "loss": 3.5857, + "rewards/accuracies": 0.71875, + "rewards/chosen": -19.660547256469727, + "rewards/margins": 3.8389804363250732, + "rewards/rejected": -23.49952507019043, + "step": 315 + }, + { + "epoch": 0.7147299971727452, + "grad_norm": 119.01395158521336, + "learning_rate": 1.8289708982703562e-07, + "logits/chosen": -18.191469192504883, + "logits/rejected": -18.05630111694336, + "logps/chosen": -1.6164871454238892, + "logps/rejected": -2.0442566871643066, + "loss": 3.1091, + "rewards/accuracies": 0.78125, + "rewards/chosen": -16.164871215820312, + "rewards/margins": 4.277695655822754, + "rewards/rejected": -20.442567825317383, + "step": 316 + }, + { + "epoch": 0.7169918009612666, + "grad_norm": 110.39790952014322, + "learning_rate": 1.802453851473151e-07, + "logits/chosen": -18.25019073486328, + "logits/rejected": -18.184785842895508, + "logps/chosen": -1.9268181324005127, + "logps/rejected": -2.5494563579559326, + "loss": 2.5819, + "rewards/accuracies": 0.75, + "rewards/chosen": -19.26818084716797, + "rewards/margins": 6.226382732391357, + "rewards/rejected": -25.494564056396484, + "step": 317 + }, + { + "epoch": 0.719253604749788, + "grad_norm": 111.59669789809674, + "learning_rate": 1.7760744160938093e-07, + "logits/chosen": -19.184326171875, + "logits/rejected": -19.069150924682617, + "logps/chosen": -1.9187712669372559, + "logps/rejected": -2.417238473892212, + "loss": 2.5089, + "rewards/accuracies": 0.90625, + "rewards/chosen": -19.187713623046875, + "rewards/margins": 4.984671592712402, + "rewards/rejected": -24.172386169433594, + "step": 318 + }, + { + "epoch": 0.7215154085383093, + "grad_norm": 111.40164812220848, + "learning_rate": 1.7498342440255135e-07, + "logits/chosen": -17.5487060546875, + "logits/rejected": -17.807178497314453, + "logps/chosen": -1.8459084033966064, + "logps/rejected": -2.48807430267334, + "loss": 3.5431, + "rewards/accuracies": 0.84375, + "rewards/chosen": -18.459083557128906, + "rewards/margins": 6.42165994644165, + "rewards/rejected": -24.88074493408203, + "step": 319 + }, + { + "epoch": 0.7237772123268307, + "grad_norm": 124.12971999012613, + "learning_rate": 1.7237349784407115e-07, + "logits/chosen": -17.986967086791992, + "logits/rejected": -18.200471878051758, + "logps/chosen": -2.0284173488616943, + "logps/rejected": -2.5237107276916504, + "loss": 3.7015, + "rewards/accuracies": 0.59375, + "rewards/chosen": -20.2841739654541, + "rewards/margins": 4.952933311462402, + "rewards/rejected": -25.237106323242188, + "step": 320 + }, + { + "epoch": 0.726039016115352, + "grad_norm": 141.09967606925343, + "learning_rate": 1.6977782536882178e-07, + "logits/chosen": -16.887096405029297, + "logits/rejected": -16.802282333374023, + "logps/chosen": -1.782692790031433, + "logps/rejected": -2.303356170654297, + "loss": 3.1257, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.82692527770996, + "rewards/margins": 5.206636428833008, + "rewards/rejected": -23.0335636138916, + "step": 321 + }, + { + "epoch": 0.7283008199038733, + "grad_norm": 116.38070332785638, + "learning_rate": 1.6719656951908708e-07, + "logits/chosen": -17.198162078857422, + "logits/rejected": -16.910192489624023, + "logps/chosen": -1.3144806623458862, + "logps/rejected": -1.8561556339263916, + "loss": 2.7638, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.144807815551758, + "rewards/margins": 5.416749954223633, + "rewards/rejected": -18.561553955078125, + "step": 322 + }, + { + "epoch": 0.7305626236923947, + "grad_norm": 114.11746852210314, + "learning_rate": 1.6462989193437453e-07, + "logits/chosen": -17.512184143066406, + "logits/rejected": -17.920053482055664, + "logps/chosen": -1.9415867328643799, + "logps/rejected": -2.1908974647521973, + "loss": 3.6178, + "rewards/accuracies": 0.59375, + "rewards/chosen": -19.415868759155273, + "rewards/margins": 2.493105888366699, + "rewards/rejected": -21.908973693847656, + "step": 323 + }, + { + "epoch": 0.732824427480916, + "grad_norm": 113.99856611080652, + "learning_rate": 1.6207795334129365e-07, + "logits/chosen": -19.32732582092285, + "logits/rejected": -19.14191246032715, + "logps/chosen": -1.6971518993377686, + "logps/rejected": -2.295579433441162, + "loss": 2.8077, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.971519470214844, + "rewards/margins": 5.98427677154541, + "rewards/rejected": -22.955795288085938, + "step": 324 + }, + { + "epoch": 0.7350862312694374, + "grad_norm": 111.98084422068199, + "learning_rate": 1.5954091354349121e-07, + "logits/chosen": -17.98455238342285, + "logits/rejected": -17.884262084960938, + "logps/chosen": -1.7183001041412354, + "logps/rejected": -2.17026424407959, + "loss": 2.8751, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.183000564575195, + "rewards/margins": 4.519641876220703, + "rewards/rejected": -21.7026424407959, + "step": 325 + }, + { + "epoch": 0.7373480350579588, + "grad_norm": 232.39770149735068, + "learning_rate": 1.5701893141164364e-07, + "logits/chosen": -18.812040328979492, + "logits/rejected": -18.633085250854492, + "logps/chosen": -1.676187515258789, + "logps/rejected": -2.3979151248931885, + "loss": 3.0272, + "rewards/accuracies": 0.875, + "rewards/chosen": -16.761873245239258, + "rewards/margins": 7.217278480529785, + "rewards/rejected": -23.979150772094727, + "step": 326 + }, + { + "epoch": 0.73960983884648, + "grad_norm": 120.7387881104363, + "learning_rate": 1.545121648735093e-07, + "logits/chosen": -18.34151840209961, + "logits/rejected": -18.516704559326172, + "logps/chosen": -1.6959328651428223, + "logps/rejected": -2.0101141929626465, + "loss": 3.0935, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.95932960510254, + "rewards/margins": 3.1418118476867676, + "rewards/rejected": -20.10114097595215, + "step": 327 + }, + { + "epoch": 0.7418716426350014, + "grad_norm": 116.46893556878314, + "learning_rate": 1.5202077090403863e-07, + "logits/chosen": -16.94781494140625, + "logits/rejected": -16.98765754699707, + "logps/chosen": -1.6322197914123535, + "logps/rejected": -2.0489912033081055, + "loss": 2.9396, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.32219696044922, + "rewards/margins": 4.1677141189575195, + "rewards/rejected": -20.489913940429688, + "step": 328 + }, + { + "epoch": 0.7441334464235227, + "grad_norm": 146.64902887166238, + "learning_rate": 1.495449055155443e-07, + "logits/chosen": -16.194108963012695, + "logits/rejected": -16.39000701904297, + "logps/chosen": -1.4700114727020264, + "logps/rejected": -1.9279454946517944, + "loss": 3.1195, + "rewards/accuracies": 0.78125, + "rewards/chosen": -14.700116157531738, + "rewards/margins": 4.579338550567627, + "rewards/rejected": -19.279455184936523, + "step": 329 + }, + { + "epoch": 0.7463952502120441, + "grad_norm": 128.77629572500186, + "learning_rate": 1.4708472374793112e-07, + "logits/chosen": -18.751955032348633, + "logits/rejected": -18.25170135498047, + "logps/chosen": -1.6594680547714233, + "logps/rejected": -2.1272830963134766, + "loss": 3.568, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.594682693481445, + "rewards/margins": 4.67814826965332, + "rewards/rejected": -21.272830963134766, + "step": 330 + }, + { + "epoch": 0.7486570540005655, + "grad_norm": 116.85830667937259, + "learning_rate": 1.4464037965898878e-07, + "logits/chosen": -19.034826278686523, + "logits/rejected": -18.4996395111084, + "logps/chosen": -1.6908671855926514, + "logps/rejected": -2.253139019012451, + "loss": 3.1892, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.908668518066406, + "rewards/margins": 5.622718811035156, + "rewards/rejected": -22.531389236450195, + "step": 331 + }, + { + "epoch": 0.7509188577890868, + "grad_norm": 128.75404728697518, + "learning_rate": 1.4221202631474282e-07, + "logits/chosen": -18.098434448242188, + "logits/rejected": -18.39754867553711, + "logps/chosen": -1.7314308881759644, + "logps/rejected": -2.248021125793457, + "loss": 3.2902, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.314308166503906, + "rewards/margins": 5.165902137756348, + "rewards/rejected": -22.480209350585938, + "step": 332 + }, + { + "epoch": 0.7531806615776081, + "grad_norm": 112.23640069941514, + "learning_rate": 1.3979981577987113e-07, + "logits/chosen": -17.052106857299805, + "logits/rejected": -17.000144958496094, + "logps/chosen": -1.8308027982711792, + "logps/rejected": -2.2044148445129395, + "loss": 2.8652, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.308027267456055, + "rewards/margins": 3.73612117767334, + "rewards/rejected": -22.04414939880371, + "step": 333 + }, + { + "epoch": 0.7554424653661295, + "grad_norm": 120.8440355705056, + "learning_rate": 1.374038991081807e-07, + "logits/chosen": -17.621667861938477, + "logits/rejected": -17.636934280395508, + "logps/chosen": -1.6944687366485596, + "logps/rejected": -2.1847054958343506, + "loss": 2.8811, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.944686889648438, + "rewards/margins": 4.902366638183594, + "rewards/rejected": -21.84705352783203, + "step": 334 + }, + { + "epoch": 0.7577042691546508, + "grad_norm": 126.66430839817343, + "learning_rate": 1.3502442633314882e-07, + "logits/chosen": -16.78680992126465, + "logits/rejected": -16.919919967651367, + "logps/chosen": -1.619330883026123, + "logps/rejected": -2.069342851638794, + "loss": 2.8007, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.193309783935547, + "rewards/margins": 4.500118255615234, + "rewards/rejected": -20.69342803955078, + "step": 335 + }, + { + "epoch": 0.7599660729431722, + "grad_norm": 107.65531245820142, + "learning_rate": 1.3266154645852815e-07, + "logits/chosen": -18.672462463378906, + "logits/rejected": -18.989612579345703, + "logps/chosen": -1.6392340660095215, + "logps/rejected": -2.228325366973877, + "loss": 2.9037, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.39234161376953, + "rewards/margins": 5.890913009643555, + "rewards/rejected": -22.283252716064453, + "step": 336 + }, + { + "epoch": 0.7622278767316936, + "grad_norm": 104.66828130875484, + "learning_rate": 1.303154074490152e-07, + "logits/chosen": -17.055776596069336, + "logits/rejected": -16.77735137939453, + "logps/chosen": -1.4908720254898071, + "logps/rejected": -1.799816608428955, + "loss": 2.7556, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.908721923828125, + "rewards/margins": 3.089444875717163, + "rewards/rejected": -17.998165130615234, + "step": 337 + }, + { + "epoch": 0.7644896805202148, + "grad_norm": 117.88462800241341, + "learning_rate": 1.2798615622098616e-07, + "logits/chosen": -17.4500732421875, + "logits/rejected": -17.197757720947266, + "logps/chosen": -1.6963038444519043, + "logps/rejected": -2.2104175090789795, + "loss": 2.8171, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.96303939819336, + "rewards/margins": 5.141136646270752, + "rewards/rejected": -22.104175567626953, + "step": 338 + }, + { + "epoch": 0.7667514843087362, + "grad_norm": 106.90568181230161, + "learning_rate": 1.2567393863329523e-07, + "logits/chosen": -18.870460510253906, + "logits/rejected": -18.87204360961914, + "logps/chosen": -1.8440241813659668, + "logps/rejected": -2.3904788494110107, + "loss": 2.8668, + "rewards/accuracies": 0.71875, + "rewards/chosen": -18.44024085998535, + "rewards/margins": 5.464546203613281, + "rewards/rejected": -23.904788970947266, + "step": 339 + }, + { + "epoch": 0.7690132880972576, + "grad_norm": 135.41776419808772, + "learning_rate": 1.233788994781423e-07, + "logits/chosen": -17.011192321777344, + "logits/rejected": -17.013751983642578, + "logps/chosen": -1.4503196477890015, + "logps/rejected": -2.010632276535034, + "loss": 3.3367, + "rewards/accuracies": 0.78125, + "rewards/chosen": -14.50319766998291, + "rewards/margins": 5.603124618530273, + "rewards/rejected": -20.106321334838867, + "step": 340 + }, + { + "epoch": 0.7712750918857789, + "grad_norm": 136.82224989040978, + "learning_rate": 1.2110118247200468e-07, + "logits/chosen": -18.286027908325195, + "logits/rejected": -18.17170524597168, + "logps/chosen": -1.6681314706802368, + "logps/rejected": -2.0645222663879395, + "loss": 2.9115, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.68131446838379, + "rewards/margins": 3.963907241821289, + "rewards/rejected": -20.645221710205078, + "step": 341 + }, + { + "epoch": 0.7735368956743003, + "grad_norm": 116.57287182904345, + "learning_rate": 1.1884093024663933e-07, + "logits/chosen": -16.591590881347656, + "logits/rejected": -16.540773391723633, + "logps/chosen": -1.636472463607788, + "logps/rejected": -2.1073007583618164, + "loss": 2.8411, + "rewards/accuracies": 0.65625, + "rewards/chosen": -16.36472511291504, + "rewards/margins": 4.708281517028809, + "rewards/rejected": -21.07300567626953, + "step": 342 + }, + { + "epoch": 0.7757986994628217, + "grad_norm": 118.5480993837639, + "learning_rate": 1.1659828434014886e-07, + "logits/chosen": -17.95990562438965, + "logits/rejected": -17.720420837402344, + "logps/chosen": -1.635303020477295, + "logps/rejected": -2.0385963916778564, + "loss": 3.0565, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.353031158447266, + "rewards/margins": 4.032935619354248, + "rewards/rejected": -20.38596534729004, + "step": 343 + }, + { + "epoch": 0.7780605032513429, + "grad_norm": 134.89450131828505, + "learning_rate": 1.143733851881203e-07, + "logits/chosen": -19.509780883789062, + "logits/rejected": -19.24722671508789, + "logps/chosen": -1.545256495475769, + "logps/rejected": -2.114616632461548, + "loss": 2.6658, + "rewards/accuracies": 0.8125, + "rewards/chosen": -15.452564239501953, + "rewards/margins": 5.693601608276367, + "rewards/rejected": -21.146167755126953, + "step": 344 + }, + { + "epoch": 0.7803223070398643, + "grad_norm": 123.25488488665061, + "learning_rate": 1.1216637211483005e-07, + "logits/chosen": -18.04479217529297, + "logits/rejected": -18.232608795166016, + "logps/chosen": -1.771346092224121, + "logps/rejected": -2.349630355834961, + "loss": 3.2986, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.71346092224121, + "rewards/margins": 5.782842636108398, + "rewards/rejected": -23.49630355834961, + "step": 345 + }, + { + "epoch": 0.7825841108283856, + "grad_norm": 114.24822425854607, + "learning_rate": 1.0997738332451936e-07, + "logits/chosen": -18.91605567932129, + "logits/rejected": -18.952014923095703, + "logps/chosen": -1.9931975603103638, + "logps/rejected": -2.6114182472229004, + "loss": 2.7017, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.931976318359375, + "rewards/margins": 6.182207107543945, + "rewards/rejected": -26.11418342590332, + "step": 346 + }, + { + "epoch": 0.784845914616907, + "grad_norm": 101.71447993811626, + "learning_rate": 1.0780655589274031e-07, + "logits/chosen": -19.68770408630371, + "logits/rejected": -19.459531784057617, + "logps/chosen": -1.9268380403518677, + "logps/rejected": -2.4731695652008057, + "loss": 3.0762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -19.26837921142578, + "rewards/margins": 5.463315010070801, + "rewards/rejected": -24.731693267822266, + "step": 347 + }, + { + "epoch": 0.7871077184054284, + "grad_norm": 144.33485472555742, + "learning_rate": 1.056540257577712e-07, + "logits/chosen": -19.237892150878906, + "logits/rejected": -19.197168350219727, + "logps/chosen": -2.0048787593841553, + "logps/rejected": -2.5931644439697266, + "loss": 2.7076, + "rewards/accuracies": 0.75, + "rewards/chosen": -20.048786163330078, + "rewards/margins": 5.882862091064453, + "rewards/rejected": -25.93164825439453, + "step": 348 + }, + { + "epoch": 0.7893695221939496, + "grad_norm": 112.5748530177231, + "learning_rate": 1.0351992771210554e-07, + "logits/chosen": -18.623769760131836, + "logits/rejected": -19.13688850402832, + "logps/chosen": -1.8369648456573486, + "logps/rejected": -2.3405067920684814, + "loss": 3.2742, + "rewards/accuracies": 0.71875, + "rewards/chosen": -18.369647979736328, + "rewards/margins": 5.035419464111328, + "rewards/rejected": -23.405067443847656, + "step": 349 + }, + { + "epoch": 0.791631325982471, + "grad_norm": 132.898356490048, + "learning_rate": 1.0140439539400953e-07, + "logits/chosen": -18.275182723999023, + "logits/rejected": -18.35052490234375, + "logps/chosen": -2.0253753662109375, + "logps/rejected": -2.4307503700256348, + "loss": 3.2702, + "rewards/accuracies": 0.75, + "rewards/chosen": -20.253755569458008, + "rewards/margins": 4.053745746612549, + "rewards/rejected": -24.307498931884766, + "step": 350 + }, + { + "epoch": 0.7938931297709924, + "grad_norm": 119.8096502311399, + "learning_rate": 9.930756127915488e-08, + "logits/chosen": -20.30059051513672, + "logits/rejected": -20.41552734375, + "logps/chosen": -1.89390230178833, + "logps/rejected": -2.320417642593384, + "loss": 2.8442, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.939023971557617, + "rewards/margins": 4.265152454376221, + "rewards/rejected": -23.204177856445312, + "step": 351 + }, + { + "epoch": 0.7961549335595137, + "grad_norm": 126.98574065425882, + "learning_rate": 9.722955667232242e-08, + "logits/chosen": -16.467742919921875, + "logits/rejected": -16.47926902770996, + "logps/chosen": -1.5406644344329834, + "logps/rejected": -2.026040554046631, + "loss": 3.5477, + "rewards/accuracies": 0.75, + "rewards/chosen": -15.406644821166992, + "rewards/margins": 4.853761672973633, + "rewards/rejected": -20.260406494140625, + "step": 352 + }, + { + "epoch": 0.7984167373480351, + "grad_norm": 136.73209013329856, + "learning_rate": 9.517051169918016e-08, + "logits/chosen": -17.076210021972656, + "logits/rejected": -17.026386260986328, + "logps/chosen": -1.750929832458496, + "logps/rejected": -2.2710447311401367, + "loss": 3.3139, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.509296417236328, + "rewards/margins": 5.2011494636535645, + "rewards/rejected": -22.71044921875, + "step": 353 + }, + { + "epoch": 0.8006785411365565, + "grad_norm": 130.1019327736892, + "learning_rate": 9.313055529813412e-08, + "logits/chosen": -18.05898666381836, + "logits/rejected": -18.257951736450195, + "logps/chosen": -1.6652144193649292, + "logps/rejected": -1.9794695377349854, + "loss": 3.2462, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.652145385742188, + "rewards/margins": 3.142549991607666, + "rewards/rejected": -19.794694900512695, + "step": 354 + }, + { + "epoch": 0.8029403449250777, + "grad_norm": 135.42727334061703, + "learning_rate": 9.110981521225532e-08, + "logits/chosen": -17.604793548583984, + "logits/rejected": -17.597965240478516, + "logps/chosen": -1.6499242782592773, + "logps/rejected": -2.056095600128174, + "loss": 3.3403, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.499242782592773, + "rewards/margins": 4.061714172363281, + "rewards/rejected": -20.560955047607422, + "step": 355 + }, + { + "epoch": 0.8052021487135991, + "grad_norm": 119.05199787120601, + "learning_rate": 8.910841798127884e-08, + "logits/chosen": -17.721969604492188, + "logits/rejected": -18.05399513244629, + "logps/chosen": -1.4811893701553345, + "logps/rejected": -1.954594612121582, + "loss": 3.4896, + "rewards/accuracies": 0.71875, + "rewards/chosen": -14.811893463134766, + "rewards/margins": 4.734054088592529, + "rewards/rejected": -19.545948028564453, + "step": 356 + }, + { + "epoch": 0.8074639525021204, + "grad_norm": 115.38476702241651, + "learning_rate": 8.712648893368139e-08, + "logits/chosen": -18.29002571105957, + "logits/rejected": -18.369548797607422, + "logps/chosen": -2.055358409881592, + "logps/rejected": -2.6065731048583984, + "loss": 3.0386, + "rewards/accuracies": 0.6875, + "rewards/chosen": -20.553585052490234, + "rewards/margins": 5.512145042419434, + "rewards/rejected": -26.065731048583984, + "step": 357 + }, + { + "epoch": 0.8097257562906418, + "grad_norm": 136.37770427632142, + "learning_rate": 8.516415217883186e-08, + "logits/chosen": -20.281574249267578, + "logits/rejected": -20.5699405670166, + "logps/chosen": -1.7435851097106934, + "logps/rejected": -2.0679283142089844, + "loss": 3.2075, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.435850143432617, + "rewards/margins": 3.2434327602386475, + "rewards/rejected": -20.679283142089844, + "step": 358 + }, + { + "epoch": 0.8119875600791632, + "grad_norm": 135.87264464627583, + "learning_rate": 8.32215305992209e-08, + "logits/chosen": -18.004587173461914, + "logits/rejected": -17.975828170776367, + "logps/chosen": -1.6789181232452393, + "logps/rejected": -2.0826172828674316, + "loss": 3.5249, + "rewards/accuracies": 0.71875, + "rewards/chosen": -16.789180755615234, + "rewards/margins": 4.036990165710449, + "rewards/rejected": -20.826171875, + "step": 359 + }, + { + "epoch": 0.8142493638676844, + "grad_norm": 102.16900938424125, + "learning_rate": 8.129874584276448e-08, + "logits/chosen": -19.01247787475586, + "logits/rejected": -18.749008178710938, + "logps/chosen": -1.819298505783081, + "logps/rejected": -2.235302209854126, + "loss": 2.6286, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.19298553466797, + "rewards/margins": 4.160037517547607, + "rewards/rejected": -22.353023529052734, + "step": 360 + }, + { + "epoch": 0.8165111676562058, + "grad_norm": 118.97964507525137, + "learning_rate": 7.939591831518746e-08, + "logits/chosen": -18.50431251525879, + "logits/rejected": -18.590957641601562, + "logps/chosen": -1.4700491428375244, + "logps/rejected": -1.7618913650512695, + "loss": 3.4084, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.700489044189453, + "rewards/margins": 2.9184250831604004, + "rewards/rejected": -17.618913650512695, + "step": 361 + }, + { + "epoch": 0.8187729714447272, + "grad_norm": 116.66310673813236, + "learning_rate": 7.751316717248304e-08, + "logits/chosen": -17.54795265197754, + "logits/rejected": -17.81288719177246, + "logps/chosen": -1.8244284391403198, + "logps/rejected": -2.45658016204834, + "loss": 2.5685, + "rewards/accuracies": 0.78125, + "rewards/chosen": -18.244285583496094, + "rewards/margins": 6.321516990661621, + "rewards/rejected": -24.565799713134766, + "step": 362 + }, + { + "epoch": 0.8210347752332485, + "grad_norm": 135.45249586464112, + "learning_rate": 7.565061031345142e-08, + "logits/chosen": -17.44479751586914, + "logits/rejected": -17.825634002685547, + "logps/chosen": -1.5290935039520264, + "logps/rejected": -2.0649290084838867, + "loss": 2.6049, + "rewards/accuracies": 0.71875, + "rewards/chosen": -15.290933609008789, + "rewards/margins": 5.358358860015869, + "rewards/rejected": -20.6492919921875, + "step": 363 + }, + { + "epoch": 0.8232965790217699, + "grad_norm": 156.30830120318217, + "learning_rate": 7.380836437231686e-08, + "logits/chosen": -17.245084762573242, + "logits/rejected": -17.161640167236328, + "logps/chosen": -1.7969930171966553, + "logps/rejected": -2.382586717605591, + "loss": 3.0618, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.969932556152344, + "rewards/margins": 5.8559346199035645, + "rewards/rejected": -23.82586669921875, + "step": 364 + }, + { + "epoch": 0.8255583828102913, + "grad_norm": 102.01777871530572, + "learning_rate": 7.198654471142371e-08, + "logits/chosen": -15.612630844116211, + "logits/rejected": -15.740878105163574, + "logps/chosen": -1.6975904703140259, + "logps/rejected": -2.106090545654297, + "loss": 2.2693, + "rewards/accuracies": 0.78125, + "rewards/chosen": -16.97590446472168, + "rewards/margins": 4.084999084472656, + "rewards/rejected": -21.06090545654297, + "step": 365 + }, + { + "epoch": 0.8278201865988125, + "grad_norm": 143.32145816638297, + "learning_rate": 7.01852654140132e-08, + "logits/chosen": -16.77008056640625, + "logits/rejected": -16.502208709716797, + "logps/chosen": -1.930238962173462, + "logps/rejected": -2.095974922180176, + "loss": 3.3324, + "rewards/accuracies": 0.65625, + "rewards/chosen": -19.302391052246094, + "rewards/margins": 1.6573582887649536, + "rewards/rejected": -20.959747314453125, + "step": 366 + }, + { + "epoch": 0.8300819903873339, + "grad_norm": 104.17056966782296, + "learning_rate": 6.840463927707833e-08, + "logits/chosen": -19.091691970825195, + "logits/rejected": -18.677135467529297, + "logps/chosen": -1.845261573791504, + "logps/rejected": -2.5466208457946777, + "loss": 2.7259, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.45261573791504, + "rewards/margins": 7.0135955810546875, + "rewards/rejected": -25.46621322631836, + "step": 367 + }, + { + "epoch": 0.8323437941758552, + "grad_norm": 108.38820652434924, + "learning_rate": 6.664477780430138e-08, + "logits/chosen": -18.874038696289062, + "logits/rejected": -18.718944549560547, + "logps/chosen": -1.7501500844955444, + "logps/rejected": -2.1758456230163574, + "loss": 3.0257, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.50149917602539, + "rewards/margins": 4.256957530975342, + "rewards/rejected": -21.758455276489258, + "step": 368 + }, + { + "epoch": 0.8346055979643766, + "grad_norm": 132.12208838073096, + "learning_rate": 6.49057911990711e-08, + "logits/chosen": -20.289962768554688, + "logits/rejected": -20.444843292236328, + "logps/chosen": -1.7201656103134155, + "logps/rejected": -2.0844554901123047, + "loss": 3.4713, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.201656341552734, + "rewards/margins": 3.642897605895996, + "rewards/rejected": -20.84455108642578, + "step": 369 + }, + { + "epoch": 0.836867401752898, + "grad_norm": 109.87950428664602, + "learning_rate": 6.318778835758189e-08, + "logits/chosen": -19.79219627380371, + "logits/rejected": -19.581737518310547, + "logps/chosen": -1.884903907775879, + "logps/rejected": -2.484591007232666, + "loss": 2.5576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -18.84903907775879, + "rewards/margins": 5.996870994567871, + "rewards/rejected": -24.845909118652344, + "step": 370 + }, + { + "epoch": 0.8391292055414192, + "grad_norm": 132.43862675142938, + "learning_rate": 6.149087686201433e-08, + "logits/chosen": -17.08002471923828, + "logits/rejected": -17.242450714111328, + "logps/chosen": -1.42905592918396, + "logps/rejected": -1.849015712738037, + "loss": 3.3921, + "rewards/accuracies": 0.78125, + "rewards/chosen": -14.290557861328125, + "rewards/margins": 4.199598789215088, + "rewards/rejected": -18.490156173706055, + "step": 371 + }, + { + "epoch": 0.8413910093299406, + "grad_norm": 125.15917954045561, + "learning_rate": 5.98151629737988e-08, + "logits/chosen": -18.34781265258789, + "logits/rejected": -18.730987548828125, + "logps/chosen": -1.8118157386779785, + "logps/rejected": -2.531665802001953, + "loss": 2.9388, + "rewards/accuracies": 0.84375, + "rewards/chosen": -18.1181583404541, + "rewards/margins": 7.19849967956543, + "rewards/rejected": -25.31665802001953, + "step": 372 + }, + { + "epoch": 0.843652813118462, + "grad_norm": 103.52444879535686, + "learning_rate": 5.816075162696097e-08, + "logits/chosen": -17.558019638061523, + "logits/rejected": -17.425859451293945, + "logps/chosen": -1.3931989669799805, + "logps/rejected": -2.0169644355773926, + "loss": 2.5297, + "rewards/accuracies": 0.90625, + "rewards/chosen": -13.931989669799805, + "rewards/margins": 6.237652778625488, + "rewards/rejected": -20.16964340209961, + "step": 373 + }, + { + "epoch": 0.8459146169069833, + "grad_norm": 96.8291597215048, + "learning_rate": 5.6527746421551046e-08, + "logits/chosen": -19.054040908813477, + "logits/rejected": -18.775854110717773, + "logps/chosen": -1.6858869791030884, + "logps/rejected": -2.047640800476074, + "loss": 3.141, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.858869552612305, + "rewards/margins": 3.617537260055542, + "rewards/rejected": -20.47640609741211, + "step": 374 + }, + { + "epoch": 0.8481764206955047, + "grad_norm": 111.02813607168243, + "learning_rate": 5.4916249617156064e-08, + "logits/chosen": -18.283065795898438, + "logits/rejected": -17.85355567932129, + "logps/chosen": -1.6720712184906006, + "logps/rejected": -2.2261600494384766, + "loss": 3.0392, + "rewards/accuracies": 0.78125, + "rewards/chosen": -16.720712661743164, + "rewards/margins": 5.540886402130127, + "rewards/rejected": -22.261598587036133, + "step": 375 + }, + { + "epoch": 0.8504382244840261, + "grad_norm": 113.59608952374487, + "learning_rate": 5.332636212649646e-08, + "logits/chosen": -17.41098976135254, + "logits/rejected": -17.46200180053711, + "logps/chosen": -1.504152774810791, + "logps/rejected": -1.9162389039993286, + "loss": 3.0122, + "rewards/accuracies": 0.875, + "rewards/chosen": -15.041528701782227, + "rewards/margins": 4.120862007141113, + "rewards/rejected": -19.162389755249023, + "step": 376 + }, + { + "epoch": 0.8527000282725473, + "grad_norm": 113.17297439644031, + "learning_rate": 5.17581835091069e-08, + "logits/chosen": -18.82924461364746, + "logits/rejected": -19.1933536529541, + "logps/chosen": -1.8666414022445679, + "logps/rejected": -2.4097957611083984, + "loss": 3.0043, + "rewards/accuracies": 0.6875, + "rewards/chosen": -18.666414260864258, + "rewards/margins": 5.431545257568359, + "rewards/rejected": -24.097959518432617, + "step": 377 + }, + { + "epoch": 0.8549618320610687, + "grad_norm": 127.65744686159971, + "learning_rate": 5.02118119651016e-08, + "logits/chosen": -15.968871116638184, + "logits/rejected": -16.066356658935547, + "logps/chosen": -1.7414379119873047, + "logps/rejected": -2.270836353302002, + "loss": 2.8791, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.41438102722168, + "rewards/margins": 5.293981075286865, + "rewards/rejected": -22.708358764648438, + "step": 378 + }, + { + "epoch": 0.85722363584959, + "grad_norm": 128.0416423580519, + "learning_rate": 4.868734432902526e-08, + "logits/chosen": -15.94872760772705, + "logits/rejected": -15.913581848144531, + "logps/chosen": -1.6559503078460693, + "logps/rejected": -2.2543857097625732, + "loss": 3.2796, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.55950355529785, + "rewards/margins": 5.984354496002197, + "rewards/rejected": -22.543859481811523, + "step": 379 + }, + { + "epoch": 0.8594854396381114, + "grad_norm": 137.44029074786243, + "learning_rate": 4.7184876063789134e-08, + "logits/chosen": -16.0161075592041, + "logits/rejected": -16.125635147094727, + "logps/chosen": -1.7773343324661255, + "logps/rejected": -2.296335458755493, + "loss": 3.0373, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.773344039916992, + "rewards/margins": 5.190011024475098, + "rewards/rejected": -22.963354110717773, + "step": 380 + }, + { + "epoch": 0.8617472434266328, + "grad_norm": 97.95472528249057, + "learning_rate": 4.570450125469314e-08, + "logits/chosen": -18.46808624267578, + "logits/rejected": -18.239410400390625, + "logps/chosen": -1.8006447553634644, + "logps/rejected": -2.444537878036499, + "loss": 2.4475, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.006446838378906, + "rewards/margins": 6.4389328956604, + "rewards/rejected": -24.44538116455078, + "step": 381 + }, + { + "epoch": 0.864009047215154, + "grad_norm": 122.46444272567702, + "learning_rate": 4.424631260353378e-08, + "logits/chosen": -16.532258987426758, + "logits/rejected": -16.949682235717773, + "logps/chosen": -1.4457993507385254, + "logps/rejected": -1.9387412071228027, + "loss": 3.2723, + "rewards/accuracies": 0.8125, + "rewards/chosen": -14.457992553710938, + "rewards/margins": 4.929420471191406, + "rewards/rejected": -19.387413024902344, + "step": 382 + }, + { + "epoch": 0.8662708510036754, + "grad_norm": 134.6321015186439, + "learning_rate": 4.281040142280008e-08, + "logits/chosen": -17.987564086914062, + "logits/rejected": -17.679292678833008, + "logps/chosen": -1.4789788722991943, + "logps/rejected": -1.9366073608398438, + "loss": 2.4358, + "rewards/accuracies": 0.8125, + "rewards/chosen": -14.789788246154785, + "rewards/margins": 4.5762858390808105, + "rewards/rejected": -19.366071701049805, + "step": 383 + }, + { + "epoch": 0.8685326547921968, + "grad_norm": 145.23607203406814, + "learning_rate": 4.1396857629954286e-08, + "logits/chosen": -19.37828254699707, + "logits/rejected": -19.550512313842773, + "logps/chosen": -2.0876235961914062, + "logps/rejected": -2.755703926086426, + "loss": 3.0501, + "rewards/accuracies": 0.6875, + "rewards/chosen": -20.876237869262695, + "rewards/margins": 6.6808037757873535, + "rewards/rejected": -27.55704116821289, + "step": 384 + }, + { + "epoch": 0.8707944585807181, + "grad_norm": 95.46804083715921, + "learning_rate": 4.000576974180232e-08, + "logits/chosen": -17.347396850585938, + "logits/rejected": -17.552875518798828, + "logps/chosen": -1.7520135641098022, + "logps/rejected": -2.1543803215026855, + "loss": 2.6766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.5201358795166, + "rewards/margins": 4.023664474487305, + "rewards/rejected": -21.543800354003906, + "step": 385 + }, + { + "epoch": 0.8730562623692395, + "grad_norm": 104.95835540317567, + "learning_rate": 3.8637224868950066e-08, + "logits/chosen": -18.283233642578125, + "logits/rejected": -18.088119506835938, + "logps/chosen": -1.747474193572998, + "logps/rejected": -2.1311683654785156, + "loss": 2.9522, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.474742889404297, + "rewards/margins": 3.8369407653808594, + "rewards/rejected": -21.311681747436523, + "step": 386 + }, + { + "epoch": 0.8753180661577609, + "grad_norm": 109.46707890564362, + "learning_rate": 3.729130871034885e-08, + "logits/chosen": -17.69164276123047, + "logits/rejected": -17.34153938293457, + "logps/chosen": -1.60804283618927, + "logps/rejected": -2.0555553436279297, + "loss": 2.7504, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.080427169799805, + "rewards/margins": 4.475124835968018, + "rewards/rejected": -20.555551528930664, + "step": 387 + }, + { + "epoch": 0.8775798699462821, + "grad_norm": 124.79737385782774, + "learning_rate": 3.596810554792888e-08, + "logits/chosen": -19.268070220947266, + "logits/rejected": -19.46062469482422, + "logps/chosen": -2.0554943084716797, + "logps/rejected": -2.616269826889038, + "loss": 3.2245, + "rewards/accuracies": 0.8125, + "rewards/chosen": -20.554943084716797, + "rewards/margins": 5.607754230499268, + "rewards/rejected": -26.162696838378906, + "step": 388 + }, + { + "epoch": 0.8798416737348035, + "grad_norm": 122.92689968096272, + "learning_rate": 3.466769824132116e-08, + "logits/chosen": -19.075674057006836, + "logits/rejected": -19.091054916381836, + "logps/chosen": -1.9592108726501465, + "logps/rejected": -2.5127036571502686, + "loss": 2.9941, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.59210968017578, + "rewards/margins": 5.534926414489746, + "rewards/rejected": -25.12703514099121, + "step": 389 + }, + { + "epoch": 0.8821034775233249, + "grad_norm": 129.9790697640605, + "learning_rate": 3.339016822266925e-08, + "logits/chosen": -17.7946834564209, + "logits/rejected": -17.704071044921875, + "logps/chosen": -1.8595997095108032, + "logps/rejected": -2.2788615226745605, + "loss": 2.2966, + "rewards/accuracies": 0.84375, + "rewards/chosen": -18.595996856689453, + "rewards/margins": 4.192615985870361, + "rewards/rejected": -22.788612365722656, + "step": 390 + }, + { + "epoch": 0.8843652813118462, + "grad_norm": 165.30674333822296, + "learning_rate": 3.213559549152958e-08, + "logits/chosen": -17.560834884643555, + "logits/rejected": -17.52518081665039, + "logps/chosen": -1.4146690368652344, + "logps/rejected": -2.0134286880493164, + "loss": 3.5443, + "rewards/accuracies": 0.875, + "rewards/chosen": -14.146690368652344, + "rewards/margins": 5.987596035003662, + "rewards/rejected": -20.134286880493164, + "step": 391 + }, + { + "epoch": 0.8866270851003676, + "grad_norm": 125.85896431798938, + "learning_rate": 3.090405860986203e-08, + "logits/chosen": -19.027587890625, + "logits/rejected": -19.275127410888672, + "logps/chosen": -2.2725133895874023, + "logps/rejected": -2.906642436981201, + "loss": 2.978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -22.72513198852539, + "rewards/margins": 6.3412885665893555, + "rewards/rejected": -29.066425323486328, + "step": 392 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 128.26387031170609, + "learning_rate": 2.9695634697110315e-08, + "logits/chosen": -17.81490135192871, + "logits/rejected": -17.856733322143555, + "logps/chosen": -1.7137458324432373, + "logps/rejected": -2.1935040950775146, + "loss": 3.3315, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.13745880126953, + "rewards/margins": 4.797582149505615, + "rewards/rejected": -21.935041427612305, + "step": 393 + }, + { + "epoch": 0.8911506926774102, + "grad_norm": 133.54866529087667, + "learning_rate": 2.8510399425372766e-08, + "logits/chosen": -17.018762588500977, + "logits/rejected": -17.156879425048828, + "logps/chosen": -1.6060205698013306, + "logps/rejected": -2.3053719997406006, + "loss": 2.6726, + "rewards/accuracies": 0.90625, + "rewards/chosen": -16.060205459594727, + "rewards/margins": 6.993513584136963, + "rewards/rejected": -23.05371856689453, + "step": 394 + }, + { + "epoch": 0.8934124964659316, + "grad_norm": 133.68043982311156, + "learning_rate": 2.734842701466329e-08, + "logits/chosen": -19.136150360107422, + "logits/rejected": -19.161407470703125, + "logps/chosen": -1.6094989776611328, + "logps/rejected": -1.981292486190796, + "loss": 3.0683, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.094987869262695, + "rewards/margins": 3.7179362773895264, + "rewards/rejected": -19.812923431396484, + "step": 395 + }, + { + "epoch": 0.8956743002544529, + "grad_norm": 135.95939370288835, + "learning_rate": 2.6209790228264438e-08, + "logits/chosen": -17.601152420043945, + "logits/rejected": -17.81585693359375, + "logps/chosen": -2.0195772647857666, + "logps/rejected": -2.464128017425537, + "loss": 3.0865, + "rewards/accuracies": 0.71875, + "rewards/chosen": -20.19577407836914, + "rewards/margins": 4.445508003234863, + "rewards/rejected": -24.64128303527832, + "step": 396 + }, + { + "epoch": 0.8979361040429743, + "grad_norm": 113.2954374562523, + "learning_rate": 2.5094560368170305e-08, + "logits/chosen": -17.47164535522461, + "logits/rejected": -17.385438919067383, + "logps/chosen": -1.6676338911056519, + "logps/rejected": -2.1376919746398926, + "loss": 2.9399, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.676340103149414, + "rewards/margins": 4.7005791664123535, + "rewards/rejected": -21.376916885375977, + "step": 397 + }, + { + "epoch": 0.9001979078314957, + "grad_norm": 109.04886917574666, + "learning_rate": 2.4002807270621893e-08, + "logits/chosen": -19.112560272216797, + "logits/rejected": -19.00596046447754, + "logps/chosen": -1.6903434991836548, + "logps/rejected": -2.2326083183288574, + "loss": 2.8406, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.90343475341797, + "rewards/margins": 5.422649383544922, + "rewards/rejected": -22.32608413696289, + "step": 398 + }, + { + "epoch": 0.9024597116200169, + "grad_norm": 118.17958359886174, + "learning_rate": 2.293459930173354e-08, + "logits/chosen": -19.049705505371094, + "logits/rejected": -19.230878829956055, + "logps/chosen": -1.889772891998291, + "logps/rejected": -2.3055403232574463, + "loss": 3.2104, + "rewards/accuracies": 0.71875, + "rewards/chosen": -18.897727966308594, + "rewards/margins": 4.157675743103027, + "rewards/rejected": -23.055404663085938, + "step": 399 + }, + { + "epoch": 0.9047215154085383, + "grad_norm": 118.64459034636026, + "learning_rate": 2.189000335321256e-08, + "logits/chosen": -16.958843231201172, + "logits/rejected": -16.984729766845703, + "logps/chosen": -1.7274019718170166, + "logps/rejected": -2.112072467803955, + "loss": 3.2286, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.274019241333008, + "rewards/margins": 3.8467049598693848, + "rewards/rejected": -21.120725631713867, + "step": 400 + }, + { + "epoch": 0.9069833191970597, + "grad_norm": 136.85158094391608, + "learning_rate": 2.086908483816954e-08, + "logits/chosen": -18.37076187133789, + "logits/rejected": -18.361103057861328, + "logps/chosen": -2.0497868061065674, + "logps/rejected": -2.2769107818603516, + "loss": 3.0689, + "rewards/accuracies": 0.75, + "rewards/chosen": -20.497867584228516, + "rewards/margins": 2.2712390422821045, + "rewards/rejected": -22.769105911254883, + "step": 401 + }, + { + "epoch": 0.909245122985581, + "grad_norm": 123.9239261032842, + "learning_rate": 1.9871907687022717e-08, + "logits/chosen": -16.64785385131836, + "logits/rejected": -16.638736724853516, + "logps/chosen": -1.5310957431793213, + "logps/rejected": -2.1431174278259277, + "loss": 3.429, + "rewards/accuracies": 0.8125, + "rewards/chosen": -15.310956954956055, + "rewards/margins": 6.120217323303223, + "rewards/rejected": -21.43117332458496, + "step": 402 + }, + { + "epoch": 0.9115069267741024, + "grad_norm": 112.34416737954284, + "learning_rate": 1.889853434349451e-08, + "logits/chosen": -18.59053611755371, + "logits/rejected": -18.618457794189453, + "logps/chosen": -1.6284946203231812, + "logps/rejected": -2.1159729957580566, + "loss": 3.0198, + "rewards/accuracies": 0.90625, + "rewards/chosen": -16.28494644165039, + "rewards/margins": 4.874783992767334, + "rewards/rejected": -21.159730911254883, + "step": 403 + }, + { + "epoch": 0.9137687305626236, + "grad_norm": 124.44884198830263, + "learning_rate": 1.7949025760701164e-08, + "logits/chosen": -18.346927642822266, + "logits/rejected": -17.982345581054688, + "logps/chosen": -1.8247474431991577, + "logps/rejected": -2.060636043548584, + "loss": 3.2447, + "rewards/accuracies": 0.65625, + "rewards/chosen": -18.247474670410156, + "rewards/margins": 2.358887195587158, + "rewards/rejected": -20.606359481811523, + "step": 404 + }, + { + "epoch": 0.916030534351145, + "grad_norm": 98.85780040911116, + "learning_rate": 1.7023441397336023e-08, + "logits/chosen": -16.48066520690918, + "logits/rejected": -16.472976684570312, + "logps/chosen": -1.3015496730804443, + "logps/rejected": -2.006908416748047, + "loss": 3.0345, + "rewards/accuracies": 0.78125, + "rewards/chosen": -13.015497207641602, + "rewards/margins": 7.053586006164551, + "rewards/rejected": -20.069082260131836, + "step": 405 + }, + { + "epoch": 0.9182923381396664, + "grad_norm": 155.0460005854357, + "learning_rate": 1.6121839213945854e-08, + "logits/chosen": -19.387483596801758, + "logits/rejected": -19.041393280029297, + "logps/chosen": -1.9884074926376343, + "logps/rejected": -2.7867484092712402, + "loss": 2.8585, + "rewards/accuracies": 0.8125, + "rewards/chosen": -19.884077072143555, + "rewards/margins": 7.983407974243164, + "rewards/rejected": -27.867483139038086, + "step": 406 + }, + { + "epoch": 0.9205541419281877, + "grad_norm": 115.17173619747005, + "learning_rate": 1.5244275669301777e-08, + "logits/chosen": -18.749929428100586, + "logits/rejected": -18.742963790893555, + "logps/chosen": -1.801578164100647, + "logps/rejected": -2.3081681728363037, + "loss": 2.7902, + "rewards/accuracies": 0.71875, + "rewards/chosen": -18.015783309936523, + "rewards/margins": 5.06589937210083, + "rewards/rejected": -23.081682205200195, + "step": 407 + }, + { + "epoch": 0.9228159457167091, + "grad_norm": 128.6499296049485, + "learning_rate": 1.4390805716863398e-08, + "logits/chosen": -15.303824424743652, + "logits/rejected": -15.512129783630371, + "logps/chosen": -1.629596471786499, + "logps/rejected": -2.028701066970825, + "loss": 3.3744, + "rewards/accuracies": 0.625, + "rewards/chosen": -16.29596519470215, + "rewards/margins": 3.991044521331787, + "rewards/rejected": -20.287010192871094, + "step": 408 + }, + { + "epoch": 0.9250777495052305, + "grad_norm": 119.27282282658015, + "learning_rate": 1.3561482801337908e-08, + "logits/chosen": -20.444072723388672, + "logits/rejected": -20.429006576538086, + "logps/chosen": -1.7733900547027588, + "logps/rejected": -2.1421661376953125, + "loss": 2.9589, + "rewards/accuracies": 0.6875, + "rewards/chosen": -17.73390007019043, + "rewards/margins": 3.687760353088379, + "rewards/rejected": -21.421661376953125, + "step": 409 + }, + { + "epoch": 0.9273395532937517, + "grad_norm": 128.38615627616883, + "learning_rate": 1.2756358855332904e-08, + "logits/chosen": -19.65103530883789, + "logits/rejected": -19.755090713500977, + "logps/chosen": -2.041804552078247, + "logps/rejected": -2.587003231048584, + "loss": 3.5126, + "rewards/accuracies": 0.78125, + "rewards/chosen": -20.418046951293945, + "rewards/margins": 5.45198917388916, + "rewards/rejected": -25.87003517150879, + "step": 410 + }, + { + "epoch": 0.9296013570822731, + "grad_norm": 124.92752994294887, + "learning_rate": 1.1975484296105154e-08, + "logits/chosen": -18.58397102355957, + "logits/rejected": -18.57633399963379, + "logps/chosen": -1.8586208820343018, + "logps/rejected": -2.2781119346618652, + "loss": 2.6842, + "rewards/accuracies": 0.71875, + "rewards/chosen": -18.58620834350586, + "rewards/margins": 4.194911956787109, + "rewards/rejected": -22.78112030029297, + "step": 411 + }, + { + "epoch": 0.9318631608707945, + "grad_norm": 125.7273927211634, + "learning_rate": 1.1218908022402374e-08, + "logits/chosen": -17.77056121826172, + "logits/rejected": -17.610393524169922, + "logps/chosen": -1.4044468402862549, + "logps/rejected": -1.957669973373413, + "loss": 2.8821, + "rewards/accuracies": 0.8125, + "rewards/chosen": -14.04446792602539, + "rewards/margins": 5.532229900360107, + "rewards/rejected": -19.576698303222656, + "step": 412 + }, + { + "epoch": 0.9341249646593158, + "grad_norm": 126.86404645931887, + "learning_rate": 1.0486677411402079e-08, + "logits/chosen": -18.397602081298828, + "logits/rejected": -18.570650100708008, + "logps/chosen": -1.7708725929260254, + "logps/rejected": -2.3376171588897705, + "loss": 3.1541, + "rewards/accuracies": 0.65625, + "rewards/chosen": -17.708724975585938, + "rewards/margins": 5.66744327545166, + "rewards/rejected": -23.376169204711914, + "step": 413 + }, + { + "epoch": 0.9363867684478372, + "grad_norm": 124.4303497435617, + "learning_rate": 9.778838315744353e-09, + "logits/chosen": -18.644079208374023, + "logits/rejected": -18.6198673248291, + "logps/chosen": -1.7886202335357666, + "logps/rejected": -2.106006383895874, + "loss": 3.3825, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.88620376586914, + "rewards/margins": 3.1738624572753906, + "rewards/rejected": -21.06006622314453, + "step": 414 + }, + { + "epoch": 0.9386485722363584, + "grad_norm": 118.54959081850222, + "learning_rate": 9.095435060660595e-09, + "logits/chosen": -18.39704132080078, + "logits/rejected": -18.45808982849121, + "logps/chosen": -1.6519393920898438, + "logps/rejected": -1.9818757772445679, + "loss": 3.0169, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.519393920898438, + "rewards/margins": 3.299362897872925, + "rewards/rejected": -19.818758010864258, + "step": 415 + }, + { + "epoch": 0.9409103760248798, + "grad_norm": 115.40228572150829, + "learning_rate": 8.436510441197864e-09, + "logits/chosen": -20.38088607788086, + "logits/rejected": -20.029354095458984, + "logps/chosen": -1.6879628896713257, + "logps/rejected": -1.8838168382644653, + "loss": 3.2592, + "rewards/accuracies": 0.6875, + "rewards/chosen": -16.879629135131836, + "rewards/margins": 1.9585394859313965, + "rewards/rejected": -18.83816909790039, + "step": 416 + }, + { + "epoch": 0.9431721798134012, + "grad_norm": 167.10931134069494, + "learning_rate": 7.802105719539076e-09, + "logits/chosen": -18.494495391845703, + "logits/rejected": -18.538818359375, + "logps/chosen": -1.9037325382232666, + "logps/rejected": -2.5261826515197754, + "loss": 3.6266, + "rewards/accuracies": 0.75, + "rewards/chosen": -19.037324905395508, + "rewards/margins": 6.224499225616455, + "rewards/rejected": -25.261825561523438, + "step": 417 + }, + { + "epoch": 0.9454339836019225, + "grad_norm": 119.62949752586249, + "learning_rate": 7.1922606224192e-09, + "logits/chosen": -18.683055877685547, + "logits/rejected": -18.973587036132812, + "logps/chosen": -1.7631547451019287, + "logps/rejected": -2.3043551445007324, + "loss": 2.9354, + "rewards/accuracies": 0.78125, + "rewards/chosen": -17.631547927856445, + "rewards/margins": 5.4120049476623535, + "rewards/rejected": -23.043554306030273, + "step": 418 + }, + { + "epoch": 0.9476957873904439, + "grad_norm": 110.69406219801077, + "learning_rate": 6.6070133386372906e-09, + "logits/chosen": -16.96223258972168, + "logits/rejected": -17.232624053955078, + "logps/chosen": -1.7214587926864624, + "logps/rejected": -2.0780019760131836, + "loss": 3.1999, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.214588165283203, + "rewards/margins": 3.5654308795928955, + "rewards/rejected": -20.780017852783203, + "step": 419 + }, + { + "epoch": 0.9499575911789653, + "grad_norm": 133.93900888371826, + "learning_rate": 6.046400516665384e-09, + "logits/chosen": -18.921737670898438, + "logits/rejected": -19.057086944580078, + "logps/chosen": -1.8862426280975342, + "logps/rejected": -2.456373453140259, + "loss": 3.1087, + "rewards/accuracies": 0.78125, + "rewards/chosen": -18.8624267578125, + "rewards/margins": 5.70130729675293, + "rewards/rejected": -24.56373405456543, + "step": 420 + }, + { + "epoch": 0.9522193949674865, + "grad_norm": 116.67059208076354, + "learning_rate": 5.510457262353396e-09, + "logits/chosen": -18.74356460571289, + "logits/rejected": -18.647714614868164, + "logps/chosen": -1.5839942693710327, + "logps/rejected": -2.068876028060913, + "loss": 3.0213, + "rewards/accuracies": 0.78125, + "rewards/chosen": -15.839942932128906, + "rewards/margins": 4.848816871643066, + "rewards/rejected": -20.68876075744629, + "step": 421 + }, + { + "epoch": 0.9544811987560079, + "grad_norm": 137.6503435987508, + "learning_rate": 4.9992171367309265e-09, + "logits/chosen": -17.830699920654297, + "logits/rejected": -17.30995750427246, + "logps/chosen": -1.6017370223999023, + "logps/rejected": -2.2726082801818848, + "loss": 2.7492, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.017372131347656, + "rewards/margins": 6.708712577819824, + "rewards/rejected": -22.726083755493164, + "step": 422 + }, + { + "epoch": 0.9567430025445293, + "grad_norm": 122.50350701504888, + "learning_rate": 4.5127121539052955e-09, + "logits/chosen": -18.987272262573242, + "logits/rejected": -18.7191162109375, + "logps/chosen": -1.7801018953323364, + "logps/rejected": -2.5131754875183105, + "loss": 2.6308, + "rewards/accuracies": 0.71875, + "rewards/chosen": -17.80101776123047, + "rewards/margins": 7.330737590789795, + "rewards/rejected": -25.131757736206055, + "step": 423 + }, + { + "epoch": 0.9590048063330506, + "grad_norm": 105.12784722468204, + "learning_rate": 4.050972779057327e-09, + "logits/chosen": -17.278427124023438, + "logits/rejected": -17.121200561523438, + "logps/chosen": -1.702017903327942, + "logps/rejected": -2.172736883163452, + "loss": 2.6915, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.020179748535156, + "rewards/margins": 4.707189559936523, + "rewards/rejected": -21.727367401123047, + "step": 424 + }, + { + "epoch": 0.961266610121572, + "grad_norm": 122.22803042526128, + "learning_rate": 3.6140279265330477e-09, + "logits/chosen": -18.193286895751953, + "logits/rejected": -17.90346908569336, + "logps/chosen": -1.8119601011276245, + "logps/rejected": -2.272505283355713, + "loss": 2.935, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.119600296020508, + "rewards/margins": 4.605450630187988, + "rewards/rejected": -22.725051879882812, + "step": 425 + }, + { + "epoch": 0.9635284139100933, + "grad_norm": 140.15479399613614, + "learning_rate": 3.2019049580335853e-09, + "logits/chosen": -17.40700340270996, + "logits/rejected": -17.39166259765625, + "logps/chosen": -1.8650894165039062, + "logps/rejected": -2.274355888366699, + "loss": 3.5647, + "rewards/accuracies": 0.78125, + "rewards/chosen": -18.65089225769043, + "rewards/margins": 4.092666149139404, + "rewards/rejected": -22.743558883666992, + "step": 426 + }, + { + "epoch": 0.9657902176986146, + "grad_norm": 102.93156958129578, + "learning_rate": 2.814629680901337e-09, + "logits/chosen": -19.251096725463867, + "logits/rejected": -19.292316436767578, + "logps/chosen": -1.6867254972457886, + "logps/rejected": -2.0900285243988037, + "loss": 2.4974, + "rewards/accuracies": 0.84375, + "rewards/chosen": -16.86725425720215, + "rewards/margins": 4.0330305099487305, + "rewards/rejected": -20.900283813476562, + "step": 427 + }, + { + "epoch": 0.968052021487136, + "grad_norm": 111.34250544518655, + "learning_rate": 2.4522263465041937e-09, + "logits/chosen": -19.024517059326172, + "logits/rejected": -18.74802017211914, + "logps/chosen": -2.0575406551361084, + "logps/rejected": -2.7982211112976074, + "loss": 2.5955, + "rewards/accuracies": 0.875, + "rewards/chosen": -20.575408935546875, + "rewards/margins": 7.406803131103516, + "rewards/rejected": -27.982210159301758, + "step": 428 + }, + { + "epoch": 0.9703138252756573, + "grad_norm": 98.51128989688017, + "learning_rate": 2.114717648716713e-09, + "logits/chosen": -16.984386444091797, + "logits/rejected": -16.8139591217041, + "logps/chosen": -1.8296539783477783, + "logps/rejected": -2.573110818862915, + "loss": 3.0284, + "rewards/accuracies": 0.84375, + "rewards/chosen": -18.296539306640625, + "rewards/margins": 7.434567451477051, + "rewards/rejected": -25.731107711791992, + "step": 429 + }, + { + "epoch": 0.9725756290641787, + "grad_norm": 124.4940263604112, + "learning_rate": 1.802124722499121e-09, + "logits/chosen": -18.865802764892578, + "logits/rejected": -18.73249626159668, + "logps/chosen": -1.7756928205490112, + "logps/rejected": -2.6194663047790527, + "loss": 2.5879, + "rewards/accuracies": 0.875, + "rewards/chosen": -17.75693130493164, + "rewards/margins": 8.437736511230469, + "rewards/rejected": -26.194665908813477, + "step": 430 + }, + { + "epoch": 0.9748374328527001, + "grad_norm": 119.13158902037044, + "learning_rate": 1.5144671425737499e-09, + "logits/chosen": -17.51629638671875, + "logits/rejected": -17.642141342163086, + "logps/chosen": -1.799952745437622, + "logps/rejected": -2.451775550842285, + "loss": 3.0634, + "rewards/accuracies": 0.75, + "rewards/chosen": -17.999526977539062, + "rewards/margins": 6.518229007720947, + "rewards/rejected": -24.517757415771484, + "step": 431 + }, + { + "epoch": 0.9770992366412213, + "grad_norm": 100.6745496186136, + "learning_rate": 1.251762922199484e-09, + "logits/chosen": -18.572729110717773, + "logits/rejected": -19.301191329956055, + "logps/chosen": -1.8852096796035767, + "logps/rejected": -2.454303503036499, + "loss": 2.4105, + "rewards/accuracies": 0.8125, + "rewards/chosen": -18.852096557617188, + "rewards/margins": 5.690939903259277, + "rewards/rejected": -24.54303550720215, + "step": 432 + }, + { + "epoch": 0.9793610404297427, + "grad_norm": 119.30288980428828, + "learning_rate": 1.0140285120433744e-09, + "logits/chosen": -18.9143009185791, + "logits/rejected": -18.95807456970215, + "logps/chosen": -1.8828755617141724, + "logps/rejected": -2.437493085861206, + "loss": 3.3873, + "rewards/accuracies": 0.75, + "rewards/chosen": -18.828754425048828, + "rewards/margins": 5.546175003051758, + "rewards/rejected": -24.374929428100586, + "step": 433 + }, + { + "epoch": 0.9816228442182641, + "grad_norm": 119.93045050852022, + "learning_rate": 8.012787991508396e-10, + "logits/chosen": -18.035734176635742, + "logits/rejected": -17.416671752929688, + "logps/chosen": -1.7183349132537842, + "logps/rejected": -2.451599359512329, + "loss": 2.4103, + "rewards/accuracies": 0.78125, + "rewards/chosen": -17.183349609375, + "rewards/margins": 7.332643508911133, + "rewards/rejected": -24.5159912109375, + "step": 434 + }, + { + "epoch": 0.9838846480067854, + "grad_norm": 127.1741271306872, + "learning_rate": 6.135271060133007e-10, + "logits/chosen": -17.5001277923584, + "logits/rejected": -17.65492057800293, + "logps/chosen": -1.74495530128479, + "logps/rejected": -2.323106527328491, + "loss": 3.0668, + "rewards/accuracies": 0.8125, + "rewards/chosen": -17.449552536010742, + "rewards/margins": 5.781513214111328, + "rewards/rejected": -23.23106575012207, + "step": 435 + }, + { + "epoch": 0.9861464517953068, + "grad_norm": 115.3718910974279, + "learning_rate": 4.50785189733871e-10, + "logits/chosen": -17.362075805664062, + "logits/rejected": -17.160686492919922, + "logps/chosen": -1.3833808898925781, + "logps/rejected": -1.7379635572433472, + "loss": 2.7748, + "rewards/accuracies": 0.78125, + "rewards/chosen": -13.833809852600098, + "rewards/margins": 3.5458261966705322, + "rewards/rejected": -17.379636764526367, + "step": 436 + }, + { + "epoch": 0.988408255583828, + "grad_norm": 110.49410393455729, + "learning_rate": 3.1306324129118935e-10, + "logits/chosen": -17.78763198852539, + "logits/rejected": -17.5814151763916, + "logps/chosen": -1.6376947164535522, + "logps/rejected": -2.1998562812805176, + "loss": 3.0113, + "rewards/accuracies": 0.8125, + "rewards/chosen": -16.3769474029541, + "rewards/margins": 5.621615886688232, + "rewards/rejected": -21.99856185913086, + "step": 437 + }, + { + "epoch": 0.9906700593723494, + "grad_norm": 142.4763338483451, + "learning_rate": 2.003698849011748e-10, + "logits/chosen": -19.646331787109375, + "logits/rejected": -19.66240119934082, + "logps/chosen": -2.0467026233673096, + "logps/rejected": -2.477294921875, + "loss": 3.3739, + "rewards/accuracies": 0.6875, + "rewards/chosen": -20.467025756835938, + "rewards/margins": 4.305922985076904, + "rewards/rejected": -24.772947311401367, + "step": 438 + }, + { + "epoch": 0.9929318631608708, + "grad_norm": 130.7419382757851, + "learning_rate": 1.1271217747714779e-10, + "logits/chosen": -17.93435287475586, + "logits/rejected": -17.90981674194336, + "logps/chosen": -1.883331298828125, + "logps/rejected": -2.1619515419006348, + "loss": 3.3682, + "rewards/accuracies": 0.65625, + "rewards/chosen": -18.83331298828125, + "rewards/margins": 2.786202907562256, + "rewards/rejected": -21.619516372680664, + "step": 439 + }, + { + "epoch": 0.9951936669493922, + "grad_norm": 125.07489041195862, + "learning_rate": 5.0095608187739055e-11, + "logits/chosen": -19.032190322875977, + "logits/rejected": -19.182344436645508, + "logps/chosen": -1.578109622001648, + "logps/rejected": -1.948418378829956, + "loss": 2.7569, + "rewards/accuracies": 0.8125, + "rewards/chosen": -15.781095504760742, + "rewards/margins": 3.703087329864502, + "rewards/rejected": -19.48418426513672, + "step": 440 + }, + { + "epoch": 0.9974554707379135, + "grad_norm": 119.11218694159568, + "learning_rate": 1.2524098113209092e-11, + "logits/chosen": -16.846660614013672, + "logits/rejected": -17.356082916259766, + "logps/chosen": -1.736297845840454, + "logps/rejected": -2.1138105392456055, + "loss": 3.4049, + "rewards/accuracies": 0.59375, + "rewards/chosen": -17.36298179626465, + "rewards/margins": 3.7751266956329346, + "rewards/rejected": -21.138107299804688, + "step": 441 + }, + { + "epoch": 0.9997172745264349, + "grad_norm": 120.08290315715726, + "learning_rate": 0.0, + "logits/chosen": -18.770984649658203, + "logits/rejected": -18.760494232177734, + "logps/chosen": -1.659979224205017, + "logps/rejected": -2.181823492050171, + "loss": 2.8512, + "rewards/accuracies": 0.71875, + "rewards/chosen": -16.59979248046875, + "rewards/margins": 5.218443393707275, + "rewards/rejected": -21.818235397338867, + "step": 442 + }, + { + "epoch": 0.9997172745264349, + "step": 442, + "total_flos": 227674672136192.0, + "train_loss": 0.0, + "train_runtime": 1.6273, + "train_samples_per_second": 34774.982, + "train_steps_per_second": 271.612 + } + ], + "logging_steps": 1, + "max_steps": 442, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 227674672136192.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}