diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4622 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2942, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.694915254237288e-09, + "logits/chosen": -1.5211243629455566, + "logits/rejected": -0.9348576664924622, + "logps/chosen": -412.05706787109375, + "logps/rejected": -913.2714233398438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.6949152542372882e-08, + "logits/chosen": -1.4827719926834106, + "logits/rejected": -1.226508378982544, + "logps/chosen": -679.3842163085938, + "logps/rejected": -639.005126953125, + "loss": 0.8262, + "rewards/accuracies": 0.4861111044883728, + "rewards/chosen": 0.20207053422927856, + "rewards/margins": 0.28480756282806396, + "rewards/rejected": -0.0827370211482048, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 3.3898305084745764e-08, + "logits/chosen": -1.4881559610366821, + "logits/rejected": -1.2070544958114624, + "logps/chosen": -392.80548095703125, + "logps/rejected": -549.167724609375, + "loss": 0.8207, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.09954075515270233, + "rewards/margins": -0.08116824924945831, + "rewards/rejected": 0.18070900440216064, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 5.0847457627118645e-08, + "logits/chosen": -1.455928921699524, + "logits/rejected": -1.218510389328003, + "logps/chosen": -549.7676391601562, + "logps/rejected": -525.0243530273438, + "loss": 0.8307, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05704854801297188, + "rewards/margins": 0.16263702511787415, + "rewards/rejected": -0.10558845847845078, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 6.779661016949153e-08, + "logits/chosen": -1.4766838550567627, + "logits/rejected": -1.218590259552002, + "logps/chosen": -411.13653564453125, + "logps/rejected": -574.4963989257812, + "loss": 0.7857, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10409893095493317, + "rewards/margins": -0.21830201148986816, + "rewards/rejected": 0.11420309543609619, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 8.47457627118644e-08, + "logits/chosen": -1.5140564441680908, + "logits/rejected": -1.1615564823150635, + "logps/chosen": -362.17059326171875, + "logps/rejected": -673.89013671875, + "loss": 0.8045, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.031153270974755287, + "rewards/margins": 0.03609558939933777, + "rewards/rejected": -0.00494231004267931, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 1.0169491525423729e-07, + "logits/chosen": -1.498203992843628, + "logits/rejected": -1.232889175415039, + "logps/chosen": -459.11163330078125, + "logps/rejected": -447.8902282714844, + "loss": 0.7617, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3170378804206848, + "rewards/margins": 0.3453710079193115, + "rewards/rejected": -0.028333133086562157, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 1.1864406779661017e-07, + "logits/chosen": -1.495025396347046, + "logits/rejected": -1.215308427810669, + "logps/chosen": -423.4064025878906, + "logps/rejected": -605.0032958984375, + "loss": 0.7105, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3061259090900421, + "rewards/margins": 0.37270691990852356, + "rewards/rejected": -0.06658102571964264, + "step": 70 + }, + { + "epoch": 0.03, + "learning_rate": 1.3559322033898305e-07, + "logits/chosen": -1.475776195526123, + "logits/rejected": -1.1816449165344238, + "logps/chosen": -586.6575927734375, + "logps/rejected": -481.2361755371094, + "loss": 0.706, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3719860911369324, + "rewards/margins": 0.38908010721206665, + "rewards/rejected": -0.017094042152166367, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 1.5254237288135593e-07, + "logits/chosen": -1.5008628368377686, + "logits/rejected": -1.2657488584518433, + "logps/chosen": -372.3196105957031, + "logps/rejected": -367.21673583984375, + "loss": 0.6233, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.44139355421066284, + "rewards/margins": 0.43834584951400757, + "rewards/rejected": 0.0030477314721792936, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.694915254237288e-07, + "logits/chosen": -1.5043809413909912, + "logits/rejected": -1.163338303565979, + "logps/chosen": -347.18408203125, + "logps/rejected": -516.4083862304688, + "loss": 0.618, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.5171124935150146, + "rewards/margins": 0.7477121353149414, + "rewards/rejected": -0.23059968650341034, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -1.4878125190734863, + "eval_logits/rejected": -1.1894134283065796, + "eval_logps/chosen": -412.9344482421875, + "eval_logps/rejected": -560.655029296875, + "eval_loss": 0.5642263293266296, + "eval_rewards/accuracies": 0.7424242496490479, + "eval_rewards/chosen": 0.69883131980896, + "eval_rewards/margins": 0.8126964569091797, + "eval_rewards/rejected": -0.11386506259441376, + "eval_runtime": 556.7476, + "eval_samples_per_second": 17.063, + "eval_steps_per_second": 0.533, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 1.8644067796610168e-07, + "logits/chosen": -1.4966198205947876, + "logits/rejected": -1.1994943618774414, + "logps/chosen": -360.8127746582031, + "logps/rejected": -802.7747802734375, + "loss": 0.5552, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.7751646637916565, + "rewards/margins": 1.0216423273086548, + "rewards/rejected": -0.24647776782512665, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 2.0338983050847458e-07, + "logits/chosen": -1.5263328552246094, + "logits/rejected": -1.2719924449920654, + "logps/chosen": -380.39715576171875, + "logps/rejected": -544.8963012695312, + "loss": 0.4977, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 1.0511916875839233, + "rewards/margins": 0.912127673625946, + "rewards/rejected": 0.13906405866146088, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 2.2033898305084743e-07, + "logits/chosen": -1.4926766157150269, + "logits/rejected": -1.205890417098999, + "logps/chosen": -433.191650390625, + "logps/rejected": -580.9930419921875, + "loss": 0.4889, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 1.003864049911499, + "rewards/margins": 1.033372402191162, + "rewards/rejected": -0.029508382081985474, + "step": 130 + }, + { + "epoch": 0.05, + "learning_rate": 2.3728813559322033e-07, + "logits/chosen": -1.4977641105651855, + "logits/rejected": -1.2685011625289917, + "logps/chosen": -319.09954833984375, + "logps/rejected": -613.55859375, + "loss": 0.4889, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.8334357142448425, + "rewards/margins": 0.8497712016105652, + "rewards/rejected": -0.01633552275598049, + "step": 140 + }, + { + "epoch": 0.05, + "learning_rate": 2.542372881355932e-07, + "logits/chosen": -1.49599289894104, + "logits/rejected": -1.2160688638687134, + "logps/chosen": -361.3035583496094, + "logps/rejected": -552.3671264648438, + "loss": 0.4224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.0968669652938843, + "rewards/margins": 1.1340056657791138, + "rewards/rejected": -0.037138573825359344, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 2.711864406779661e-07, + "logits/chosen": -1.4980968236923218, + "logits/rejected": -1.2006438970565796, + "logps/chosen": -340.8217468261719, + "logps/rejected": -510.5523376464844, + "loss": 0.466, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.201171875, + "rewards/margins": 1.245226502418518, + "rewards/rejected": -0.04405476525425911, + "step": 160 + }, + { + "epoch": 0.06, + "learning_rate": 2.88135593220339e-07, + "logits/chosen": -1.485167145729065, + "logits/rejected": -1.1941057443618774, + "logps/chosen": -447.4808654785156, + "logps/rejected": -482.01336669921875, + "loss": 0.3674, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.3460757732391357, + "rewards/margins": 1.4081037044525146, + "rewards/rejected": -0.06202799081802368, + "step": 170 + }, + { + "epoch": 0.06, + "learning_rate": 3.0508474576271186e-07, + "logits/chosen": -1.4714066982269287, + "logits/rejected": -1.2212668657302856, + "logps/chosen": -495.3789978027344, + "logps/rejected": -627.9542236328125, + "loss": 0.3855, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 1.6778628826141357, + "rewards/margins": 1.837794303894043, + "rewards/rejected": -0.15993157029151917, + "step": 180 + }, + { + "epoch": 0.06, + "learning_rate": 3.220338983050847e-07, + "logits/chosen": -1.4885241985321045, + "logits/rejected": -1.1983642578125, + "logps/chosen": -357.9476623535156, + "logps/rejected": -569.2054443359375, + "loss": 0.3162, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 1.981848955154419, + "rewards/margins": 2.1885552406311035, + "rewards/rejected": -0.2067060023546219, + "step": 190 + }, + { + "epoch": 0.07, + "learning_rate": 3.389830508474576e-07, + "logits/chosen": -1.4891068935394287, + "logits/rejected": -1.140967607498169, + "logps/chosen": -420.4295349121094, + "logps/rejected": -437.64874267578125, + "loss": 0.3539, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 1.6021934747695923, + "rewards/margins": 1.6449912786483765, + "rewards/rejected": -0.04279797524213791, + "step": 200 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -1.4799621105194092, + "eval_logits/rejected": -1.1625027656555176, + "eval_logps/chosen": -400.7641296386719, + "eval_logps/rejected": -562.246337890625, + "eval_loss": 0.31968235969543457, + "eval_rewards/accuracies": 0.8846801519393921, + "eval_rewards/chosen": 1.915861964225769, + "eval_rewards/margins": 2.1888532638549805, + "eval_rewards/rejected": -0.27299147844314575, + "eval_runtime": 558.7533, + "eval_samples_per_second": 17.002, + "eval_steps_per_second": 0.532, + "step": 200 + }, + { + "epoch": 0.07, + "learning_rate": 3.559322033898305e-07, + "logits/chosen": -1.4898474216461182, + "logits/rejected": -1.2634966373443604, + "logps/chosen": -344.88134765625, + "logps/rejected": -730.7076416015625, + "loss": 0.3019, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.2538204193115234, + "rewards/margins": 2.555640697479248, + "rewards/rejected": -0.3018200993537903, + "step": 210 + }, + { + "epoch": 0.07, + "learning_rate": 3.7288135593220336e-07, + "logits/chosen": -1.4739089012145996, + "logits/rejected": -1.2359154224395752, + "logps/chosen": -474.7027282714844, + "logps/rejected": -482.19598388671875, + "loss": 0.2677, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.262795925140381, + "rewards/margins": 2.4807448387145996, + "rewards/rejected": -0.21794895827770233, + "step": 220 + }, + { + "epoch": 0.08, + "learning_rate": 3.898305084745763e-07, + "logits/chosen": -1.4778010845184326, + "logits/rejected": -1.2181063890457153, + "logps/chosen": -419.628662109375, + "logps/rejected": -598.6785278320312, + "loss": 0.267, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.3457484245300293, + "rewards/margins": 2.492673397064209, + "rewards/rejected": -0.1469249576330185, + "step": 230 + }, + { + "epoch": 0.08, + "learning_rate": 4.0677966101694916e-07, + "logits/chosen": -1.4769701957702637, + "logits/rejected": -1.1472581624984741, + "logps/chosen": -401.890625, + "logps/rejected": -701.8416748046875, + "loss": 0.3046, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 2.2063190937042236, + "rewards/margins": 2.501216173171997, + "rewards/rejected": -0.2948969304561615, + "step": 240 + }, + { + "epoch": 0.08, + "learning_rate": 4.23728813559322e-07, + "logits/chosen": -1.4752933979034424, + "logits/rejected": -1.1277306079864502, + "logps/chosen": -344.5944519042969, + "logps/rejected": -534.5394287109375, + "loss": 0.2745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.5138680934906006, + "rewards/margins": 3.0067882537841797, + "rewards/rejected": -0.4929198622703552, + "step": 250 + }, + { + "epoch": 0.09, + "learning_rate": 4.4067796610169486e-07, + "logits/chosen": -1.4643208980560303, + "logits/rejected": -1.2705574035644531, + "logps/chosen": -415.1036071777344, + "logps/rejected": -591.1699829101562, + "loss": 0.2602, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.2184274196624756, + "rewards/margins": 2.6140334606170654, + "rewards/rejected": -0.395606130361557, + "step": 260 + }, + { + "epoch": 0.09, + "learning_rate": 4.576271186440678e-07, + "logits/chosen": -1.4880424737930298, + "logits/rejected": -1.1456931829452515, + "logps/chosen": -367.011962890625, + "logps/rejected": -408.5341491699219, + "loss": 0.2313, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.4607489109039307, + "rewards/margins": 2.9449193477630615, + "rewards/rejected": -0.48417049646377563, + "step": 270 + }, + { + "epoch": 0.1, + "learning_rate": 4.7457627118644066e-07, + "logits/chosen": -1.4503757953643799, + "logits/rejected": -1.0820204019546509, + "logps/chosen": -341.2666320800781, + "logps/rejected": -390.6230163574219, + "loss": 0.2483, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.5250704288482666, + "rewards/margins": 3.119588851928711, + "rewards/rejected": -0.59451824426651, + "step": 280 + }, + { + "epoch": 0.1, + "learning_rate": 4.915254237288136e-07, + "logits/chosen": -1.4802117347717285, + "logits/rejected": -1.1157623529434204, + "logps/chosen": -311.9570007324219, + "logps/rejected": -566.5151977539062, + "loss": 0.2405, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.3611626625061035, + "rewards/margins": 3.0045018196105957, + "rewards/rejected": -0.6433390378952026, + "step": 290 + }, + { + "epoch": 0.1, + "learning_rate": 4.990555345674349e-07, + "logits/chosen": -1.483705997467041, + "logits/rejected": -1.069526195526123, + "logps/chosen": -341.3657531738281, + "logps/rejected": -632.6183471679688, + "loss": 0.2287, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.7662487030029297, + "rewards/margins": 3.5426669120788574, + "rewards/rejected": -0.7764180302619934, + "step": 300 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -1.464929223060608, + "eval_logits/rejected": -1.1360561847686768, + "eval_logps/chosen": -391.8653564453125, + "eval_logps/rejected": -565.0551147460938, + "eval_loss": 0.2127748280763626, + "eval_rewards/accuracies": 0.9200336933135986, + "eval_rewards/chosen": 2.805741310119629, + "eval_rewards/margins": 3.3596181869506836, + "eval_rewards/rejected": -0.5538769960403442, + "eval_runtime": 557.7091, + "eval_samples_per_second": 17.034, + "eval_steps_per_second": 0.533, + "step": 300 + }, + { + "epoch": 0.11, + "learning_rate": 4.971666037023044e-07, + "logits/chosen": -1.478566288948059, + "logits/rejected": -1.18798828125, + "logps/chosen": -352.68511962890625, + "logps/rejected": -547.9373168945312, + "loss": 0.2133, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.948876142501831, + "rewards/margins": 3.459970474243164, + "rewards/rejected": -0.5110937356948853, + "step": 310 + }, + { + "epoch": 0.11, + "learning_rate": 4.952776728371742e-07, + "logits/chosen": -1.441450834274292, + "logits/rejected": -1.1746580600738525, + "logps/chosen": -548.0260009765625, + "logps/rejected": -451.2164611816406, + "loss": 0.2126, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.0513620376586914, + "rewards/margins": 3.686058759689331, + "rewards/rejected": -0.6346968412399292, + "step": 320 + }, + { + "epoch": 0.11, + "learning_rate": 4.933887419720438e-07, + "logits/chosen": -1.4612153768539429, + "logits/rejected": -1.1157002449035645, + "logps/chosen": -409.2095642089844, + "logps/rejected": -628.3384399414062, + "loss": 0.2312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.231168031692505, + "rewards/margins": 4.0572967529296875, + "rewards/rejected": -0.826129138469696, + "step": 330 + }, + { + "epoch": 0.12, + "learning_rate": 4.914998111069135e-07, + "logits/chosen": -1.471062183380127, + "logits/rejected": -1.1619117259979248, + "logps/chosen": -319.6500549316406, + "logps/rejected": -560.5769653320312, + "loss": 0.1976, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.2418594360351562, + "rewards/margins": 3.9750003814697266, + "rewards/rejected": -0.733141303062439, + "step": 340 + }, + { + "epoch": 0.12, + "learning_rate": 4.896108802417831e-07, + "logits/chosen": -1.4680635929107666, + "logits/rejected": -1.2122979164123535, + "logps/chosen": -383.1250305175781, + "logps/rejected": -619.9765625, + "loss": 0.2053, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.1170341968536377, + "rewards/margins": 3.632521152496338, + "rewards/rejected": -0.5154868364334106, + "step": 350 + }, + { + "epoch": 0.12, + "learning_rate": 4.877219493766528e-07, + "logits/chosen": -1.480360746383667, + "logits/rejected": -1.1829755306243896, + "logps/chosen": -315.16925048828125, + "logps/rejected": -440.88409423828125, + "loss": 0.1615, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.1895642280578613, + "rewards/margins": 4.29224967956543, + "rewards/rejected": -1.1026861667633057, + "step": 360 + }, + { + "epoch": 0.13, + "learning_rate": 4.858330185115224e-07, + "logits/chosen": -1.4765106439590454, + "logits/rejected": -1.180673360824585, + "logps/chosen": -322.9911193847656, + "logps/rejected": -827.7867431640625, + "loss": 0.1879, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.0941760540008545, + "rewards/margins": 3.847609043121338, + "rewards/rejected": -0.7534326910972595, + "step": 370 + }, + { + "epoch": 0.13, + "learning_rate": 4.839440876463921e-07, + "logits/chosen": -1.4910002946853638, + "logits/rejected": -1.1475781202316284, + "logps/chosen": -359.06103515625, + "logps/rejected": -703.1707153320312, + "loss": 0.1842, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.6445469856262207, + "rewards/margins": 4.668353080749512, + "rewards/rejected": -1.023805856704712, + "step": 380 + }, + { + "epoch": 0.13, + "learning_rate": 4.820551567812618e-07, + "logits/chosen": -1.4512460231781006, + "logits/rejected": -1.1814398765563965, + "logps/chosen": -393.1922302246094, + "logps/rejected": -443.9873962402344, + "loss": 0.1744, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.31535267829895, + "rewards/margins": 3.9874045848846436, + "rewards/rejected": -0.6720519065856934, + "step": 390 + }, + { + "epoch": 0.14, + "learning_rate": 4.801662259161314e-07, + "logits/chosen": -1.4928423166275024, + "logits/rejected": -1.0846529006958008, + "logps/chosen": -340.31768798828125, + "logps/rejected": -675.9088745117188, + "loss": 0.158, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.019392967224121, + "rewards/margins": 5.392711639404297, + "rewards/rejected": -1.3733187913894653, + "step": 400 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -1.462174892425537, + "eval_logits/rejected": -1.1299500465393066, + "eval_logps/chosen": -385.3669738769531, + "eval_logps/rejected": -569.8557739257812, + "eval_loss": 0.1673159897327423, + "eval_rewards/accuracies": 0.932659924030304, + "eval_rewards/chosen": 3.4555790424346924, + "eval_rewards/margins": 4.489521503448486, + "eval_rewards/rejected": -1.0339421033859253, + "eval_runtime": 557.8843, + "eval_samples_per_second": 17.029, + "eval_steps_per_second": 0.532, + "step": 400 + }, + { + "epoch": 0.14, + "learning_rate": 4.782772950510011e-07, + "logits/chosen": -1.4920897483825684, + "logits/rejected": -1.212968111038208, + "logps/chosen": -316.6252746582031, + "logps/rejected": -558.7969360351562, + "loss": 0.165, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.692905902862549, + "rewards/margins": 4.354551315307617, + "rewards/rejected": -0.6616458296775818, + "step": 410 + }, + { + "epoch": 0.14, + "learning_rate": 4.7638836418587073e-07, + "logits/chosen": -1.468379259109497, + "logits/rejected": -1.1990084648132324, + "logps/chosen": -325.2828369140625, + "logps/rejected": -746.1368408203125, + "loss": 0.1803, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.6396260261535645, + "rewards/margins": 4.8296284675598145, + "rewards/rejected": -1.19000244140625, + "step": 420 + }, + { + "epoch": 0.15, + "learning_rate": 4.7449943332074044e-07, + "logits/chosen": -1.4748234748840332, + "logits/rejected": -1.181004285812378, + "logps/chosen": -308.9472961425781, + "logps/rejected": -665.6039428710938, + "loss": 0.1592, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.706333875656128, + "rewards/margins": 4.874017715454102, + "rewards/rejected": -1.1676843166351318, + "step": 430 + }, + { + "epoch": 0.15, + "learning_rate": 4.7261050245561014e-07, + "logits/chosen": -1.4771819114685059, + "logits/rejected": -1.1283105611801147, + "logps/chosen": -328.56915283203125, + "logps/rejected": -495.9109802246094, + "loss": 0.1475, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.4302115440368652, + "rewards/margins": 4.693282127380371, + "rewards/rejected": -1.2630702257156372, + "step": 440 + }, + { + "epoch": 0.15, + "learning_rate": 4.7072157159047975e-07, + "logits/chosen": -1.4672292470932007, + "logits/rejected": -1.0770254135131836, + "logps/chosen": -366.31182861328125, + "logps/rejected": -418.83758544921875, + "loss": 0.1714, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.2115979194641113, + "rewards/margins": 4.826067924499512, + "rewards/rejected": -1.6144702434539795, + "step": 450 + }, + { + "epoch": 0.16, + "learning_rate": 4.6883264072534946e-07, + "logits/chosen": -1.4420884847640991, + "logits/rejected": -1.096064567565918, + "logps/chosen": -405.8441467285156, + "logps/rejected": -466.4434509277344, + "loss": 0.1394, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.97516131401062, + "rewards/margins": 5.2190327644348145, + "rewards/rejected": -1.2438714504241943, + "step": 460 + }, + { + "epoch": 0.16, + "learning_rate": 4.6694370986021906e-07, + "logits/chosen": -1.4477102756500244, + "logits/rejected": -1.1750242710113525, + "logps/chosen": -496.8306579589844, + "logps/rejected": -317.3291015625, + "loss": 0.1444, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.208278656005859, + "rewards/margins": 5.423037052154541, + "rewards/rejected": -1.2147585153579712, + "step": 470 + }, + { + "epoch": 0.16, + "learning_rate": 4.6505477899508877e-07, + "logits/chosen": -1.4701149463653564, + "logits/rejected": -1.249976396560669, + "logps/chosen": -317.600830078125, + "logps/rejected": -649.2884521484375, + "loss": 0.1347, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.9683494567871094, + "rewards/margins": 5.167794227600098, + "rewards/rejected": -1.199444055557251, + "step": 480 + }, + { + "epoch": 0.17, + "learning_rate": 4.631658481299584e-07, + "logits/chosen": -1.4915847778320312, + "logits/rejected": -1.1582549810409546, + "logps/chosen": -343.9900207519531, + "logps/rejected": -545.9863891601562, + "loss": 0.1168, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.775851726531982, + "rewards/margins": 6.124849796295166, + "rewards/rejected": -1.3489978313446045, + "step": 490 + }, + { + "epoch": 0.17, + "learning_rate": 4.612769172648281e-07, + "logits/chosen": -1.4516441822052002, + "logits/rejected": -1.1763832569122314, + "logps/chosen": -479.01776123046875, + "logps/rejected": -355.7424011230469, + "loss": 0.1599, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.103041172027588, + "rewards/margins": 5.349932670593262, + "rewards/rejected": -1.2468923330307007, + "step": 500 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -1.4607428312301636, + "eval_logits/rejected": -1.1274610757827759, + "eval_logps/chosen": -382.4375915527344, + "eval_logps/rejected": -572.8546142578125, + "eval_loss": 0.13974203169345856, + "eval_rewards/accuracies": 0.9461279511451721, + "eval_rewards/chosen": 3.7485170364379883, + "eval_rewards/margins": 5.082335472106934, + "eval_rewards/rejected": -1.3338183164596558, + "eval_runtime": 557.5578, + "eval_samples_per_second": 17.039, + "eval_steps_per_second": 0.533, + "step": 500 + }, + { + "epoch": 0.17, + "learning_rate": 4.5938798639969773e-07, + "logits/chosen": -1.4614421129226685, + "logits/rejected": -1.1661673784255981, + "logps/chosen": -432.869384765625, + "logps/rejected": -702.9627685546875, + "loss": 0.1517, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.9662163257598877, + "rewards/margins": 5.194244384765625, + "rewards/rejected": -1.228027105331421, + "step": 510 + }, + { + "epoch": 0.18, + "learning_rate": 4.574990555345674e-07, + "logits/chosen": -1.4722058773040771, + "logits/rejected": -1.1686432361602783, + "logps/chosen": -335.47344970703125, + "logps/rejected": -595.4827880859375, + "loss": 0.1268, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.8043179512023926, + "rewards/margins": 5.093755722045898, + "rewards/rejected": -1.2894370555877686, + "step": 520 + }, + { + "epoch": 0.18, + "learning_rate": 4.556101246694371e-07, + "logits/chosen": -1.4674203395843506, + "logits/rejected": -1.1525086164474487, + "logps/chosen": -329.02264404296875, + "logps/rejected": -652.1644287109375, + "loss": 0.1353, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.099880218505859, + "rewards/margins": 5.421080589294434, + "rewards/rejected": -1.3212003707885742, + "step": 530 + }, + { + "epoch": 0.18, + "learning_rate": 4.5372119380430675e-07, + "logits/chosen": -1.4733096361160278, + "logits/rejected": -1.134479284286499, + "logps/chosen": -315.3797912597656, + "logps/rejected": -457.8125915527344, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9675514698028564, + "rewards/margins": 5.392933368682861, + "rewards/rejected": -1.4253814220428467, + "step": 540 + }, + { + "epoch": 0.19, + "learning_rate": 4.518322629391764e-07, + "logits/chosen": -1.465785264968872, + "logits/rejected": -1.1767680644989014, + "logps/chosen": -327.9288635253906, + "logps/rejected": -518.0633544921875, + "loss": 0.1361, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.936115264892578, + "rewards/margins": 5.388223171234131, + "rewards/rejected": -1.45210862159729, + "step": 550 + }, + { + "epoch": 0.19, + "learning_rate": 4.4994333207404607e-07, + "logits/chosen": -1.4351527690887451, + "logits/rejected": -1.1543748378753662, + "logps/chosen": -454.43292236328125, + "logps/rejected": -475.8153381347656, + "loss": 0.1295, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.992440700531006, + "rewards/margins": 5.424699306488037, + "rewards/rejected": -1.4322583675384521, + "step": 560 + }, + { + "epoch": 0.19, + "learning_rate": 4.480544012089157e-07, + "logits/chosen": -1.4429913759231567, + "logits/rejected": -1.201302409172058, + "logps/chosen": -496.41790771484375, + "logps/rejected": -365.816650390625, + "loss": 0.1039, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.107169151306152, + "rewards/margins": 5.378964424133301, + "rewards/rejected": -1.2717949151992798, + "step": 570 + }, + { + "epoch": 0.2, + "learning_rate": 4.461654703437854e-07, + "logits/chosen": -1.4437055587768555, + "logits/rejected": -1.1555430889129639, + "logps/chosen": -460.52252197265625, + "logps/rejected": -543.8465576171875, + "loss": 0.1497, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.182587623596191, + "rewards/margins": 5.458142280578613, + "rewards/rejected": -1.2755542993545532, + "step": 580 + }, + { + "epoch": 0.2, + "learning_rate": 4.442765394786551e-07, + "logits/chosen": -1.4703487157821655, + "logits/rejected": -1.1514756679534912, + "logps/chosen": -335.76141357421875, + "logps/rejected": -425.265380859375, + "loss": 0.1179, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.938772678375244, + "rewards/margins": 5.324645042419434, + "rewards/rejected": -1.3858733177185059, + "step": 590 + }, + { + "epoch": 0.2, + "learning_rate": 4.423876086135247e-07, + "logits/chosen": -1.4418364763259888, + "logits/rejected": -1.1505249738693237, + "logps/chosen": -448.2904357910156, + "logps/rejected": -721.1622314453125, + "loss": 0.1389, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.9244837760925293, + "rewards/margins": 5.320973873138428, + "rewards/rejected": -1.3964899778366089, + "step": 600 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -1.4519003629684448, + "eval_logits/rejected": -1.1194298267364502, + "eval_logps/chosen": -380.6632995605469, + "eval_logps/rejected": -574.6277465820312, + "eval_loss": 0.12727472186088562, + "eval_rewards/accuracies": 0.9528619647026062, + "eval_rewards/chosen": 3.9259443283081055, + "eval_rewards/margins": 5.437079906463623, + "eval_rewards/rejected": -1.5111361742019653, + "eval_runtime": 557.5407, + "eval_samples_per_second": 17.039, + "eval_steps_per_second": 0.533, + "step": 600 + }, + { + "epoch": 0.21, + "learning_rate": 4.404986777483944e-07, + "logits/chosen": -1.462982416152954, + "logits/rejected": -1.1688556671142578, + "logps/chosen": -368.68914794921875, + "logps/rejected": -497.86566162109375, + "loss": 0.1364, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.3148436546325684, + "rewards/margins": 4.890419006347656, + "rewards/rejected": -1.5755746364593506, + "step": 610 + }, + { + "epoch": 0.21, + "learning_rate": 4.3860974688326405e-07, + "logits/chosen": -1.4497849941253662, + "logits/rejected": -1.1572027206420898, + "logps/chosen": -465.70391845703125, + "logps/rejected": -631.2728881835938, + "loss": 0.1163, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.193670272827148, + "rewards/margins": 5.658702850341797, + "rewards/rejected": -1.465032935142517, + "step": 620 + }, + { + "epoch": 0.21, + "learning_rate": 4.367208160181337e-07, + "logits/chosen": -1.4319039583206177, + "logits/rejected": -1.1632667779922485, + "logps/chosen": -421.3758239746094, + "logps/rejected": -333.9304504394531, + "loss": 0.1224, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.695667266845703, + "rewards/margins": 5.386081218719482, + "rewards/rejected": -1.6904138326644897, + "step": 630 + }, + { + "epoch": 0.22, + "learning_rate": 4.348318851530034e-07, + "logits/chosen": -1.4637318849563599, + "logits/rejected": -1.095100998878479, + "logps/chosen": -396.5002136230469, + "logps/rejected": -622.4762573242188, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.092984199523926, + "rewards/margins": 6.124913215637207, + "rewards/rejected": -2.0319290161132812, + "step": 640 + }, + { + "epoch": 0.22, + "learning_rate": 4.32942954287873e-07, + "logits/chosen": -1.4509179592132568, + "logits/rejected": -1.1328189373016357, + "logps/chosen": -371.68572998046875, + "logps/rejected": -406.5970458984375, + "loss": 0.1163, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.031527996063232, + "rewards/margins": 5.5390424728393555, + "rewards/rejected": -1.5075138807296753, + "step": 650 + }, + { + "epoch": 0.22, + "learning_rate": 4.3105402342274273e-07, + "logits/chosen": -1.4884783029556274, + "logits/rejected": -1.1401994228363037, + "logps/chosen": -321.93023681640625, + "logps/rejected": -570.5530395507812, + "loss": 0.1097, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.893491744995117, + "rewards/margins": 5.886297702789307, + "rewards/rejected": -1.9928067922592163, + "step": 660 + }, + { + "epoch": 0.23, + "learning_rate": 4.2916509255761233e-07, + "logits/chosen": -1.45878005027771, + "logits/rejected": -1.0974493026733398, + "logps/chosen": -378.56256103515625, + "logps/rejected": -671.4094848632812, + "loss": 0.1419, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.538317918777466, + "rewards/margins": 5.423024654388428, + "rewards/rejected": -1.8847074508666992, + "step": 670 + }, + { + "epoch": 0.23, + "learning_rate": 4.2727616169248204e-07, + "logits/chosen": -1.4535772800445557, + "logits/rejected": -1.0777978897094727, + "logps/chosen": -347.06304931640625, + "logps/rejected": -607.7723999023438, + "loss": 0.1015, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.180381774902344, + "rewards/margins": 6.305299282073975, + "rewards/rejected": -2.12491774559021, + "step": 680 + }, + { + "epoch": 0.23, + "learning_rate": 4.253872308273517e-07, + "logits/chosen": -1.4577261209487915, + "logits/rejected": -1.1172749996185303, + "logps/chosen": -428.004150390625, + "logps/rejected": -609.6082763671875, + "loss": 0.0902, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.4771833419799805, + "rewards/margins": 6.247294902801514, + "rewards/rejected": -1.770111083984375, + "step": 690 + }, + { + "epoch": 0.24, + "learning_rate": 4.2349829996222135e-07, + "logits/chosen": -1.459695816040039, + "logits/rejected": -1.1408016681671143, + "logps/chosen": -375.980224609375, + "logps/rejected": -637.662109375, + "loss": 0.0778, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.746489524841309, + "rewards/margins": 6.767748832702637, + "rewards/rejected": -2.0212595462799072, + "step": 700 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -1.4541884660720825, + "eval_logits/rejected": -1.1302434206008911, + "eval_logps/chosen": -379.2232971191406, + "eval_logps/rejected": -578.0139770507812, + "eval_loss": 0.1122458353638649, + "eval_rewards/accuracies": 0.9612794518470764, + "eval_rewards/chosen": 4.069947719573975, + "eval_rewards/margins": 5.919719219207764, + "eval_rewards/rejected": -1.84977126121521, + "eval_runtime": 559.0869, + "eval_samples_per_second": 16.992, + "eval_steps_per_second": 0.531, + "step": 700 + }, + { + "epoch": 0.24, + "learning_rate": 4.2160936909709106e-07, + "logits/chosen": -1.4686052799224854, + "logits/rejected": -1.17806077003479, + "logps/chosen": -370.8101806640625, + "logps/rejected": -445.7115783691406, + "loss": 0.1139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.275851249694824, + "rewards/margins": 5.940474033355713, + "rewards/rejected": -1.6646230220794678, + "step": 710 + }, + { + "epoch": 0.24, + "learning_rate": 4.1972043823196066e-07, + "logits/chosen": -1.4710712432861328, + "logits/rejected": -1.2118116617202759, + "logps/chosen": -349.842041015625, + "logps/rejected": -497.0186462402344, + "loss": 0.1405, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.897695541381836, + "rewards/margins": 6.115738391876221, + "rewards/rejected": -2.2180426120758057, + "step": 720 + }, + { + "epoch": 0.25, + "learning_rate": 4.1783150736683037e-07, + "logits/chosen": -1.4504650831222534, + "logits/rejected": -1.1481643915176392, + "logps/chosen": -421.00927734375, + "logps/rejected": -541.6673583984375, + "loss": 0.1052, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.073805809020996, + "rewards/margins": 5.906230926513672, + "rewards/rejected": -1.8324254751205444, + "step": 730 + }, + { + "epoch": 0.25, + "learning_rate": 4.1594257650170003e-07, + "logits/chosen": -1.475731611251831, + "logits/rejected": -1.1985948085784912, + "logps/chosen": -405.5483703613281, + "logps/rejected": -488.00091552734375, + "loss": 0.1015, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 3.7700603008270264, + "rewards/margins": 5.521186351776123, + "rewards/rejected": -1.7511262893676758, + "step": 740 + }, + { + "epoch": 0.25, + "learning_rate": 4.140536456365697e-07, + "logits/chosen": -1.4649735689163208, + "logits/rejected": -1.115206241607666, + "logps/chosen": -314.864501953125, + "logps/rejected": -583.0494995117188, + "loss": 0.0764, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.480077266693115, + "rewards/margins": 6.317180633544922, + "rewards/rejected": -1.837104082107544, + "step": 750 + }, + { + "epoch": 0.26, + "learning_rate": 4.1216471477143934e-07, + "logits/chosen": -1.4586890935897827, + "logits/rejected": -1.1937768459320068, + "logps/chosen": -318.05615234375, + "logps/rejected": -491.633056640625, + "loss": 0.1225, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.9885802268981934, + "rewards/margins": 5.613485813140869, + "rewards/rejected": -1.6249048709869385, + "step": 760 + }, + { + "epoch": 0.26, + "learning_rate": 4.10275783906309e-07, + "logits/chosen": -1.474686861038208, + "logits/rejected": -1.2240248918533325, + "logps/chosen": -448.8330993652344, + "logps/rejected": -574.8685302734375, + "loss": 0.079, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.1468000411987305, + "rewards/margins": 6.423197269439697, + "rewards/rejected": -2.276397228240967, + "step": 770 + }, + { + "epoch": 0.27, + "learning_rate": 4.0838685304117865e-07, + "logits/chosen": -1.4716846942901611, + "logits/rejected": -1.1474727392196655, + "logps/chosen": -458.2037658691406, + "logps/rejected": -677.0916137695312, + "loss": 0.103, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.230536460876465, + "rewards/margins": 6.255263805389404, + "rewards/rejected": -2.0247273445129395, + "step": 780 + }, + { + "epoch": 0.27, + "learning_rate": 4.0649792217604836e-07, + "logits/chosen": -1.4893128871917725, + "logits/rejected": -1.1543127298355103, + "logps/chosen": -368.73876953125, + "logps/rejected": -385.9571533203125, + "loss": 0.0759, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.367506504058838, + "rewards/margins": 6.192745208740234, + "rewards/rejected": -1.825238823890686, + "step": 790 + }, + { + "epoch": 0.27, + "learning_rate": 4.04608991310918e-07, + "logits/chosen": -1.4659887552261353, + "logits/rejected": -1.2101812362670898, + "logps/chosen": -387.55120849609375, + "logps/rejected": -527.9453735351562, + "loss": 0.0993, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.042902946472168, + "rewards/margins": 5.99516487121582, + "rewards/rejected": -1.9522621631622314, + "step": 800 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -1.46894109249115, + "eval_logits/rejected": -1.142426609992981, + "eval_logps/chosen": -377.5001220703125, + "eval_logps/rejected": -579.4506225585938, + "eval_loss": 0.09749113768339157, + "eval_rewards/accuracies": 0.9663299918174744, + "eval_rewards/chosen": 4.2422590255737305, + "eval_rewards/margins": 6.235683441162109, + "eval_rewards/rejected": -1.9934238195419312, + "eval_runtime": 558.57, + "eval_samples_per_second": 17.008, + "eval_steps_per_second": 0.532, + "step": 800 + }, + { + "epoch": 0.28, + "learning_rate": 4.0272006044578767e-07, + "logits/chosen": -1.4774185419082642, + "logits/rejected": -1.185450553894043, + "logps/chosen": -389.61981201171875, + "logps/rejected": -652.4323120117188, + "loss": 0.1036, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.061360836029053, + "rewards/margins": 6.323044776916504, + "rewards/rejected": -2.261683940887451, + "step": 810 + }, + { + "epoch": 0.28, + "learning_rate": 4.0083112958065733e-07, + "logits/chosen": -1.4516403675079346, + "logits/rejected": -1.1874592304229736, + "logps/chosen": -475.3050231933594, + "logps/rejected": -444.3898010253906, + "loss": 0.1065, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.8749263286590576, + "rewards/margins": 5.786238670349121, + "rewards/rejected": -1.9113123416900635, + "step": 820 + }, + { + "epoch": 0.28, + "learning_rate": 3.98942198715527e-07, + "logits/chosen": -1.4862116575241089, + "logits/rejected": -1.2050374746322632, + "logps/chosen": -303.4962463378906, + "logps/rejected": -611.756591796875, + "loss": 0.1023, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.315167427062988, + "rewards/margins": 6.290981769561768, + "rewards/rejected": -1.9758144617080688, + "step": 830 + }, + { + "epoch": 0.29, + "learning_rate": 3.970532678503967e-07, + "logits/chosen": -1.4739606380462646, + "logits/rejected": -1.2146607637405396, + "logps/chosen": -395.7440490722656, + "logps/rejected": -519.9666748046875, + "loss": 0.0939, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.144864559173584, + "rewards/margins": 7.278559684753418, + "rewards/rejected": -2.133694648742676, + "step": 840 + }, + { + "epoch": 0.29, + "learning_rate": 3.951643369852663e-07, + "logits/chosen": -1.476678490638733, + "logits/rejected": -1.206061601638794, + "logps/chosen": -404.0805358886719, + "logps/rejected": -790.9165649414062, + "loss": 0.0917, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.473787307739258, + "rewards/margins": 6.4759931564331055, + "rewards/rejected": -2.002206325531006, + "step": 850 + }, + { + "epoch": 0.29, + "learning_rate": 3.93275406120136e-07, + "logits/chosen": -1.471995234489441, + "logits/rejected": -1.2011922597885132, + "logps/chosen": -373.6271057128906, + "logps/rejected": -589.4290771484375, + "loss": 0.0896, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.3988142013549805, + "rewards/margins": 6.434880256652832, + "rewards/rejected": -2.036065101623535, + "step": 860 + }, + { + "epoch": 0.3, + "learning_rate": 3.913864752550056e-07, + "logits/chosen": -1.4729435443878174, + "logits/rejected": -1.2467955350875854, + "logps/chosen": -479.82470703125, + "logps/rejected": -649.1353759765625, + "loss": 0.1121, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 5.313334941864014, + "rewards/margins": 7.132607936859131, + "rewards/rejected": -1.8192729949951172, + "step": 870 + }, + { + "epoch": 0.3, + "learning_rate": 3.894975443898753e-07, + "logits/chosen": -1.4922538995742798, + "logits/rejected": -1.1949760913848877, + "logps/chosen": -295.2701721191406, + "logps/rejected": -510.95001220703125, + "loss": 0.0773, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.697510719299316, + "rewards/margins": 6.993855953216553, + "rewards/rejected": -2.2963459491729736, + "step": 880 + }, + { + "epoch": 0.3, + "learning_rate": 3.87608613524745e-07, + "logits/chosen": -1.4829437732696533, + "logits/rejected": -1.2089799642562866, + "logps/chosen": -367.78387451171875, + "logps/rejected": -401.001953125, + "loss": 0.1007, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.9919090270996094, + "rewards/margins": 5.9659037590026855, + "rewards/rejected": -1.973995566368103, + "step": 890 + }, + { + "epoch": 0.31, + "learning_rate": 3.857196826596146e-07, + "logits/chosen": -1.5004401206970215, + "logits/rejected": -1.0854889154434204, + "logps/chosen": -337.4493408203125, + "logps/rejected": -477.30963134765625, + "loss": 0.111, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 4.4730706214904785, + "rewards/margins": 6.716238498687744, + "rewards/rejected": -2.2431674003601074, + "step": 900 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -1.4820351600646973, + "eval_logits/rejected": -1.1542390584945679, + "eval_logps/chosen": -376.70477294921875, + "eval_logps/rejected": -582.0501098632812, + "eval_loss": 0.09071440994739532, + "eval_rewards/accuracies": 0.9696969985961914, + "eval_rewards/chosen": 4.32179594039917, + "eval_rewards/margins": 6.575175762176514, + "eval_rewards/rejected": -2.253380537033081, + "eval_runtime": 559.8508, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 0.53, + "step": 900 + }, + { + "epoch": 0.31, + "learning_rate": 3.8383075179448433e-07, + "logits/chosen": -1.48300302028656, + "logits/rejected": -1.1969270706176758, + "logps/chosen": -444.98114013671875, + "logps/rejected": -399.57403564453125, + "loss": 0.0892, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.185998439788818, + "rewards/margins": 6.078363418579102, + "rewards/rejected": -1.8923648595809937, + "step": 910 + }, + { + "epoch": 0.31, + "learning_rate": 3.8194182092935394e-07, + "logits/chosen": -1.4898041486740112, + "logits/rejected": -1.1662390232086182, + "logps/chosen": -322.0852355957031, + "logps/rejected": -505.6220703125, + "loss": 0.0793, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.525553226470947, + "rewards/margins": 6.857700347900391, + "rewards/rejected": -2.3321471214294434, + "step": 920 + }, + { + "epoch": 0.32, + "learning_rate": 3.8005289006422365e-07, + "logits/chosen": -1.4851583242416382, + "logits/rejected": -1.1979453563690186, + "logps/chosen": -358.98101806640625, + "logps/rejected": -621.0003051757812, + "loss": 0.0882, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.939785957336426, + "rewards/margins": 7.3684186935424805, + "rewards/rejected": -2.4286324977874756, + "step": 930 + }, + { + "epoch": 0.32, + "learning_rate": 3.7816395919909325e-07, + "logits/chosen": -1.49127197265625, + "logits/rejected": -1.239793062210083, + "logps/chosen": -314.9703674316406, + "logps/rejected": -568.3617553710938, + "loss": 0.0664, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.589601993560791, + "rewards/margins": 6.650811672210693, + "rewards/rejected": -2.0612106323242188, + "step": 940 + }, + { + "epoch": 0.32, + "learning_rate": 3.7627502833396296e-07, + "logits/chosen": -1.4954484701156616, + "logits/rejected": -1.247184157371521, + "logps/chosen": -381.68499755859375, + "logps/rejected": -518.2406005859375, + "loss": 0.0898, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.539010047912598, + "rewards/margins": 6.654293060302734, + "rewards/rejected": -2.115283489227295, + "step": 950 + }, + { + "epoch": 0.33, + "learning_rate": 3.7438609746883267e-07, + "logits/chosen": -1.475711464881897, + "logits/rejected": -1.2076390981674194, + "logps/chosen": -448.14556884765625, + "logps/rejected": -554.5950927734375, + "loss": 0.0742, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 5.037951946258545, + "rewards/margins": 7.1611647605896, + "rewards/rejected": -2.1232128143310547, + "step": 960 + }, + { + "epoch": 0.33, + "learning_rate": 3.7249716660370227e-07, + "logits/chosen": -1.4880720376968384, + "logits/rejected": -1.206027865409851, + "logps/chosen": -303.1774597167969, + "logps/rejected": -724.1024169921875, + "loss": 0.0858, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.292189121246338, + "rewards/margins": 6.413214683532715, + "rewards/rejected": -2.1210262775421143, + "step": 970 + }, + { + "epoch": 0.33, + "learning_rate": 3.70608235738572e-07, + "logits/chosen": -1.4684410095214844, + "logits/rejected": -1.15514075756073, + "logps/chosen": -447.8116760253906, + "logps/rejected": -666.262939453125, + "loss": 0.1078, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.545645236968994, + "rewards/margins": 7.141517639160156, + "rewards/rejected": -2.595871925354004, + "step": 980 + }, + { + "epoch": 0.34, + "learning_rate": 3.687193048734416e-07, + "logits/chosen": -1.4797093868255615, + "logits/rejected": -1.1443145275115967, + "logps/chosen": -302.0823059082031, + "logps/rejected": -653.0443115234375, + "loss": 0.0951, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.940204620361328, + "rewards/margins": 7.737614631652832, + "rewards/rejected": -2.797410011291504, + "step": 990 + }, + { + "epoch": 0.34, + "learning_rate": 3.668303740083113e-07, + "logits/chosen": -1.483666181564331, + "logits/rejected": -1.183774709701538, + "logps/chosen": -401.68359375, + "logps/rejected": -488.19451904296875, + "loss": 0.0893, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.864621639251709, + "rewards/margins": 7.051810264587402, + "rewards/rejected": -2.187187671661377, + "step": 1000 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -1.469427227973938, + "eval_logits/rejected": -1.14968740940094, + "eval_logps/chosen": -376.04510498046875, + "eval_logps/rejected": -582.1046752929688, + "eval_loss": 0.0881563276052475, + "eval_rewards/accuracies": 0.9663299918174744, + "eval_rewards/chosen": 4.387765407562256, + "eval_rewards/margins": 6.646595001220703, + "eval_rewards/rejected": -2.2588300704956055, + "eval_runtime": 559.0589, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 0.531, + "step": 1000 + }, + { + "epoch": 0.34, + "learning_rate": 3.6494144314318094e-07, + "logits/chosen": -1.4677711725234985, + "logits/rejected": -1.22615647315979, + "logps/chosen": -408.8759765625, + "logps/rejected": -469.7860412597656, + "loss": 0.1257, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.661646842956543, + "rewards/margins": 6.4328932762146, + "rewards/rejected": -1.7712465524673462, + "step": 1010 + }, + { + "epoch": 0.35, + "learning_rate": 3.630525122780506e-07, + "logits/chosen": -1.497859239578247, + "logits/rejected": -1.1968727111816406, + "logps/chosen": -288.6357116699219, + "logps/rejected": -523.6803588867188, + "loss": 0.103, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.572832107543945, + "rewards/margins": 6.243821620941162, + "rewards/rejected": -1.670989990234375, + "step": 1020 + }, + { + "epoch": 0.35, + "learning_rate": 3.6116358141292026e-07, + "logits/chosen": -1.4879519939422607, + "logits/rejected": -1.2336044311523438, + "logps/chosen": -303.6993103027344, + "logps/rejected": -508.20123291015625, + "loss": 0.128, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.822319030761719, + "rewards/margins": 6.814971923828125, + "rewards/rejected": -1.9926522970199585, + "step": 1030 + }, + { + "epoch": 0.35, + "learning_rate": 3.592746505477899e-07, + "logits/chosen": -1.4755656719207764, + "logits/rejected": -1.2433079481124878, + "logps/chosen": -495.46337890625, + "logps/rejected": -697.58740234375, + "loss": 0.0889, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.0532355308532715, + "rewards/margins": 6.689316749572754, + "rewards/rejected": -2.636080503463745, + "step": 1040 + }, + { + "epoch": 0.36, + "learning_rate": 3.573857196826596e-07, + "logits/chosen": -1.512407660484314, + "logits/rejected": -1.203151822090149, + "logps/chosen": -480.6717224121094, + "logps/rejected": -505.75830078125, + "loss": 0.0971, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.397000312805176, + "rewards/margins": 6.988126277923584, + "rewards/rejected": -2.591125726699829, + "step": 1050 + }, + { + "epoch": 0.36, + "learning_rate": 3.554967888175293e-07, + "logits/chosen": -1.5000110864639282, + "logits/rejected": -1.1538686752319336, + "logps/chosen": -380.41741943359375, + "logps/rejected": -551.3828735351562, + "loss": 0.1111, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.297232151031494, + "rewards/margins": 6.739757537841797, + "rewards/rejected": -2.4425251483917236, + "step": 1060 + }, + { + "epoch": 0.36, + "learning_rate": 3.5360785795239893e-07, + "logits/chosen": -1.4952175617218018, + "logits/rejected": -1.2412437200546265, + "logps/chosen": -433.78424072265625, + "logps/rejected": -458.8251953125, + "loss": 0.0803, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.398755073547363, + "rewards/margins": 6.876921653747559, + "rewards/rejected": -2.478165864944458, + "step": 1070 + }, + { + "epoch": 0.37, + "learning_rate": 3.517189270872686e-07, + "logits/chosen": -1.503846526145935, + "logits/rejected": -1.2310945987701416, + "logps/chosen": -378.23150634765625, + "logps/rejected": -460.6910705566406, + "loss": 0.0755, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.319530487060547, + "rewards/margins": 6.819169521331787, + "rewards/rejected": -2.4996395111083984, + "step": 1080 + }, + { + "epoch": 0.37, + "learning_rate": 3.4982999622213824e-07, + "logits/chosen": -1.489429235458374, + "logits/rejected": -1.194059133529663, + "logps/chosen": -378.36224365234375, + "logps/rejected": -339.7525329589844, + "loss": 0.0693, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.176763534545898, + "rewards/margins": 6.276023864746094, + "rewards/rejected": -2.099259853363037, + "step": 1090 + }, + { + "epoch": 0.37, + "learning_rate": 3.479410653570079e-07, + "logits/chosen": -1.4815757274627686, + "logits/rejected": -1.1958659887313843, + "logps/chosen": -382.04718017578125, + "logps/rejected": -590.7028198242188, + "loss": 0.079, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.512447834014893, + "rewards/margins": 7.3736677169799805, + "rewards/rejected": -2.861220121383667, + "step": 1100 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -1.4807450771331787, + "eval_logits/rejected": -1.153213620185852, + "eval_logps/chosen": -375.21636962890625, + "eval_logps/rejected": -582.6480712890625, + "eval_loss": 0.08400283753871918, + "eval_rewards/accuracies": 0.9688552021980286, + "eval_rewards/chosen": 4.470638751983643, + "eval_rewards/margins": 6.783812046051025, + "eval_rewards/rejected": -2.313173294067383, + "eval_runtime": 560.0503, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 0.53, + "step": 1100 + }, + { + "epoch": 0.38, + "learning_rate": 3.460521344918776e-07, + "logits/chosen": -1.5088837146759033, + "logits/rejected": -1.1168277263641357, + "logps/chosen": -275.2812194824219, + "logps/rejected": -645.93701171875, + "loss": 0.102, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.146356105804443, + "rewards/margins": 6.36793327331543, + "rewards/rejected": -2.2215771675109863, + "step": 1110 + }, + { + "epoch": 0.38, + "learning_rate": 3.441632036267472e-07, + "logits/chosen": -1.50923752784729, + "logits/rejected": -1.1946974992752075, + "logps/chosen": -330.641357421875, + "logps/rejected": -577.8738403320312, + "loss": 0.0676, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.274444580078125, + "rewards/margins": 6.753907680511475, + "rewards/rejected": -2.4794628620147705, + "step": 1120 + }, + { + "epoch": 0.38, + "learning_rate": 3.422742727616169e-07, + "logits/chosen": -1.4846798181533813, + "logits/rejected": -1.1590081453323364, + "logps/chosen": -407.13201904296875, + "logps/rejected": -393.16961669921875, + "loss": 0.0598, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.086771011352539, + "rewards/margins": 6.169893741607666, + "rewards/rejected": -2.0831220149993896, + "step": 1130 + }, + { + "epoch": 0.39, + "learning_rate": 3.403853418964866e-07, + "logits/chosen": -1.4728076457977295, + "logits/rejected": -1.1583011150360107, + "logps/chosen": -379.17791748046875, + "logps/rejected": -501.658935546875, + "loss": 0.0813, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.588146209716797, + "rewards/margins": 7.1952104568481445, + "rewards/rejected": -2.6070632934570312, + "step": 1140 + }, + { + "epoch": 0.39, + "learning_rate": 3.3849641103135623e-07, + "logits/chosen": -1.4747841358184814, + "logits/rejected": -1.1721833944320679, + "logps/chosen": -417.3246154785156, + "logps/rejected": -365.9868469238281, + "loss": 0.0597, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 5.068565845489502, + "rewards/margins": 7.486462593078613, + "rewards/rejected": -2.4178969860076904, + "step": 1150 + }, + { + "epoch": 0.39, + "learning_rate": 3.3660748016622594e-07, + "logits/chosen": -1.4949634075164795, + "logits/rejected": -1.1580262184143066, + "logps/chosen": -463.01165771484375, + "logps/rejected": -490.30926513671875, + "loss": 0.0793, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.0085272789001465, + "rewards/margins": 6.497877597808838, + "rewards/rejected": -2.489349603652954, + "step": 1160 + }, + { + "epoch": 0.4, + "learning_rate": 3.3471854930109554e-07, + "logits/chosen": -1.4875307083129883, + "logits/rejected": -1.1987477540969849, + "logps/chosen": -300.71185302734375, + "logps/rejected": -572.1688232421875, + "loss": 0.0727, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.1474151611328125, + "rewards/margins": 6.675169467926025, + "rewards/rejected": -2.527754306793213, + "step": 1170 + }, + { + "epoch": 0.4, + "learning_rate": 3.3282961843596525e-07, + "logits/chosen": -1.4851741790771484, + "logits/rejected": -1.1590709686279297, + "logps/chosen": -357.76080322265625, + "logps/rejected": -723.2069091796875, + "loss": 0.0862, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.243594169616699, + "rewards/margins": 6.315056800842285, + "rewards/rejected": -2.071462392807007, + "step": 1180 + }, + { + "epoch": 0.4, + "learning_rate": 3.3094068757083485e-07, + "logits/chosen": -1.4589731693267822, + "logits/rejected": -1.124894142150879, + "logps/chosen": -395.55999755859375, + "logps/rejected": -413.896240234375, + "loss": 0.095, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.135136127471924, + "rewards/margins": 6.630227565765381, + "rewards/rejected": -2.495091676712036, + "step": 1190 + }, + { + "epoch": 0.41, + "learning_rate": 3.2905175670570456e-07, + "logits/chosen": -1.494888186454773, + "logits/rejected": -1.1850342750549316, + "logps/chosen": -464.22430419921875, + "logps/rejected": -514.73486328125, + "loss": 0.0706, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.990201234817505, + "rewards/margins": 6.323441028594971, + "rewards/rejected": -2.333240032196045, + "step": 1200 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -1.4885011911392212, + "eval_logits/rejected": -1.1666902303695679, + "eval_logps/chosen": -375.603759765625, + "eval_logps/rejected": -586.021728515625, + "eval_loss": 0.07206810265779495, + "eval_rewards/accuracies": 0.9722222089767456, + "eval_rewards/chosen": 4.431900501251221, + "eval_rewards/margins": 7.082433223724365, + "eval_rewards/rejected": -2.6505327224731445, + "eval_runtime": 560.4254, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 0.53, + "step": 1200 + }, + { + "epoch": 0.41, + "learning_rate": 3.271628258405742e-07, + "logits/chosen": -1.5175247192382812, + "logits/rejected": -1.1132011413574219, + "logps/chosen": -382.6025085449219, + "logps/rejected": -333.1087646484375, + "loss": 0.0749, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.4511542320251465, + "rewards/margins": 6.978426456451416, + "rewards/rejected": -2.5272724628448486, + "step": 1210 + }, + { + "epoch": 0.41, + "learning_rate": 3.252738949754439e-07, + "logits/chosen": -1.4874627590179443, + "logits/rejected": -1.1758732795715332, + "logps/chosen": -365.48291015625, + "logps/rejected": -641.5902099609375, + "loss": 0.0659, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.6857669353485107, + "rewards/margins": 6.348451614379883, + "rewards/rejected": -2.662684679031372, + "step": 1220 + }, + { + "epoch": 0.42, + "learning_rate": 3.233849641103136e-07, + "logits/chosen": -1.4980213642120361, + "logits/rejected": -1.2111032009124756, + "logps/chosen": -301.4989318847656, + "logps/rejected": -837.3327026367188, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.749375820159912, + "rewards/margins": 7.8252434730529785, + "rewards/rejected": -3.0758676528930664, + "step": 1230 + }, + { + "epoch": 0.42, + "learning_rate": 3.214960332451832e-07, + "logits/chosen": -1.4798014163970947, + "logits/rejected": -1.184417963027954, + "logps/chosen": -396.10888671875, + "logps/rejected": -611.9696044921875, + "loss": 0.0912, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.219520568847656, + "rewards/margins": 7.1679277420043945, + "rewards/rejected": -2.9484081268310547, + "step": 1240 + }, + { + "epoch": 0.42, + "learning_rate": 3.196071023800529e-07, + "logits/chosen": -1.5121700763702393, + "logits/rejected": -1.2249577045440674, + "logps/chosen": -387.7380065917969, + "logps/rejected": -794.2557373046875, + "loss": 0.0843, + "rewards/accuracies": 0.9375, + "rewards/chosen": 4.096358776092529, + "rewards/margins": 6.268472194671631, + "rewards/rejected": -2.1721131801605225, + "step": 1250 + }, + { + "epoch": 0.43, + "learning_rate": 3.1771817151492255e-07, + "logits/chosen": -1.4872616529464722, + "logits/rejected": -1.2021340131759644, + "logps/chosen": -396.11920166015625, + "logps/rejected": -725.071533203125, + "loss": 0.0682, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.439419269561768, + "rewards/margins": 7.2906174659729, + "rewards/rejected": -2.851198673248291, + "step": 1260 + }, + { + "epoch": 0.43, + "learning_rate": 3.158292406497922e-07, + "logits/chosen": -1.4843103885650635, + "logits/rejected": -1.1521885395050049, + "logps/chosen": -430.59686279296875, + "logps/rejected": -623.4092407226562, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.504824638366699, + "rewards/margins": 7.555941581726074, + "rewards/rejected": -3.0511183738708496, + "step": 1270 + }, + { + "epoch": 0.44, + "learning_rate": 3.1394030978466186e-07, + "logits/chosen": -1.5158779621124268, + "logits/rejected": -1.161768913269043, + "logps/chosen": -340.71282958984375, + "logps/rejected": -436.19873046875, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.091620445251465, + "rewards/margins": 7.653326511383057, + "rewards/rejected": -2.5617051124572754, + "step": 1280 + }, + { + "epoch": 0.44, + "learning_rate": 3.120513789195315e-07, + "logits/chosen": -1.5111273527145386, + "logits/rejected": -1.1369271278381348, + "logps/chosen": -313.18426513671875, + "logps/rejected": -592.9158935546875, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0614423751831055, + "rewards/margins": 7.267691612243652, + "rewards/rejected": -3.2062485218048096, + "step": 1290 + }, + { + "epoch": 0.44, + "learning_rate": 3.1016244805440117e-07, + "logits/chosen": -1.478244423866272, + "logits/rejected": -1.2829620838165283, + "logps/chosen": -377.1938781738281, + "logps/rejected": -613.537841796875, + "loss": 0.0705, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.685913562774658, + "rewards/margins": 6.318792819976807, + "rewards/rejected": -2.6328797340393066, + "step": 1300 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -1.5001074075698853, + "eval_logits/rejected": -1.181748628616333, + "eval_logps/chosen": -376.17987060546875, + "eval_logps/rejected": -588.2330322265625, + "eval_loss": 0.07252340018749237, + "eval_rewards/accuracies": 0.9739057421684265, + "eval_rewards/chosen": 4.374290466308594, + "eval_rewards/margins": 7.245957374572754, + "eval_rewards/rejected": -2.87166690826416, + "eval_runtime": 559.1781, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 0.531, + "step": 1300 + }, + { + "epoch": 0.45, + "learning_rate": 3.082735171892709e-07, + "logits/chosen": -1.5192620754241943, + "logits/rejected": -1.2069748640060425, + "logps/chosen": -312.7062072753906, + "logps/rejected": -482.1133728027344, + "loss": 0.0863, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.136082649230957, + "rewards/margins": 6.253493309020996, + "rewards/rejected": -2.117410182952881, + "step": 1310 + }, + { + "epoch": 0.45, + "learning_rate": 3.0638458632414054e-07, + "logits/chosen": -1.51072096824646, + "logits/rejected": -1.2908846139907837, + "logps/chosen": -370.267333984375, + "logps/rejected": -700.0077514648438, + "loss": 0.0773, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.160948276519775, + "rewards/margins": 7.014911651611328, + "rewards/rejected": -2.8539633750915527, + "step": 1320 + }, + { + "epoch": 0.45, + "learning_rate": 3.044956554590102e-07, + "logits/chosen": -1.4973653554916382, + "logits/rejected": -1.1667084693908691, + "logps/chosen": -367.26861572265625, + "logps/rejected": -421.83868408203125, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.954542875289917, + "rewards/margins": 6.594731330871582, + "rewards/rejected": -2.6401877403259277, + "step": 1330 + }, + { + "epoch": 0.46, + "learning_rate": 3.0260672459387985e-07, + "logits/chosen": -1.5250272750854492, + "logits/rejected": -1.1743382215499878, + "logps/chosen": -298.74359130859375, + "logps/rejected": -514.9177856445312, + "loss": 0.0598, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.985522270202637, + "rewards/margins": 7.792318820953369, + "rewards/rejected": -2.806795597076416, + "step": 1340 + }, + { + "epoch": 0.46, + "learning_rate": 3.007177937287495e-07, + "logits/chosen": -1.509218454360962, + "logits/rejected": -1.1687209606170654, + "logps/chosen": -313.0664978027344, + "logps/rejected": -652.6644897460938, + "loss": 0.0348, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.642588138580322, + "rewards/margins": 7.6289262771606445, + "rewards/rejected": -2.9863381385803223, + "step": 1350 + }, + { + "epoch": 0.46, + "learning_rate": 2.988288628636192e-07, + "logits/chosen": -1.5087230205535889, + "logits/rejected": -1.2443337440490723, + "logps/chosen": -378.0022888183594, + "logps/rejected": -524.3590087890625, + "loss": 0.0976, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.988725662231445, + "rewards/margins": 7.9680657386779785, + "rewards/rejected": -2.979340076446533, + "step": 1360 + }, + { + "epoch": 0.47, + "learning_rate": 2.969399319984888e-07, + "logits/chosen": -1.4828989505767822, + "logits/rejected": -1.1986699104309082, + "logps/chosen": -329.68743896484375, + "logps/rejected": -764.1326293945312, + "loss": 0.06, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.3440890312194824, + "rewards/margins": 6.329494476318359, + "rewards/rejected": -2.985405445098877, + "step": 1370 + }, + { + "epoch": 0.47, + "learning_rate": 2.950510011333585e-07, + "logits/chosen": -1.5152653455734253, + "logits/rejected": -1.1745421886444092, + "logps/chosen": -352.6541748046875, + "logps/rejected": -632.8907470703125, + "loss": 0.0712, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.7109479904174805, + "rewards/margins": 7.576220512390137, + "rewards/rejected": -2.8652729988098145, + "step": 1380 + }, + { + "epoch": 0.47, + "learning_rate": 2.9316207026822813e-07, + "logits/chosen": -1.523559808731079, + "logits/rejected": -1.219855546951294, + "logps/chosen": -350.2402648925781, + "logps/rejected": -644.3220825195312, + "loss": 0.0555, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.640657901763916, + "rewards/margins": 7.175803184509277, + "rewards/rejected": -2.5351455211639404, + "step": 1390 + }, + { + "epoch": 0.48, + "learning_rate": 2.9127313940309784e-07, + "logits/chosen": -1.5056852102279663, + "logits/rejected": -1.171008586883545, + "logps/chosen": -290.78802490234375, + "logps/rejected": -539.4659423828125, + "loss": 0.0537, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.571245193481445, + "rewards/margins": 8.044679641723633, + "rewards/rejected": -3.4734344482421875, + "step": 1400 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -1.5018603801727295, + "eval_logits/rejected": -1.1789315938949585, + "eval_logps/chosen": -376.07598876953125, + "eval_logps/rejected": -589.1927490234375, + "eval_loss": 0.06479610502719879, + "eval_rewards/accuracies": 0.9755892157554626, + "eval_rewards/chosen": 4.384680271148682, + "eval_rewards/margins": 7.3523173332214355, + "eval_rewards/rejected": -2.967637062072754, + "eval_runtime": 560.4699, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 0.53, + "step": 1400 + }, + { + "epoch": 0.48, + "learning_rate": 2.8938420853796754e-07, + "logits/chosen": -1.5160057544708252, + "logits/rejected": -1.1476496458053589, + "logps/chosen": -411.9048767089844, + "logps/rejected": -589.949951171875, + "loss": 0.0359, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.407751560211182, + "rewards/margins": 7.413491725921631, + "rewards/rejected": -3.0057406425476074, + "step": 1410 + }, + { + "epoch": 0.48, + "learning_rate": 2.8749527767283715e-07, + "logits/chosen": -1.4947352409362793, + "logits/rejected": -1.2628874778747559, + "logps/chosen": -390.4944152832031, + "logps/rejected": -644.2883911132812, + "loss": 0.0619, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.163218021392822, + "rewards/margins": 7.030184268951416, + "rewards/rejected": -2.866966485977173, + "step": 1420 + }, + { + "epoch": 0.49, + "learning_rate": 2.8560634680770686e-07, + "logits/chosen": -1.4881634712219238, + "logits/rejected": -1.2398184537887573, + "logps/chosen": -369.6822814941406, + "logps/rejected": -579.9475708007812, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.736443996429443, + "rewards/margins": 7.590858459472656, + "rewards/rejected": -2.854414463043213, + "step": 1430 + }, + { + "epoch": 0.49, + "learning_rate": 2.8371741594257646e-07, + "logits/chosen": -1.4975007772445679, + "logits/rejected": -1.2246668338775635, + "logps/chosen": -476.669677734375, + "logps/rejected": -479.30108642578125, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.004213809967041, + "rewards/margins": 6.466128349304199, + "rewards/rejected": -2.4619147777557373, + "step": 1440 + }, + { + "epoch": 0.49, + "learning_rate": 2.8182848507744617e-07, + "logits/chosen": -1.516287088394165, + "logits/rejected": -1.202371597290039, + "logps/chosen": -321.565673828125, + "logps/rejected": -408.3916015625, + "loss": 0.0573, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.261721611022949, + "rewards/margins": 7.2457404136657715, + "rewards/rejected": -2.9840192794799805, + "step": 1450 + }, + { + "epoch": 0.5, + "learning_rate": 2.799395542123158e-07, + "logits/chosen": -1.4922573566436768, + "logits/rejected": -1.2690476179122925, + "logps/chosen": -453.80413818359375, + "logps/rejected": -637.1600952148438, + "loss": 0.0575, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.037074089050293, + "rewards/margins": 6.629528045654297, + "rewards/rejected": -2.592454433441162, + "step": 1460 + }, + { + "epoch": 0.5, + "learning_rate": 2.780506233471855e-07, + "logits/chosen": -1.5398370027542114, + "logits/rejected": -1.190582513809204, + "logps/chosen": -303.58465576171875, + "logps/rejected": -457.27142333984375, + "loss": 0.0475, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.979724884033203, + "rewards/margins": 6.8525190353393555, + "rewards/rejected": -2.8727943897247314, + "step": 1470 + }, + { + "epoch": 0.5, + "learning_rate": 2.7616169248205513e-07, + "logits/chosen": -1.480398178100586, + "logits/rejected": -1.1202542781829834, + "logps/chosen": -291.8365478515625, + "logps/rejected": -525.9415893554688, + "loss": 0.0765, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7548556327819824, + "rewards/margins": 6.797545433044434, + "rewards/rejected": -3.042689800262451, + "step": 1480 + }, + { + "epoch": 0.51, + "learning_rate": 2.742727616169248e-07, + "logits/chosen": -1.5058258771896362, + "logits/rejected": -1.2445354461669922, + "logps/chosen": -349.40081787109375, + "logps/rejected": -353.338623046875, + "loss": 0.0673, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.481741428375244, + "rewards/margins": 7.538400173187256, + "rewards/rejected": -3.056657552719116, + "step": 1490 + }, + { + "epoch": 0.51, + "learning_rate": 2.723838307517945e-07, + "logits/chosen": -1.5099804401397705, + "logits/rejected": -1.227176308631897, + "logps/chosen": -449.3966369628906, + "logps/rejected": -564.8681030273438, + "loss": 0.0483, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.375594615936279, + "rewards/margins": 7.313169956207275, + "rewards/rejected": -2.937574863433838, + "step": 1500 + }, + { + "epoch": 0.51, + "eval_logits/chosen": -1.5114119052886963, + "eval_logits/rejected": -1.1923363208770752, + "eval_logps/chosen": -376.16131591796875, + "eval_logps/rejected": -591.8114013671875, + "eval_loss": 0.060400474816560745, + "eval_rewards/accuracies": 0.9797979593276978, + "eval_rewards/chosen": 4.3761420249938965, + "eval_rewards/margins": 7.605640411376953, + "eval_rewards/rejected": -3.2294986248016357, + "eval_runtime": 560.6153, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 0.53, + "step": 1500 + }, + { + "epoch": 0.51, + "learning_rate": 2.7049489988666416e-07, + "logits/chosen": -1.5114130973815918, + "logits/rejected": -1.2126189470291138, + "logps/chosen": -430.208984375, + "logps/rejected": -501.6602478027344, + "loss": 0.075, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.029915809631348, + "rewards/margins": 7.003431797027588, + "rewards/rejected": -2.973515748977661, + "step": 1510 + }, + { + "epoch": 0.52, + "learning_rate": 2.686059690215338e-07, + "logits/chosen": -1.4975926876068115, + "logits/rejected": -1.231730580329895, + "logps/chosen": -384.80133056640625, + "logps/rejected": -646.3175048828125, + "loss": 0.0883, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.063393592834473, + "rewards/margins": 7.050488471984863, + "rewards/rejected": -2.9870944023132324, + "step": 1520 + }, + { + "epoch": 0.52, + "learning_rate": 2.6671703815640347e-07, + "logits/chosen": -1.5093035697937012, + "logits/rejected": -1.1693612337112427, + "logps/chosen": -357.9332580566406, + "logps/rejected": -419.8724060058594, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.968228816986084, + "rewards/margins": 7.172101020812988, + "rewards/rejected": -3.2038722038269043, + "step": 1530 + }, + { + "epoch": 0.52, + "learning_rate": 2.648281072912731e-07, + "logits/chosen": -1.5199733972549438, + "logits/rejected": -1.2490711212158203, + "logps/chosen": -319.37335205078125, + "logps/rejected": -703.9043579101562, + "loss": 0.0709, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.484343528747559, + "rewards/margins": 6.947661399841309, + "rewards/rejected": -2.46331787109375, + "step": 1540 + }, + { + "epoch": 0.53, + "learning_rate": 2.629391764261428e-07, + "logits/chosen": -1.4934265613555908, + "logits/rejected": -1.2080678939819336, + "logps/chosen": -397.90325927734375, + "logps/rejected": -477.54681396484375, + "loss": 0.069, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9346039295196533, + "rewards/margins": 7.143618583679199, + "rewards/rejected": -3.2090160846710205, + "step": 1550 + }, + { + "epoch": 0.53, + "learning_rate": 2.610502455610125e-07, + "logits/chosen": -1.4791805744171143, + "logits/rejected": -1.201578140258789, + "logps/chosen": -478.21563720703125, + "logps/rejected": -653.1609497070312, + "loss": 0.0623, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.886442184448242, + "rewards/margins": 6.926022529602051, + "rewards/rejected": -3.0395796298980713, + "step": 1560 + }, + { + "epoch": 0.53, + "learning_rate": 2.591613146958821e-07, + "logits/chosen": -1.5138906240463257, + "logits/rejected": -1.243032455444336, + "logps/chosen": -350.7288818359375, + "logps/rejected": -627.7431640625, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.634993076324463, + "rewards/margins": 7.957524299621582, + "rewards/rejected": -3.3225319385528564, + "step": 1570 + }, + { + "epoch": 0.54, + "learning_rate": 2.572723838307518e-07, + "logits/chosen": -1.5238749980926514, + "logits/rejected": -1.1785722970962524, + "logps/chosen": -377.65045166015625, + "logps/rejected": -531.2706298828125, + "loss": 0.0716, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.06870698928833, + "rewards/margins": 7.24503231048584, + "rewards/rejected": -3.176325798034668, + "step": 1580 + }, + { + "epoch": 0.54, + "learning_rate": 2.5538345296562145e-07, + "logits/chosen": -1.5166254043579102, + "logits/rejected": -1.1256914138793945, + "logps/chosen": -375.782470703125, + "logps/rejected": -465.26287841796875, + "loss": 0.0531, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.288039684295654, + "rewards/margins": 7.8462958335876465, + "rewards/rejected": -3.5582566261291504, + "step": 1590 + }, + { + "epoch": 0.54, + "learning_rate": 2.534945221004911e-07, + "logits/chosen": -1.4953352212905884, + "logits/rejected": -1.2030283212661743, + "logps/chosen": -529.4041748046875, + "logps/rejected": -491.87384033203125, + "loss": 0.0572, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.736111164093018, + "rewards/margins": 7.682862281799316, + "rewards/rejected": -2.946751356124878, + "step": 1600 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -1.504213571548462, + "eval_logits/rejected": -1.185482144355774, + "eval_logps/chosen": -376.66448974609375, + "eval_logps/rejected": -592.157470703125, + "eval_loss": 0.05805225297808647, + "eval_rewards/accuracies": 0.9772727489471436, + "eval_rewards/chosen": 4.325828552246094, + "eval_rewards/margins": 7.589939117431641, + "eval_rewards/rejected": -3.2641103267669678, + "eval_runtime": 560.9875, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 0.529, + "step": 1600 + }, + { + "epoch": 0.55, + "learning_rate": 2.516055912353608e-07, + "logits/chosen": -1.5023882389068604, + "logits/rejected": -1.2200844287872314, + "logps/chosen": -385.99200439453125, + "logps/rejected": -679.5035400390625, + "loss": 0.0551, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.191043376922607, + "rewards/margins": 7.510348320007324, + "rewards/rejected": -3.3193047046661377, + "step": 1610 + }, + { + "epoch": 0.55, + "learning_rate": 2.497166603702304e-07, + "logits/chosen": -1.5069820880889893, + "logits/rejected": -1.2421448230743408, + "logps/chosen": -367.49481201171875, + "logps/rejected": -656.8528442382812, + "loss": 0.0836, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.26437520980835, + "rewards/margins": 7.1600213050842285, + "rewards/rejected": -2.895646572113037, + "step": 1620 + }, + { + "epoch": 0.55, + "learning_rate": 2.4782772950510013e-07, + "logits/chosen": -1.516225814819336, + "logits/rejected": -1.1817419528961182, + "logps/chosen": -397.768798828125, + "logps/rejected": -495.23443603515625, + "loss": 0.0624, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.2399187088012695, + "rewards/margins": 7.541648864746094, + "rewards/rejected": -3.3017311096191406, + "step": 1630 + }, + { + "epoch": 0.56, + "learning_rate": 2.459387986399698e-07, + "logits/chosen": -1.517730474472046, + "logits/rejected": -1.2045361995697021, + "logps/chosen": -356.7253112792969, + "logps/rejected": -508.641357421875, + "loss": 0.0592, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.046034812927246, + "rewards/margins": 7.333725929260254, + "rewards/rejected": -3.2876906394958496, + "step": 1640 + }, + { + "epoch": 0.56, + "learning_rate": 2.4404986777483944e-07, + "logits/chosen": -1.497016191482544, + "logits/rejected": -1.227853536605835, + "logps/chosen": -446.447265625, + "logps/rejected": -458.6922912597656, + "loss": 0.0625, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.6903839111328125, + "rewards/margins": 6.824693202972412, + "rewards/rejected": -3.1343090534210205, + "step": 1650 + }, + { + "epoch": 0.56, + "learning_rate": 2.421609369097091e-07, + "logits/chosen": -1.514736533164978, + "logits/rejected": -1.1953274011611938, + "logps/chosen": -395.9438781738281, + "logps/rejected": -519.7674560546875, + "loss": 0.0504, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.116658687591553, + "rewards/margins": 7.169915199279785, + "rewards/rejected": -3.0532562732696533, + "step": 1660 + }, + { + "epoch": 0.57, + "learning_rate": 2.4027200604457875e-07, + "logits/chosen": -1.5058282613754272, + "logits/rejected": -1.2581437826156616, + "logps/chosen": -366.08233642578125, + "logps/rejected": -546.3473510742188, + "loss": 0.0501, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.698902130126953, + "rewards/margins": 7.672143459320068, + "rewards/rejected": -2.973240852355957, + "step": 1670 + }, + { + "epoch": 0.57, + "learning_rate": 2.383830751794484e-07, + "logits/chosen": -1.4994776248931885, + "logits/rejected": -1.184206485748291, + "logps/chosen": -481.41680908203125, + "logps/rejected": -767.9908447265625, + "loss": 0.0468, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.650491714477539, + "rewards/margins": 8.127163887023926, + "rewards/rejected": -3.4766716957092285, + "step": 1680 + }, + { + "epoch": 0.57, + "learning_rate": 2.364941443143181e-07, + "logits/chosen": -1.531582236289978, + "logits/rejected": -1.2031329870224, + "logps/chosen": -321.4193420410156, + "logps/rejected": -499.52520751953125, + "loss": 0.0414, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.110111236572266, + "rewards/margins": 7.465426445007324, + "rewards/rejected": -3.355315685272217, + "step": 1690 + }, + { + "epoch": 0.58, + "learning_rate": 2.3460521344918775e-07, + "logits/chosen": -1.5217511653900146, + "logits/rejected": -1.2462084293365479, + "logps/chosen": -344.8479309082031, + "logps/rejected": -528.7171020507812, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.58670711517334, + "rewards/margins": 7.7972092628479, + "rewards/rejected": -3.2105019092559814, + "step": 1700 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -1.5110249519348145, + "eval_logits/rejected": -1.1886183023452759, + "eval_logps/chosen": -376.65234375, + "eval_logps/rejected": -593.3289184570312, + "eval_loss": 0.05385367199778557, + "eval_rewards/accuracies": 0.9814814925193787, + "eval_rewards/chosen": 4.327041149139404, + "eval_rewards/margins": 7.708298683166504, + "eval_rewards/rejected": -3.3812568187713623, + "eval_runtime": 560.6648, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 0.53, + "step": 1700 + }, + { + "epoch": 0.58, + "learning_rate": 2.327162825840574e-07, + "logits/chosen": -1.5017929077148438, + "logits/rejected": -1.2263991832733154, + "logps/chosen": -440.38330078125, + "logps/rejected": -768.6172485351562, + "loss": 0.0801, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.483176231384277, + "rewards/margins": 7.676694393157959, + "rewards/rejected": -3.1935179233551025, + "step": 1710 + }, + { + "epoch": 0.58, + "learning_rate": 2.3082735171892708e-07, + "logits/chosen": -1.4979521036148071, + "logits/rejected": -1.2336069345474243, + "logps/chosen": -368.3187255859375, + "logps/rejected": -962.7932739257812, + "loss": 0.0557, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.179459095001221, + "rewards/margins": 7.952836036682129, + "rewards/rejected": -3.7733776569366455, + "step": 1720 + }, + { + "epoch": 0.59, + "learning_rate": 2.2893842085379674e-07, + "logits/chosen": -1.527874231338501, + "logits/rejected": -1.1270580291748047, + "logps/chosen": -333.58258056640625, + "logps/rejected": -464.1814880371094, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.677160263061523, + "rewards/margins": 8.32009220123291, + "rewards/rejected": -3.642932176589966, + "step": 1730 + }, + { + "epoch": 0.59, + "learning_rate": 2.2704948998866642e-07, + "logits/chosen": -1.4980236291885376, + "logits/rejected": -1.2070553302764893, + "logps/chosen": -518.45703125, + "logps/rejected": -483.01422119140625, + "loss": 0.0512, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.008620262145996, + "rewards/margins": 7.4014739990234375, + "rewards/rejected": -3.392852783203125, + "step": 1740 + }, + { + "epoch": 0.59, + "learning_rate": 2.2516055912353608e-07, + "logits/chosen": -1.5251991748809814, + "logits/rejected": -1.212727665901184, + "logps/chosen": -445.04931640625, + "logps/rejected": -477.67364501953125, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.699405193328857, + "rewards/margins": 8.411505699157715, + "rewards/rejected": -3.7121009826660156, + "step": 1750 + }, + { + "epoch": 0.6, + "learning_rate": 2.2327162825840573e-07, + "logits/chosen": -1.5143253803253174, + "logits/rejected": -1.2615511417388916, + "logps/chosen": -432.19561767578125, + "logps/rejected": -536.9600219726562, + "loss": 0.058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.530027866363525, + "rewards/margins": 7.708025932312012, + "rewards/rejected": -3.1779980659484863, + "step": 1760 + }, + { + "epoch": 0.6, + "learning_rate": 2.213826973932754e-07, + "logits/chosen": -1.5171505212783813, + "logits/rejected": -1.230185866355896, + "logps/chosen": -315.69781494140625, + "logps/rejected": -726.6375122070312, + "loss": 0.0496, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.840248107910156, + "rewards/margins": 8.333008766174316, + "rewards/rejected": -3.4927608966827393, + "step": 1770 + }, + { + "epoch": 0.61, + "learning_rate": 2.1949376652814505e-07, + "logits/chosen": -1.4918253421783447, + "logits/rejected": -1.2145707607269287, + "logps/chosen": -400.26776123046875, + "logps/rejected": -509.4452209472656, + "loss": 0.071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.016195774078369, + "rewards/margins": 7.594870567321777, + "rewards/rejected": -3.5786757469177246, + "step": 1780 + }, + { + "epoch": 0.61, + "learning_rate": 2.176048356630147e-07, + "logits/chosen": -1.5160037279129028, + "logits/rejected": -1.2147983312606812, + "logps/chosen": -521.50537109375, + "logps/rejected": -494.4219665527344, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.44297981262207, + "rewards/margins": 7.988565921783447, + "rewards/rejected": -3.5455868244171143, + "step": 1790 + }, + { + "epoch": 0.61, + "learning_rate": 2.157159047978844e-07, + "logits/chosen": -1.5425684452056885, + "logits/rejected": -1.1855405569076538, + "logps/chosen": -327.6552429199219, + "logps/rejected": -658.273193359375, + "loss": 0.0561, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.459707736968994, + "rewards/margins": 7.496172904968262, + "rewards/rejected": -3.0364651679992676, + "step": 1800 + }, + { + "epoch": 0.61, + "eval_logits/chosen": -1.5143883228302002, + "eval_logits/rejected": -1.194756269454956, + "eval_logps/chosen": -376.0636291503906, + "eval_logps/rejected": -593.4963989257812, + "eval_loss": 0.05014927685260773, + "eval_rewards/accuracies": 0.9797979593276978, + "eval_rewards/chosen": 4.385910511016846, + "eval_rewards/margins": 7.783912658691406, + "eval_rewards/rejected": -3.3980023860931396, + "eval_runtime": 560.6319, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 0.53, + "step": 1800 + }, + { + "epoch": 0.62, + "learning_rate": 2.1382697393275407e-07, + "logits/chosen": -1.526610016822815, + "logits/rejected": -1.1567124128341675, + "logps/chosen": -454.001220703125, + "logps/rejected": -625.8849487304688, + "loss": 0.0381, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.282632827758789, + "rewards/margins": 8.028142929077148, + "rewards/rejected": -3.7455101013183594, + "step": 1810 + }, + { + "epoch": 0.62, + "learning_rate": 2.1193804306762372e-07, + "logits/chosen": -1.514725923538208, + "logits/rejected": -1.2673990726470947, + "logps/chosen": -387.0127258300781, + "logps/rejected": -848.1672973632812, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.279064655303955, + "rewards/margins": 7.790387153625488, + "rewards/rejected": -3.5113232135772705, + "step": 1820 + }, + { + "epoch": 0.62, + "learning_rate": 2.1004911220249338e-07, + "logits/chosen": -1.5263025760650635, + "logits/rejected": -1.2004420757293701, + "logps/chosen": -390.23028564453125, + "logps/rejected": -547.3260498046875, + "loss": 0.0551, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.023037433624268, + "rewards/margins": 6.7541937828063965, + "rewards/rejected": -2.731156349182129, + "step": 1830 + }, + { + "epoch": 0.63, + "learning_rate": 2.0816018133736303e-07, + "logits/chosen": -1.5079574584960938, + "logits/rejected": -1.1305427551269531, + "logps/chosen": -463.9093322753906, + "logps/rejected": -681.10205078125, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.483296871185303, + "rewards/margins": 8.173705101013184, + "rewards/rejected": -3.690408229827881, + "step": 1840 + }, + { + "epoch": 0.63, + "learning_rate": 2.0627125047223271e-07, + "logits/chosen": -1.5206629037857056, + "logits/rejected": -1.2181518077850342, + "logps/chosen": -397.5008850097656, + "logps/rejected": -562.5797729492188, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.292169094085693, + "rewards/margins": 7.382157325744629, + "rewards/rejected": -3.089987277984619, + "step": 1850 + }, + { + "epoch": 0.63, + "learning_rate": 2.0438231960710237e-07, + "logits/chosen": -1.5032273530960083, + "logits/rejected": -1.276940107345581, + "logps/chosen": -366.43060302734375, + "logps/rejected": -677.36962890625, + "loss": 0.062, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.3607306480407715, + "rewards/margins": 7.476487636566162, + "rewards/rejected": -3.115757465362549, + "step": 1860 + }, + { + "epoch": 0.64, + "learning_rate": 2.0249338874197203e-07, + "logits/chosen": -1.5287476778030396, + "logits/rejected": -1.2484480142593384, + "logps/chosen": -378.16302490234375, + "logps/rejected": -514.4780883789062, + "loss": 0.0657, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.000302314758301, + "rewards/margins": 7.317461967468262, + "rewards/rejected": -3.317160129547119, + "step": 1870 + }, + { + "epoch": 0.64, + "learning_rate": 2.0060445787684168e-07, + "logits/chosen": -1.5523929595947266, + "logits/rejected": -1.2034788131713867, + "logps/chosen": -319.57110595703125, + "logps/rejected": -540.62158203125, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.091488361358643, + "rewards/margins": 7.53751277923584, + "rewards/rejected": -3.4460244178771973, + "step": 1880 + }, + { + "epoch": 0.64, + "learning_rate": 1.9871552701171136e-07, + "logits/chosen": -1.5344616174697876, + "logits/rejected": -1.2502692937850952, + "logps/chosen": -417.1963806152344, + "logps/rejected": -557.7677001953125, + "loss": 0.0625, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.207846641540527, + "rewards/margins": 7.392093658447266, + "rewards/rejected": -3.184246778488159, + "step": 1890 + }, + { + "epoch": 0.65, + "learning_rate": 1.9682659614658105e-07, + "logits/chosen": -1.5210683345794678, + "logits/rejected": -1.2344892024993896, + "logps/chosen": -468.1206970214844, + "logps/rejected": -585.029541015625, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.628952980041504, + "rewards/margins": 7.9791693687438965, + "rewards/rejected": -3.35021710395813, + "step": 1900 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -1.5147186517715454, + "eval_logits/rejected": -1.203603982925415, + "eval_logps/chosen": -375.7136535644531, + "eval_logps/rejected": -593.9944458007812, + "eval_loss": 0.050368715077638626, + "eval_rewards/accuracies": 0.9814814925193787, + "eval_rewards/chosen": 4.420912742614746, + "eval_rewards/margins": 7.868711471557617, + "eval_rewards/rejected": -3.447798728942871, + "eval_runtime": 559.9302, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 0.53, + "step": 1900 + }, + { + "epoch": 0.65, + "learning_rate": 1.949376652814507e-07, + "logits/chosen": -1.5237205028533936, + "logits/rejected": -1.2196900844573975, + "logps/chosen": -353.64813232421875, + "logps/rejected": -567.98486328125, + "loss": 0.0444, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.203667640686035, + "rewards/margins": 7.699929237365723, + "rewards/rejected": -3.4962615966796875, + "step": 1910 + }, + { + "epoch": 0.65, + "learning_rate": 1.9304873441632036e-07, + "logits/chosen": -1.5383799076080322, + "logits/rejected": -1.1897004842758179, + "logps/chosen": -304.3857116699219, + "logps/rejected": -420.22833251953125, + "loss": 0.0371, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.345728874206543, + "rewards/margins": 7.751856327056885, + "rewards/rejected": -3.4061267375946045, + "step": 1920 + }, + { + "epoch": 0.66, + "learning_rate": 1.9115980355119001e-07, + "logits/chosen": -1.527374029159546, + "logits/rejected": -1.2602983713150024, + "logps/chosen": -329.572998046875, + "logps/rejected": -735.498046875, + "loss": 0.0391, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.703977108001709, + "rewards/margins": 7.709604740142822, + "rewards/rejected": -3.005627393722534, + "step": 1930 + }, + { + "epoch": 0.66, + "learning_rate": 1.8927087268605967e-07, + "logits/chosen": -1.5346765518188477, + "logits/rejected": -1.2059959173202515, + "logps/chosen": -328.3225402832031, + "logps/rejected": -767.6409912109375, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.100872993469238, + "rewards/margins": 8.146936416625977, + "rewards/rejected": -4.04606294631958, + "step": 1940 + }, + { + "epoch": 0.66, + "learning_rate": 1.8738194182092935e-07, + "logits/chosen": -1.494866132736206, + "logits/rejected": -1.2239644527435303, + "logps/chosen": -555.472900390625, + "logps/rejected": -520.2547607421875, + "loss": 0.0514, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.442248344421387, + "rewards/margins": 7.3626532554626465, + "rewards/rejected": -2.920405626296997, + "step": 1950 + }, + { + "epoch": 0.67, + "learning_rate": 1.85493010955799e-07, + "logits/chosen": -1.519852876663208, + "logits/rejected": -1.212501883506775, + "logps/chosen": -389.075927734375, + "logps/rejected": -503.87640380859375, + "loss": 0.0575, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.409701347351074, + "rewards/margins": 8.056486129760742, + "rewards/rejected": -3.6467843055725098, + "step": 1960 + }, + { + "epoch": 0.67, + "learning_rate": 1.8360408009066866e-07, + "logits/chosen": -1.5152437686920166, + "logits/rejected": -1.2427327632904053, + "logps/chosen": -401.62127685546875, + "logps/rejected": -749.9725341796875, + "loss": 0.054, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.197791576385498, + "rewards/margins": 7.349157810211182, + "rewards/rejected": -3.1513662338256836, + "step": 1970 + }, + { + "epoch": 0.67, + "learning_rate": 1.8171514922553835e-07, + "logits/chosen": -1.531203269958496, + "logits/rejected": -1.2765506505966187, + "logps/chosen": -406.9158630371094, + "logps/rejected": -546.1353759765625, + "loss": 0.0515, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.106224536895752, + "rewards/margins": 7.5454840660095215, + "rewards/rejected": -3.439260482788086, + "step": 1980 + }, + { + "epoch": 0.68, + "learning_rate": 1.79826218360408e-07, + "logits/chosen": -1.4749855995178223, + "logits/rejected": -1.2416235208511353, + "logps/chosen": -434.5870056152344, + "logps/rejected": -412.8590393066406, + "loss": 0.0475, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.6538729667663574, + "rewards/margins": 7.308139801025391, + "rewards/rejected": -3.654266357421875, + "step": 1990 + }, + { + "epoch": 0.68, + "learning_rate": 1.7793728749527768e-07, + "logits/chosen": -1.5064570903778076, + "logits/rejected": -1.2394144535064697, + "logps/chosen": -378.8180236816406, + "logps/rejected": -484.3456115722656, + "loss": 0.0493, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.8156559467315674, + "rewards/margins": 7.063841342926025, + "rewards/rejected": -3.2481846809387207, + "step": 2000 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -1.5070686340332031, + "eval_logits/rejected": -1.1925033330917358, + "eval_logps/chosen": -376.0872802734375, + "eval_logps/rejected": -595.3203125, + "eval_loss": 0.04720592126250267, + "eval_rewards/accuracies": 0.9831649661064148, + "eval_rewards/chosen": 4.383547306060791, + "eval_rewards/margins": 7.9639458656311035, + "eval_rewards/rejected": -3.5803987979888916, + "eval_runtime": 558.9461, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 0.531, + "step": 2000 + }, + { + "epoch": 0.68, + "learning_rate": 1.7604835663014734e-07, + "logits/chosen": -1.5241343975067139, + "logits/rejected": -1.2008111476898193, + "logps/chosen": -319.7298278808594, + "logps/rejected": -589.1061401367188, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.302298545837402, + "rewards/margins": 7.667372703552246, + "rewards/rejected": -3.3650736808776855, + "step": 2010 + }, + { + "epoch": 0.69, + "learning_rate": 1.74159425765017e-07, + "logits/chosen": -1.5107629299163818, + "logits/rejected": -1.1774795055389404, + "logps/chosen": -325.34588623046875, + "logps/rejected": -675.2240600585938, + "loss": 0.0605, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.494423866271973, + "rewards/margins": 8.346731185913086, + "rewards/rejected": -3.852306365966797, + "step": 2020 + }, + { + "epoch": 0.69, + "learning_rate": 1.7227049489988665e-07, + "logits/chosen": -1.4806641340255737, + "logits/rejected": -1.22501540184021, + "logps/chosen": -429.6233825683594, + "logps/rejected": -475.925537109375, + "loss": 0.0373, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.175747871398926, + "rewards/margins": 7.295570373535156, + "rewards/rejected": -3.1198229789733887, + "step": 2030 + }, + { + "epoch": 0.69, + "learning_rate": 1.703815640347563e-07, + "logits/chosen": -1.5204100608825684, + "logits/rejected": -1.2161033153533936, + "logps/chosen": -313.9083557128906, + "logps/rejected": -539.1317749023438, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.7474541664123535, + "rewards/margins": 8.653862953186035, + "rewards/rejected": -3.9064087867736816, + "step": 2040 + }, + { + "epoch": 0.7, + "learning_rate": 1.6849263316962596e-07, + "logits/chosen": -1.5086395740509033, + "logits/rejected": -1.1813517808914185, + "logps/chosen": -395.3387756347656, + "logps/rejected": -496.54052734375, + "loss": 0.0482, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.5052483081817627, + "rewards/margins": 7.271603584289551, + "rewards/rejected": -3.766355514526367, + "step": 2050 + }, + { + "epoch": 0.7, + "learning_rate": 1.6660370230449564e-07, + "logits/chosen": -1.5112879276275635, + "logits/rejected": -1.1831653118133545, + "logps/chosen": -406.0916442871094, + "logps/rejected": -510.435302734375, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.234797477722168, + "rewards/margins": 7.962366580963135, + "rewards/rejected": -3.727570056915283, + "step": 2060 + }, + { + "epoch": 0.7, + "learning_rate": 1.6471477143936533e-07, + "logits/chosen": -1.5286850929260254, + "logits/rejected": -1.2524337768554688, + "logps/chosen": -375.4138488769531, + "logps/rejected": -542.0020751953125, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.921536922454834, + "rewards/margins": 8.480701446533203, + "rewards/rejected": -3.559164047241211, + "step": 2070 + }, + { + "epoch": 0.71, + "learning_rate": 1.6282584057423498e-07, + "logits/chosen": -1.5124969482421875, + "logits/rejected": -1.151609182357788, + "logps/chosen": -297.8030700683594, + "logps/rejected": -420.9918518066406, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.653942584991455, + "rewards/margins": 8.311192512512207, + "rewards/rejected": -3.657250165939331, + "step": 2080 + }, + { + "epoch": 0.71, + "learning_rate": 1.6093690970910464e-07, + "logits/chosen": -1.5191973447799683, + "logits/rejected": -1.2087528705596924, + "logps/chosen": -406.9920654296875, + "logps/rejected": -566.786376953125, + "loss": 0.0491, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.626988887786865, + "rewards/margins": 8.500511169433594, + "rewards/rejected": -3.873521327972412, + "step": 2090 + }, + { + "epoch": 0.71, + "learning_rate": 1.590479788439743e-07, + "logits/chosen": -1.5033385753631592, + "logits/rejected": -1.2532203197479248, + "logps/chosen": -429.1227111816406, + "logps/rejected": -824.8059692382812, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9824090003967285, + "rewards/margins": 7.593686103820801, + "rewards/rejected": -3.611276149749756, + "step": 2100 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -1.5165996551513672, + "eval_logits/rejected": -1.2020140886306763, + "eval_logps/chosen": -376.9510498046875, + "eval_logps/rejected": -597.5147094726562, + "eval_loss": 0.044869087636470795, + "eval_rewards/accuracies": 0.9840067625045776, + "eval_rewards/chosen": 4.297166347503662, + "eval_rewards/margins": 8.097002983093262, + "eval_rewards/rejected": -3.7998366355895996, + "eval_runtime": 560.374, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 0.53, + "step": 2100 + }, + { + "epoch": 0.72, + "learning_rate": 1.5715904797884398e-07, + "logits/chosen": -1.5076260566711426, + "logits/rejected": -1.236230492591858, + "logps/chosen": -348.450439453125, + "logps/rejected": -499.9619140625, + "loss": 0.0848, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.0965800285339355, + "rewards/margins": 7.605328559875488, + "rewards/rejected": -3.5087478160858154, + "step": 2110 + }, + { + "epoch": 0.72, + "learning_rate": 1.5527011711371363e-07, + "logits/chosen": -1.529317855834961, + "logits/rejected": -1.2510004043579102, + "logps/chosen": -339.26165771484375, + "logps/rejected": -693.9097290039062, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.7485551834106445, + "rewards/margins": 8.082775115966797, + "rewards/rejected": -3.3342204093933105, + "step": 2120 + }, + { + "epoch": 0.72, + "learning_rate": 1.533811862485833e-07, + "logits/chosen": -1.5148240327835083, + "logits/rejected": -1.199103593826294, + "logps/chosen": -385.8512878417969, + "logps/rejected": -639.3033447265625, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.181856155395508, + "rewards/margins": 7.878443717956543, + "rewards/rejected": -3.6965866088867188, + "step": 2130 + }, + { + "epoch": 0.73, + "learning_rate": 1.5149225538345294e-07, + "logits/chosen": -1.5198280811309814, + "logits/rejected": -1.1935245990753174, + "logps/chosen": -348.9425964355469, + "logps/rejected": -459.17041015625, + "loss": 0.0394, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.525935173034668, + "rewards/margins": 7.647967338562012, + "rewards/rejected": -3.122032403945923, + "step": 2140 + }, + { + "epoch": 0.73, + "learning_rate": 1.496033245183226e-07, + "logits/chosen": -1.5460079908370972, + "logits/rejected": -1.1968626976013184, + "logps/chosen": -328.58251953125, + "logps/rejected": -633.262451171875, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.540610313415527, + "rewards/margins": 8.390274047851562, + "rewards/rejected": -3.849663496017456, + "step": 2150 + }, + { + "epoch": 0.73, + "learning_rate": 1.477143936531923e-07, + "logits/chosen": -1.5178780555725098, + "logits/rejected": -1.214658498764038, + "logps/chosen": -476.41876220703125, + "logps/rejected": -339.6917419433594, + "loss": 0.0619, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.9794323444366455, + "rewards/margins": 7.556540489196777, + "rewards/rejected": -3.5771079063415527, + "step": 2160 + }, + { + "epoch": 0.74, + "learning_rate": 1.4582546278806196e-07, + "logits/chosen": -1.520281195640564, + "logits/rejected": -1.2420076131820679, + "logps/chosen": -379.0602111816406, + "logps/rejected": -473.5738830566406, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.220221042633057, + "rewards/margins": 7.9052581787109375, + "rewards/rejected": -3.685037612915039, + "step": 2170 + }, + { + "epoch": 0.74, + "learning_rate": 1.4393653192293162e-07, + "logits/chosen": -1.5194097757339478, + "logits/rejected": -1.2116343975067139, + "logps/chosen": -409.2084655761719, + "logps/rejected": -558.417236328125, + "loss": 0.0416, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.273309230804443, + "rewards/margins": 7.955672264099121, + "rewards/rejected": -3.6823630332946777, + "step": 2180 + }, + { + "epoch": 0.74, + "learning_rate": 1.4204760105780127e-07, + "logits/chosen": -1.5045769214630127, + "logits/rejected": -1.2302758693695068, + "logps/chosen": -392.59149169921875, + "logps/rejected": -547.0234985351562, + "loss": 0.046, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.600020408630371, + "rewards/margins": 8.4217529296875, + "rewards/rejected": -3.8217320442199707, + "step": 2190 + }, + { + "epoch": 0.75, + "learning_rate": 1.4015867019267093e-07, + "logits/chosen": -1.4690983295440674, + "logits/rejected": -1.182051658630371, + "logps/chosen": -650.6025390625, + "logps/rejected": -511.93035888671875, + "loss": 0.0475, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.5337417125701904, + "rewards/margins": 7.234931945800781, + "rewards/rejected": -3.7011895179748535, + "step": 2200 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -1.517708420753479, + "eval_logits/rejected": -1.1991840600967407, + "eval_logps/chosen": -376.849365234375, + "eval_logps/rejected": -596.0023803710938, + "eval_loss": 0.04422454535961151, + "eval_rewards/accuracies": 0.9840067625045776, + "eval_rewards/chosen": 4.307338714599609, + "eval_rewards/margins": 7.955935478210449, + "eval_rewards/rejected": -3.648597002029419, + "eval_runtime": 561.4605, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 0.529, + "step": 2200 + }, + { + "epoch": 0.75, + "learning_rate": 1.382697393275406e-07, + "logits/chosen": -1.4882314205169678, + "logits/rejected": -1.2919832468032837, + "logps/chosen": -555.08984375, + "logps/rejected": -622.5364379882812, + "loss": 0.0606, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.7566609382629395, + "rewards/margins": 7.16671895980835, + "rewards/rejected": -3.4100584983825684, + "step": 2210 + }, + { + "epoch": 0.75, + "learning_rate": 1.3638080846241027e-07, + "logits/chosen": -1.5286386013031006, + "logits/rejected": -1.2888312339782715, + "logps/chosen": -284.7803039550781, + "logps/rejected": -523.2872314453125, + "loss": 0.0565, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.4519944190979, + "rewards/margins": 7.86702823638916, + "rewards/rejected": -3.4150338172912598, + "step": 2220 + }, + { + "epoch": 0.76, + "learning_rate": 1.3449187759727992e-07, + "logits/chosen": -1.527421236038208, + "logits/rejected": -1.2638188600540161, + "logps/chosen": -299.5697937011719, + "logps/rejected": -468.88641357421875, + "loss": 0.0418, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.214964389801025, + "rewards/margins": 7.762864589691162, + "rewards/rejected": -3.547900676727295, + "step": 2230 + }, + { + "epoch": 0.76, + "learning_rate": 1.3260294673214958e-07, + "logits/chosen": -1.5077273845672607, + "logits/rejected": -1.2305810451507568, + "logps/chosen": -315.0970153808594, + "logps/rejected": -543.9883422851562, + "loss": 0.0428, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.3515095710754395, + "rewards/margins": 8.111806869506836, + "rewards/rejected": -3.760296583175659, + "step": 2240 + }, + { + "epoch": 0.76, + "learning_rate": 1.3071401586701926e-07, + "logits/chosen": -1.5324013233184814, + "logits/rejected": -1.176598310470581, + "logps/chosen": -350.73590087890625, + "logps/rejected": -407.3160705566406, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.385611534118652, + "rewards/margins": 8.232316970825195, + "rewards/rejected": -3.8467063903808594, + "step": 2250 + }, + { + "epoch": 0.77, + "learning_rate": 1.2882508500188894e-07, + "logits/chosen": -1.53853440284729, + "logits/rejected": -1.1605119705200195, + "logps/chosen": -322.6458740234375, + "logps/rejected": -577.635009765625, + "loss": 0.0612, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1477580070495605, + "rewards/margins": 8.158384323120117, + "rewards/rejected": -4.010627746582031, + "step": 2260 + }, + { + "epoch": 0.77, + "learning_rate": 1.269361541367586e-07, + "logits/chosen": -1.4969431161880493, + "logits/rejected": -1.2394483089447021, + "logps/chosen": -429.3595275878906, + "logps/rejected": -534.6132202148438, + "loss": 0.0514, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.626293659210205, + "rewards/margins": 8.510354995727539, + "rewards/rejected": -3.8840622901916504, + "step": 2270 + }, + { + "epoch": 0.77, + "learning_rate": 1.2504722327162826e-07, + "logits/chosen": -1.5438110828399658, + "logits/rejected": -1.3265063762664795, + "logps/chosen": -336.172607421875, + "logps/rejected": -555.7020263671875, + "loss": 0.0643, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.796633243560791, + "rewards/margins": 8.188325881958008, + "rewards/rejected": -3.391692638397217, + "step": 2280 + }, + { + "epoch": 0.78, + "learning_rate": 1.231582924064979e-07, + "logits/chosen": -1.534425973892212, + "logits/rejected": -1.2709665298461914, + "logps/chosen": -331.15771484375, + "logps/rejected": -599.520751953125, + "loss": 0.0743, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.660555362701416, + "rewards/margins": 7.926393985748291, + "rewards/rejected": -3.265838146209717, + "step": 2290 + }, + { + "epoch": 0.78, + "learning_rate": 1.2126936154136757e-07, + "logits/chosen": -1.5267812013626099, + "logits/rejected": -1.2841061353683472, + "logps/chosen": -363.42718505859375, + "logps/rejected": -614.720947265625, + "loss": 0.0407, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.113651752471924, + "rewards/margins": 7.806565284729004, + "rewards/rejected": -3.69291353225708, + "step": 2300 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -1.5241639614105225, + "eval_logits/rejected": -1.207804799079895, + "eval_logps/chosen": -376.9122009277344, + "eval_logps/rejected": -597.497802734375, + "eval_loss": 0.04077613726258278, + "eval_rewards/accuracies": 0.9882155060768127, + "eval_rewards/chosen": 4.301055908203125, + "eval_rewards/margins": 8.099197387695312, + "eval_rewards/rejected": -3.798142194747925, + "eval_runtime": 561.0154, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 0.529, + "step": 2300 + }, + { + "epoch": 0.79, + "learning_rate": 1.1938043067623725e-07, + "logits/chosen": -1.5032380819320679, + "logits/rejected": -1.2603862285614014, + "logps/chosen": -468.38519287109375, + "logps/rejected": -603.046142578125, + "loss": 0.0583, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.9953246116638184, + "rewards/margins": 7.673506259918213, + "rewards/rejected": -3.6781811714172363, + "step": 2310 + }, + { + "epoch": 0.79, + "learning_rate": 1.1749149981110692e-07, + "logits/chosen": -1.5227621793746948, + "logits/rejected": -1.294641137123108, + "logps/chosen": -368.20526123046875, + "logps/rejected": -515.7191162109375, + "loss": 0.0287, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.12636661529541, + "rewards/margins": 7.043761253356934, + "rewards/rejected": -2.9173948764801025, + "step": 2320 + }, + { + "epoch": 0.79, + "learning_rate": 1.1560256894597657e-07, + "logits/chosen": -1.4953995943069458, + "logits/rejected": -1.1584880352020264, + "logps/chosen": -586.2529296875, + "logps/rejected": -516.81591796875, + "loss": 0.0496, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.4468929767608643, + "rewards/margins": 7.300196647644043, + "rewards/rejected": -3.853304386138916, + "step": 2330 + }, + { + "epoch": 0.8, + "learning_rate": 1.1371363808084623e-07, + "logits/chosen": -1.511156439781189, + "logits/rejected": -1.1865122318267822, + "logps/chosen": -372.8666076660156, + "logps/rejected": -906.1705932617188, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.395539283752441, + "rewards/margins": 8.54565715789795, + "rewards/rejected": -4.150118827819824, + "step": 2340 + }, + { + "epoch": 0.8, + "learning_rate": 1.118247072157159e-07, + "logits/chosen": -1.5099351406097412, + "logits/rejected": -1.203018307685852, + "logps/chosen": -348.9670715332031, + "logps/rejected": -429.0040588378906, + "loss": 0.0443, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.664444923400879, + "rewards/margins": 8.25013542175293, + "rewards/rejected": -3.5856919288635254, + "step": 2350 + }, + { + "epoch": 0.8, + "learning_rate": 1.0993577635058557e-07, + "logits/chosen": -1.5239416360855103, + "logits/rejected": -1.1614112854003906, + "logps/chosen": -306.3650817871094, + "logps/rejected": -499.98553466796875, + "loss": 0.0506, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.466089725494385, + "rewards/margins": 8.765016555786133, + "rewards/rejected": -4.298927307128906, + "step": 2360 + }, + { + "epoch": 0.81, + "learning_rate": 1.0804684548545522e-07, + "logits/chosen": -1.4973338842391968, + "logits/rejected": -1.2353532314300537, + "logps/chosen": -363.0428771972656, + "logps/rejected": -688.9337158203125, + "loss": 0.0569, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.117824554443359, + "rewards/margins": 7.465353488922119, + "rewards/rejected": -3.3475289344787598, + "step": 2370 + }, + { + "epoch": 0.81, + "learning_rate": 1.0615791462032489e-07, + "logits/chosen": -1.4992105960845947, + "logits/rejected": -1.1850025653839111, + "logps/chosen": -390.51898193359375, + "logps/rejected": -471.59246826171875, + "loss": 0.0359, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.290133953094482, + "rewards/margins": 8.444581985473633, + "rewards/rejected": -4.154448509216309, + "step": 2380 + }, + { + "epoch": 0.81, + "learning_rate": 1.0426898375519455e-07, + "logits/chosen": -1.5164746046066284, + "logits/rejected": -1.2176775932312012, + "logps/chosen": -386.88580322265625, + "logps/rejected": -695.19873046875, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.442442893981934, + "rewards/margins": 8.452461242675781, + "rewards/rejected": -4.010018348693848, + "step": 2390 + }, + { + "epoch": 0.82, + "learning_rate": 1.0238005289006423e-07, + "logits/chosen": -1.5462114810943604, + "logits/rejected": -1.2368415594100952, + "logps/chosen": -341.13232421875, + "logps/rejected": -573.03857421875, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.52898645401001, + "rewards/margins": 7.922593593597412, + "rewards/rejected": -3.3936073780059814, + "step": 2400 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -1.5132849216461182, + "eval_logits/rejected": -1.2028939723968506, + "eval_logps/chosen": -376.4996337890625, + "eval_logps/rejected": -596.8302001953125, + "eval_loss": 0.03966302424669266, + "eval_rewards/accuracies": 0.9882155060768127, + "eval_rewards/chosen": 4.342313289642334, + "eval_rewards/margins": 8.073698997497559, + "eval_rewards/rejected": -3.731386184692383, + "eval_runtime": 559.9916, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 0.53, + "step": 2400 + }, + { + "epoch": 0.82, + "learning_rate": 1.0049112202493389e-07, + "logits/chosen": -1.5179309844970703, + "logits/rejected": -1.190763235092163, + "logps/chosen": -344.33892822265625, + "logps/rejected": -483.5990295410156, + "loss": 0.0401, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.271202564239502, + "rewards/margins": 7.67493200302124, + "rewards/rejected": -3.4037303924560547, + "step": 2410 + }, + { + "epoch": 0.82, + "learning_rate": 9.860219115980354e-08, + "logits/chosen": -1.501859426498413, + "logits/rejected": -1.2754067182540894, + "logps/chosen": -424.81298828125, + "logps/rejected": -500.36175537109375, + "loss": 0.0512, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.149248123168945, + "rewards/margins": 7.505955696105957, + "rewards/rejected": -3.3567073345184326, + "step": 2420 + }, + { + "epoch": 0.83, + "learning_rate": 9.671326029467321e-08, + "logits/chosen": -1.5037426948547363, + "logits/rejected": -1.284251093864441, + "logps/chosen": -380.0538024902344, + "logps/rejected": -624.1697387695312, + "loss": 0.054, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.587218284606934, + "rewards/margins": 8.128026008605957, + "rewards/rejected": -3.5408072471618652, + "step": 2430 + }, + { + "epoch": 0.83, + "learning_rate": 9.482432942954287e-08, + "logits/chosen": -1.5006518363952637, + "logits/rejected": -1.248807668685913, + "logps/chosen": -396.10888671875, + "logps/rejected": -717.84033203125, + "loss": 0.0336, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.7980237007141113, + "rewards/margins": 7.6149582862854, + "rewards/rejected": -3.816934108734131, + "step": 2440 + }, + { + "epoch": 0.83, + "learning_rate": 9.293539856441255e-08, + "logits/chosen": -1.5265161991119385, + "logits/rejected": -1.211663007736206, + "logps/chosen": -327.98809814453125, + "logps/rejected": -630.8543701171875, + "loss": 0.0388, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.494044303894043, + "rewards/margins": 7.565443515777588, + "rewards/rejected": -3.0713984966278076, + "step": 2450 + }, + { + "epoch": 0.84, + "learning_rate": 9.10464676992822e-08, + "logits/chosen": -1.5001169443130493, + "logits/rejected": -1.2648394107818604, + "logps/chosen": -304.59185791015625, + "logps/rejected": -596.6958618164062, + "loss": 0.049, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.436002254486084, + "rewards/margins": 7.953620910644531, + "rewards/rejected": -3.517618179321289, + "step": 2460 + }, + { + "epoch": 0.84, + "learning_rate": 8.915753683415186e-08, + "logits/chosen": -1.5231083631515503, + "logits/rejected": -1.221145749092102, + "logps/chosen": -391.0690612792969, + "logps/rejected": -552.14208984375, + "loss": 0.0472, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.607051849365234, + "rewards/margins": 8.291250228881836, + "rewards/rejected": -3.6841976642608643, + "step": 2470 + }, + { + "epoch": 0.84, + "learning_rate": 8.726860596902153e-08, + "logits/chosen": -1.4747313261032104, + "logits/rejected": -1.266494631767273, + "logps/chosen": -392.5005798339844, + "logps/rejected": -545.3479614257812, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.316119194030762, + "rewards/margins": 7.8105034828186035, + "rewards/rejected": -3.494384288787842, + "step": 2480 + }, + { + "epoch": 0.85, + "learning_rate": 8.53796751038912e-08, + "logits/chosen": -1.5253236293792725, + "logits/rejected": -1.2925410270690918, + "logps/chosen": -378.5972900390625, + "logps/rejected": -446.5828552246094, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2583818435668945, + "rewards/margins": 7.742008209228516, + "rewards/rejected": -3.4836268424987793, + "step": 2490 + }, + { + "epoch": 0.85, + "learning_rate": 8.349074423876085e-08, + "logits/chosen": -1.4970636367797852, + "logits/rejected": -1.173380970954895, + "logps/chosen": -453.74249267578125, + "logps/rejected": -443.0990295410156, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.253788948059082, + "rewards/margins": 8.546039581298828, + "rewards/rejected": -4.292250156402588, + "step": 2500 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -1.5187809467315674, + "eval_logits/rejected": -1.2023788690567017, + "eval_logps/chosen": -376.1911926269531, + "eval_logps/rejected": -597.20654296875, + "eval_loss": 0.03895895555615425, + "eval_rewards/accuracies": 0.9856902360916138, + "eval_rewards/chosen": 4.3731584548950195, + "eval_rewards/margins": 8.142176628112793, + "eval_rewards/rejected": -3.7690184116363525, + "eval_runtime": 560.5484, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 0.53, + "step": 2500 + }, + { + "epoch": 0.85, + "learning_rate": 8.160181337363052e-08, + "logits/chosen": -1.5079090595245361, + "logits/rejected": -1.218942403793335, + "logps/chosen": -400.8089904785156, + "logps/rejected": -563.3532104492188, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.257572650909424, + "rewards/margins": 7.628092288970947, + "rewards/rejected": -3.3705201148986816, + "step": 2510 + }, + { + "epoch": 0.86, + "learning_rate": 7.971288250850018e-08, + "logits/chosen": -1.5301258563995361, + "logits/rejected": -1.2807587385177612, + "logps/chosen": -315.27337646484375, + "logps/rejected": -816.6290893554688, + "loss": 0.0417, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.818511009216309, + "rewards/margins": 8.743834495544434, + "rewards/rejected": -3.925323486328125, + "step": 2520 + }, + { + "epoch": 0.86, + "learning_rate": 7.782395164336985e-08, + "logits/chosen": -1.5073165893554688, + "logits/rejected": -1.2790597677230835, + "logps/chosen": -434.49267578125, + "logps/rejected": -593.9933471679688, + "loss": 0.0468, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.8378801345825195, + "rewards/margins": 7.976640224456787, + "rewards/rejected": -3.1387598514556885, + "step": 2530 + }, + { + "epoch": 0.86, + "learning_rate": 7.593502077823952e-08, + "logits/chosen": -1.503222942352295, + "logits/rejected": -1.2196729183197021, + "logps/chosen": -313.89654541015625, + "logps/rejected": -462.62078857421875, + "loss": 0.0303, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.41795015335083, + "rewards/margins": 7.8886284828186035, + "rewards/rejected": -3.4706790447235107, + "step": 2540 + }, + { + "epoch": 0.87, + "learning_rate": 7.404608991310917e-08, + "logits/chosen": -1.5052754878997803, + "logits/rejected": -1.2386561632156372, + "logps/chosen": -372.96209716796875, + "logps/rejected": -585.10791015625, + "loss": 0.0583, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.49515438079834, + "rewards/margins": 8.383849143981934, + "rewards/rejected": -3.888695478439331, + "step": 2550 + }, + { + "epoch": 0.87, + "learning_rate": 7.215715904797884e-08, + "logits/chosen": -1.525687336921692, + "logits/rejected": -1.1762199401855469, + "logps/chosen": -317.4388122558594, + "logps/rejected": -654.0875854492188, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.393351078033447, + "rewards/margins": 8.096270561218262, + "rewards/rejected": -3.7029201984405518, + "step": 2560 + }, + { + "epoch": 0.87, + "learning_rate": 7.02682281828485e-08, + "logits/chosen": -1.520235300064087, + "logits/rejected": -1.1894917488098145, + "logps/chosen": -396.32781982421875, + "logps/rejected": -622.02783203125, + "loss": 0.061, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.850032329559326, + "rewards/margins": 7.790719032287598, + "rewards/rejected": -3.940687656402588, + "step": 2570 + }, + { + "epoch": 0.88, + "learning_rate": 6.837929731771818e-08, + "logits/chosen": -1.5212862491607666, + "logits/rejected": -1.2499208450317383, + "logps/chosen": -376.70684814453125, + "logps/rejected": -389.7237854003906, + "loss": 0.0437, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.127648830413818, + "rewards/margins": 7.433091640472412, + "rewards/rejected": -3.305443286895752, + "step": 2580 + }, + { + "epoch": 0.88, + "learning_rate": 6.649036645258783e-08, + "logits/chosen": -1.517173171043396, + "logits/rejected": -1.1914104223251343, + "logps/chosen": -444.6543884277344, + "logps/rejected": -365.63848876953125, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.557802677154541, + "rewards/margins": 8.270492553710938, + "rewards/rejected": -3.7126896381378174, + "step": 2590 + }, + { + "epoch": 0.88, + "learning_rate": 6.460143558745749e-08, + "logits/chosen": -1.5273593664169312, + "logits/rejected": -1.1451303958892822, + "logps/chosen": -325.8979797363281, + "logps/rejected": -582.1531982421875, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.247693061828613, + "rewards/margins": 8.258806228637695, + "rewards/rejected": -4.011113166809082, + "step": 2600 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -1.5157567262649536, + "eval_logits/rejected": -1.1977304220199585, + "eval_logps/chosen": -376.56494140625, + "eval_logps/rejected": -597.8150024414062, + "eval_loss": 0.037716832011938095, + "eval_rewards/accuracies": 0.9865319728851318, + "eval_rewards/chosen": 4.33577823638916, + "eval_rewards/margins": 8.16563892364502, + "eval_rewards/rejected": -3.8298606872558594, + "eval_runtime": 560.9406, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 0.529, + "step": 2600 + }, + { + "epoch": 0.89, + "learning_rate": 6.271250472232716e-08, + "logits/chosen": -1.5143485069274902, + "logits/rejected": -1.3070073127746582, + "logps/chosen": -389.19464111328125, + "logps/rejected": -658.4906616210938, + "loss": 0.0493, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.09715461730957, + "rewards/margins": 7.326831817626953, + "rewards/rejected": -3.229677200317383, + "step": 2610 + }, + { + "epoch": 0.89, + "learning_rate": 6.082357385719683e-08, + "logits/chosen": -1.5194426774978638, + "logits/rejected": -1.338354229927063, + "logps/chosen": -472.4857482910156, + "logps/rejected": -520.1239013671875, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.095303535461426, + "rewards/margins": 7.620616912841797, + "rewards/rejected": -3.525313138961792, + "step": 2620 + }, + { + "epoch": 0.89, + "learning_rate": 5.893464299206649e-08, + "logits/chosen": -1.5334171056747437, + "logits/rejected": -1.2070086002349854, + "logps/chosen": -345.03106689453125, + "logps/rejected": -601.37060546875, + "loss": 0.0327, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.165444374084473, + "rewards/margins": 7.924208641052246, + "rewards/rejected": -3.7587637901306152, + "step": 2630 + }, + { + "epoch": 0.9, + "learning_rate": 5.704571212693615e-08, + "logits/chosen": -1.5042918920516968, + "logits/rejected": -1.2301785945892334, + "logps/chosen": -357.8497314453125, + "logps/rejected": -494.36944580078125, + "loss": 0.0457, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.3990867137908936, + "rewards/margins": 6.8430280685424805, + "rewards/rejected": -3.443941593170166, + "step": 2640 + }, + { + "epoch": 0.9, + "learning_rate": 5.5156781261805816e-08, + "logits/chosen": -1.5347890853881836, + "logits/rejected": -1.2450568675994873, + "logps/chosen": -290.27862548828125, + "logps/rejected": -452.07562255859375, + "loss": 0.0342, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.202007293701172, + "rewards/margins": 7.855565547943115, + "rewards/rejected": -3.6535582542419434, + "step": 2650 + }, + { + "epoch": 0.9, + "learning_rate": 5.326785039667548e-08, + "logits/chosen": -1.5297716856002808, + "logits/rejected": -1.1971065998077393, + "logps/chosen": -395.7029724121094, + "logps/rejected": -416.86810302734375, + "loss": 0.0418, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.438989639282227, + "rewards/margins": 8.2936372756958, + "rewards/rejected": -3.854647159576416, + "step": 2660 + }, + { + "epoch": 0.91, + "learning_rate": 5.137891953154514e-08, + "logits/chosen": -1.5386561155319214, + "logits/rejected": -1.268654704093933, + "logps/chosen": -334.1921691894531, + "logps/rejected": -703.80908203125, + "loss": 0.0391, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 5.104978084564209, + "rewards/margins": 8.777002334594727, + "rewards/rejected": -3.6720242500305176, + "step": 2670 + }, + { + "epoch": 0.91, + "learning_rate": 4.948998866641481e-08, + "logits/chosen": -1.5136685371398926, + "logits/rejected": -1.2055470943450928, + "logps/chosen": -393.09869384765625, + "logps/rejected": -756.4305419921875, + "loss": 0.0371, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.0205078125, + "rewards/margins": 8.307108879089355, + "rewards/rejected": -4.2866010665893555, + "step": 2680 + }, + { + "epoch": 0.91, + "learning_rate": 4.760105780128447e-08, + "logits/chosen": -1.5168330669403076, + "logits/rejected": -1.1547820568084717, + "logps/chosen": -425.5591735839844, + "logps/rejected": -774.383544921875, + "loss": 0.0347, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.134261131286621, + "rewards/margins": 8.559054374694824, + "rewards/rejected": -4.424793720245361, + "step": 2690 + }, + { + "epoch": 0.92, + "learning_rate": 4.5712126936154134e-08, + "logits/chosen": -1.5314137935638428, + "logits/rejected": -1.2539647817611694, + "logps/chosen": -260.22467041015625, + "logps/rejected": -489.43182373046875, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.627935409545898, + "rewards/margins": 8.63275146484375, + "rewards/rejected": -4.00481653213501, + "step": 2700 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -1.5139025449752808, + "eval_logits/rejected": -1.2032972574234009, + "eval_logps/chosen": -376.6385803222656, + "eval_logps/rejected": -597.8989868164062, + "eval_loss": 0.039693351835012436, + "eval_rewards/accuracies": 0.9890572428703308, + "eval_rewards/chosen": 4.328419208526611, + "eval_rewards/margins": 8.166685104370117, + "eval_rewards/rejected": -3.838265895843506, + "eval_runtime": 558.6431, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 0.532, + "step": 2700 + }, + { + "epoch": 0.92, + "learning_rate": 4.3823196071023796e-08, + "logits/chosen": -1.5291705131530762, + "logits/rejected": -1.2298305034637451, + "logps/chosen": -323.2745056152344, + "logps/rejected": -548.8238525390625, + "loss": 0.0424, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.2751970291137695, + "rewards/margins": 8.485635757446289, + "rewards/rejected": -4.2104387283325195, + "step": 2710 + }, + { + "epoch": 0.92, + "learning_rate": 4.1934265205893465e-08, + "logits/chosen": -1.503124475479126, + "logits/rejected": -1.2478026151657104, + "logps/chosen": -480.8046875, + "logps/rejected": -691.4312133789062, + "loss": 0.0365, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.418412208557129, + "rewards/margins": 8.399371147155762, + "rewards/rejected": -3.980959415435791, + "step": 2720 + }, + { + "epoch": 0.93, + "learning_rate": 4.004533434076313e-08, + "logits/chosen": -1.5199840068817139, + "logits/rejected": -1.2516810894012451, + "logps/chosen": -318.7385559082031, + "logps/rejected": -670.9912719726562, + "loss": 0.0378, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.218963623046875, + "rewards/margins": 7.438061714172363, + "rewards/rejected": -3.21909761428833, + "step": 2730 + }, + { + "epoch": 0.93, + "learning_rate": 3.815640347563279e-08, + "logits/chosen": -1.4905387163162231, + "logits/rejected": -1.271468162536621, + "logps/chosen": -392.78240966796875, + "logps/rejected": -901.37548828125, + "loss": 0.0291, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.8263752460479736, + "rewards/margins": 7.711263179779053, + "rewards/rejected": -3.8848884105682373, + "step": 2740 + }, + { + "epoch": 0.93, + "learning_rate": 3.626747261050245e-08, + "logits/chosen": -1.5208760499954224, + "logits/rejected": -1.2187827825546265, + "logps/chosen": -320.1753845214844, + "logps/rejected": -740.0049438476562, + "loss": 0.0378, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.149036884307861, + "rewards/margins": 8.07148551940918, + "rewards/rejected": -3.9224491119384766, + "step": 2750 + }, + { + "epoch": 0.94, + "learning_rate": 3.4378541745372115e-08, + "logits/chosen": -1.5195062160491943, + "logits/rejected": -1.2230064868927002, + "logps/chosen": -427.1581115722656, + "logps/rejected": -645.3033447265625, + "loss": 0.0417, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.26482629776001, + "rewards/margins": 8.393260955810547, + "rewards/rejected": -4.128435134887695, + "step": 2760 + }, + { + "epoch": 0.94, + "learning_rate": 3.2489610880241784e-08, + "logits/chosen": -1.525866150856018, + "logits/rejected": -1.2397754192352295, + "logps/chosen": -306.03778076171875, + "logps/rejected": -587.6482543945312, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.998318672180176, + "rewards/margins": 8.605379104614258, + "rewards/rejected": -3.6070590019226074, + "step": 2770 + }, + { + "epoch": 0.94, + "learning_rate": 3.0600680015111446e-08, + "logits/chosen": -1.516852617263794, + "logits/rejected": -1.1658298969268799, + "logps/chosen": -312.1228942871094, + "logps/rejected": -346.4730224609375, + "loss": 0.0445, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.162473678588867, + "rewards/margins": 8.016084671020508, + "rewards/rejected": -3.853611469268799, + "step": 2780 + }, + { + "epoch": 0.95, + "learning_rate": 2.871174914998111e-08, + "logits/chosen": -1.524436354637146, + "logits/rejected": -1.1867830753326416, + "logps/chosen": -326.0414123535156, + "logps/rejected": -485.95166015625, + "loss": 0.0375, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.58970308303833, + "rewards/margins": 8.010432243347168, + "rewards/rejected": -3.420729875564575, + "step": 2790 + }, + { + "epoch": 0.95, + "learning_rate": 2.682281828485077e-08, + "logits/chosen": -1.520810842514038, + "logits/rejected": -1.1647284030914307, + "logps/chosen": -333.65008544921875, + "logps/rejected": -827.0930786132812, + "loss": 0.0527, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.306121349334717, + "rewards/margins": 8.471829414367676, + "rewards/rejected": -4.165709018707275, + "step": 2800 + }, + { + "epoch": 0.95, + "eval_logits/chosen": -1.5196325778961182, + "eval_logits/rejected": -1.2036585807800293, + "eval_logps/chosen": -376.93743896484375, + "eval_logps/rejected": -598.0059204101562, + "eval_loss": 0.0383492186665535, + "eval_rewards/accuracies": 0.9856902360916138, + "eval_rewards/chosen": 4.298529148101807, + "eval_rewards/margins": 8.147479057312012, + "eval_rewards/rejected": -3.848950147628784, + "eval_runtime": 558.5691, + "eval_samples_per_second": 17.008, + "eval_steps_per_second": 0.532, + "step": 2800 + }, + { + "epoch": 0.96, + "learning_rate": 2.4933887419720436e-08, + "logits/chosen": -1.5408833026885986, + "logits/rejected": -1.2266209125518799, + "logps/chosen": -303.4648132324219, + "logps/rejected": -399.6147155761719, + "loss": 0.0527, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5173773765563965, + "rewards/margins": 8.121113777160645, + "rewards/rejected": -3.6037354469299316, + "step": 2810 + }, + { + "epoch": 0.96, + "learning_rate": 2.30449565545901e-08, + "logits/chosen": -1.5411306619644165, + "logits/rejected": -1.2380434274673462, + "logps/chosen": -333.23699951171875, + "logps/rejected": -425.56365966796875, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0661211013793945, + "rewards/margins": 7.654998779296875, + "rewards/rejected": -3.588876724243164, + "step": 2820 + }, + { + "epoch": 0.96, + "learning_rate": 2.1156025689459764e-08, + "logits/chosen": -1.5202014446258545, + "logits/rejected": -1.249887466430664, + "logps/chosen": -415.40435791015625, + "logps/rejected": -705.8953857421875, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5467915534973145, + "rewards/margins": 8.021774291992188, + "rewards/rejected": -3.474982500076294, + "step": 2830 + }, + { + "epoch": 0.97, + "learning_rate": 1.926709482432943e-08, + "logits/chosen": -1.5168020725250244, + "logits/rejected": -1.1783835887908936, + "logps/chosen": -368.14337158203125, + "logps/rejected": -583.8258056640625, + "loss": 0.0346, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.783400058746338, + "rewards/margins": 8.449257850646973, + "rewards/rejected": -3.6658573150634766, + "step": 2840 + }, + { + "epoch": 0.97, + "learning_rate": 1.7378163959199092e-08, + "logits/chosen": -1.5222840309143066, + "logits/rejected": -1.2157676219940186, + "logps/chosen": -377.74932861328125, + "logps/rejected": -491.2120056152344, + "loss": 0.0368, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.284539222717285, + "rewards/margins": 8.006728172302246, + "rewards/rejected": -3.7221896648406982, + "step": 2850 + }, + { + "epoch": 0.97, + "learning_rate": 1.5489233094068758e-08, + "logits/chosen": -1.5340877771377563, + "logits/rejected": -1.2179819345474243, + "logps/chosen": -341.79974365234375, + "logps/rejected": -472.01611328125, + "loss": 0.0316, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.628954887390137, + "rewards/margins": 8.175573348999023, + "rewards/rejected": -3.546616315841675, + "step": 2860 + }, + { + "epoch": 0.98, + "learning_rate": 1.3600302228938419e-08, + "logits/chosen": -1.5086033344268799, + "logits/rejected": -1.2281320095062256, + "logps/chosen": -449.17156982421875, + "logps/rejected": -668.3966064453125, + "loss": 0.0593, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.049460411071777, + "rewards/margins": 7.482198238372803, + "rewards/rejected": -3.4327378273010254, + "step": 2870 + }, + { + "epoch": 0.98, + "learning_rate": 1.1711371363808084e-08, + "logits/chosen": -1.533616304397583, + "logits/rejected": -1.2686628103256226, + "logps/chosen": -389.5843200683594, + "logps/rejected": -875.8938598632812, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.17772102355957, + "rewards/margins": 8.47387981414795, + "rewards/rejected": -4.296158790588379, + "step": 2880 + }, + { + "epoch": 0.98, + "learning_rate": 9.822440498677748e-09, + "logits/chosen": -1.5015050172805786, + "logits/rejected": -1.2158093452453613, + "logps/chosen": -532.0431518554688, + "logps/rejected": -657.7987670898438, + "loss": 0.037, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.858596086502075, + "rewards/margins": 7.628444671630859, + "rewards/rejected": -3.769848346710205, + "step": 2890 + }, + { + "epoch": 0.99, + "learning_rate": 7.933509633547412e-09, + "logits/chosen": -1.5313704013824463, + "logits/rejected": -1.2360942363739014, + "logps/chosen": -339.72064208984375, + "logps/rejected": -532.837158203125, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4045090675354, + "rewards/margins": 8.255632400512695, + "rewards/rejected": -3.851123332977295, + "step": 2900 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -1.5156338214874268, + "eval_logits/rejected": -1.1996530294418335, + "eval_logps/chosen": -376.83685302734375, + "eval_logps/rejected": -597.8652954101562, + "eval_loss": 0.03792084753513336, + "eval_rewards/accuracies": 0.9873737096786499, + "eval_rewards/chosen": 4.308588981628418, + "eval_rewards/margins": 8.143476486206055, + "eval_rewards/rejected": -3.8348886966705322, + "eval_runtime": 561.0021, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 0.529, + "step": 2900 + }, + { + "epoch": 0.99, + "learning_rate": 6.044578768417076e-09, + "logits/chosen": -1.5284096002578735, + "logits/rejected": -1.2779542207717896, + "logps/chosen": -324.67864990234375, + "logps/rejected": -710.6837768554688, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9714195728302, + "rewards/margins": 7.755260467529297, + "rewards/rejected": -3.7838408946990967, + "step": 2910 + }, + { + "epoch": 0.99, + "learning_rate": 4.15564790328674e-09, + "logits/chosen": -1.5134741067886353, + "logits/rejected": -1.2182211875915527, + "logps/chosen": -336.8125915527344, + "logps/rejected": -749.4577026367188, + "loss": 0.028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.103480815887451, + "rewards/margins": 7.714101314544678, + "rewards/rejected": -3.6106209754943848, + "step": 2920 + }, + { + "epoch": 1.0, + "learning_rate": 2.2667170381564033e-09, + "logits/chosen": -1.5044059753417969, + "logits/rejected": -1.248711347579956, + "logps/chosen": -317.59130859375, + "logps/rejected": -411.42706298828125, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9643845558166504, + "rewards/margins": 7.606657981872559, + "rewards/rejected": -3.64227294921875, + "step": 2930 + }, + { + "epoch": 1.0, + "learning_rate": 3.7778617302606723e-10, + "logits/chosen": -1.4944725036621094, + "logits/rejected": -1.208418369293213, + "logps/chosen": -398.1802673339844, + "logps/rejected": -528.9981689453125, + "loss": 0.0369, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.150896072387695, + "rewards/margins": 7.481464385986328, + "rewards/rejected": -3.3305678367614746, + "step": 2940 + }, + { + "epoch": 1.0, + "step": 2942, + "total_flos": 0.0, + "train_loss": 0.11494619559330763, + "train_runtime": 36321.6775, + "train_samples_per_second": 5.184, + "train_steps_per_second": 0.081 + } + ], + "logging_steps": 10, + "max_steps": 2942, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}