{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 400, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026171159382360636, "grad_norm": 7.1875, "learning_rate": 7.832898172323759e-10, "logits/chosen": -2.3929858207702637, "logits/rejected": -2.2321977615356445, "logps/chosen": -0.7943928837776184, "logps/rejected": -0.861505389213562, "loss": 1.7141, "rewards/accuracies": 0.625, "rewards/chosen": -1.5887857675552368, "rewards/margins": 0.13422495126724243, "rewards/rejected": -1.723010778427124, "step": 1 }, { "epoch": 0.0013085579691180318, "grad_norm": 6.6875, "learning_rate": 3.91644908616188e-09, "logits/chosen": -2.3382856845855713, "logits/rejected": -2.261397123336792, "logps/chosen": -1.013643503189087, "logps/rejected": -1.0403972864151, "loss": 1.837, "rewards/accuracies": 0.484375, "rewards/chosen": -2.027287006378174, "rewards/margins": 0.05350770056247711, "rewards/rejected": -2.0807945728302, "step": 5 }, { "epoch": 0.0026171159382360636, "grad_norm": 4.96875, "learning_rate": 7.83289817232376e-09, "logits/chosen": -2.426370620727539, "logits/rejected": -2.4551613330841064, "logps/chosen": -0.9928344488143921, "logps/rejected": -0.979974627494812, "loss": 1.8843, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.9856688976287842, "rewards/margins": -0.025719786062836647, "rewards/rejected": -1.959949254989624, "step": 10 }, { "epoch": 0.003925673907354096, "grad_norm": 5.09375, "learning_rate": 1.174934725848564e-08, "logits/chosen": -2.406400442123413, "logits/rejected": -2.3791346549987793, "logps/chosen": -0.8756189346313477, "logps/rejected": -1.0032527446746826, "loss": 1.661, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7512378692626953, "rewards/margins": 0.2552677094936371, "rewards/rejected": -2.0065054893493652, "step": 15 }, { "epoch": 0.005234231876472127, "grad_norm": 5.09375, "learning_rate": 1.566579634464752e-08, "logits/chosen": -2.480056047439575, "logits/rejected": -2.3385822772979736, "logps/chosen": -0.8800347447395325, "logps/rejected": -1.0929635763168335, "loss": 1.6164, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.760069489479065, "rewards/margins": 0.4258577823638916, "rewards/rejected": -2.185927152633667, "step": 20 }, { "epoch": 0.00654278984559016, "grad_norm": 3.625, "learning_rate": 1.95822454308094e-08, "logits/chosen": -2.452441930770874, "logits/rejected": -2.3058266639709473, "logps/chosen": -0.8851073980331421, "logps/rejected": -1.0340964794158936, "loss": 1.5879, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7702147960662842, "rewards/margins": 0.2979782819747925, "rewards/rejected": -2.068192958831787, "step": 25 }, { "epoch": 0.007851347814708191, "grad_norm": 4.46875, "learning_rate": 2.349869451697128e-08, "logits/chosen": -2.4211630821228027, "logits/rejected": -2.388763666152954, "logps/chosen": -0.9654448628425598, "logps/rejected": -0.9848822355270386, "loss": 1.8206, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.9308897256851196, "rewards/margins": 0.03887487202882767, "rewards/rejected": -1.9697644710540771, "step": 30 }, { "epoch": 0.009159905783826224, "grad_norm": 7.09375, "learning_rate": 2.741514360313316e-08, "logits/chosen": -2.3911948204040527, "logits/rejected": -2.254054546356201, "logps/chosen": -0.9038335680961609, "logps/rejected": -1.0748339891433716, "loss": 1.6299, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8076671361923218, "rewards/margins": 0.3420008718967438, "rewards/rejected": -2.149667978286743, "step": 35 }, { "epoch": 0.010468463752944255, "grad_norm": 5.84375, "learning_rate": 3.133159268929504e-08, "logits/chosen": -2.404452085494995, "logits/rejected": -2.2743349075317383, "logps/chosen": -0.8856223225593567, "logps/rejected": -0.9654895663261414, "loss": 1.7171, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.7712446451187134, "rewards/margins": 0.15973450243473053, "rewards/rejected": -1.9309791326522827, "step": 40 }, { "epoch": 0.011777021722062287, "grad_norm": 8.4375, "learning_rate": 3.524804177545692e-08, "logits/chosen": -2.4613146781921387, "logits/rejected": -2.3491759300231934, "logps/chosen": -0.9516292810440063, "logps/rejected": -1.028428554534912, "loss": 1.7317, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9032585620880127, "rewards/margins": 0.15359854698181152, "rewards/rejected": -2.056857109069824, "step": 45 }, { "epoch": 0.01308557969118032, "grad_norm": 6.375, "learning_rate": 3.91644908616188e-08, "logits/chosen": -2.549776792526245, "logits/rejected": -2.431725025177002, "logps/chosen": -0.922882080078125, "logps/rejected": -0.9356275796890259, "loss": 1.8108, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.84576416015625, "rewards/margins": 0.025490831583738327, "rewards/rejected": -1.8712551593780518, "step": 50 }, { "epoch": 0.014394137660298352, "grad_norm": 8.625, "learning_rate": 4.308093994778068e-08, "logits/chosen": -2.3783116340637207, "logits/rejected": -2.2331128120422363, "logps/chosen": -0.8826783299446106, "logps/rejected": -1.0029122829437256, "loss": 1.6932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.7653566598892212, "rewards/margins": 0.24046793580055237, "rewards/rejected": -2.005824565887451, "step": 55 }, { "epoch": 0.015702695629416383, "grad_norm": 4.875, "learning_rate": 4.699738903394256e-08, "logits/chosen": -2.591465711593628, "logits/rejected": -2.424081325531006, "logps/chosen": -0.8889732360839844, "logps/rejected": -1.0766708850860596, "loss": 1.6738, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7779464721679688, "rewards/margins": 0.3753954768180847, "rewards/rejected": -2.153341770172119, "step": 60 }, { "epoch": 0.017011253598534413, "grad_norm": 5.4375, "learning_rate": 5.091383812010443e-08, "logits/chosen": -2.454481601715088, "logits/rejected": -2.3158888816833496, "logps/chosen": -0.9410486221313477, "logps/rejected": -1.0309474468231201, "loss": 1.6933, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8820972442626953, "rewards/margins": 0.1797974556684494, "rewards/rejected": -2.0618948936462402, "step": 65 }, { "epoch": 0.018319811567652448, "grad_norm": 6.53125, "learning_rate": 5.483028720626632e-08, "logits/chosen": -2.4346108436584473, "logits/rejected": -2.331714391708374, "logps/chosen": -0.8850765228271484, "logps/rejected": -0.9854335784912109, "loss": 1.7242, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.7701530456542969, "rewards/margins": 0.2007141411304474, "rewards/rejected": -1.9708671569824219, "step": 70 }, { "epoch": 0.01962836953677048, "grad_norm": 6.15625, "learning_rate": 5.87467362924282e-08, "logits/chosen": -2.4403324127197266, "logits/rejected": -2.328378200531006, "logps/chosen": -0.9527246356010437, "logps/rejected": -1.083192229270935, "loss": 1.6618, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9054492712020874, "rewards/margins": 0.26093512773513794, "rewards/rejected": -2.16638445854187, "step": 75 }, { "epoch": 0.02093692750588851, "grad_norm": 5.75, "learning_rate": 6.266318537859008e-08, "logits/chosen": -2.5196890830993652, "logits/rejected": -2.380474090576172, "logps/chosen": -0.8992692828178406, "logps/rejected": -0.9594500660896301, "loss": 1.7602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7985385656356812, "rewards/margins": 0.12036142498254776, "rewards/rejected": -1.9189001321792603, "step": 80 }, { "epoch": 0.022245485475006543, "grad_norm": 7.125, "learning_rate": 6.657963446475196e-08, "logits/chosen": -2.4916937351226807, "logits/rejected": -2.334261417388916, "logps/chosen": -0.8791500926017761, "logps/rejected": -1.0251330137252808, "loss": 1.6255, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.7583001852035522, "rewards/margins": 0.29196593165397644, "rewards/rejected": -2.0502660274505615, "step": 85 }, { "epoch": 0.023554043444124574, "grad_norm": 6.96875, "learning_rate": 7.049608355091384e-08, "logits/chosen": -2.5398292541503906, "logits/rejected": -2.3612000942230225, "logps/chosen": -0.9267665147781372, "logps/rejected": -0.9929956197738647, "loss": 1.7355, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.8535330295562744, "rewards/margins": 0.1324581801891327, "rewards/rejected": -1.9859912395477295, "step": 90 }, { "epoch": 0.02486260141324261, "grad_norm": 7.375, "learning_rate": 7.441253263707572e-08, "logits/chosen": -2.540365695953369, "logits/rejected": -2.4132840633392334, "logps/chosen": -0.9225910305976868, "logps/rejected": -0.9454398155212402, "loss": 1.8098, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8451820611953735, "rewards/margins": 0.04569756239652634, "rewards/rejected": -1.8908796310424805, "step": 95 }, { "epoch": 0.02617115938236064, "grad_norm": 11.0, "learning_rate": 7.83289817232376e-08, "logits/chosen": -2.460408926010132, "logits/rejected": -2.4058172702789307, "logps/chosen": -0.8751091957092285, "logps/rejected": -0.9861213564872742, "loss": 1.6748, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.750218391418457, "rewards/margins": 0.22202444076538086, "rewards/rejected": -1.9722427129745483, "step": 100 }, { "epoch": 0.02747971735147867, "grad_norm": 6.03125, "learning_rate": 8.224543080939946e-08, "logits/chosen": -2.4411768913269043, "logits/rejected": -2.3341009616851807, "logps/chosen": -0.9243078231811523, "logps/rejected": -1.0461833477020264, "loss": 1.6508, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8486156463623047, "rewards/margins": 0.24375078082084656, "rewards/rejected": -2.0923666954040527, "step": 105 }, { "epoch": 0.028788275320596704, "grad_norm": 10.0625, "learning_rate": 8.616187989556136e-08, "logits/chosen": -2.4604249000549316, "logits/rejected": -2.3214480876922607, "logps/chosen": -0.9041354060173035, "logps/rejected": -1.0756652355194092, "loss": 1.5991, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.808270812034607, "rewards/margins": 0.34305933117866516, "rewards/rejected": -2.1513304710388184, "step": 110 }, { "epoch": 0.030096833289714735, "grad_norm": 7.125, "learning_rate": 9.007832898172324e-08, "logits/chosen": -2.6047489643096924, "logits/rejected": -2.423083782196045, "logps/chosen": -0.934342086315155, "logps/rejected": -1.0481892824172974, "loss": 1.6739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.86868417263031, "rewards/margins": 0.2276945412158966, "rewards/rejected": -2.0963785648345947, "step": 115 }, { "epoch": 0.031405391258832765, "grad_norm": 5.28125, "learning_rate": 9.399477806788512e-08, "logits/chosen": -2.4768102169036865, "logits/rejected": -2.3186275959014893, "logps/chosen": -0.914435863494873, "logps/rejected": -1.023848533630371, "loss": 1.7225, "rewards/accuracies": 0.5, "rewards/chosen": -1.828871726989746, "rewards/margins": 0.21882522106170654, "rewards/rejected": -2.047697067260742, "step": 120 }, { "epoch": 0.032713949227950796, "grad_norm": 6.71875, "learning_rate": 9.7911227154047e-08, "logits/chosen": -2.428565502166748, "logits/rejected": -2.307558298110962, "logps/chosen": -0.8888899087905884, "logps/rejected": -0.9790868759155273, "loss": 1.701, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.7777798175811768, "rewards/margins": 0.18039414286613464, "rewards/rejected": -1.9581737518310547, "step": 125 }, { "epoch": 0.03402250719706883, "grad_norm": 4.25, "learning_rate": 1.0182767624020886e-07, "logits/chosen": -2.389721393585205, "logits/rejected": -2.253096103668213, "logps/chosen": -0.9319776296615601, "logps/rejected": -1.0042223930358887, "loss": 1.7407, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8639552593231201, "rewards/margins": 0.14448928833007812, "rewards/rejected": -2.0084447860717773, "step": 130 }, { "epoch": 0.035331065166186865, "grad_norm": 4.59375, "learning_rate": 1.0574412532637074e-07, "logits/chosen": -2.4350485801696777, "logits/rejected": -2.3896994590759277, "logps/chosen": -0.8629164695739746, "logps/rejected": -0.9937006235122681, "loss": 1.6718, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.7258329391479492, "rewards/margins": 0.2615683078765869, "rewards/rejected": -1.9874012470245361, "step": 135 }, { "epoch": 0.036639623135304895, "grad_norm": 5.90625, "learning_rate": 1.0966057441253264e-07, "logits/chosen": -2.358617067337036, "logits/rejected": -2.3305978775024414, "logps/chosen": -0.961345374584198, "logps/rejected": -1.0267747640609741, "loss": 1.7454, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.922690749168396, "rewards/margins": 0.13085877895355225, "rewards/rejected": -2.0535495281219482, "step": 140 }, { "epoch": 0.037948181104422926, "grad_norm": 5.5, "learning_rate": 1.1357702349869451e-07, "logits/chosen": -2.4439597129821777, "logits/rejected": -2.3329501152038574, "logps/chosen": -0.94514399766922, "logps/rejected": -0.9944084286689758, "loss": 1.764, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.89028799533844, "rewards/margins": 0.09852910786867142, "rewards/rejected": -1.9888168573379517, "step": 145 }, { "epoch": 0.03925673907354096, "grad_norm": 5.8125, "learning_rate": 1.174934725848564e-07, "logits/chosen": -2.4960317611694336, "logits/rejected": -2.3280179500579834, "logps/chosen": -0.9063509106636047, "logps/rejected": -1.0402507781982422, "loss": 1.6466, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8127018213272095, "rewards/margins": 0.26779991388320923, "rewards/rejected": -2.0805015563964844, "step": 150 }, { "epoch": 0.04056529704265899, "grad_norm": 3.28125, "learning_rate": 1.214099216710183e-07, "logits/chosen": -2.4747376441955566, "logits/rejected": -2.399122953414917, "logps/chosen": -0.945054829120636, "logps/rejected": -1.0740740299224854, "loss": 1.6823, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.890109658241272, "rewards/margins": 0.25803810358047485, "rewards/rejected": -2.1481480598449707, "step": 155 }, { "epoch": 0.04187385501177702, "grad_norm": 11.0, "learning_rate": 1.2532637075718015e-07, "logits/chosen": -2.416409730911255, "logits/rejected": -2.2528250217437744, "logps/chosen": -0.9375729560852051, "logps/rejected": -1.1229662895202637, "loss": 1.6195, "rewards/accuracies": 0.625, "rewards/chosen": -1.8751459121704102, "rewards/margins": 0.37078672647476196, "rewards/rejected": -2.2459325790405273, "step": 160 }, { "epoch": 0.043182412980895056, "grad_norm": 5.15625, "learning_rate": 1.2924281984334202e-07, "logits/chosen": -2.5457167625427246, "logits/rejected": -2.3906517028808594, "logps/chosen": -0.8909904360771179, "logps/rejected": -1.129612684249878, "loss": 1.5843, "rewards/accuracies": 0.625, "rewards/chosen": -1.7819808721542358, "rewards/margins": 0.4772440791130066, "rewards/rejected": -2.259225368499756, "step": 165 }, { "epoch": 0.04449097095001309, "grad_norm": 9.75, "learning_rate": 1.3315926892950391e-07, "logits/chosen": -2.4581961631774902, "logits/rejected": -2.2854654788970947, "logps/chosen": -0.9324159622192383, "logps/rejected": -1.017854928970337, "loss": 1.7151, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8648319244384766, "rewards/margins": 0.17087793350219727, "rewards/rejected": -2.035709857940674, "step": 170 }, { "epoch": 0.04579952891913112, "grad_norm": 10.9375, "learning_rate": 1.3707571801566578e-07, "logits/chosen": -2.5348904132843018, "logits/rejected": -2.443265676498413, "logps/chosen": -0.8872332572937012, "logps/rejected": -1.0019452571868896, "loss": 1.6642, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.7744665145874023, "rewards/margins": 0.22942402958869934, "rewards/rejected": -2.0038905143737793, "step": 175 }, { "epoch": 0.04710808688824915, "grad_norm": 5.3125, "learning_rate": 1.4099216710182767e-07, "logits/chosen": -2.43281888961792, "logits/rejected": -2.364088535308838, "logps/chosen": -0.9738823175430298, "logps/rejected": -1.0775669813156128, "loss": 1.6729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9477646350860596, "rewards/margins": 0.20736956596374512, "rewards/rejected": -2.1551339626312256, "step": 180 }, { "epoch": 0.04841664485736718, "grad_norm": 5.15625, "learning_rate": 1.4490861618798957e-07, "logits/chosen": -2.4177231788635254, "logits/rejected": -2.3026790618896484, "logps/chosen": -0.8476292490959167, "logps/rejected": -1.078465223312378, "loss": 1.5611, "rewards/accuracies": 0.625, "rewards/chosen": -1.6952584981918335, "rewards/margins": 0.4616720676422119, "rewards/rejected": -2.156930446624756, "step": 185 }, { "epoch": 0.04972520282648522, "grad_norm": 4.3125, "learning_rate": 1.4882506527415143e-07, "logits/chosen": -2.416144847869873, "logits/rejected": -2.3191165924072266, "logps/chosen": -0.9843645095825195, "logps/rejected": -1.0324324369430542, "loss": 1.7831, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.968729019165039, "rewards/margins": 0.09613589942455292, "rewards/rejected": -2.0648648738861084, "step": 190 }, { "epoch": 0.05103376079560325, "grad_norm": 3.65625, "learning_rate": 1.527415143603133e-07, "logits/chosen": -2.398357391357422, "logits/rejected": -2.2828030586242676, "logps/chosen": -0.9086023569107056, "logps/rejected": -1.0458821058273315, "loss": 1.6255, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8172047138214111, "rewards/margins": 0.2745595872402191, "rewards/rejected": -2.091764211654663, "step": 195 }, { "epoch": 0.05234231876472128, "grad_norm": 7.15625, "learning_rate": 1.566579634464752e-07, "logits/chosen": -2.4192728996276855, "logits/rejected": -2.368617296218872, "logps/chosen": -0.8891614079475403, "logps/rejected": -0.9575755000114441, "loss": 1.7249, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7783228158950806, "rewards/margins": 0.1368284672498703, "rewards/rejected": -1.9151510000228882, "step": 200 }, { "epoch": 0.05365087673383931, "grad_norm": 7.09375, "learning_rate": 1.6057441253263706e-07, "logits/chosen": -2.500441074371338, "logits/rejected": -2.4010748863220215, "logps/chosen": -0.9709323644638062, "logps/rejected": -0.9870332479476929, "loss": 1.8089, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9418647289276123, "rewards/margins": 0.032201606780290604, "rewards/rejected": -1.9740664958953857, "step": 205 }, { "epoch": 0.05495943470295734, "grad_norm": 6.28125, "learning_rate": 1.6449086161879893e-07, "logits/chosen": -2.509099006652832, "logits/rejected": -2.299696445465088, "logps/chosen": -0.9479492902755737, "logps/rejected": -1.0242407321929932, "loss": 1.7234, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8958985805511475, "rewards/margins": 0.1525828093290329, "rewards/rejected": -2.0484814643859863, "step": 210 }, { "epoch": 0.05626799267207537, "grad_norm": 4.25, "learning_rate": 1.6840731070496085e-07, "logits/chosen": -2.544700860977173, "logits/rejected": -2.3568367958068848, "logps/chosen": -0.9340695142745972, "logps/rejected": -1.1109009981155396, "loss": 1.6238, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8681390285491943, "rewards/margins": 0.3536628484725952, "rewards/rejected": -2.221801996231079, "step": 215 }, { "epoch": 0.05757655064119341, "grad_norm": 8.75, "learning_rate": 1.723237597911227e-07, "logits/chosen": -2.5071818828582764, "logits/rejected": -2.534008502960205, "logps/chosen": -0.889661967754364, "logps/rejected": -0.9272378087043762, "loss": 1.7833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.779323935508728, "rewards/margins": 0.0751517191529274, "rewards/rejected": -1.8544756174087524, "step": 220 }, { "epoch": 0.05888510861031144, "grad_norm": 5.5625, "learning_rate": 1.762402088772846e-07, "logits/chosen": -2.5091605186462402, "logits/rejected": -2.309223175048828, "logps/chosen": -0.9561560750007629, "logps/rejected": -1.1034660339355469, "loss": 1.6533, "rewards/accuracies": 0.625, "rewards/chosen": -1.9123121500015259, "rewards/margins": 0.29461997747421265, "rewards/rejected": -2.2069320678710938, "step": 225 }, { "epoch": 0.06019366657942947, "grad_norm": 18.625, "learning_rate": 1.8015665796344647e-07, "logits/chosen": -2.4012577533721924, "logits/rejected": -2.2910590171813965, "logps/chosen": -0.9449939727783203, "logps/rejected": -1.1328877210617065, "loss": 1.6108, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.8899879455566406, "rewards/margins": 0.37578752636909485, "rewards/rejected": -2.265775442123413, "step": 230 }, { "epoch": 0.0615022245485475, "grad_norm": 6.46875, "learning_rate": 1.8407310704960834e-07, "logits/chosen": -2.484492778778076, "logits/rejected": -2.3762400150299072, "logps/chosen": -0.9043990969657898, "logps/rejected": -1.0593132972717285, "loss": 1.6599, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8087981939315796, "rewards/margins": 0.3098280429840088, "rewards/rejected": -2.118626594543457, "step": 235 }, { "epoch": 0.06281078251766553, "grad_norm": 11.9375, "learning_rate": 1.8798955613577023e-07, "logits/chosen": -2.367265224456787, "logits/rejected": -2.3047831058502197, "logps/chosen": -0.9208362698554993, "logps/rejected": -1.1005771160125732, "loss": 1.5746, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8416725397109985, "rewards/margins": 0.3594818413257599, "rewards/rejected": -2.2011542320251465, "step": 240 }, { "epoch": 0.06411934048678357, "grad_norm": 7.1875, "learning_rate": 1.919060052219321e-07, "logits/chosen": -2.4860782623291016, "logits/rejected": -2.272517681121826, "logps/chosen": -0.9189214706420898, "logps/rejected": -1.1259539127349854, "loss": 1.5902, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8378429412841797, "rewards/margins": 0.4140649437904358, "rewards/rejected": -2.2519078254699707, "step": 245 }, { "epoch": 0.06542789845590159, "grad_norm": 8.5625, "learning_rate": 1.95822454308094e-07, "logits/chosen": -2.522761821746826, "logits/rejected": -2.3399651050567627, "logps/chosen": -0.927911639213562, "logps/rejected": -1.1523277759552002, "loss": 1.5643, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.855823278427124, "rewards/margins": 0.4488322138786316, "rewards/rejected": -2.3046555519104004, "step": 250 }, { "epoch": 0.06673645642501963, "grad_norm": 6.6875, "learning_rate": 1.9973890339425586e-07, "logits/chosen": -2.3732612133026123, "logits/rejected": -2.236459255218506, "logps/chosen": -0.9771307110786438, "logps/rejected": -1.0230567455291748, "loss": 1.8154, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9542614221572876, "rewards/margins": 0.09185231477022171, "rewards/rejected": -2.0461134910583496, "step": 255 }, { "epoch": 0.06804501439413765, "grad_norm": 4.15625, "learning_rate": 2.0365535248041772e-07, "logits/chosen": -2.482785940170288, "logits/rejected": -2.3916196823120117, "logps/chosen": -0.8988520503044128, "logps/rejected": -0.9144598245620728, "loss": 1.8105, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7977041006088257, "rewards/margins": 0.03121563419699669, "rewards/rejected": -1.8289196491241455, "step": 260 }, { "epoch": 0.06935357236325569, "grad_norm": 11.25, "learning_rate": 2.0757180156657962e-07, "logits/chosen": -2.4947338104248047, "logits/rejected": -2.3473544120788574, "logps/chosen": -0.9266901016235352, "logps/rejected": -1.0785419940948486, "loss": 1.6096, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8533802032470703, "rewards/margins": 0.3037036061286926, "rewards/rejected": -2.1570839881896973, "step": 265 }, { "epoch": 0.07066213033237373, "grad_norm": 6.9375, "learning_rate": 2.1148825065274148e-07, "logits/chosen": -2.4460299015045166, "logits/rejected": -2.389753818511963, "logps/chosen": -0.9668501615524292, "logps/rejected": -1.043068528175354, "loss": 1.7203, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9337003231048584, "rewards/margins": 0.15243682265281677, "rewards/rejected": -2.086137056350708, "step": 270 }, { "epoch": 0.07197068830149175, "grad_norm": 10.25, "learning_rate": 2.154046997389034e-07, "logits/chosen": -2.499992847442627, "logits/rejected": -2.281294345855713, "logps/chosen": -0.9527046084403992, "logps/rejected": -1.1139146089553833, "loss": 1.633, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9054092168807983, "rewards/margins": 0.32241979241371155, "rewards/rejected": -2.2278292179107666, "step": 275 }, { "epoch": 0.07327924627060979, "grad_norm": 10.5, "learning_rate": 2.1932114882506527e-07, "logits/chosen": -2.3968310356140137, "logits/rejected": -2.377711534500122, "logps/chosen": -0.9350486993789673, "logps/rejected": -1.1452116966247559, "loss": 1.5565, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8700973987579346, "rewards/margins": 0.42032623291015625, "rewards/rejected": -2.2904233932495117, "step": 280 }, { "epoch": 0.07458780423972781, "grad_norm": 5.46875, "learning_rate": 2.2323759791122716e-07, "logits/chosen": -2.4311115741729736, "logits/rejected": -2.487086772918701, "logps/chosen": -0.9067524075508118, "logps/rejected": -0.9682222604751587, "loss": 1.7401, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8135048151016235, "rewards/margins": 0.12293944507837296, "rewards/rejected": -1.9364445209503174, "step": 285 }, { "epoch": 0.07589636220884585, "grad_norm": 8.9375, "learning_rate": 2.2715404699738903e-07, "logits/chosen": -2.3616719245910645, "logits/rejected": -2.238387107849121, "logps/chosen": -1.0354571342468262, "logps/rejected": -1.0928757190704346, "loss": 1.7682, "rewards/accuracies": 0.5, "rewards/chosen": -2.0709142684936523, "rewards/margins": 0.11483726650476456, "rewards/rejected": -2.185751438140869, "step": 290 }, { "epoch": 0.07720492017796389, "grad_norm": 4.875, "learning_rate": 2.310704960835509e-07, "logits/chosen": -2.5253851413726807, "logits/rejected": -2.3990378379821777, "logps/chosen": -0.8590397834777832, "logps/rejected": -1.0271087884902954, "loss": 1.5922, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7180795669555664, "rewards/margins": 0.3361378610134125, "rewards/rejected": -2.054217576980591, "step": 295 }, { "epoch": 0.07851347814708191, "grad_norm": 8.25, "learning_rate": 2.349869451697128e-07, "logits/chosen": -2.442436456680298, "logits/rejected": -2.453054428100586, "logps/chosen": -0.9019156694412231, "logps/rejected": -1.067659616470337, "loss": 1.6271, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8038313388824463, "rewards/margins": 0.3314875662326813, "rewards/rejected": -2.135319232940674, "step": 300 }, { "epoch": 0.07982203611619995, "grad_norm": 5.96875, "learning_rate": 2.3890339425587466e-07, "logits/chosen": -2.5527732372283936, "logits/rejected": -2.339331865310669, "logps/chosen": -0.8872030973434448, "logps/rejected": -0.9970418810844421, "loss": 1.6658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7744061946868896, "rewards/margins": 0.2196773737668991, "rewards/rejected": -1.9940837621688843, "step": 305 }, { "epoch": 0.08113059408531798, "grad_norm": 8.625, "learning_rate": 2.428198433420366e-07, "logits/chosen": -2.3914530277252197, "logits/rejected": -2.382140636444092, "logps/chosen": -0.9143654108047485, "logps/rejected": -1.0700318813323975, "loss": 1.6237, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.828730821609497, "rewards/margins": 0.3113328814506531, "rewards/rejected": -2.140063762664795, "step": 310 }, { "epoch": 0.08243915205443601, "grad_norm": 12.625, "learning_rate": 2.4673629242819844e-07, "logits/chosen": -2.5993523597717285, "logits/rejected": -2.3815178871154785, "logps/chosen": -0.9237080812454224, "logps/rejected": -1.1313755512237549, "loss": 1.5805, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8474161624908447, "rewards/margins": 0.41533535718917847, "rewards/rejected": -2.2627511024475098, "step": 315 }, { "epoch": 0.08374771002355404, "grad_norm": 5.6875, "learning_rate": 2.506527415143603e-07, "logits/chosen": -2.527980327606201, "logits/rejected": -2.343719005584717, "logps/chosen": -0.8650591969490051, "logps/rejected": -1.0115206241607666, "loss": 1.6503, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7301183938980103, "rewards/margins": 0.29292288422584534, "rewards/rejected": -2.023041248321533, "step": 320 }, { "epoch": 0.08505626799267207, "grad_norm": 5.625, "learning_rate": 2.545691906005222e-07, "logits/chosen": -2.4999685287475586, "logits/rejected": -2.3292179107666016, "logps/chosen": -0.9075754284858704, "logps/rejected": -1.027204155921936, "loss": 1.675, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8151508569717407, "rewards/margins": 0.23925738036632538, "rewards/rejected": -2.054408311843872, "step": 325 }, { "epoch": 0.08636482596179011, "grad_norm": 5.78125, "learning_rate": 2.5848563968668404e-07, "logits/chosen": -2.451565980911255, "logits/rejected": -2.311919689178467, "logps/chosen": -0.9026411175727844, "logps/rejected": -1.085221290588379, "loss": 1.6007, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8052822351455688, "rewards/margins": 0.3651603162288666, "rewards/rejected": -2.170442581176758, "step": 330 }, { "epoch": 0.08767338393090814, "grad_norm": 7.03125, "learning_rate": 2.6240208877284596e-07, "logits/chosen": -2.3653557300567627, "logits/rejected": -2.398683547973633, "logps/chosen": -0.9381823539733887, "logps/rejected": -1.0668904781341553, "loss": 1.6937, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8763647079467773, "rewards/margins": 0.25741633772850037, "rewards/rejected": -2.1337809562683105, "step": 335 }, { "epoch": 0.08898194190002617, "grad_norm": 6.1875, "learning_rate": 2.6631853785900783e-07, "logits/chosen": -2.43845796585083, "logits/rejected": -2.3552565574645996, "logps/chosen": -0.887952983379364, "logps/rejected": -1.068477988243103, "loss": 1.597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.775905966758728, "rewards/margins": 0.36104997992515564, "rewards/rejected": -2.136955976486206, "step": 340 }, { "epoch": 0.0902904998691442, "grad_norm": 10.9375, "learning_rate": 2.702349869451697e-07, "logits/chosen": -2.3344759941101074, "logits/rejected": -2.3171322345733643, "logps/chosen": -0.916831374168396, "logps/rejected": -0.9886277318000793, "loss": 1.7309, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.833662748336792, "rewards/margins": 0.14359267055988312, "rewards/rejected": -1.9772554636001587, "step": 345 }, { "epoch": 0.09159905783826224, "grad_norm": 6.25, "learning_rate": 2.7415143603133156e-07, "logits/chosen": -2.342546224594116, "logits/rejected": -2.138016700744629, "logps/chosen": -0.9402766227722168, "logps/rejected": -1.1056863069534302, "loss": 1.618, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8805532455444336, "rewards/margins": 0.3308192491531372, "rewards/rejected": -2.2113726139068604, "step": 350 }, { "epoch": 0.09290761580738027, "grad_norm": 10.125, "learning_rate": 2.7806788511749343e-07, "logits/chosen": -2.462902545928955, "logits/rejected": -2.3874893188476562, "logps/chosen": -0.9408588409423828, "logps/rejected": -1.098220944404602, "loss": 1.6081, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8817176818847656, "rewards/margins": 0.3147239089012146, "rewards/rejected": -2.196441888809204, "step": 355 }, { "epoch": 0.0942161737764983, "grad_norm": 4.28125, "learning_rate": 2.8198433420365535e-07, "logits/chosen": -2.5288405418395996, "logits/rejected": -2.3516831398010254, "logps/chosen": -0.8734146356582642, "logps/rejected": -1.1018562316894531, "loss": 1.507, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7468292713165283, "rewards/margins": 0.4568832814693451, "rewards/rejected": -2.2037124633789062, "step": 360 }, { "epoch": 0.09552473174561633, "grad_norm": 6.78125, "learning_rate": 2.859007832898172e-07, "logits/chosen": -2.4478681087493896, "logits/rejected": -2.291496753692627, "logps/chosen": -0.9448345899581909, "logps/rejected": -1.1577575206756592, "loss": 1.5438, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8896691799163818, "rewards/margins": 0.4258459210395813, "rewards/rejected": -2.3155150413513184, "step": 365 }, { "epoch": 0.09683328971473436, "grad_norm": 9.0625, "learning_rate": 2.8981723237597913e-07, "logits/chosen": -2.3275673389434814, "logits/rejected": -2.2965145111083984, "logps/chosen": -0.8997896313667297, "logps/rejected": -1.0529388189315796, "loss": 1.6759, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7995792627334595, "rewards/margins": 0.3062984347343445, "rewards/rejected": -2.105877637863159, "step": 370 }, { "epoch": 0.0981418476838524, "grad_norm": 5.34375, "learning_rate": 2.93733681462141e-07, "logits/chosen": -2.387625217437744, "logits/rejected": -2.342306137084961, "logps/chosen": -0.9206598401069641, "logps/rejected": -1.0559571981430054, "loss": 1.6538, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8413196802139282, "rewards/margins": 0.27059483528137207, "rewards/rejected": -2.1119143962860107, "step": 375 }, { "epoch": 0.09945040565297043, "grad_norm": 7.34375, "learning_rate": 2.9765013054830287e-07, "logits/chosen": -2.420382022857666, "logits/rejected": -2.2946856021881104, "logps/chosen": -0.8861302137374878, "logps/rejected": -1.0251914262771606, "loss": 1.6312, "rewards/accuracies": 0.625, "rewards/chosen": -1.7722604274749756, "rewards/margins": 0.27812227606773376, "rewards/rejected": -2.0503828525543213, "step": 380 }, { "epoch": 0.10075896362208846, "grad_norm": 10.8125, "learning_rate": 2.9999974949918995e-07, "logits/chosen": -2.401793956756592, "logits/rejected": -2.3295111656188965, "logps/chosen": -0.9263688921928406, "logps/rejected": -1.0271470546722412, "loss": 1.6874, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8527377843856812, "rewards/margins": 0.20155613124370575, "rewards/rejected": -2.0542941093444824, "step": 385 }, { "epoch": 0.1020675215912065, "grad_norm": 9.375, "learning_rate": 2.9999693137468605e-07, "logits/chosen": -2.4672961235046387, "logits/rejected": -2.445577621459961, "logps/chosen": -0.9505416750907898, "logps/rejected": -1.0743643045425415, "loss": 1.6756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9010833501815796, "rewards/margins": 0.2476452887058258, "rewards/rejected": -2.148728609085083, "step": 390 }, { "epoch": 0.10337607956032452, "grad_norm": 9.1875, "learning_rate": 2.9999098205869016e-07, "logits/chosen": -2.4404184818267822, "logits/rejected": -2.380148410797119, "logps/chosen": -0.9004305005073547, "logps/rejected": -1.0790345668792725, "loss": 1.6199, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8008610010147095, "rewards/margins": 0.35720810294151306, "rewards/rejected": -2.158069133758545, "step": 395 }, { "epoch": 0.10468463752944256, "grad_norm": 4.09375, "learning_rate": 2.999819016753946e-07, "logits/chosen": -2.449219226837158, "logits/rejected": -2.402337074279785, "logps/chosen": -0.9261224865913391, "logps/rejected": -1.0990149974822998, "loss": 1.6117, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8522449731826782, "rewards/margins": 0.3457852900028229, "rewards/rejected": -2.1980299949645996, "step": 400 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -2.2513277530670166, "eval_logits/rejected": -2.1454787254333496, "eval_logps/chosen": -0.952590823173523, "eval_logps/rejected": -1.121248722076416, "eval_loss": 1.6171038150787354, "eval_rewards/accuracies": 0.6010000109672546, "eval_rewards/chosen": -1.905181646347046, "eval_rewards/margins": 0.3373158872127533, "eval_rewards/rejected": -2.242497444152832, "eval_runtime": 424.333, "eval_samples_per_second": 4.713, "eval_steps_per_second": 1.178, "step": 400 }, { "epoch": 0.10599319549856058, "grad_norm": 8.0625, "learning_rate": 2.9996969041435263e-07, "logits/chosen": -2.3188745975494385, "logits/rejected": -2.261295795440674, "logps/chosen": -0.9594962000846863, "logps/rejected": -1.0835071802139282, "loss": 1.7107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9189924001693726, "rewards/margins": 0.24802187085151672, "rewards/rejected": -2.1670143604278564, "step": 405 }, { "epoch": 0.10730175346767862, "grad_norm": 8.25, "learning_rate": 2.9995434853047485e-07, "logits/chosen": -2.4103803634643555, "logits/rejected": -2.442323684692383, "logps/chosen": -0.9610514640808105, "logps/rejected": -1.1024830341339111, "loss": 1.6239, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.922102928161621, "rewards/margins": 0.2828632593154907, "rewards/rejected": -2.2049660682678223, "step": 410 }, { "epoch": 0.10861031143679666, "grad_norm": 10.4375, "learning_rate": 2.999358763440235e-07, "logits/chosen": -2.459806442260742, "logits/rejected": -2.320763111114502, "logps/chosen": -0.9868751764297485, "logps/rejected": -1.0588407516479492, "loss": 1.7821, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.973750352859497, "rewards/margins": 0.14393123984336853, "rewards/rejected": -2.1176815032958984, "step": 415 }, { "epoch": 0.10991886940591468, "grad_norm": 8.125, "learning_rate": 2.9991427424060636e-07, "logits/chosen": -2.3800106048583984, "logits/rejected": -2.354858160018921, "logps/chosen": -0.9697184562683105, "logps/rejected": -1.0938130617141724, "loss": 1.6764, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.939436912536621, "rewards/margins": 0.24818918108940125, "rewards/rejected": -2.1876261234283447, "step": 420 }, { "epoch": 0.11122742737503272, "grad_norm": 9.875, "learning_rate": 2.99889542671168e-07, "logits/chosen": -2.3663439750671387, "logits/rejected": -2.3495450019836426, "logps/chosen": -0.9690505266189575, "logps/rejected": -1.0756759643554688, "loss": 1.7196, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.938101053237915, "rewards/margins": 0.21325090527534485, "rewards/rejected": -2.1513519287109375, "step": 425 }, { "epoch": 0.11253598534415074, "grad_norm": 7.59375, "learning_rate": 2.998616821519809e-07, "logits/chosen": -2.491093397140503, "logits/rejected": -2.3764915466308594, "logps/chosen": -0.9218136668205261, "logps/rejected": -1.0008352994918823, "loss": 1.7313, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8436273336410522, "rewards/margins": 0.15804331004619598, "rewards/rejected": -2.0016705989837646, "step": 430 }, { "epoch": 0.11384454331326878, "grad_norm": 9.4375, "learning_rate": 2.9983069326463446e-07, "logits/chosen": -2.464858055114746, "logits/rejected": -2.362672805786133, "logps/chosen": -0.9647724032402039, "logps/rejected": -1.1482311487197876, "loss": 1.5803, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9295448064804077, "rewards/margins": 0.3669171929359436, "rewards/rejected": -2.296462297439575, "step": 435 }, { "epoch": 0.11515310128238682, "grad_norm": 7.125, "learning_rate": 2.997965766560227e-07, "logits/chosen": -2.400057554244995, "logits/rejected": -2.3539984226226807, "logps/chosen": -0.9212436676025391, "logps/rejected": -1.0166393518447876, "loss": 1.7191, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8424873352050781, "rewards/margins": 0.19079145789146423, "rewards/rejected": -2.033278703689575, "step": 440 }, { "epoch": 0.11646165925150484, "grad_norm": 8.875, "learning_rate": 2.9975933303833125e-07, "logits/chosen": -2.422158718109131, "logits/rejected": -2.389476776123047, "logps/chosen": -0.8293859362602234, "logps/rejected": -1.0167236328125, "loss": 1.5636, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6587718725204468, "rewards/margins": 0.3746754229068756, "rewards/rejected": -2.033447265625, "step": 445 }, { "epoch": 0.11777021722062288, "grad_norm": 9.25, "learning_rate": 2.9971896318902195e-07, "logits/chosen": -2.3260674476623535, "logits/rejected": -2.216798782348633, "logps/chosen": -1.0063860416412354, "logps/rejected": -1.1665751934051514, "loss": 1.6509, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0127720832824707, "rewards/margins": 0.32037821412086487, "rewards/rejected": -2.3331503868103027, "step": 450 }, { "epoch": 0.1190787751897409, "grad_norm": 4.96875, "learning_rate": 2.996754679508169e-07, "logits/chosen": -2.472568988800049, "logits/rejected": -2.3225364685058594, "logps/chosen": -0.9303165674209595, "logps/rejected": -0.9713878631591797, "loss": 1.7743, "rewards/accuracies": 0.5, "rewards/chosen": -1.860633134841919, "rewards/margins": 0.08214248716831207, "rewards/rejected": -1.9427757263183594, "step": 455 }, { "epoch": 0.12038733315885894, "grad_norm": 8.75, "learning_rate": 2.996288482316808e-07, "logits/chosen": -2.401106357574463, "logits/rejected": -2.2523350715637207, "logps/chosen": -0.9310356378555298, "logps/rejected": -1.0990873575210571, "loss": 1.648, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8620712757110596, "rewards/margins": 0.33610355854034424, "rewards/rejected": -2.1981747150421143, "step": 460 }, { "epoch": 0.12169589112797696, "grad_norm": 4.5, "learning_rate": 2.9957910500480206e-07, "logits/chosen": -2.4034886360168457, "logits/rejected": -2.3578133583068848, "logps/chosen": -0.9459611177444458, "logps/rejected": -1.1297959089279175, "loss": 1.5968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8919222354888916, "rewards/margins": 0.36766964197158813, "rewards/rejected": -2.259591817855835, "step": 465 }, { "epoch": 0.123004449097095, "grad_norm": 6.0625, "learning_rate": 2.995262393085723e-07, "logits/chosen": -2.401829481124878, "logits/rejected": -2.197314500808716, "logps/chosen": -0.940493106842041, "logps/rejected": -1.1122708320617676, "loss": 1.6295, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.880986213684082, "rewards/margins": 0.3435554802417755, "rewards/rejected": -2.224541664123535, "step": 470 }, { "epoch": 0.12431300706621304, "grad_norm": 8.5, "learning_rate": 2.9947025224656487e-07, "logits/chosen": -2.3963265419006348, "logits/rejected": -2.367873191833496, "logps/chosen": -1.008406639099121, "logps/rejected": -1.0969040393829346, "loss": 1.7713, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.016813278198242, "rewards/margins": 0.1769949495792389, "rewards/rejected": -2.193808078765869, "step": 475 }, { "epoch": 0.12562156503533106, "grad_norm": 6.09375, "learning_rate": 2.994111449875119e-07, "logits/chosen": -2.3887929916381836, "logits/rejected": -2.323988437652588, "logps/chosen": -0.9816536903381348, "logps/rejected": -1.1715774536132812, "loss": 1.5923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9633073806762695, "rewards/margins": 0.37984758615493774, "rewards/rejected": -2.3431549072265625, "step": 480 }, { "epoch": 0.12693012300444909, "grad_norm": 3.984375, "learning_rate": 2.993489187652795e-07, "logits/chosen": -2.2953505516052246, "logits/rejected": -2.3416717052459717, "logps/chosen": -0.8750378489494324, "logps/rejected": -1.0500514507293701, "loss": 1.6116, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7500756978988647, "rewards/margins": 0.35002726316452026, "rewards/rejected": -2.1001029014587402, "step": 485 }, { "epoch": 0.12823868097356714, "grad_norm": 5.8125, "learning_rate": 2.9928357487884233e-07, "logits/chosen": -2.4674229621887207, "logits/rejected": -2.3832409381866455, "logps/chosen": -0.9203821420669556, "logps/rejected": -1.0805714130401611, "loss": 1.6943, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8407642841339111, "rewards/margins": 0.3203783631324768, "rewards/rejected": -2.1611428260803223, "step": 490 }, { "epoch": 0.12954723894268516, "grad_norm": 5.28125, "learning_rate": 2.9921511469225654e-07, "logits/chosen": -2.4352641105651855, "logits/rejected": -2.2938902378082275, "logps/chosen": -0.9046840667724609, "logps/rejected": -1.0541527271270752, "loss": 1.6367, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8093681335449219, "rewards/margins": 0.2989373207092285, "rewards/rejected": -2.1083054542541504, "step": 495 }, { "epoch": 0.13085579691180318, "grad_norm": 8.4375, "learning_rate": 2.99143539634631e-07, "logits/chosen": -2.472432851791382, "logits/rejected": -2.32405161857605, "logps/chosen": -0.895167350769043, "logps/rejected": -1.0566622018814087, "loss": 1.6128, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.790334701538086, "rewards/margins": 0.3229896128177643, "rewards/rejected": -2.1133244037628174, "step": 500 }, { "epoch": 0.13216435488092124, "grad_norm": 6.65625, "learning_rate": 2.990688512000977e-07, "logits/chosen": -2.3493149280548096, "logits/rejected": -2.3101589679718018, "logps/chosen": -0.9914226531982422, "logps/rejected": -1.2395421266555786, "loss": 1.5934, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9828453063964844, "rewards/margins": 0.4962390959262848, "rewards/rejected": -2.4790842533111572, "step": 505 }, { "epoch": 0.13347291285003926, "grad_norm": 7.75, "learning_rate": 2.989910509477805e-07, "logits/chosen": -2.436180591583252, "logits/rejected": -2.262840747833252, "logps/chosen": -0.996168315410614, "logps/rejected": -1.119922399520874, "loss": 1.6781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.992336630821228, "rewards/margins": 0.24750828742980957, "rewards/rejected": -2.239844799041748, "step": 510 }, { "epoch": 0.13478147081915728, "grad_norm": 4.75, "learning_rate": 2.9891014050176247e-07, "logits/chosen": -2.3457024097442627, "logits/rejected": -2.32684326171875, "logps/chosen": -1.0045838356018066, "logps/rejected": -1.1608080863952637, "loss": 1.6557, "rewards/accuracies": 0.625, "rewards/chosen": -2.0091676712036133, "rewards/margins": 0.31244856119155884, "rewards/rejected": -2.3216161727905273, "step": 515 }, { "epoch": 0.1360900287882753, "grad_norm": 10.3125, "learning_rate": 2.9882612155105215e-07, "logits/chosen": -2.3013076782226562, "logits/rejected": -2.3133058547973633, "logps/chosen": -1.037622332572937, "logps/rejected": -1.1788644790649414, "loss": 1.676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.075244665145874, "rewards/margins": 0.2824842631816864, "rewards/rejected": -2.357728958129883, "step": 520 }, { "epoch": 0.13739858675739336, "grad_norm": 8.5, "learning_rate": 2.987389958495484e-07, "logits/chosen": -2.463963270187378, "logits/rejected": -2.3986783027648926, "logps/chosen": -0.9441148638725281, "logps/rejected": -1.1538679599761963, "loss": 1.5847, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8882297277450562, "rewards/margins": 0.4195060133934021, "rewards/rejected": -2.3077359199523926, "step": 525 }, { "epoch": 0.13870714472651138, "grad_norm": 7.1875, "learning_rate": 2.9864876521600336e-07, "logits/chosen": -2.449537754058838, "logits/rejected": -2.380209445953369, "logps/chosen": -0.9405506253242493, "logps/rejected": -1.1435946226119995, "loss": 1.5474, "rewards/accuracies": 0.625, "rewards/chosen": -1.8811012506484985, "rewards/margins": 0.4060877859592438, "rewards/rejected": -2.287189245223999, "step": 530 }, { "epoch": 0.1400157026956294, "grad_norm": 7.03125, "learning_rate": 2.985554315339848e-07, "logits/chosen": -2.478217601776123, "logits/rejected": -2.30013370513916, "logps/chosen": -0.9366180300712585, "logps/rejected": -1.178182601928711, "loss": 1.5548, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.873236060142517, "rewards/margins": 0.48312908411026, "rewards/rejected": -2.356365203857422, "step": 535 }, { "epoch": 0.14132426066474746, "grad_norm": 5.0, "learning_rate": 2.9845899675183687e-07, "logits/chosen": -2.4434995651245117, "logits/rejected": -2.3974361419677734, "logps/chosen": -0.8716647028923035, "logps/rejected": -1.0463964939117432, "loss": 1.6043, "rewards/accuracies": 0.625, "rewards/chosen": -1.743329405784607, "rewards/margins": 0.3494636118412018, "rewards/rejected": -2.0927929878234863, "step": 540 }, { "epoch": 0.14263281863386548, "grad_norm": 16.375, "learning_rate": 2.983594628826392e-07, "logits/chosen": -2.3764243125915527, "logits/rejected": -2.3367056846618652, "logps/chosen": -0.9412527084350586, "logps/rejected": -1.105259895324707, "loss": 1.6784, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8825054168701172, "rewards/margins": 0.3280145525932312, "rewards/rejected": -2.210519790649414, "step": 545 }, { "epoch": 0.1439413766029835, "grad_norm": 6.9375, "learning_rate": 2.9825683200416494e-07, "logits/chosen": -2.454298973083496, "logits/rejected": -2.301304340362549, "logps/chosen": -0.9309526681900024, "logps/rejected": -1.1586534976959229, "loss": 1.5695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8619053363800049, "rewards/margins": 0.45540183782577515, "rewards/rejected": -2.3173069953918457, "step": 550 }, { "epoch": 0.14524993457210156, "grad_norm": 7.34375, "learning_rate": 2.981511062588375e-07, "logits/chosen": -2.4893765449523926, "logits/rejected": -2.279204845428467, "logps/chosen": -0.9922503232955933, "logps/rejected": -1.1499555110931396, "loss": 1.6865, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9845006465911865, "rewards/margins": 0.31541046500205994, "rewards/rejected": -2.2999110221862793, "step": 555 }, { "epoch": 0.14655849254121958, "grad_norm": 7.34375, "learning_rate": 2.9804228785368573e-07, "logits/chosen": -2.3789010047912598, "logits/rejected": -2.272644519805908, "logps/chosen": -0.9705682992935181, "logps/rejected": -1.1720385551452637, "loss": 1.578, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9411365985870361, "rewards/margins": 0.4029403328895569, "rewards/rejected": -2.3440771102905273, "step": 560 }, { "epoch": 0.1478670505103376, "grad_norm": 4.25, "learning_rate": 2.9793037906029786e-07, "logits/chosen": -2.4314959049224854, "logits/rejected": -2.3498497009277344, "logps/chosen": -0.9575311541557312, "logps/rejected": -1.1182386875152588, "loss": 1.6654, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9150623083114624, "rewards/margins": 0.32141512632369995, "rewards/rejected": -2.2364773750305176, "step": 565 }, { "epoch": 0.14917560847945563, "grad_norm": 10.4375, "learning_rate": 2.9781538221477413e-07, "logits/chosen": -2.4762187004089355, "logits/rejected": -2.279405117034912, "logps/chosen": -0.9812570810317993, "logps/rejected": -1.174328327178955, "loss": 1.5668, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9625141620635986, "rewards/margins": 0.3861425518989563, "rewards/rejected": -2.34865665435791, "step": 570 }, { "epoch": 0.15048416644857368, "grad_norm": 6.9375, "learning_rate": 2.976972997176779e-07, "logits/chosen": -2.3956246376037598, "logits/rejected": -2.2909069061279297, "logps/chosen": -0.9791876077651978, "logps/rejected": -1.2195472717285156, "loss": 1.5671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9583752155303955, "rewards/margins": 0.4807194173336029, "rewards/rejected": -2.4390945434570312, "step": 575 }, { "epoch": 0.1517927244176917, "grad_norm": 8.125, "learning_rate": 2.9757613403398566e-07, "logits/chosen": -2.4154441356658936, "logits/rejected": -2.281869888305664, "logps/chosen": -0.986433207988739, "logps/rejected": -1.1436251401901245, "loss": 1.6565, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.972866415977478, "rewards/margins": 0.31438392400741577, "rewards/rejected": -2.287250280380249, "step": 580 }, { "epoch": 0.15310128238680973, "grad_norm": 6.15625, "learning_rate": 2.9745188769303555e-07, "logits/chosen": -2.379331588745117, "logits/rejected": -2.3229763507843018, "logps/chosen": -1.0146372318267822, "logps/rejected": -1.069580316543579, "loss": 1.7914, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0292744636535645, "rewards/margins": 0.10988602787256241, "rewards/rejected": -2.139160633087158, "step": 585 }, { "epoch": 0.15440984035592778, "grad_norm": 8.4375, "learning_rate": 2.973245632884746e-07, "logits/chosen": -2.4001305103302, "logits/rejected": -2.2695565223693848, "logps/chosen": -1.0788220167160034, "logps/rejected": -1.2275183200836182, "loss": 1.7213, "rewards/accuracies": 0.5625, "rewards/chosen": -2.157644033432007, "rewards/margins": 0.2973926365375519, "rewards/rejected": -2.4550366401672363, "step": 590 }, { "epoch": 0.1557183983250458, "grad_norm": 7.5, "learning_rate": 2.9719416347820435e-07, "logits/chosen": -2.3957741260528564, "logits/rejected": -2.31162691116333, "logps/chosen": -0.8724110722541809, "logps/rejected": -1.201525330543518, "loss": 1.4121, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7448221445083618, "rewards/margins": 0.6582284569740295, "rewards/rejected": -2.403050661087036, "step": 595 }, { "epoch": 0.15702695629416383, "grad_norm": 7.15625, "learning_rate": 2.970606909843257e-07, "logits/chosen": -2.3641343116760254, "logits/rejected": -2.3780410289764404, "logps/chosen": -0.9861353039741516, "logps/rejected": -1.0746090412139893, "loss": 1.7419, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9722706079483032, "rewards/margins": 0.1769472062587738, "rewards/rejected": -2.1492180824279785, "step": 600 }, { "epoch": 0.15833551426328185, "grad_norm": 8.5625, "learning_rate": 2.96924148593082e-07, "logits/chosen": -2.5523200035095215, "logits/rejected": -2.3768351078033447, "logps/chosen": -0.9870797991752625, "logps/rejected": -1.2453333139419556, "loss": 1.5644, "rewards/accuracies": 0.625, "rewards/chosen": -1.974159598350525, "rewards/margins": 0.5165067911148071, "rewards/rejected": -2.490666627883911, "step": 605 }, { "epoch": 0.1596440722323999, "grad_norm": 13.4375, "learning_rate": 2.967845391548006e-07, "logits/chosen": -2.489711284637451, "logits/rejected": -2.3177313804626465, "logps/chosen": -0.9760478138923645, "logps/rejected": -1.1500861644744873, "loss": 1.6019, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.952095627784729, "rewards/margins": 0.34807658195495605, "rewards/rejected": -2.3001723289489746, "step": 610 }, { "epoch": 0.16095263020151793, "grad_norm": 15.75, "learning_rate": 2.966418655838337e-07, "logits/chosen": -2.2412636280059814, "logits/rejected": -2.126152515411377, "logps/chosen": -0.962241530418396, "logps/rejected": -1.151241660118103, "loss": 1.6074, "rewards/accuracies": 0.625, "rewards/chosen": -1.924483060836792, "rewards/margins": 0.37800025939941406, "rewards/rejected": -2.302483320236206, "step": 615 }, { "epoch": 0.16226118817063595, "grad_norm": 5.5, "learning_rate": 2.9649613085849746e-07, "logits/chosen": -2.398313522338867, "logits/rejected": -2.177938938140869, "logps/chosen": -0.890715479850769, "logps/rejected": -1.1732763051986694, "loss": 1.4465, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.781430959701538, "rewards/margins": 0.5651217699050903, "rewards/rejected": -2.346552610397339, "step": 620 }, { "epoch": 0.163569746139754, "grad_norm": 6.75, "learning_rate": 2.9634733802100955e-07, "logits/chosen": -2.267308473587036, "logits/rejected": -2.2396576404571533, "logps/chosen": -0.9639409184455872, "logps/rejected": -1.0920507907867432, "loss": 1.6433, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9278818368911743, "rewards/margins": 0.2562195062637329, "rewards/rejected": -2.1841015815734863, "step": 625 }, { "epoch": 0.16487830410887203, "grad_norm": 15.8125, "learning_rate": 2.96195490177426e-07, "logits/chosen": -2.475383996963501, "logits/rejected": -2.4182159900665283, "logps/chosen": -0.9371517300605774, "logps/rejected": -1.134628176689148, "loss": 1.6209, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8743034601211548, "rewards/margins": 0.3949531316757202, "rewards/rejected": -2.269256353378296, "step": 630 }, { "epoch": 0.16618686207799005, "grad_norm": 8.4375, "learning_rate": 2.9604059049757624e-07, "logits/chosen": -2.368704319000244, "logits/rejected": -2.3213417530059814, "logps/chosen": -0.9990984201431274, "logps/rejected": -1.0755655765533447, "loss": 1.7315, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9981968402862549, "rewards/margins": 0.1529347002506256, "rewards/rejected": -2.1511311531066895, "step": 635 }, { "epoch": 0.16749542004710807, "grad_norm": 9.4375, "learning_rate": 2.958826422149967e-07, "logits/chosen": -2.427081346511841, "logits/rejected": -2.3457372188568115, "logps/chosen": -0.8863208889961243, "logps/rejected": -1.067103624343872, "loss": 1.5663, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7726417779922485, "rewards/margins": 0.36156538128852844, "rewards/rejected": -2.134207248687744, "step": 640 }, { "epoch": 0.16880397801622612, "grad_norm": 6.59375, "learning_rate": 2.957216486268637e-07, "logits/chosen": -2.330294370651245, "logits/rejected": -2.202291965484619, "logps/chosen": -1.0088696479797363, "logps/rejected": -1.1333798170089722, "loss": 1.7278, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0177392959594727, "rewards/margins": 0.2490203082561493, "rewards/rejected": -2.2667596340179443, "step": 645 }, { "epoch": 0.17011253598534415, "grad_norm": 7.9375, "learning_rate": 2.9555761309392436e-07, "logits/chosen": -2.4448795318603516, "logits/rejected": -2.273897647857666, "logps/chosen": -0.9754883646965027, "logps/rejected": -1.150264024734497, "loss": 1.602, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9509767293930054, "rewards/margins": 0.3495512008666992, "rewards/rejected": -2.300528049468994, "step": 650 }, { "epoch": 0.17142109395446217, "grad_norm": 8.25, "learning_rate": 2.953905390404264e-07, "logits/chosen": -2.3641345500946045, "logits/rejected": -2.2323379516601562, "logps/chosen": -1.0143330097198486, "logps/rejected": -1.2682030200958252, "loss": 1.5373, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0286660194396973, "rewards/margins": 0.5077397227287292, "rewards/rejected": -2.5364060401916504, "step": 655 }, { "epoch": 0.17272965192358022, "grad_norm": 8.4375, "learning_rate": 2.9522042995404697e-07, "logits/chosen": -2.4380645751953125, "logits/rejected": -2.2877676486968994, "logps/chosen": -0.9197438955307007, "logps/rejected": -1.100537657737732, "loss": 1.5944, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8394877910614014, "rewards/margins": 0.3615874648094177, "rewards/rejected": -2.201075315475464, "step": 660 }, { "epoch": 0.17403820989269825, "grad_norm": 9.125, "learning_rate": 2.9504728938581943e-07, "logits/chosen": -2.480069398880005, "logits/rejected": -2.4035069942474365, "logps/chosen": -0.9758340716362, "logps/rejected": -1.1643568277359009, "loss": 1.6231, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9516681432724, "rewards/margins": 0.37704557180404663, "rewards/rejected": -2.3287136554718018, "step": 665 }, { "epoch": 0.17534676786181627, "grad_norm": 5.84375, "learning_rate": 2.9487112095005966e-07, "logits/chosen": -2.451627254486084, "logits/rejected": -2.365042209625244, "logps/chosen": -1.016111135482788, "logps/rejected": -1.2239553928375244, "loss": 1.5983, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.032222270965576, "rewards/margins": 0.4156881272792816, "rewards/rejected": -2.447910785675049, "step": 670 }, { "epoch": 0.17665532583093432, "grad_norm": 7.75, "learning_rate": 2.946919283242902e-07, "logits/chosen": -2.348477840423584, "logits/rejected": -2.3489179611206055, "logps/chosen": -1.0354164838790894, "logps/rejected": -1.194151520729065, "loss": 1.662, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0708329677581787, "rewards/margins": 0.31747013330459595, "rewards/rejected": -2.38830304145813, "step": 675 }, { "epoch": 0.17796388380005235, "grad_norm": 12.8125, "learning_rate": 2.9450971524916383e-07, "logits/chosen": -2.379840135574341, "logits/rejected": -2.3483166694641113, "logps/chosen": -0.9533805847167969, "logps/rejected": -1.1625690460205078, "loss": 1.583, "rewards/accuracies": 0.625, "rewards/chosen": -1.9067611694335938, "rewards/margins": 0.41837677359580994, "rewards/rejected": -2.3251380920410156, "step": 680 }, { "epoch": 0.17927244176917037, "grad_norm": 21.625, "learning_rate": 2.9432448552838516e-07, "logits/chosen": -2.4483020305633545, "logits/rejected": -2.3117237091064453, "logps/chosen": -1.020320177078247, "logps/rejected": -1.2638919353485107, "loss": 1.6615, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.040640354156494, "rewards/margins": 0.48714321851730347, "rewards/rejected": -2.5277838706970215, "step": 685 }, { "epoch": 0.1805809997382884, "grad_norm": 14.375, "learning_rate": 2.941362430286315e-07, "logits/chosen": -2.4380064010620117, "logits/rejected": -2.331979274749756, "logps/chosen": -1.0060025453567505, "logps/rejected": -1.2595679759979248, "loss": 1.5215, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.012005090713501, "rewards/margins": 0.5071309804916382, "rewards/rejected": -2.5191359519958496, "step": 690 }, { "epoch": 0.18188955770740645, "grad_norm": 5.09375, "learning_rate": 2.9394499167947193e-07, "logits/chosen": -2.4510135650634766, "logits/rejected": -2.295680284500122, "logps/chosen": -0.9680082201957703, "logps/rejected": -1.2023645639419556, "loss": 1.5592, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9360164403915405, "rewards/margins": 0.46871232986450195, "rewards/rejected": -2.404729127883911, "step": 695 }, { "epoch": 0.18319811567652447, "grad_norm": 4.65625, "learning_rate": 2.9375073547328555e-07, "logits/chosen": -2.427438259124756, "logits/rejected": -2.313152313232422, "logps/chosen": -0.9715389013290405, "logps/rejected": -1.2043567895889282, "loss": 1.5339, "rewards/accuracies": 0.625, "rewards/chosen": -1.943077802658081, "rewards/margins": 0.46563559770584106, "rewards/rejected": -2.4087135791778564, "step": 700 }, { "epoch": 0.1845066736456425, "grad_norm": 7.65625, "learning_rate": 2.935534784651778e-07, "logits/chosen": -2.408898115158081, "logits/rejected": -2.326305389404297, "logps/chosen": -0.9936673045158386, "logps/rejected": -1.2327395677566528, "loss": 1.5409, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9873346090316772, "rewards/margins": 0.47814446687698364, "rewards/rejected": -2.4654791355133057, "step": 705 }, { "epoch": 0.18581523161476055, "grad_norm": 5.28125, "learning_rate": 2.93353224772896e-07, "logits/chosen": -2.4771580696105957, "logits/rejected": -2.3489770889282227, "logps/chosen": -0.8804000616073608, "logps/rejected": -1.0891720056533813, "loss": 1.5935, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7608001232147217, "rewards/margins": 0.4175436496734619, "rewards/rejected": -2.1783440113067627, "step": 710 }, { "epoch": 0.18712378958387857, "grad_norm": 7.09375, "learning_rate": 2.9314997857674333e-07, "logits/chosen": -2.4162282943725586, "logits/rejected": -2.3713278770446777, "logps/chosen": -1.057502031326294, "logps/rejected": -1.24507737159729, "loss": 1.6408, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.115004062652588, "rewards/margins": 0.3751503825187683, "rewards/rejected": -2.49015474319458, "step": 715 }, { "epoch": 0.1884323475529966, "grad_norm": 5.3125, "learning_rate": 2.929437441194918e-07, "logits/chosen": -2.466341018676758, "logits/rejected": -2.368431568145752, "logps/chosen": -0.9546599388122559, "logps/rejected": -1.1010894775390625, "loss": 1.6843, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9093198776245117, "rewards/margins": 0.2928588390350342, "rewards/rejected": -2.202178955078125, "step": 720 }, { "epoch": 0.18974090552211462, "grad_norm": 4.75, "learning_rate": 2.9273452570629324e-07, "logits/chosen": -2.320737838745117, "logits/rejected": -2.258985996246338, "logps/chosen": -0.9658737182617188, "logps/rejected": -1.0923596620559692, "loss": 1.6981, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9317474365234375, "rewards/margins": 0.2529716491699219, "rewards/rejected": -2.1847193241119385, "step": 725 }, { "epoch": 0.19104946349123267, "grad_norm": 4.59375, "learning_rate": 2.9252232770458974e-07, "logits/chosen": -2.4028193950653076, "logits/rejected": -2.344017505645752, "logps/chosen": -0.9385544061660767, "logps/rejected": -1.1675488948822021, "loss": 1.5605, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8771088123321533, "rewards/margins": 0.45798879861831665, "rewards/rejected": -2.3350977897644043, "step": 730 }, { "epoch": 0.1923580214603507, "grad_norm": 11.625, "learning_rate": 2.9230715454402253e-07, "logits/chosen": -2.330263137817383, "logits/rejected": -2.157097339630127, "logps/chosen": -1.1042064428329468, "logps/rejected": -1.2024831771850586, "loss": 1.7607, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2084128856658936, "rewards/margins": 0.1965535581111908, "rewards/rejected": -2.404966354370117, "step": 735 }, { "epoch": 0.19366657942946872, "grad_norm": 4.15625, "learning_rate": 2.9208901071633923e-07, "logits/chosen": -2.4216017723083496, "logits/rejected": -2.3744590282440186, "logps/chosen": -0.8338273763656616, "logps/rejected": -1.1236623525619507, "loss": 1.4663, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6676547527313232, "rewards/margins": 0.5796698927879333, "rewards/rejected": -2.2473247051239014, "step": 740 }, { "epoch": 0.19497513739858677, "grad_norm": 7.4375, "learning_rate": 2.9186790077530036e-07, "logits/chosen": -2.426046848297119, "logits/rejected": -2.312283515930176, "logps/chosen": -0.9459326863288879, "logps/rejected": -1.1866517066955566, "loss": 1.5275, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8918653726577759, "rewards/margins": 0.481438010931015, "rewards/rejected": -2.3733034133911133, "step": 745 }, { "epoch": 0.1962836953677048, "grad_norm": 10.25, "learning_rate": 2.9164382933658406e-07, "logits/chosen": -2.3249096870422363, "logits/rejected": -2.2094438076019287, "logps/chosen": -1.0056768655776978, "logps/rejected": -1.1798431873321533, "loss": 1.6378, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0113537311553955, "rewards/margins": 0.3483326733112335, "rewards/rejected": -2.3596863746643066, "step": 750 }, { "epoch": 0.19759225333682282, "grad_norm": 4.9375, "learning_rate": 2.9141680107768996e-07, "logits/chosen": -2.4671313762664795, "logits/rejected": -2.3500142097473145, "logps/chosen": -1.0276011228561401, "logps/rejected": -1.203189492225647, "loss": 1.6342, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0552022457122803, "rewards/margins": 0.3511764705181122, "rewards/rejected": -2.406378984451294, "step": 755 }, { "epoch": 0.19890081130594087, "grad_norm": 5.5625, "learning_rate": 2.911868207378413e-07, "logits/chosen": -2.3088057041168213, "logits/rejected": -2.2088522911071777, "logps/chosen": -1.0932196378707886, "logps/rejected": -1.2626073360443115, "loss": 1.6654, "rewards/accuracies": 0.5625, "rewards/chosen": -2.186439275741577, "rewards/margins": 0.3387756049633026, "rewards/rejected": -2.525214672088623, "step": 760 }, { "epoch": 0.2002093692750589, "grad_norm": 8.0, "learning_rate": 2.9095389311788627e-07, "logits/chosen": -2.365936279296875, "logits/rejected": -2.3454153537750244, "logps/chosen": -1.0347694158554077, "logps/rejected": -1.1502060890197754, "loss": 1.7376, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0695388317108154, "rewards/margins": 0.23087334632873535, "rewards/rejected": -2.300412178039551, "step": 765 }, { "epoch": 0.20151792724417691, "grad_norm": 14.1875, "learning_rate": 2.907180230801974e-07, "logits/chosen": -2.4282312393188477, "logits/rejected": -2.323211669921875, "logps/chosen": -1.035776138305664, "logps/rejected": -1.0719871520996094, "loss": 1.8077, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.071552276611328, "rewards/margins": 0.07242242246866226, "rewards/rejected": -2.1439743041992188, "step": 770 }, { "epoch": 0.20282648521329494, "grad_norm": 10.0625, "learning_rate": 2.9047921554857067e-07, "logits/chosen": -2.5066514015197754, "logits/rejected": -2.405531167984009, "logps/chosen": -1.0375727415084839, "logps/rejected": -1.1584465503692627, "loss": 1.7324, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0751454830169678, "rewards/margins": 0.24174734950065613, "rewards/rejected": -2.3168931007385254, "step": 775 }, { "epoch": 0.204135043182413, "grad_norm": 7.0625, "learning_rate": 2.90237475508122e-07, "logits/chosen": -2.410250186920166, "logits/rejected": -2.298802614212036, "logps/chosen": -1.067455530166626, "logps/rejected": -1.2438901662826538, "loss": 1.61, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.134911060333252, "rewards/margins": 0.3528691828250885, "rewards/rejected": -2.4877803325653076, "step": 780 }, { "epoch": 0.205443601151531, "grad_norm": 5.125, "learning_rate": 2.899928080051837e-07, "logits/chosen": -2.446481227874756, "logits/rejected": -2.4267303943634033, "logps/chosen": -1.0310351848602295, "logps/rejected": -1.233411192893982, "loss": 1.6188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.062070369720459, "rewards/margins": 0.4047521948814392, "rewards/rejected": -2.466822385787964, "step": 785 }, { "epoch": 0.20675215912064904, "grad_norm": 9.9375, "learning_rate": 2.89745218147199e-07, "logits/chosen": -2.451665163040161, "logits/rejected": -2.317182779312134, "logps/chosen": -0.9822245836257935, "logps/rejected": -1.1709480285644531, "loss": 1.6093, "rewards/accuracies": 0.625, "rewards/chosen": -1.964449167251587, "rewards/margins": 0.3774469792842865, "rewards/rejected": -2.3418960571289062, "step": 790 }, { "epoch": 0.2080607170897671, "grad_norm": 11.4375, "learning_rate": 2.894947111026152e-07, "logits/chosen": -2.475515365600586, "logits/rejected": -2.394200325012207, "logps/chosen": -0.9711693525314331, "logps/rejected": -1.206620216369629, "loss": 1.5534, "rewards/accuracies": 0.625, "rewards/chosen": -1.9423387050628662, "rewards/margins": 0.47090157866477966, "rewards/rejected": -2.413240432739258, "step": 795 }, { "epoch": 0.2093692750588851, "grad_norm": 8.375, "learning_rate": 2.8924129210077626e-07, "logits/chosen": -2.4767403602600098, "logits/rejected": -2.416543483734131, "logps/chosen": -0.9062013626098633, "logps/rejected": -1.1419422626495361, "loss": 1.5829, "rewards/accuracies": 0.625, "rewards/chosen": -1.8124027252197266, "rewards/margins": 0.47148171067237854, "rewards/rejected": -2.2838845252990723, "step": 800 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -2.2393040657043457, "eval_logits/rejected": -2.134131908416748, "eval_logps/chosen": -0.9938094615936279, "eval_logps/rejected": -1.2007477283477783, "eval_loss": 1.588818073272705, "eval_rewards/accuracies": 0.6159999966621399, "eval_rewards/chosen": -1.9876189231872559, "eval_rewards/margins": 0.41387680172920227, "eval_rewards/rejected": -2.4014954566955566, "eval_runtime": 424.168, "eval_samples_per_second": 4.715, "eval_steps_per_second": 1.179, "step": 800 }, { "epoch": 0.21067783302800314, "grad_norm": 11.3125, "learning_rate": 2.889849664318132e-07, "logits/chosen": -2.4516258239746094, "logits/rejected": -2.4605178833007812, "logps/chosen": -0.9220935702323914, "logps/rejected": -1.135303020477295, "loss": 1.5889, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8441871404647827, "rewards/margins": 0.42641887068748474, "rewards/rejected": -2.27060604095459, "step": 805 }, { "epoch": 0.21198639099712116, "grad_norm": 10.0625, "learning_rate": 2.887257394465338e-07, "logits/chosen": -2.39811110496521, "logits/rejected": -2.3578684329986572, "logps/chosen": -0.9809758067131042, "logps/rejected": -1.2084648609161377, "loss": 1.5489, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9619516134262085, "rewards/margins": 0.4549782872200012, "rewards/rejected": -2.4169297218322754, "step": 810 }, { "epoch": 0.2132949489662392, "grad_norm": 9.125, "learning_rate": 2.8846361655631103e-07, "logits/chosen": -2.374671697616577, "logits/rejected": -2.256479501724243, "logps/chosen": -0.9819415807723999, "logps/rejected": -1.409139633178711, "loss": 1.3639, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9638831615447998, "rewards/margins": 0.8543959856033325, "rewards/rejected": -2.818279266357422, "step": 815 }, { "epoch": 0.21460350693535724, "grad_norm": 30.875, "learning_rate": 2.8819860323296986e-07, "logits/chosen": -2.355116844177246, "logits/rejected": -2.451261520385742, "logps/chosen": -0.9881532788276672, "logps/rejected": -1.1678146123886108, "loss": 1.6187, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9763065576553345, "rewards/margins": 0.35932278633117676, "rewards/rejected": -2.3356292247772217, "step": 820 }, { "epoch": 0.21591206490447526, "grad_norm": 6.03125, "learning_rate": 2.879307050086732e-07, "logits/chosen": -2.4813873767852783, "logits/rejected": -2.4763541221618652, "logps/chosen": -0.9986332654953003, "logps/rejected": -1.0292822122573853, "loss": 1.8177, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.9972665309906006, "rewards/margins": 0.06129790470004082, "rewards/rejected": -2.0585644245147705, "step": 825 }, { "epoch": 0.2172206228735933, "grad_norm": 8.125, "learning_rate": 2.876599274758065e-07, "logits/chosen": -2.5252673625946045, "logits/rejected": -2.3571643829345703, "logps/chosen": -1.0006084442138672, "logps/rejected": -1.1224209070205688, "loss": 1.7265, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0012168884277344, "rewards/margins": 0.24362501502037048, "rewards/rejected": -2.2448418140411377, "step": 830 }, { "epoch": 0.21852918084271133, "grad_norm": 7.03125, "learning_rate": 2.8738627628686065e-07, "logits/chosen": -2.4392199516296387, "logits/rejected": -2.2247982025146484, "logps/chosen": -0.9700218439102173, "logps/rejected": -1.2154910564422607, "loss": 1.5274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9400436878204346, "rewards/margins": 0.49093833565711975, "rewards/rejected": -2.4309821128845215, "step": 835 }, { "epoch": 0.21983773881182936, "grad_norm": 4.75, "learning_rate": 2.871097571543144e-07, "logits/chosen": -2.5041568279266357, "logits/rejected": -2.2985968589782715, "logps/chosen": -0.9054889678955078, "logps/rejected": -1.2235815525054932, "loss": 1.4383, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8109779357910156, "rewards/margins": 0.6361854076385498, "rewards/rejected": -2.4471631050109863, "step": 840 }, { "epoch": 0.22114629678094738, "grad_norm": 12.75, "learning_rate": 2.8683037585051496e-07, "logits/chosen": -2.390765428543091, "logits/rejected": -2.22505521774292, "logps/chosen": -0.9999173879623413, "logps/rejected": -1.2959444522857666, "loss": 1.4664, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9998347759246826, "rewards/margins": 0.5920537710189819, "rewards/rejected": -2.591888904571533, "step": 845 }, { "epoch": 0.22245485475006543, "grad_norm": 2.84375, "learning_rate": 2.865481382075573e-07, "logits/chosen": -2.6051478385925293, "logits/rejected": -2.356677293777466, "logps/chosen": -0.9005275964736938, "logps/rejected": -1.346327543258667, "loss": 1.2966, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8010551929473877, "rewards/margins": 0.8915997743606567, "rewards/rejected": -2.692655086517334, "step": 850 }, { "epoch": 0.22376341271918346, "grad_norm": 4.15625, "learning_rate": 2.862630501171626e-07, "logits/chosen": -2.3502492904663086, "logits/rejected": -2.301334857940674, "logps/chosen": -0.9207111597061157, "logps/rejected": -1.1572659015655518, "loss": 1.5354, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8414223194122314, "rewards/margins": 0.473109632730484, "rewards/rejected": -2.3145318031311035, "step": 855 }, { "epoch": 0.22507197068830148, "grad_norm": 10.25, "learning_rate": 2.8597511753055533e-07, "logits/chosen": -2.4075369834899902, "logits/rejected": -2.237473487854004, "logps/chosen": -0.9390727281570435, "logps/rejected": -1.1401338577270508, "loss": 1.584, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.878145456314087, "rewards/margins": 0.4021223485469818, "rewards/rejected": -2.2802677154541016, "step": 860 }, { "epoch": 0.22638052865741953, "grad_norm": 13.1875, "learning_rate": 2.85684346458339e-07, "logits/chosen": -2.4613823890686035, "logits/rejected": -2.313011407852173, "logps/chosen": -0.9659273028373718, "logps/rejected": -1.0467588901519775, "loss": 1.7549, "rewards/accuracies": 0.5, "rewards/chosen": -1.9318546056747437, "rewards/margins": 0.1616632640361786, "rewards/rejected": -2.093517780303955, "step": 865 }, { "epoch": 0.22768908662653756, "grad_norm": 19.375, "learning_rate": 2.8539074297037035e-07, "logits/chosen": -2.412383794784546, "logits/rejected": -2.2774882316589355, "logps/chosen": -1.0244829654693604, "logps/rejected": -1.2398474216461182, "loss": 1.5447, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0489659309387207, "rewards/margins": 0.43072906136512756, "rewards/rejected": -2.4796948432922363, "step": 870 }, { "epoch": 0.22899764459565558, "grad_norm": 7.25, "learning_rate": 2.8509431319563316e-07, "logits/chosen": -2.3457627296447754, "logits/rejected": -2.365752935409546, "logps/chosen": -0.9948088526725769, "logps/rejected": -1.1564724445343018, "loss": 1.6531, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9896177053451538, "rewards/margins": 0.3233271539211273, "rewards/rejected": -2.3129448890686035, "step": 875 }, { "epoch": 0.23030620256477363, "grad_norm": 10.75, "learning_rate": 2.8479506332210984e-07, "logits/chosen": -2.424428701400757, "logits/rejected": -2.2465081214904785, "logps/chosen": -0.987517237663269, "logps/rejected": -1.3590209484100342, "loss": 1.4245, "rewards/accuracies": 0.6875, "rewards/chosen": -1.975034475326538, "rewards/margins": 0.7430071234703064, "rewards/rejected": -2.7180418968200684, "step": 880 }, { "epoch": 0.23161476053389166, "grad_norm": 19.25, "learning_rate": 2.8449299959665266e-07, "logits/chosen": -2.380384683609009, "logits/rejected": -2.3756532669067383, "logps/chosen": -1.1722111701965332, "logps/rejected": -1.2338510751724243, "loss": 1.8308, "rewards/accuracies": 0.5, "rewards/chosen": -2.3444223403930664, "rewards/margins": 0.1232796460390091, "rewards/rejected": -2.4677021503448486, "step": 885 }, { "epoch": 0.23292331850300968, "grad_norm": 9.375, "learning_rate": 2.8418812832485306e-07, "logits/chosen": -2.4375529289245605, "logits/rejected": -2.3899483680725098, "logps/chosen": -0.952872633934021, "logps/rejected": -1.084661602973938, "loss": 1.6603, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.905745267868042, "rewards/margins": 0.26357781887054443, "rewards/rejected": -2.169323205947876, "step": 890 }, { "epoch": 0.2342318764721277, "grad_norm": 12.375, "learning_rate": 2.8388045587091005e-07, "logits/chosen": -2.420869827270508, "logits/rejected": -2.4796133041381836, "logps/chosen": -1.0273797512054443, "logps/rejected": -1.150836706161499, "loss": 1.6473, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0547595024108887, "rewards/margins": 0.24691441655158997, "rewards/rejected": -2.301673412322998, "step": 895 }, { "epoch": 0.23554043444124576, "grad_norm": 5.40625, "learning_rate": 2.8356998865749757e-07, "logits/chosen": -2.378568410873413, "logits/rejected": -2.239475965499878, "logps/chosen": -0.925240159034729, "logps/rejected": -1.1964004039764404, "loss": 1.523, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.850480318069458, "rewards/margins": 0.5423206686973572, "rewards/rejected": -2.392800807952881, "step": 900 }, { "epoch": 0.23684899241036378, "grad_norm": 13.0, "learning_rate": 2.8325673316563016e-07, "logits/chosen": -2.447786808013916, "logits/rejected": -2.326889753341675, "logps/chosen": -0.983212947845459, "logps/rejected": -1.1922987699508667, "loss": 1.6518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.966425895690918, "rewards/margins": 0.4181714951992035, "rewards/rejected": -2.3845975399017334, "step": 905 }, { "epoch": 0.2381575503794818, "grad_norm": 12.625, "learning_rate": 2.8294069593452774e-07, "logits/chosen": -2.439202070236206, "logits/rejected": -2.3414106369018555, "logps/chosen": -0.9200547337532043, "logps/rejected": -1.1782020330429077, "loss": 1.5378, "rewards/accuracies": 0.625, "rewards/chosen": -1.8401094675064087, "rewards/margins": 0.5162947177886963, "rewards/rejected": -2.3564040660858154, "step": 910 }, { "epoch": 0.23946610834859985, "grad_norm": 14.5625, "learning_rate": 2.8262188356147927e-07, "logits/chosen": -2.390491008758545, "logits/rejected": -2.3385746479034424, "logps/chosen": -0.9335182309150696, "logps/rejected": -1.1045535802841187, "loss": 1.6357, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8670364618301392, "rewards/margins": 0.342070996761322, "rewards/rejected": -2.2091071605682373, "step": 915 }, { "epoch": 0.24077466631771788, "grad_norm": 4.75, "learning_rate": 2.823003027017048e-07, "logits/chosen": -2.524477481842041, "logits/rejected": -2.4056529998779297, "logps/chosen": -0.9904319643974304, "logps/rejected": -1.1246320009231567, "loss": 1.6748, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9808639287948608, "rewards/margins": 0.2684001624584198, "rewards/rejected": -2.2492640018463135, "step": 920 }, { "epoch": 0.2420832242868359, "grad_norm": 7.5, "learning_rate": 2.8197596006821675e-07, "logits/chosen": -2.4536209106445312, "logits/rejected": -2.355556011199951, "logps/chosen": -0.966331958770752, "logps/rejected": -1.0445019006729126, "loss": 1.7292, "rewards/accuracies": 0.5, "rewards/chosen": -1.932663917541504, "rewards/margins": 0.15633949637413025, "rewards/rejected": -2.089003801345825, "step": 925 }, { "epoch": 0.24339178225595393, "grad_norm": 12.0, "learning_rate": 2.8164886243167953e-07, "logits/chosen": -2.4395382404327393, "logits/rejected": -2.3719687461853027, "logps/chosen": -0.9285646677017212, "logps/rejected": -1.1775720119476318, "loss": 1.5198, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8571293354034424, "rewards/margins": 0.4980148673057556, "rewards/rejected": -2.3551440238952637, "step": 930 }, { "epoch": 0.24470034022507198, "grad_norm": 27.875, "learning_rate": 2.813190166202684e-07, "logits/chosen": -2.431688070297241, "logits/rejected": -2.2427709102630615, "logps/chosen": -0.943575382232666, "logps/rejected": -1.1530876159667969, "loss": 1.5925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.887150764465332, "rewards/margins": 0.41902461647987366, "rewards/rejected": -2.3061752319335938, "step": 935 }, { "epoch": 0.24600889819419, "grad_norm": 8.75, "learning_rate": 2.8098642951952686e-07, "logits/chosen": -2.4145593643188477, "logits/rejected": -2.2109591960906982, "logps/chosen": -1.0051122903823853, "logps/rejected": -1.3509918451309204, "loss": 1.4393, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0102245807647705, "rewards/margins": 0.6917588114738464, "rewards/rejected": -2.701983690261841, "step": 940 }, { "epoch": 0.24731745616330802, "grad_norm": 14.25, "learning_rate": 2.80651108072223e-07, "logits/chosen": -2.3588650226593018, "logits/rejected": -2.299746036529541, "logps/chosen": -0.9482453465461731, "logps/rejected": -1.114438772201538, "loss": 1.6168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8964906930923462, "rewards/margins": 0.33238688111305237, "rewards/rejected": -2.228877544403076, "step": 945 }, { "epoch": 0.24862601413242608, "grad_norm": 24.25, "learning_rate": 2.803130592782044e-07, "logits/chosen": -2.4514098167419434, "logits/rejected": -2.41989803314209, "logps/chosen": -1.1074198484420776, "logps/rejected": -1.2037928104400635, "loss": 1.7817, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.2148396968841553, "rewards/margins": 0.19274567067623138, "rewards/rejected": -2.407585620880127, "step": 950 }, { "epoch": 0.2499345721015441, "grad_norm": 7.125, "learning_rate": 2.799722901942521e-07, "logits/chosen": -2.520164728164673, "logits/rejected": -2.3973140716552734, "logps/chosen": -0.9990448951721191, "logps/rejected": -1.183989405632019, "loss": 1.6199, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9980897903442383, "rewards/margins": 0.3698894679546356, "rewards/rejected": -2.367978811264038, "step": 955 }, { "epoch": 0.2512431300706621, "grad_norm": 4.28125, "learning_rate": 2.7962880793393343e-07, "logits/chosen": -2.4670472145080566, "logits/rejected": -2.2736260890960693, "logps/chosen": -1.0069376230239868, "logps/rejected": -1.2014423608779907, "loss": 1.6586, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0138752460479736, "rewards/margins": 0.3890094459056854, "rewards/rejected": -2.4028847217559814, "step": 960 }, { "epoch": 0.2525516880397802, "grad_norm": 4.8125, "learning_rate": 2.7928261966745325e-07, "logits/chosen": -2.3300890922546387, "logits/rejected": -2.370814085006714, "logps/chosen": -1.0185413360595703, "logps/rejected": -1.3102461099624634, "loss": 1.5178, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0370826721191406, "rewards/margins": 0.5834096670150757, "rewards/rejected": -2.6204922199249268, "step": 965 }, { "epoch": 0.25386024600889817, "grad_norm": 11.0625, "learning_rate": 2.789337326215044e-07, "logits/chosen": -2.4394021034240723, "logits/rejected": -2.340552568435669, "logps/chosen": -1.0584160089492798, "logps/rejected": -1.2664167881011963, "loss": 1.6099, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1168320178985596, "rewards/margins": 0.416001558303833, "rewards/rejected": -2.5328335762023926, "step": 970 }, { "epoch": 0.2551688039780162, "grad_norm": 14.625, "learning_rate": 2.785821540791169e-07, "logits/chosen": -2.4387996196746826, "logits/rejected": -2.4485208988189697, "logps/chosen": -1.1076973676681519, "logps/rejected": -1.1626137495040894, "loss": 1.7697, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2153947353363037, "rewards/margins": 0.10983259975910187, "rewards/rejected": -2.3252274990081787, "step": 975 }, { "epoch": 0.2564773619471343, "grad_norm": 6.5, "learning_rate": 2.7822789137950586e-07, "logits/chosen": -2.540992498397827, "logits/rejected": -2.415658712387085, "logps/chosen": -1.0076406002044678, "logps/rejected": -1.133551836013794, "loss": 1.6932, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0152812004089355, "rewards/margins": 0.2518223822116852, "rewards/rejected": -2.267103672027588, "step": 980 }, { "epoch": 0.25778591991625227, "grad_norm": 6.875, "learning_rate": 2.778709519179181e-07, "logits/chosen": -2.3798279762268066, "logits/rejected": -2.1673123836517334, "logps/chosen": -1.0493723154067993, "logps/rejected": -1.3648525476455688, "loss": 1.4621, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0987446308135986, "rewards/margins": 0.6309604644775391, "rewards/rejected": -2.7297050952911377, "step": 985 }, { "epoch": 0.2590944778853703, "grad_norm": 18.25, "learning_rate": 2.7751134314547823e-07, "logits/chosen": -2.4644808769226074, "logits/rejected": -2.357712507247925, "logps/chosen": -1.0473966598510742, "logps/rejected": -1.1457302570343018, "loss": 1.7572, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.0947933197021484, "rewards/margins": 0.19666709005832672, "rewards/rejected": -2.2914605140686035, "step": 990 }, { "epoch": 0.2604030358544884, "grad_norm": 10.125, "learning_rate": 2.771490725690327e-07, "logits/chosen": -2.354029417037964, "logits/rejected": -2.3697075843811035, "logps/chosen": -0.9810983538627625, "logps/rejected": -1.168823003768921, "loss": 1.6831, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.962196707725525, "rewards/margins": 0.37544897198677063, "rewards/rejected": -2.337646007537842, "step": 995 }, { "epoch": 0.26171159382360637, "grad_norm": 7.71875, "learning_rate": 2.767841477509931e-07, "logits/chosen": -2.399759531021118, "logits/rejected": -2.1953818798065186, "logps/chosen": -1.0013476610183716, "logps/rejected": -1.2377477884292603, "loss": 1.5488, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.002695322036743, "rewards/margins": 0.47280043363571167, "rewards/rejected": -2.4754955768585205, "step": 1000 }, { "epoch": 0.2630201517927244, "grad_norm": 10.0, "learning_rate": 2.7641657630917846e-07, "logits/chosen": -2.4585630893707275, "logits/rejected": -2.3352015018463135, "logps/chosen": -0.9908092617988586, "logps/rejected": -1.2345106601715088, "loss": 1.5765, "rewards/accuracies": 0.625, "rewards/chosen": -1.9816185235977173, "rewards/margins": 0.487403005361557, "rewards/rejected": -2.4690213203430176, "step": 1005 }, { "epoch": 0.2643287097618425, "grad_norm": 7.53125, "learning_rate": 2.760463659166564e-07, "logits/chosen": -2.401664972305298, "logits/rejected": -2.3110299110412598, "logps/chosen": -1.0222747325897217, "logps/rejected": -1.2789936065673828, "loss": 1.6744, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0445494651794434, "rewards/margins": 0.5134380459785461, "rewards/rejected": -2.5579872131347656, "step": 1010 }, { "epoch": 0.26563726773096047, "grad_norm": 4.71875, "learning_rate": 2.7567352430158243e-07, "logits/chosen": -2.488914728164673, "logits/rejected": -2.3880484104156494, "logps/chosen": -1.0811675786972046, "logps/rejected": -1.2931978702545166, "loss": 1.6414, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.162335157394409, "rewards/margins": 0.42406076192855835, "rewards/rejected": -2.586395740509033, "step": 1015 }, { "epoch": 0.2669458257000785, "grad_norm": 6.5, "learning_rate": 2.752980592470391e-07, "logits/chosen": -2.318246841430664, "logits/rejected": -2.20145583152771, "logps/chosen": -0.9236286878585815, "logps/rejected": -1.263554573059082, "loss": 1.4318, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.847257375717163, "rewards/margins": 0.6798514127731323, "rewards/rejected": -2.527109146118164, "step": 1020 }, { "epoch": 0.26825438366919657, "grad_norm": 21.375, "learning_rate": 2.7491997859087335e-07, "logits/chosen": -2.447873592376709, "logits/rejected": -2.30826997756958, "logps/chosen": -1.0539740324020386, "logps/rejected": -1.288760781288147, "loss": 1.5409, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.107948064804077, "rewards/margins": 0.469573438167572, "rewards/rejected": -2.577521562576294, "step": 1025 }, { "epoch": 0.26956294163831457, "grad_norm": 7.46875, "learning_rate": 2.745392902255328e-07, "logits/chosen": -2.3910837173461914, "logits/rejected": -2.315263509750366, "logps/chosen": -1.0552270412445068, "logps/rejected": -1.3487416505813599, "loss": 1.5118, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1104540824890137, "rewards/margins": 0.5870293378829956, "rewards/rejected": -2.6974833011627197, "step": 1030 }, { "epoch": 0.2708714996074326, "grad_norm": 6.59375, "learning_rate": 2.7415600209790114e-07, "logits/chosen": -2.332489013671875, "logits/rejected": -2.309741735458374, "logps/chosen": -1.0340750217437744, "logps/rejected": -1.345901370048523, "loss": 1.5339, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.068150043487549, "rewards/margins": 0.6236530542373657, "rewards/rejected": -2.691802740097046, "step": 1035 }, { "epoch": 0.2721800575765506, "grad_norm": 17.0, "learning_rate": 2.737701222091323e-07, "logits/chosen": -2.3910536766052246, "logits/rejected": -2.252192258834839, "logps/chosen": -1.0110094547271729, "logps/rejected": -1.207939863204956, "loss": 1.6291, "rewards/accuracies": 0.625, "rewards/chosen": -2.0220189094543457, "rewards/margins": 0.3938608467578888, "rewards/rejected": -2.415879726409912, "step": 1040 }, { "epoch": 0.27348861554566867, "grad_norm": 4.46875, "learning_rate": 2.7338165861448324e-07, "logits/chosen": -2.5505199432373047, "logits/rejected": -2.402164936065674, "logps/chosen": -0.9761930704116821, "logps/rejected": -1.1321427822113037, "loss": 1.6151, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9523861408233643, "rewards/margins": 0.31189969182014465, "rewards/rejected": -2.2642855644226074, "step": 1045 }, { "epoch": 0.2747971735147867, "grad_norm": 15.0, "learning_rate": 2.729906194231457e-07, "logits/chosen": -2.3267905712127686, "logits/rejected": -2.367274045944214, "logps/chosen": -0.9572963714599609, "logps/rejected": -1.1391490697860718, "loss": 1.6352, "rewards/accuracies": 0.625, "rewards/chosen": -1.9145927429199219, "rewards/margins": 0.36370545625686646, "rewards/rejected": -2.2782981395721436, "step": 1050 }, { "epoch": 0.2761057314839047, "grad_norm": 7.96875, "learning_rate": 2.7259701279807757e-07, "logits/chosen": -2.4183857440948486, "logits/rejected": -2.440964698791504, "logps/chosen": -1.0021231174468994, "logps/rejected": -1.1307986974716187, "loss": 1.7103, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.004246234893799, "rewards/margins": 0.2573511004447937, "rewards/rejected": -2.2615973949432373, "step": 1055 }, { "epoch": 0.27741428945302277, "grad_norm": 10.0, "learning_rate": 2.7220084695583143e-07, "logits/chosen": -2.4503607749938965, "logits/rejected": -2.2314000129699707, "logps/chosen": -0.9595519304275513, "logps/rejected": -1.2621471881866455, "loss": 1.5366, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9191038608551025, "rewards/margins": 0.6051904559135437, "rewards/rejected": -2.524294376373291, "step": 1060 }, { "epoch": 0.2787228474221408, "grad_norm": 11.875, "learning_rate": 2.7180213016638404e-07, "logits/chosen": -2.3533949851989746, "logits/rejected": -2.289022445678711, "logps/chosen": -1.0468542575836182, "logps/rejected": -1.2864720821380615, "loss": 1.5589, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0937085151672363, "rewards/margins": 0.4792355000972748, "rewards/rejected": -2.572944164276123, "step": 1065 }, { "epoch": 0.2800314053912588, "grad_norm": 14.8125, "learning_rate": 2.7140087075296304e-07, "logits/chosen": -2.4480719566345215, "logits/rejected": -2.3734869956970215, "logps/chosen": -0.9196842312812805, "logps/rejected": -1.1853047609329224, "loss": 1.5218, "rewards/accuracies": 0.5625, "rewards/chosen": -1.839368462562561, "rewards/margins": 0.5312410593032837, "rewards/rejected": -2.3706095218658447, "step": 1070 }, { "epoch": 0.28133996336037687, "grad_norm": 10.8125, "learning_rate": 2.709970770918736e-07, "logits/chosen": -2.444913387298584, "logits/rejected": -2.2555317878723145, "logps/chosen": -0.9042195081710815, "logps/rejected": -1.236684799194336, "loss": 1.4722, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.808439016342163, "rewards/margins": 0.6649302840232849, "rewards/rejected": -2.473369598388672, "step": 1075 }, { "epoch": 0.2826485213294949, "grad_norm": 13.625, "learning_rate": 2.7059075761232333e-07, "logits/chosen": -2.3760976791381836, "logits/rejected": -2.3014397621154785, "logps/chosen": -1.0100362300872803, "logps/rejected": -1.1621885299682617, "loss": 1.645, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0200724601745605, "rewards/margins": 0.3043045401573181, "rewards/rejected": -2.3243770599365234, "step": 1080 }, { "epoch": 0.2839570792986129, "grad_norm": 18.25, "learning_rate": 2.701819207962464e-07, "logits/chosen": -2.3184731006622314, "logits/rejected": -2.266268730163574, "logps/chosen": -1.016550064086914, "logps/rejected": -1.1386358737945557, "loss": 1.707, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.033100128173828, "rewards/margins": 0.2441713809967041, "rewards/rejected": -2.2772717475891113, "step": 1085 }, { "epoch": 0.28526563726773096, "grad_norm": 14.4375, "learning_rate": 2.697705751781264e-07, "logits/chosen": -2.317656993865967, "logits/rejected": -2.374851942062378, "logps/chosen": -1.048663854598999, "logps/rejected": -1.1214885711669922, "loss": 1.7853, "rewards/accuracies": 0.5625, "rewards/chosen": -2.097327709197998, "rewards/margins": 0.1456489861011505, "rewards/rejected": -2.2429771423339844, "step": 1090 }, { "epoch": 0.286574195236849, "grad_norm": 34.75, "learning_rate": 2.693567293448184e-07, "logits/chosen": -2.4381680488586426, "logits/rejected": -2.405142307281494, "logps/chosen": -0.9904729127883911, "logps/rejected": -1.1277961730957031, "loss": 1.7273, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9809458255767822, "rewards/margins": 0.2746466398239136, "rewards/rejected": -2.2555923461914062, "step": 1095 }, { "epoch": 0.287882753205967, "grad_norm": 21.875, "learning_rate": 2.689403919353695e-07, "logits/chosen": -2.3633666038513184, "logits/rejected": -2.2658963203430176, "logps/chosen": -1.0258548259735107, "logps/rejected": -1.2487916946411133, "loss": 1.6042, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0517096519470215, "rewards/margins": 0.4458739757537842, "rewards/rejected": -2.4975833892822266, "step": 1100 }, { "epoch": 0.28919131117508506, "grad_norm": 6.6875, "learning_rate": 2.6852157164083845e-07, "logits/chosen": -2.2697737216949463, "logits/rejected": -2.250499725341797, "logps/chosen": -1.0883874893188477, "logps/rejected": -1.268333077430725, "loss": 1.6582, "rewards/accuracies": 0.5, "rewards/chosen": -2.1767749786376953, "rewards/margins": 0.35989099740982056, "rewards/rejected": -2.53666615486145, "step": 1105 }, { "epoch": 0.2904998691442031, "grad_norm": 13.625, "learning_rate": 2.681002772041145e-07, "logits/chosen": -2.291240692138672, "logits/rejected": -2.2009472846984863, "logps/chosen": -1.006746768951416, "logps/rejected": -1.197805404663086, "loss": 1.6315, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.013493537902832, "rewards/margins": 0.38211748003959656, "rewards/rejected": -2.395610809326172, "step": 1110 }, { "epoch": 0.2918084271133211, "grad_norm": 12.0625, "learning_rate": 2.6767651741973446e-07, "logits/chosen": -2.37373948097229, "logits/rejected": -2.3151698112487793, "logps/chosen": -1.062633752822876, "logps/rejected": -1.093269944190979, "loss": 1.7856, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.125267505645752, "rewards/margins": 0.061272718012332916, "rewards/rejected": -2.186539888381958, "step": 1115 }, { "epoch": 0.29311698508243916, "grad_norm": 10.4375, "learning_rate": 2.672503011336996e-07, "logits/chosen": -2.504060745239258, "logits/rejected": -2.2818965911865234, "logps/chosen": -0.9543046951293945, "logps/rejected": -1.296682596206665, "loss": 1.4271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.908609390258789, "rewards/margins": 0.6847559213638306, "rewards/rejected": -2.59336519241333, "step": 1120 }, { "epoch": 0.29442554305155716, "grad_norm": 5.125, "learning_rate": 2.6682163724329064e-07, "logits/chosen": -2.376828670501709, "logits/rejected": -2.3670010566711426, "logps/chosen": -0.9661476016044617, "logps/rejected": -1.1339595317840576, "loss": 1.6376, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9322952032089233, "rewards/margins": 0.33562371134757996, "rewards/rejected": -2.2679190635681152, "step": 1125 }, { "epoch": 0.2957341010206752, "grad_norm": 7.90625, "learning_rate": 2.6639053469688214e-07, "logits/chosen": -2.431734561920166, "logits/rejected": -2.3546271324157715, "logps/chosen": -0.9448612332344055, "logps/rejected": -1.1339503526687622, "loss": 1.6075, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.889722466468811, "rewards/margins": 0.37817806005477905, "rewards/rejected": -2.2679007053375244, "step": 1130 }, { "epoch": 0.29704265898979326, "grad_norm": 7.0, "learning_rate": 2.6595700249375574e-07, "logits/chosen": -2.438298225402832, "logits/rejected": -2.3050029277801514, "logps/chosen": -0.9640270471572876, "logps/rejected": -1.1462175846099854, "loss": 1.628, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9280540943145752, "rewards/margins": 0.36438119411468506, "rewards/rejected": -2.2924351692199707, "step": 1135 }, { "epoch": 0.29835121695891126, "grad_norm": 9.375, "learning_rate": 2.655210496839122e-07, "logits/chosen": -2.329777479171753, "logits/rejected": -2.202432155609131, "logps/chosen": -0.961220383644104, "logps/rejected": -1.2406482696533203, "loss": 1.5502, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.922440767288208, "rewards/margins": 0.5588559508323669, "rewards/rejected": -2.4812965393066406, "step": 1140 }, { "epoch": 0.2996597749280293, "grad_norm": 7.625, "learning_rate": 2.6508268536788254e-07, "logits/chosen": -2.5535147190093994, "logits/rejected": -2.2793781757354736, "logps/chosen": -0.9079625010490417, "logps/rejected": -1.1809990406036377, "loss": 1.5005, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8159250020980835, "rewards/margins": 0.5460728406906128, "rewards/rejected": -2.3619980812072754, "step": 1145 }, { "epoch": 0.30096833289714736, "grad_norm": 15.625, "learning_rate": 2.6464191869653816e-07, "logits/chosen": -2.4130797386169434, "logits/rejected": -2.321037769317627, "logps/chosen": -1.0572654008865356, "logps/rejected": -1.2880295515060425, "loss": 1.595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1145308017730713, "rewards/margins": 0.4615286886692047, "rewards/rejected": -2.576059103012085, "step": 1150 }, { "epoch": 0.30227689086626536, "grad_norm": 7.9375, "learning_rate": 2.6419875887089947e-07, "logits/chosen": -2.3939616680145264, "logits/rejected": -2.2619128227233887, "logps/chosen": -1.0411205291748047, "logps/rejected": -1.2986230850219727, "loss": 1.5908, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0822410583496094, "rewards/margins": 0.5150050520896912, "rewards/rejected": -2.5972461700439453, "step": 1155 }, { "epoch": 0.3035854488353834, "grad_norm": 7.90625, "learning_rate": 2.6375321514194433e-07, "logits/chosen": -2.389794111251831, "logits/rejected": -2.3115391731262207, "logps/chosen": -0.9055787920951843, "logps/rejected": -1.1313393115997314, "loss": 1.5647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8111575841903687, "rewards/margins": 0.45152121782302856, "rewards/rejected": -2.262678623199463, "step": 1160 }, { "epoch": 0.30489400680450146, "grad_norm": 5.09375, "learning_rate": 2.633052968104146e-07, "logits/chosen": -2.454319953918457, "logits/rejected": -2.3286030292510986, "logps/chosen": -1.0062270164489746, "logps/rejected": -1.2692174911499023, "loss": 1.506, "rewards/accuracies": 0.6875, "rewards/chosen": -2.012454032897949, "rewards/margins": 0.5259808897972107, "rewards/rejected": -2.5384349822998047, "step": 1165 }, { "epoch": 0.30620256477361946, "grad_norm": 8.0, "learning_rate": 2.628550132266219e-07, "logits/chosen": -2.4439146518707275, "logits/rejected": -2.2944960594177246, "logps/chosen": -1.0077712535858154, "logps/rejected": -1.2314029932022095, "loss": 1.6038, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.015542507171631, "rewards/margins": 0.44726333022117615, "rewards/rejected": -2.462805986404419, "step": 1170 }, { "epoch": 0.3075111227427375, "grad_norm": 11.1875, "learning_rate": 2.624023737902529e-07, "logits/chosen": -2.328723907470703, "logits/rejected": -2.208144187927246, "logps/chosen": -1.017555832862854, "logps/rejected": -1.2412947416305542, "loss": 1.5698, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.035111665725708, "rewards/margins": 0.4474780559539795, "rewards/rejected": -2.4825894832611084, "step": 1175 }, { "epoch": 0.30881968071185556, "grad_norm": 18.5, "learning_rate": 2.619473879501724e-07, "logits/chosen": -2.3947043418884277, "logits/rejected": -2.329451322555542, "logps/chosen": -0.9892759323120117, "logps/rejected": -1.190051794052124, "loss": 1.6029, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9785518646240234, "rewards/margins": 0.40155211091041565, "rewards/rejected": -2.380103588104248, "step": 1180 }, { "epoch": 0.31012823868097356, "grad_norm": 5.03125, "learning_rate": 2.614900652042266e-07, "logits/chosen": -2.391979694366455, "logits/rejected": -2.292463541030884, "logps/chosen": -0.9539421200752258, "logps/rejected": -1.0902842283248901, "loss": 1.6706, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9078842401504517, "rewards/margins": 0.27268412709236145, "rewards/rejected": -2.1805684566497803, "step": 1185 }, { "epoch": 0.3114367966500916, "grad_norm": 9.9375, "learning_rate": 2.6103041509904496e-07, "logits/chosen": -2.4043469429016113, "logits/rejected": -2.391824722290039, "logps/chosen": -0.9655801653862, "logps/rejected": -1.1953800916671753, "loss": 1.6425, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9311603307724, "rewards/margins": 0.45959973335266113, "rewards/rejected": -2.3907601833343506, "step": 1190 }, { "epoch": 0.3127453546192096, "grad_norm": 8.375, "learning_rate": 2.6056844722984026e-07, "logits/chosen": -2.272109270095825, "logits/rejected": -2.1874499320983887, "logps/chosen": -1.068488597869873, "logps/rejected": -1.3128079175949097, "loss": 1.6331, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.136977195739746, "rewards/margins": 0.4886387884616852, "rewards/rejected": -2.6256158351898193, "step": 1195 }, { "epoch": 0.31405391258832765, "grad_norm": 15.9375, "learning_rate": 2.601041712402089e-07, "logits/chosen": -2.4291703701019287, "logits/rejected": -2.283667802810669, "logps/chosen": -0.9904572367668152, "logps/rejected": -1.2777166366577148, "loss": 1.5829, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9809144735336304, "rewards/margins": 0.5745185613632202, "rewards/rejected": -2.5554332733154297, "step": 1200 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -2.235567092895508, "eval_logits/rejected": -2.1315157413482666, "eval_logps/chosen": -0.9915000796318054, "eval_logps/rejected": -1.231597900390625, "eval_loss": 1.5655561685562134, "eval_rewards/accuracies": 0.6234999895095825, "eval_rewards/chosen": -1.9830001592636108, "eval_rewards/margins": 0.4801955819129944, "eval_rewards/rejected": -2.46319580078125, "eval_runtime": 424.0046, "eval_samples_per_second": 4.717, "eval_steps_per_second": 1.179, "step": 1200 }, { "epoch": 0.3153624705574457, "grad_norm": 8.375, "learning_rate": 2.596375968219294e-07, "logits/chosen": -2.4649062156677246, "logits/rejected": -2.2643074989318848, "logps/chosen": -0.9553738832473755, "logps/rejected": -1.394248127937317, "loss": 1.3136, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.910747766494751, "rewards/margins": 0.8777486681938171, "rewards/rejected": -2.788496255874634, "step": 1205 }, { "epoch": 0.3166710285265637, "grad_norm": 8.25, "learning_rate": 2.5916873371475993e-07, "logits/chosen": -2.374929428100586, "logits/rejected": -2.3392183780670166, "logps/chosen": -1.1504192352294922, "logps/rejected": -1.3420312404632568, "loss": 1.6874, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.3008384704589844, "rewards/margins": 0.3832238018512726, "rewards/rejected": -2.6840624809265137, "step": 1210 }, { "epoch": 0.31797958649568175, "grad_norm": 22.625, "learning_rate": 2.5869759170623527e-07, "logits/chosen": -2.4541077613830566, "logits/rejected": -2.2307677268981934, "logps/chosen": -0.9221817255020142, "logps/rejected": -1.206541657447815, "loss": 1.4448, "rewards/accuracies": 0.75, "rewards/chosen": -1.8443634510040283, "rewards/margins": 0.5687195062637329, "rewards/rejected": -2.41308331489563, "step": 1215 }, { "epoch": 0.3192881444647998, "grad_norm": 11.0625, "learning_rate": 2.5822418063146223e-07, "logits/chosen": -2.3491275310516357, "logits/rejected": -2.2686352729797363, "logps/chosen": -0.977401077747345, "logps/rejected": -1.294829249382019, "loss": 1.4758, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.95480215549469, "rewards/margins": 0.6348565816879272, "rewards/rejected": -2.589658498764038, "step": 1220 }, { "epoch": 0.3205967024339178, "grad_norm": 12.25, "learning_rate": 2.5774851037291444e-07, "logits/chosen": -2.315556764602661, "logits/rejected": -2.3250012397766113, "logps/chosen": -1.0274797677993774, "logps/rejected": -1.2594670057296753, "loss": 1.594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.054959535598755, "rewards/margins": 0.4639747142791748, "rewards/rejected": -2.5189340114593506, "step": 1225 }, { "epoch": 0.32190526040303585, "grad_norm": 16.125, "learning_rate": 2.5727059086022633e-07, "logits/chosen": -2.3673152923583984, "logits/rejected": -2.2988669872283936, "logps/chosen": -1.01120126247406, "logps/rejected": -1.198730707168579, "loss": 1.6069, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.02240252494812, "rewards/margins": 0.3750587999820709, "rewards/rejected": -2.397461414337158, "step": 1230 }, { "epoch": 0.3232138183721539, "grad_norm": 10.5, "learning_rate": 2.5679043206998524e-07, "logits/chosen": -2.469179630279541, "logits/rejected": -2.3566226959228516, "logps/chosen": -1.0186004638671875, "logps/rejected": -1.146507978439331, "loss": 1.7771, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.037200927734375, "rewards/margins": 0.2558150887489319, "rewards/rejected": -2.293015956878662, "step": 1235 }, { "epoch": 0.3245223763412719, "grad_norm": 13.0625, "learning_rate": 2.5630804402552375e-07, "logits/chosen": -2.37241268157959, "logits/rejected": -2.408428192138672, "logps/chosen": -0.951165497303009, "logps/rejected": -1.1685595512390137, "loss": 1.5673, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.902330994606018, "rewards/margins": 0.43478816747665405, "rewards/rejected": -2.3371191024780273, "step": 1240 }, { "epoch": 0.32583093431038995, "grad_norm": 20.25, "learning_rate": 2.5582343679671013e-07, "logits/chosen": -2.381155252456665, "logits/rejected": -2.2607316970825195, "logps/chosen": -0.9428825378417969, "logps/rejected": -1.1458461284637451, "loss": 1.5702, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8857650756835938, "rewards/margins": 0.4059273302555084, "rewards/rejected": -2.2916922569274902, "step": 1245 }, { "epoch": 0.327139492279508, "grad_norm": 11.625, "learning_rate": 2.553366204997382e-07, "logits/chosen": -2.3997268676757812, "logits/rejected": -2.3447718620300293, "logps/chosen": -0.9511159658432007, "logps/rejected": -1.2433512210845947, "loss": 1.5067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9022319316864014, "rewards/margins": 0.5844705104827881, "rewards/rejected": -2.4867024421691895, "step": 1250 }, { "epoch": 0.328448050248626, "grad_norm": 19.875, "learning_rate": 2.548476052969162e-07, "logits/chosen": -2.4176876544952393, "logits/rejected": -2.331204414367676, "logps/chosen": -0.9887922406196594, "logps/rejected": -1.2208980321884155, "loss": 1.5765, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9775844812393188, "rewards/margins": 0.46421152353286743, "rewards/rejected": -2.441796064376831, "step": 1255 }, { "epoch": 0.32975660821774405, "grad_norm": 14.875, "learning_rate": 2.5435640139645454e-07, "logits/chosen": -2.4489705562591553, "logits/rejected": -2.2676053047180176, "logps/chosen": -1.0713706016540527, "logps/rejected": -1.298412799835205, "loss": 1.5895, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1427412033081055, "rewards/margins": 0.45408445596694946, "rewards/rejected": -2.59682559967041, "step": 1260 }, { "epoch": 0.3310651661868621, "grad_norm": 8.5625, "learning_rate": 2.5386301905225284e-07, "logits/chosen": -2.3240914344787598, "logits/rejected": -2.2950937747955322, "logps/chosen": -0.9917286038398743, "logps/rejected": -1.2593555450439453, "loss": 1.4989, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9834572076797485, "rewards/margins": 0.5352541208267212, "rewards/rejected": -2.5187110900878906, "step": 1265 }, { "epoch": 0.3323737241559801, "grad_norm": 24.5, "learning_rate": 2.5336746856368584e-07, "logits/chosen": -2.440915584564209, "logits/rejected": -2.2676079273223877, "logps/chosen": -1.0991510152816772, "logps/rejected": -1.2505751848220825, "loss": 1.7503, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.1983020305633545, "rewards/margins": 0.30284810066223145, "rewards/rejected": -2.501150369644165, "step": 1270 }, { "epoch": 0.33368228212509815, "grad_norm": 17.125, "learning_rate": 2.5286976027538814e-07, "logits/chosen": -2.491551399230957, "logits/rejected": -2.241516590118408, "logps/chosen": -0.982774555683136, "logps/rejected": -1.2921442985534668, "loss": 1.4818, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.965549111366272, "rewards/margins": 0.6187397241592407, "rewards/rejected": -2.5842885971069336, "step": 1275 }, { "epoch": 0.33499084009421615, "grad_norm": 17.25, "learning_rate": 2.523699045770389e-07, "logits/chosen": -2.47591495513916, "logits/rejected": -2.356618881225586, "logps/chosen": -0.9736431837081909, "logps/rejected": -1.2996432781219482, "loss": 1.456, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9472863674163818, "rewards/margins": 0.651999831199646, "rewards/rejected": -2.5992865562438965, "step": 1280 }, { "epoch": 0.3362993980633342, "grad_norm": 16.25, "learning_rate": 2.518679119031442e-07, "logits/chosen": -2.3980047702789307, "logits/rejected": -2.309596061706543, "logps/chosen": -0.9635213613510132, "logps/rejected": -1.2493789196014404, "loss": 1.5741, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9270427227020264, "rewards/margins": 0.5717151165008545, "rewards/rejected": -2.498757839202881, "step": 1285 }, { "epoch": 0.33760795603245225, "grad_norm": 17.75, "learning_rate": 2.513637927328197e-07, "logits/chosen": -2.541740655899048, "logits/rejected": -2.303169012069702, "logps/chosen": -0.96503084897995, "logps/rejected": -1.239220142364502, "loss": 1.4736, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9300616979599, "rewards/margins": 0.5483787059783936, "rewards/rejected": -2.478440284729004, "step": 1290 }, { "epoch": 0.33891651400157025, "grad_norm": 9.5625, "learning_rate": 2.508575575895717e-07, "logits/chosen": -2.515761375427246, "logits/rejected": -2.412909984588623, "logps/chosen": -1.0652989149093628, "logps/rejected": -1.2261133193969727, "loss": 1.6835, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.1305978298187256, "rewards/margins": 0.32162898778915405, "rewards/rejected": -2.4522266387939453, "step": 1295 }, { "epoch": 0.3402250719706883, "grad_norm": 8.5625, "learning_rate": 2.503492170410776e-07, "logits/chosen": -2.4879727363586426, "logits/rejected": -2.3036916255950928, "logps/chosen": -0.9453676342964172, "logps/rejected": -1.261852502822876, "loss": 1.4731, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8907352685928345, "rewards/margins": 0.6329694986343384, "rewards/rejected": -2.523705005645752, "step": 1300 }, { "epoch": 0.34153362993980635, "grad_norm": 16.0, "learning_rate": 2.498387816989651e-07, "logits/chosen": -2.516601085662842, "logits/rejected": -2.3422815799713135, "logps/chosen": -0.9201231002807617, "logps/rejected": -1.29447340965271, "loss": 1.3609, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8402462005615234, "rewards/margins": 0.7487004995346069, "rewards/rejected": -2.58894681930542, "step": 1305 }, { "epoch": 0.34284218790892435, "grad_norm": 10.625, "learning_rate": 2.493262622185909e-07, "logits/chosen": -2.4754393100738525, "logits/rejected": -2.332514524459839, "logps/chosen": -1.044008493423462, "logps/rejected": -1.309216856956482, "loss": 1.5603, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.088016986846924, "rewards/margins": 0.5304168462753296, "rewards/rejected": -2.618433713912964, "step": 1310 }, { "epoch": 0.3441507458780424, "grad_norm": 20.25, "learning_rate": 2.4881166929881807e-07, "logits/chosen": -2.417811393737793, "logits/rejected": -2.3413987159729004, "logps/chosen": -1.0614794492721558, "logps/rejected": -1.309481143951416, "loss": 1.5492, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1229588985443115, "rewards/margins": 0.4960029721260071, "rewards/rejected": -2.618962287902832, "step": 1315 }, { "epoch": 0.34545930384716045, "grad_norm": 14.9375, "learning_rate": 2.482950136817929e-07, "logits/chosen": -2.4295315742492676, "logits/rejected": -2.303346633911133, "logps/chosen": -1.0497418642044067, "logps/rejected": -1.163682460784912, "loss": 1.7156, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0994837284088135, "rewards/margins": 0.2278813123703003, "rewards/rejected": -2.327364921569824, "step": 1320 }, { "epoch": 0.34676786181627844, "grad_norm": 19.875, "learning_rate": 2.4777630615272047e-07, "logits/chosen": -2.394493818283081, "logits/rejected": -2.2753148078918457, "logps/chosen": -1.0658694505691528, "logps/rejected": -1.230351209640503, "loss": 1.6446, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1317389011383057, "rewards/margins": 0.32896357774734497, "rewards/rejected": -2.460702419281006, "step": 1325 }, { "epoch": 0.3480764197853965, "grad_norm": 13.375, "learning_rate": 2.4725555753963964e-07, "logits/chosen": -2.4680016040802, "logits/rejected": -2.402773857116699, "logps/chosen": -0.9903423190116882, "logps/rejected": -1.2059918642044067, "loss": 1.6461, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9806846380233765, "rewards/margins": 0.43129926919937134, "rewards/rejected": -2.4119837284088135, "step": 1330 }, { "epoch": 0.34938497775451455, "grad_norm": 70.0, "learning_rate": 2.46732778713197e-07, "logits/chosen": -2.4388976097106934, "logits/rejected": -2.384936809539795, "logps/chosen": -1.0735394954681396, "logps/rejected": -1.1656219959259033, "loss": 1.7723, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.1470789909362793, "rewards/margins": 0.18416447937488556, "rewards/rejected": -2.3312439918518066, "step": 1335 }, { "epoch": 0.35069353572363254, "grad_norm": 11.0625, "learning_rate": 2.4620798058641987e-07, "logits/chosen": -2.41865611076355, "logits/rejected": -2.308170795440674, "logps/chosen": -1.043940782546997, "logps/rejected": -1.2287954092025757, "loss": 1.6025, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.087881565093994, "rewards/margins": 0.369709312915802, "rewards/rejected": -2.4575908184051514, "step": 1340 }, { "epoch": 0.3520020936927506, "grad_norm": 14.0625, "learning_rate": 2.456811741144886e-07, "logits/chosen": -2.4499049186706543, "logits/rejected": -2.226224184036255, "logps/chosen": -1.0009632110595703, "logps/rejected": -1.3817651271820068, "loss": 1.4183, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0019264221191406, "rewards/margins": 0.7616036534309387, "rewards/rejected": -2.7635302543640137, "step": 1345 }, { "epoch": 0.35331065166186865, "grad_norm": 13.8125, "learning_rate": 2.4515237029450776e-07, "logits/chosen": -2.42193341255188, "logits/rejected": -2.2043275833129883, "logps/chosen": -0.8983974456787109, "logps/rejected": -1.2252178192138672, "loss": 1.4374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7967948913574219, "rewards/margins": 0.653640627861023, "rewards/rejected": -2.4504356384277344, "step": 1350 }, { "epoch": 0.35461920963098664, "grad_norm": 13.625, "learning_rate": 2.446215801652766e-07, "logits/chosen": -2.437544584274292, "logits/rejected": -2.2992396354675293, "logps/chosen": -0.9766480326652527, "logps/rejected": -1.3967548608779907, "loss": 1.3599, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9532960653305054, "rewards/margins": 0.8402134776115417, "rewards/rejected": -2.7935097217559814, "step": 1355 }, { "epoch": 0.3559277676001047, "grad_norm": 24.5, "learning_rate": 2.440888148070588e-07, "logits/chosen": -2.340860366821289, "logits/rejected": -2.240917682647705, "logps/chosen": -1.0234384536743164, "logps/rejected": -1.2179601192474365, "loss": 1.6093, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.046876907348633, "rewards/margins": 0.3890431821346283, "rewards/rejected": -2.435920238494873, "step": 1360 }, { "epoch": 0.3572363255692227, "grad_norm": 13.1875, "learning_rate": 2.4355408534135087e-07, "logits/chosen": -2.340690851211548, "logits/rejected": -2.2879786491394043, "logps/chosen": -1.004956603050232, "logps/rejected": -1.1495189666748047, "loss": 1.7447, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.009913206100464, "rewards/margins": 0.28912442922592163, "rewards/rejected": -2.2990379333496094, "step": 1365 }, { "epoch": 0.35854488353834074, "grad_norm": 13.4375, "learning_rate": 2.4301740293065025e-07, "logits/chosen": -2.3771138191223145, "logits/rejected": -2.3991477489471436, "logps/chosen": -0.9993813633918762, "logps/rejected": -1.1512587070465088, "loss": 1.6538, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9987627267837524, "rewards/margins": 0.30375435948371887, "rewards/rejected": -2.3025174140930176, "step": 1370 }, { "epoch": 0.3598534415074588, "grad_norm": 13.25, "learning_rate": 2.4247877877822195e-07, "logits/chosen": -2.4756643772125244, "logits/rejected": -2.341838836669922, "logps/chosen": -0.9918584823608398, "logps/rejected": -1.1517237424850464, "loss": 1.6329, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9837169647216797, "rewards/margins": 0.31973060965538025, "rewards/rejected": -2.3034474849700928, "step": 1375 }, { "epoch": 0.3611619994765768, "grad_norm": 6.5, "learning_rate": 2.419382241278653e-07, "logits/chosen": -2.413790464401245, "logits/rejected": -2.2874484062194824, "logps/chosen": -1.0072007179260254, "logps/rejected": -1.2081117630004883, "loss": 1.5921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.014401435852051, "rewards/margins": 0.40182214975357056, "rewards/rejected": -2.4162235260009766, "step": 1380 }, { "epoch": 0.36247055744569484, "grad_norm": 22.25, "learning_rate": 2.4139575026367846e-07, "logits/chosen": -2.4645771980285645, "logits/rejected": -2.3263297080993652, "logps/chosen": -0.9974699020385742, "logps/rejected": -1.2695637941360474, "loss": 1.5744, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9949398040771484, "rewards/margins": 0.5441876649856567, "rewards/rejected": -2.5391275882720947, "step": 1385 }, { "epoch": 0.3637791154148129, "grad_norm": 8.625, "learning_rate": 2.4085136850982355e-07, "logits/chosen": -2.422297716140747, "logits/rejected": -2.2749927043914795, "logps/chosen": -0.9386559724807739, "logps/rejected": -1.190751314163208, "loss": 1.5219, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8773119449615479, "rewards/margins": 0.5041903257369995, "rewards/rejected": -2.381502628326416, "step": 1390 }, { "epoch": 0.3650876733839309, "grad_norm": 6.78125, "learning_rate": 2.403050902302897e-07, "logits/chosen": -2.5324907302856445, "logits/rejected": -2.3535213470458984, "logps/chosen": -1.007462501525879, "logps/rejected": -1.0935630798339844, "loss": 1.794, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.014925003051758, "rewards/margins": 0.17220084369182587, "rewards/rejected": -2.1871261596679688, "step": 1395 }, { "epoch": 0.36639623135304894, "grad_norm": 16.375, "learning_rate": 2.397569268286563e-07, "logits/chosen": -2.3074936866760254, "logits/rejected": -2.301485776901245, "logps/chosen": -0.9663370251655579, "logps/rejected": -1.242025375366211, "loss": 1.5202, "rewards/accuracies": 0.625, "rewards/chosen": -1.9326740503311157, "rewards/margins": 0.5513765215873718, "rewards/rejected": -2.484050750732422, "step": 1400 }, { "epoch": 0.367704789322167, "grad_norm": 19.375, "learning_rate": 2.392068897478546e-07, "logits/chosen": -2.433990001678467, "logits/rejected": -2.3399014472961426, "logps/chosen": -0.9729989767074585, "logps/rejected": -1.2174397706985474, "loss": 1.5909, "rewards/accuracies": 0.625, "rewards/chosen": -1.945997953414917, "rewards/margins": 0.4888814091682434, "rewards/rejected": -2.4348795413970947, "step": 1405 }, { "epoch": 0.369013347291285, "grad_norm": 10.625, "learning_rate": 2.3865499046992893e-07, "logits/chosen": -2.431098699569702, "logits/rejected": -2.249119520187378, "logps/chosen": -0.9695953130722046, "logps/rejected": -1.1627790927886963, "loss": 1.5541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9391906261444092, "rewards/margins": 0.3863676190376282, "rewards/rejected": -2.3255581855773926, "step": 1410 }, { "epoch": 0.37032190526040304, "grad_norm": 7.46875, "learning_rate": 2.3810124051579727e-07, "logits/chosen": -2.3677175045013428, "logits/rejected": -2.314408779144287, "logps/chosen": -1.1800720691680908, "logps/rejected": -1.2807811498641968, "loss": 1.7421, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.3601441383361816, "rewards/margins": 0.20141835510730743, "rewards/rejected": -2.5615622997283936, "step": 1415 }, { "epoch": 0.3716304632295211, "grad_norm": 27.25, "learning_rate": 2.375456514450103e-07, "logits/chosen": -2.4871699810028076, "logits/rejected": -2.3619089126586914, "logps/chosen": -0.972209095954895, "logps/rejected": -1.214404821395874, "loss": 1.5201, "rewards/accuracies": 0.625, "rewards/chosen": -1.94441819190979, "rewards/margins": 0.48439159989356995, "rewards/rejected": -2.428809642791748, "step": 1420 }, { "epoch": 0.3729390211986391, "grad_norm": 11.8125, "learning_rate": 2.3698823485551056e-07, "logits/chosen": -2.410151720046997, "logits/rejected": -2.365622043609619, "logps/chosen": -0.9991546869277954, "logps/rejected": -1.2299830913543701, "loss": 1.603, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9983093738555908, "rewards/margins": 0.46165648102760315, "rewards/rejected": -2.4599661827087402, "step": 1425 }, { "epoch": 0.37424757916775714, "grad_norm": 30.625, "learning_rate": 2.3642900238338984e-07, "logits/chosen": -2.3757917881011963, "logits/rejected": -2.2749416828155518, "logps/chosen": -1.0619580745697021, "logps/rejected": -1.2996100187301636, "loss": 1.6036, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1239161491394043, "rewards/margins": 0.4753037095069885, "rewards/rejected": -2.599220037460327, "step": 1430 }, { "epoch": 0.3755561371368752, "grad_norm": 52.75, "learning_rate": 2.3586796570264672e-07, "logits/chosen": -2.3334732055664062, "logits/rejected": -2.3186182975769043, "logps/chosen": -0.870564341545105, "logps/rejected": -1.341253638267517, "loss": 1.3463, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.74112868309021, "rewards/margins": 0.9413784742355347, "rewards/rejected": -2.682507276535034, "step": 1435 }, { "epoch": 0.3768646951059932, "grad_norm": 18.25, "learning_rate": 2.353051365249427e-07, "logits/chosen": -2.4163975715637207, "logits/rejected": -2.363940477371216, "logps/chosen": -1.0730884075164795, "logps/rejected": -1.3135440349578857, "loss": 1.6722, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.146176815032959, "rewards/margins": 0.48091164231300354, "rewards/rejected": -2.6270880699157715, "step": 1440 }, { "epoch": 0.37817325307511124, "grad_norm": 18.25, "learning_rate": 2.347405265993577e-07, "logits/chosen": -2.3810534477233887, "logits/rejected": -2.351130247116089, "logps/chosen": -0.9553653597831726, "logps/rejected": -1.1968038082122803, "loss": 1.6117, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9107307195663452, "rewards/margins": 0.4828767776489258, "rewards/rejected": -2.3936076164245605, "step": 1445 }, { "epoch": 0.37948181104422923, "grad_norm": 23.25, "learning_rate": 2.3417414771214472e-07, "logits/chosen": -2.4542784690856934, "logits/rejected": -2.307743549346924, "logps/chosen": -0.9325535893440247, "logps/rejected": -1.254533052444458, "loss": 1.475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8651071786880493, "rewards/margins": 0.6439592242240906, "rewards/rejected": -2.509066104888916, "step": 1450 }, { "epoch": 0.3807903690133473, "grad_norm": 13.375, "learning_rate": 2.3360601168648407e-07, "logits/chosen": -2.4535746574401855, "logits/rejected": -2.359161853790283, "logps/chosen": -0.9743674993515015, "logps/rejected": -1.256029725074768, "loss": 1.5322, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.948734998703003, "rewards/margins": 0.5633248090744019, "rewards/rejected": -2.512059450149536, "step": 1455 }, { "epoch": 0.38209892698246534, "grad_norm": 8.75, "learning_rate": 2.330361303822363e-07, "logits/chosen": -2.38000750541687, "logits/rejected": -2.369983673095703, "logps/chosen": -1.0438467264175415, "logps/rejected": -1.2766417264938354, "loss": 1.6026, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.087693452835083, "rewards/margins": 0.46559014916419983, "rewards/rejected": -2.553283452987671, "step": 1460 }, { "epoch": 0.38340748495158333, "grad_norm": 16.25, "learning_rate": 2.3246451569569468e-07, "logits/chosen": -2.4709715843200684, "logits/rejected": -2.304572105407715, "logps/chosen": -0.9587949514389038, "logps/rejected": -1.2313339710235596, "loss": 1.556, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9175899028778076, "rewards/margins": 0.5450778603553772, "rewards/rejected": -2.462667942047119, "step": 1465 }, { "epoch": 0.3847160429207014, "grad_norm": 5.96875, "learning_rate": 2.3189117955933697e-07, "logits/chosen": -2.5118327140808105, "logits/rejected": -2.3302104473114014, "logps/chosen": -0.945669949054718, "logps/rejected": -1.251129150390625, "loss": 1.496, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.891339898109436, "rewards/margins": 0.6109183430671692, "rewards/rejected": -2.50225830078125, "step": 1470 }, { "epoch": 0.38602460088981944, "grad_norm": 15.6875, "learning_rate": 2.3131613394157616e-07, "logits/chosen": -2.4204957485198975, "logits/rejected": -2.2464210987091064, "logps/chosen": -0.9467811584472656, "logps/rejected": -1.1316184997558594, "loss": 1.6448, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8935623168945312, "rewards/margins": 0.3696743845939636, "rewards/rejected": -2.2632369995117188, "step": 1475 }, { "epoch": 0.38733315885893743, "grad_norm": 16.125, "learning_rate": 2.307393908465109e-07, "logits/chosen": -2.451876401901245, "logits/rejected": -2.3513565063476562, "logps/chosen": -0.95673006772995, "logps/rejected": -1.2027654647827148, "loss": 1.5638, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9134601354599, "rewards/margins": 0.49207067489624023, "rewards/rejected": -2.4055309295654297, "step": 1480 }, { "epoch": 0.3886417168280555, "grad_norm": 10.1875, "learning_rate": 2.301609623136746e-07, "logits/chosen": -2.489187002182007, "logits/rejected": -2.4289920330047607, "logps/chosen": -0.9566315412521362, "logps/rejected": -1.2512379884719849, "loss": 1.5209, "rewards/accuracies": 0.625, "rewards/chosen": -1.9132630825042725, "rewards/margins": 0.5892128944396973, "rewards/rejected": -2.5024759769439697, "step": 1485 }, { "epoch": 0.38995027479717354, "grad_norm": 14.75, "learning_rate": 2.2958086041778425e-07, "logits/chosen": -2.4941651821136475, "logits/rejected": -2.284539222717285, "logps/chosen": -1.011383295059204, "logps/rejected": -1.2261422872543335, "loss": 1.6093, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.022766590118408, "rewards/margins": 0.42951804399490356, "rewards/rejected": -2.452284574508667, "step": 1490 }, { "epoch": 0.39125883276629153, "grad_norm": 10.5, "learning_rate": 2.289990972684884e-07, "logits/chosen": -2.480138063430786, "logits/rejected": -2.3768107891082764, "logps/chosen": -0.9785095453262329, "logps/rejected": -1.1698919534683228, "loss": 1.6362, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9570190906524658, "rewards/margins": 0.3827648162841797, "rewards/rejected": -2.3397839069366455, "step": 1495 }, { "epoch": 0.3925673907354096, "grad_norm": 12.4375, "learning_rate": 2.2841568501011434e-07, "logits/chosen": -2.2922472953796387, "logits/rejected": -2.2737226486206055, "logps/chosen": -1.0102667808532715, "logps/rejected": -1.3443195819854736, "loss": 1.54, "rewards/accuracies": 0.5625, "rewards/chosen": -2.020533561706543, "rewards/margins": 0.6681055426597595, "rewards/rejected": -2.6886391639709473, "step": 1500 }, { "epoch": 0.39387594870452763, "grad_norm": 23.375, "learning_rate": 2.2783063582141454e-07, "logits/chosen": -2.2789902687072754, "logits/rejected": -2.275425434112549, "logps/chosen": -1.0516712665557861, "logps/rejected": -1.387373924255371, "loss": 1.506, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1033425331115723, "rewards/margins": 0.6714051961898804, "rewards/rejected": -2.774747848510742, "step": 1505 }, { "epoch": 0.39518450667364563, "grad_norm": 38.0, "learning_rate": 2.2724396191531244e-07, "logits/chosen": -2.4971213340759277, "logits/rejected": -2.3444957733154297, "logps/chosen": -1.0838091373443604, "logps/rejected": -1.2990797758102417, "loss": 1.6285, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1676182746887207, "rewards/margins": 0.43054142594337463, "rewards/rejected": -2.5981595516204834, "step": 1510 }, { "epoch": 0.3964930646427637, "grad_norm": 5.59375, "learning_rate": 2.2665567553864752e-07, "logits/chosen": -2.367192029953003, "logits/rejected": -2.3863494396209717, "logps/chosen": -1.0418930053710938, "logps/rejected": -1.1464084386825562, "loss": 1.815, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.0837860107421875, "rewards/margins": 0.2090306580066681, "rewards/rejected": -2.2928168773651123, "step": 1515 }, { "epoch": 0.39780162261188173, "grad_norm": 39.75, "learning_rate": 2.260657889719197e-07, "logits/chosen": -2.380061388015747, "logits/rejected": -2.216752767562866, "logps/chosen": -0.9189685583114624, "logps/rejected": -1.2759473323822021, "loss": 1.436, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8379371166229248, "rewards/margins": 0.7139574885368347, "rewards/rejected": -2.5518946647644043, "step": 1520 }, { "epoch": 0.39911018058099973, "grad_norm": 32.25, "learning_rate": 2.2547431452903293e-07, "logits/chosen": -2.4317126274108887, "logits/rejected": -2.3179543018341064, "logps/chosen": -0.9951443672180176, "logps/rejected": -1.3966236114501953, "loss": 1.367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9902887344360352, "rewards/margins": 0.8029584884643555, "rewards/rejected": -2.7932472229003906, "step": 1525 }, { "epoch": 0.4004187385501178, "grad_norm": 8.1875, "learning_rate": 2.2488126455703795e-07, "logits/chosen": -2.4274566173553467, "logits/rejected": -2.3697831630706787, "logps/chosen": -0.9997008442878723, "logps/rejected": -1.273087501525879, "loss": 1.5195, "rewards/accuracies": 0.625, "rewards/chosen": -1.9994016885757446, "rewards/margins": 0.5467732548713684, "rewards/rejected": -2.546175003051758, "step": 1530 }, { "epoch": 0.4017272965192358, "grad_norm": 31.375, "learning_rate": 2.2428665143587498e-07, "logits/chosen": -2.383143663406372, "logits/rejected": -2.3256874084472656, "logps/chosen": -1.0762752294540405, "logps/rejected": -1.258907437324524, "loss": 1.7187, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.152550458908081, "rewards/margins": 0.365264356136322, "rewards/rejected": -2.517814874649048, "step": 1535 }, { "epoch": 0.40303585448835383, "grad_norm": 6.15625, "learning_rate": 2.2369048757811492e-07, "logits/chosen": -2.340837001800537, "logits/rejected": -2.2795417308807373, "logps/chosen": -1.0112799406051636, "logps/rejected": -1.4303480386734009, "loss": 1.4838, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.022559881210327, "rewards/margins": 0.8381361961364746, "rewards/rejected": -2.8606960773468018, "step": 1540 }, { "epoch": 0.4043444124574719, "grad_norm": 26.875, "learning_rate": 2.230927854287003e-07, "logits/chosen": -2.3873379230499268, "logits/rejected": -2.383901596069336, "logps/chosen": -1.2201106548309326, "logps/rejected": -1.4484117031097412, "loss": 1.6286, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.4402213096618652, "rewards/margins": 0.45660200715065, "rewards/rejected": -2.8968234062194824, "step": 1545 }, { "epoch": 0.4056529704265899, "grad_norm": 26.0, "learning_rate": 2.224935574646856e-07, "logits/chosen": -2.4189586639404297, "logits/rejected": -2.3259055614471436, "logps/chosen": -1.053755521774292, "logps/rejected": -1.435430645942688, "loss": 1.5001, "rewards/accuracies": 0.625, "rewards/chosen": -2.107511043548584, "rewards/margins": 0.7633501291275024, "rewards/rejected": -2.870861291885376, "step": 1550 }, { "epoch": 0.4069615283957079, "grad_norm": 20.75, "learning_rate": 2.2189281619497678e-07, "logits/chosen": -2.3774755001068115, "logits/rejected": -2.3386740684509277, "logps/chosen": -1.161452054977417, "logps/rejected": -1.211249589920044, "loss": 1.8327, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.322904109954834, "rewards/margins": 0.09959502518177032, "rewards/rejected": -2.422499179840088, "step": 1555 }, { "epoch": 0.408270086364826, "grad_norm": 12.75, "learning_rate": 2.212905741600701e-07, "logits/chosen": -2.4253733158111572, "logits/rejected": -2.380932331085205, "logps/chosen": -1.0395207405090332, "logps/rejected": -1.2749578952789307, "loss": 1.562, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0790414810180664, "rewards/margins": 0.4708743691444397, "rewards/rejected": -2.5499157905578613, "step": 1560 }, { "epoch": 0.409578644333944, "grad_norm": 12.1875, "learning_rate": 2.2068684393179022e-07, "logits/chosen": -2.3719356060028076, "logits/rejected": -2.3233540058135986, "logps/chosen": -0.9532672762870789, "logps/rejected": -1.2395387887954712, "loss": 1.5239, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9065345525741577, "rewards/margins": 0.5725430250167847, "rewards/rejected": -2.4790775775909424, "step": 1565 }, { "epoch": 0.410887202303062, "grad_norm": 26.5, "learning_rate": 2.2008163811302809e-07, "logits/chosen": -2.4872236251831055, "logits/rejected": -2.3692429065704346, "logps/chosen": -1.0144637823104858, "logps/rejected": -1.2384027242660522, "loss": 1.6094, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0289275646209717, "rewards/margins": 0.44787779450416565, "rewards/rejected": -2.4768054485321045, "step": 1570 }, { "epoch": 0.4121957602721801, "grad_norm": 17.5, "learning_rate": 2.1947496933747748e-07, "logits/chosen": -2.4363608360290527, "logits/rejected": -2.2648589611053467, "logps/chosen": -1.0407702922821045, "logps/rejected": -1.3420772552490234, "loss": 1.5576, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.081540584564209, "rewards/margins": 0.6026141047477722, "rewards/rejected": -2.684154510498047, "step": 1575 }, { "epoch": 0.4135043182412981, "grad_norm": 11.75, "learning_rate": 2.1886685026937157e-07, "logits/chosen": -2.377046585083008, "logits/rejected": -2.2974839210510254, "logps/chosen": -1.0203555822372437, "logps/rejected": -1.2779319286346436, "loss": 1.513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0407111644744873, "rewards/margins": 0.5151528120040894, "rewards/rejected": -2.555863857269287, "step": 1580 }, { "epoch": 0.4148128762104161, "grad_norm": 24.0, "learning_rate": 2.1825729360321846e-07, "logits/chosen": -2.4333748817443848, "logits/rejected": -2.360531806945801, "logps/chosen": -1.0672863721847534, "logps/rejected": -1.2936062812805176, "loss": 1.613, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.134572744369507, "rewards/margins": 0.4526399075984955, "rewards/rejected": -2.587212562561035, "step": 1585 }, { "epoch": 0.4161214341795342, "grad_norm": 19.625, "learning_rate": 2.1764631206353606e-07, "logits/chosen": -2.3645429611206055, "logits/rejected": -2.1343088150024414, "logps/chosen": -1.0360791683197021, "logps/rejected": -1.3076984882354736, "loss": 1.597, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0721583366394043, "rewards/margins": 0.5432384014129639, "rewards/rejected": -2.6153969764709473, "step": 1590 }, { "epoch": 0.4174299921486522, "grad_norm": 6.65625, "learning_rate": 2.1703391840458656e-07, "logits/chosen": -2.489915609359741, "logits/rejected": -2.354588508605957, "logps/chosen": -0.9339109659194946, "logps/rejected": -1.2715728282928467, "loss": 1.4615, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8678219318389893, "rewards/margins": 0.6753236055374146, "rewards/rejected": -2.5431456565856934, "step": 1595 }, { "epoch": 0.4187385501177702, "grad_norm": 11.25, "learning_rate": 2.1642012541011033e-07, "logits/chosen": -2.516678810119629, "logits/rejected": -2.329498529434204, "logps/chosen": -1.051903486251831, "logps/rejected": -1.2379695177078247, "loss": 1.6544, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.103806972503662, "rewards/margins": 0.3721316158771515, "rewards/rejected": -2.4759390354156494, "step": 1600 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -2.23919677734375, "eval_logits/rejected": -2.1361849308013916, "eval_logps/chosen": -1.020378828048706, "eval_logps/rejected": -1.2794970273971558, "eval_loss": 1.560106635093689, "eval_rewards/accuracies": 0.6205000281333923, "eval_rewards/chosen": -2.040757656097412, "eval_rewards/margins": 0.5182366967201233, "eval_rewards/rejected": -2.5589940547943115, "eval_runtime": 423.405, "eval_samples_per_second": 4.724, "eval_steps_per_second": 1.181, "step": 1600 }, { "epoch": 0.4200471080868883, "grad_norm": 10.5, "learning_rate": 2.1580494589305882e-07, "logits/chosen": -2.430644989013672, "logits/rejected": -2.3093252182006836, "logps/chosen": -0.9489940404891968, "logps/rejected": -1.2481306791305542, "loss": 1.4956, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8979880809783936, "rewards/margins": 0.5982732772827148, "rewards/rejected": -2.4962613582611084, "step": 1605 }, { "epoch": 0.4213556660560063, "grad_norm": 17.125, "learning_rate": 2.1518839269532718e-07, "logits/chosen": -2.4260668754577637, "logits/rejected": -2.2923803329467773, "logps/chosen": -1.0544755458831787, "logps/rejected": -1.2251731157302856, "loss": 1.6552, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1089510917663574, "rewards/margins": 0.34139513969421387, "rewards/rejected": -2.4503462314605713, "step": 1610 }, { "epoch": 0.4226642240251243, "grad_norm": 7.625, "learning_rate": 2.145704786874862e-07, "logits/chosen": -2.3860607147216797, "logits/rejected": -2.321589469909668, "logps/chosen": -0.993428111076355, "logps/rejected": -1.240110993385315, "loss": 1.5831, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.98685622215271, "rewards/margins": 0.4933653771877289, "rewards/rejected": -2.48022198677063, "step": 1615 }, { "epoch": 0.4239727819942423, "grad_norm": 13.3125, "learning_rate": 2.1395121676851373e-07, "logits/chosen": -2.4365038871765137, "logits/rejected": -2.2111167907714844, "logps/chosen": -1.0990612506866455, "logps/rejected": -1.4180501699447632, "loss": 1.5335, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.198122501373291, "rewards/margins": 0.6379777789115906, "rewards/rejected": -2.8361003398895264, "step": 1620 }, { "epoch": 0.42528133996336037, "grad_norm": 13.625, "learning_rate": 2.1333061986552517e-07, "logits/chosen": -2.469062328338623, "logits/rejected": -2.3584277629852295, "logps/chosen": -0.939937949180603, "logps/rejected": -1.1771152019500732, "loss": 1.5622, "rewards/accuracies": 0.625, "rewards/chosen": -1.879875898361206, "rewards/margins": 0.47435441613197327, "rewards/rejected": -2.3542304039001465, "step": 1625 }, { "epoch": 0.4265898979324784, "grad_norm": 17.125, "learning_rate": 2.127087009335039e-07, "logits/chosen": -2.3647632598876953, "logits/rejected": -2.3372089862823486, "logps/chosen": -1.121917963027954, "logps/rejected": -1.422224998474121, "loss": 1.5967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.243835926055908, "rewards/margins": 0.6006139516830444, "rewards/rejected": -2.844449996948242, "step": 1630 }, { "epoch": 0.4278984559015964, "grad_norm": 9.4375, "learning_rate": 2.1208547295503055e-07, "logits/chosen": -2.4563426971435547, "logits/rejected": -2.3446362018585205, "logps/chosen": -1.0581527948379517, "logps/rejected": -1.2183663845062256, "loss": 1.6703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1163055896759033, "rewards/margins": 0.320427268743515, "rewards/rejected": -2.436732769012451, "step": 1635 }, { "epoch": 0.42920701387071447, "grad_norm": 15.9375, "learning_rate": 2.1146094894001232e-07, "logits/chosen": -2.3326759338378906, "logits/rejected": -2.356873035430908, "logps/chosen": -1.019042730331421, "logps/rejected": -1.2006655931472778, "loss": 1.6874, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.038085460662842, "rewards/margins": 0.36324557662010193, "rewards/rejected": -2.4013311862945557, "step": 1640 }, { "epoch": 0.4305155718398325, "grad_norm": 9.375, "learning_rate": 2.1083514192541108e-07, "logits/chosen": -2.497969388961792, "logits/rejected": -2.4648308753967285, "logps/chosen": -0.9925057291984558, "logps/rejected": -1.1867188215255737, "loss": 1.5958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9850114583969116, "rewards/margins": 0.38842612504959106, "rewards/rejected": -2.3734376430511475, "step": 1645 }, { "epoch": 0.4318241298089505, "grad_norm": 22.875, "learning_rate": 2.1020806497497142e-07, "logits/chosen": -2.3821117877960205, "logits/rejected": -2.3183438777923584, "logps/chosen": -0.9481805562973022, "logps/rejected": -1.2898935079574585, "loss": 1.5259, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8963611125946045, "rewards/margins": 0.6834259033203125, "rewards/rejected": -2.579787015914917, "step": 1650 }, { "epoch": 0.43313268777806857, "grad_norm": 20.0, "learning_rate": 2.0957973117894786e-07, "logits/chosen": -2.2719874382019043, "logits/rejected": -2.3260483741760254, "logps/chosen": -0.9907784461975098, "logps/rejected": -1.1269526481628418, "loss": 1.7048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9815568923950195, "rewards/margins": 0.27234819531440735, "rewards/rejected": -2.2539052963256836, "step": 1655 }, { "epoch": 0.4344412457471866, "grad_norm": 8.75, "learning_rate": 2.0895015365383164e-07, "logits/chosen": -2.3466298580169678, "logits/rejected": -2.3033411502838135, "logps/chosen": -0.9524238705635071, "logps/rejected": -1.2471843957901, "loss": 1.5007, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9048477411270142, "rewards/margins": 0.589521050453186, "rewards/rejected": -2.4943687915802, "step": 1660 }, { "epoch": 0.4357498037163046, "grad_norm": 9.25, "learning_rate": 2.0831934554207698e-07, "logits/chosen": -2.4324917793273926, "logits/rejected": -2.3662221431732178, "logps/chosen": -1.050973653793335, "logps/rejected": -1.163582682609558, "loss": 1.7713, "rewards/accuracies": 0.5, "rewards/chosen": -2.10194730758667, "rewards/margins": 0.22521796822547913, "rewards/rejected": -2.327165365219116, "step": 1665 }, { "epoch": 0.43705836168542267, "grad_norm": 9.875, "learning_rate": 2.0768732001182643e-07, "logits/chosen": -2.3797950744628906, "logits/rejected": -2.290417194366455, "logps/chosen": -1.0599757432937622, "logps/rejected": -1.1102312803268433, "loss": 1.7929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1199514865875244, "rewards/margins": 0.10051125288009644, "rewards/rejected": -2.2204625606536865, "step": 1670 }, { "epoch": 0.4383669196545407, "grad_norm": 13.1875, "learning_rate": 2.070540902566363e-07, "logits/chosen": -2.4174249172210693, "logits/rejected": -2.274294376373291, "logps/chosen": -0.9530884027481079, "logps/rejected": -1.2381136417388916, "loss": 1.496, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9061768054962158, "rewards/margins": 0.5700503587722778, "rewards/rejected": -2.476227283477783, "step": 1675 }, { "epoch": 0.4396754776236587, "grad_norm": 5.46875, "learning_rate": 2.064196694952013e-07, "logits/chosen": -2.372811794281006, "logits/rejected": -2.385651111602783, "logps/chosen": -0.9906123280525208, "logps/rejected": -1.2131260633468628, "loss": 1.63, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9812246561050415, "rewards/margins": 0.44502776861190796, "rewards/rejected": -2.4262521266937256, "step": 1680 }, { "epoch": 0.44098403559277677, "grad_norm": 29.5, "learning_rate": 2.0578407097107822e-07, "logits/chosen": -2.524641990661621, "logits/rejected": -2.4137794971466064, "logps/chosen": -0.974774181842804, "logps/rejected": -1.220322847366333, "loss": 1.5569, "rewards/accuracies": 0.5625, "rewards/chosen": -1.949548363685608, "rewards/margins": 0.49109750986099243, "rewards/rejected": -2.440645694732666, "step": 1685 }, { "epoch": 0.44229259356189476, "grad_norm": 7.625, "learning_rate": 2.0514730795240973e-07, "logits/chosen": -2.317484140396118, "logits/rejected": -2.223536491394043, "logps/chosen": -1.0006580352783203, "logps/rejected": -1.3375447988510132, "loss": 1.4473, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0013160705566406, "rewards/margins": 0.6737735867500305, "rewards/rejected": -2.6750895977020264, "step": 1690 }, { "epoch": 0.4436011515310128, "grad_norm": 17.875, "learning_rate": 2.0450939373164744e-07, "logits/chosen": -2.526895046234131, "logits/rejected": -2.438276529312134, "logps/chosen": -1.034822702407837, "logps/rejected": -1.0600029230117798, "loss": 1.8158, "rewards/accuracies": 0.5625, "rewards/chosen": -2.069645404815674, "rewards/margins": 0.050360359251499176, "rewards/rejected": -2.1200058460235596, "step": 1695 }, { "epoch": 0.44490970950013087, "grad_norm": 14.9375, "learning_rate": 2.0387034162527433e-07, "logits/chosen": -2.309950351715088, "logits/rejected": -2.316476821899414, "logps/chosen": -0.9866979718208313, "logps/rejected": -1.1516833305358887, "loss": 1.7011, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9733959436416626, "rewards/margins": 0.32997068762779236, "rewards/rejected": -2.3033666610717773, "step": 1700 }, { "epoch": 0.44621826746924886, "grad_norm": 20.625, "learning_rate": 2.0323016497352672e-07, "logits/chosen": -2.433002233505249, "logits/rejected": -2.3623645305633545, "logps/chosen": -1.0685750246047974, "logps/rejected": -1.2361853122711182, "loss": 1.6608, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1371500492095947, "rewards/margins": 0.33522042632102966, "rewards/rejected": -2.4723706245422363, "step": 1705 }, { "epoch": 0.4475268254383669, "grad_norm": 14.6875, "learning_rate": 2.0258887714011594e-07, "logits/chosen": -2.297880172729492, "logits/rejected": -2.2886757850646973, "logps/chosen": -1.0399835109710693, "logps/rejected": -1.4994447231292725, "loss": 1.3582, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0799670219421387, "rewards/margins": 0.9189218282699585, "rewards/rejected": -2.998889446258545, "step": 1710 }, { "epoch": 0.44883538340748497, "grad_norm": 8.4375, "learning_rate": 2.019464915119492e-07, "logits/chosen": -2.468632221221924, "logits/rejected": -2.3343093395233154, "logps/chosen": -1.0056952238082886, "logps/rejected": -1.2218900918960571, "loss": 1.5858, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.011390447616577, "rewards/margins": 0.4323893189430237, "rewards/rejected": -2.4437801837921143, "step": 1715 }, { "epoch": 0.45014394137660296, "grad_norm": 24.0, "learning_rate": 2.0130302149885032e-07, "logits/chosen": -2.5109100341796875, "logits/rejected": -2.404381275177002, "logps/chosen": -0.919482409954071, "logps/rejected": -1.187798261642456, "loss": 1.5001, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.838964819908142, "rewards/margins": 0.5366318225860596, "rewards/rejected": -2.375596523284912, "step": 1720 }, { "epoch": 0.451452499345721, "grad_norm": 25.0, "learning_rate": 2.0065848053327957e-07, "logits/chosen": -2.378965139389038, "logits/rejected": -2.3417181968688965, "logps/chosen": -1.1118217706680298, "logps/rejected": -1.3456599712371826, "loss": 1.6217, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2236435413360596, "rewards/margins": 0.4676761031150818, "rewards/rejected": -2.6913199424743652, "step": 1725 }, { "epoch": 0.45276105731483907, "grad_norm": 16.375, "learning_rate": 2.000128820700534e-07, "logits/chosen": -2.485917806625366, "logits/rejected": -2.459390163421631, "logps/chosen": -1.1392418146133423, "logps/rejected": -1.2941620349884033, "loss": 1.6969, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.2784836292266846, "rewards/margins": 0.30984050035476685, "rewards/rejected": -2.5883240699768066, "step": 1730 }, { "epoch": 0.45406961528395706, "grad_norm": 15.75, "learning_rate": 1.9936623958606372e-07, "logits/chosen": -2.4256553649902344, "logits/rejected": -2.1980483531951904, "logps/chosen": -1.05477774143219, "logps/rejected": -1.2226080894470215, "loss": 1.6641, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.10955548286438, "rewards/margins": 0.3356606364250183, "rewards/rejected": -2.445216178894043, "step": 1735 }, { "epoch": 0.4553781732530751, "grad_norm": 7.03125, "learning_rate": 1.987185665799963e-07, "logits/chosen": -2.3571059703826904, "logits/rejected": -2.2139298915863037, "logps/chosen": -1.0624784231185913, "logps/rejected": -1.2872092723846436, "loss": 1.5959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1249568462371826, "rewards/margins": 0.44946184754371643, "rewards/rejected": -2.574418544769287, "step": 1740 }, { "epoch": 0.45668673122219317, "grad_norm": 18.125, "learning_rate": 1.9806987657204902e-07, "logits/chosen": -2.4822094440460205, "logits/rejected": -2.412656784057617, "logps/chosen": -0.9763482809066772, "logps/rejected": -1.2210361957550049, "loss": 1.5461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9526965618133545, "rewards/margins": 0.48937588930130005, "rewards/rejected": -2.4420723915100098, "step": 1745 }, { "epoch": 0.45799528919131116, "grad_norm": 18.375, "learning_rate": 1.9742018310364992e-07, "logits/chosen": -2.4158544540405273, "logits/rejected": -2.3645384311676025, "logps/chosen": -0.9654977917671204, "logps/rejected": -1.2088598012924194, "loss": 1.5278, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9309955835342407, "rewards/margins": 0.4867241382598877, "rewards/rejected": -2.417719602584839, "step": 1750 }, { "epoch": 0.4593038471604292, "grad_norm": 15.625, "learning_rate": 1.9676949973717403e-07, "logits/chosen": -2.3639960289001465, "logits/rejected": -2.2483508586883545, "logps/chosen": -0.9691553115844727, "logps/rejected": -1.3462722301483154, "loss": 1.5464, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9383106231689453, "rewards/margins": 0.7542338371276855, "rewards/rejected": -2.692544460296631, "step": 1755 }, { "epoch": 0.46061240512954726, "grad_norm": 18.5, "learning_rate": 1.9611784005566085e-07, "logits/chosen": -2.503281831741333, "logits/rejected": -2.2975518703460693, "logps/chosen": -0.9205846786499023, "logps/rejected": -1.3108736276626587, "loss": 1.387, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8411693572998047, "rewards/margins": 0.7805778980255127, "rewards/rejected": -2.6217472553253174, "step": 1760 }, { "epoch": 0.46192096309866526, "grad_norm": 15.9375, "learning_rate": 1.9546521766253025e-07, "logits/chosen": -2.354820728302002, "logits/rejected": -2.2928662300109863, "logps/chosen": -1.067790150642395, "logps/rejected": -1.1816309690475464, "loss": 1.7342, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.13558030128479, "rewards/margins": 0.22768183052539825, "rewards/rejected": -2.3632619380950928, "step": 1765 }, { "epoch": 0.4632295210677833, "grad_norm": 22.375, "learning_rate": 1.9481164618129885e-07, "logits/chosen": -2.4465503692626953, "logits/rejected": -2.2854578495025635, "logps/chosen": -1.0381178855895996, "logps/rejected": -1.2207326889038086, "loss": 1.6874, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.076235771179199, "rewards/margins": 0.3652295470237732, "rewards/rejected": -2.441465377807617, "step": 1770 }, { "epoch": 0.4645380790369013, "grad_norm": 10.9375, "learning_rate": 1.9415713925529556e-07, "logits/chosen": -2.435256004333496, "logits/rejected": -2.2597994804382324, "logps/chosen": -1.0681062936782837, "logps/rejected": -1.2307686805725098, "loss": 1.6748, "rewards/accuracies": 0.625, "rewards/chosen": -2.1362125873565674, "rewards/margins": 0.3253251016139984, "rewards/rejected": -2.4615373611450195, "step": 1775 }, { "epoch": 0.46584663700601936, "grad_norm": 14.875, "learning_rate": 1.935017105473766e-07, "logits/chosen": -2.44612455368042, "logits/rejected": -2.3136560916900635, "logps/chosen": -1.0957090854644775, "logps/rejected": -1.2843434810638428, "loss": 1.7127, "rewards/accuracies": 0.5625, "rewards/chosen": -2.191418170928955, "rewards/margins": 0.3772687017917633, "rewards/rejected": -2.5686869621276855, "step": 1780 }, { "epoch": 0.4671551949751374, "grad_norm": 9.3125, "learning_rate": 1.928453737396405e-07, "logits/chosen": -2.403357744216919, "logits/rejected": -2.257378578186035, "logps/chosen": -1.0548166036605835, "logps/rejected": -1.3808900117874146, "loss": 1.5488, "rewards/accuracies": 0.625, "rewards/chosen": -2.109633207321167, "rewards/margins": 0.6521469950675964, "rewards/rejected": -2.761780023574829, "step": 1785 }, { "epoch": 0.4684637529442554, "grad_norm": 8.125, "learning_rate": 1.9218814253314248e-07, "logits/chosen": -2.3236823081970215, "logits/rejected": -2.261929988861084, "logps/chosen": -1.01413893699646, "logps/rejected": -1.367406964302063, "loss": 1.4269, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.02827787399292, "rewards/margins": 0.7065359354019165, "rewards/rejected": -2.734813928604126, "step": 1790 }, { "epoch": 0.46977231091337346, "grad_norm": 18.875, "learning_rate": 1.9153003064760817e-07, "logits/chosen": -2.4380874633789062, "logits/rejected": -2.291417360305786, "logps/chosen": -0.9332562685012817, "logps/rejected": -1.280174970626831, "loss": 1.5439, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8665125370025635, "rewards/margins": 0.6938372254371643, "rewards/rejected": -2.560349941253662, "step": 1795 }, { "epoch": 0.4710808688824915, "grad_norm": 8.75, "learning_rate": 1.908710518211476e-07, "logits/chosen": -2.4104881286621094, "logits/rejected": -2.4083914756774902, "logps/chosen": -0.9653786420822144, "logps/rejected": -1.150954246520996, "loss": 1.6125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9307572841644287, "rewards/margins": 0.37115129828453064, "rewards/rejected": -2.301908493041992, "step": 1800 }, { "epoch": 0.4723894268516095, "grad_norm": 12.875, "learning_rate": 1.902112198099682e-07, "logits/chosen": -2.4105076789855957, "logits/rejected": -2.282792806625366, "logps/chosen": -0.9747291803359985, "logps/rejected": -1.3606829643249512, "loss": 1.4708, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.949458360671997, "rewards/margins": 0.7719077467918396, "rewards/rejected": -2.7213659286499023, "step": 1805 }, { "epoch": 0.47369798482072756, "grad_norm": 10.9375, "learning_rate": 1.8955054838808755e-07, "logits/chosen": -2.3875927925109863, "logits/rejected": -2.3702666759490967, "logps/chosen": -1.0358930826187134, "logps/rejected": -1.2059333324432373, "loss": 1.6309, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0717861652374268, "rewards/margins": 0.34008073806762695, "rewards/rejected": -2.4118666648864746, "step": 1810 }, { "epoch": 0.4750065427898456, "grad_norm": 16.625, "learning_rate": 1.8888905134704599e-07, "logits/chosen": -2.360687732696533, "logits/rejected": -2.3041653633117676, "logps/chosen": -1.0096989870071411, "logps/rejected": -1.2296128273010254, "loss": 1.5579, "rewards/accuracies": 0.625, "rewards/chosen": -2.0193979740142822, "rewards/margins": 0.4398278295993805, "rewards/rejected": -2.459225654602051, "step": 1815 }, { "epoch": 0.4763151007589636, "grad_norm": 6.5625, "learning_rate": 1.882267424956188e-07, "logits/chosen": -2.4011521339416504, "logits/rejected": -2.2532505989074707, "logps/chosen": -1.052173137664795, "logps/rejected": -1.239912748336792, "loss": 1.6371, "rewards/accuracies": 0.625, "rewards/chosen": -2.10434627532959, "rewards/margins": 0.37547945976257324, "rewards/rejected": -2.479825496673584, "step": 1820 }, { "epoch": 0.47762365872808166, "grad_norm": 13.25, "learning_rate": 1.8756363565952778e-07, "logits/chosen": -2.4511189460754395, "logits/rejected": -2.3730571269989014, "logps/chosen": -1.110384225845337, "logps/rejected": -1.2998292446136475, "loss": 1.6052, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.220768451690674, "rewards/margins": 0.3788902163505554, "rewards/rejected": -2.599658489227295, "step": 1825 }, { "epoch": 0.4789322166971997, "grad_norm": 13.625, "learning_rate": 1.868997446811526e-07, "logits/chosen": -2.475877046585083, "logits/rejected": -2.3332247734069824, "logps/chosen": -0.9946733713150024, "logps/rejected": -1.329590082168579, "loss": 1.465, "rewards/accuracies": 0.625, "rewards/chosen": -1.9893467426300049, "rewards/margins": 0.6698335409164429, "rewards/rejected": -2.659180164337158, "step": 1830 }, { "epoch": 0.4802407746663177, "grad_norm": 36.0, "learning_rate": 1.8623508341924198e-07, "logits/chosen": -2.3308472633361816, "logits/rejected": -2.249074697494507, "logps/chosen": -0.9235193133354187, "logps/rejected": -1.179910659790039, "loss": 1.5437, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8470386266708374, "rewards/margins": 0.5127825140953064, "rewards/rejected": -2.359821319580078, "step": 1835 }, { "epoch": 0.48154933263543576, "grad_norm": 42.75, "learning_rate": 1.8556966574862444e-07, "logits/chosen": -2.343470811843872, "logits/rejected": -2.209136486053467, "logps/chosen": -0.9127250909805298, "logps/rejected": -1.246764898300171, "loss": 1.4266, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8254501819610596, "rewards/margins": 0.6680800318717957, "rewards/rejected": -2.493529796600342, "step": 1840 }, { "epoch": 0.4828578906045538, "grad_norm": 9.625, "learning_rate": 1.8490350555991835e-07, "logits/chosen": -2.443899154663086, "logits/rejected": -2.2177982330322266, "logps/chosen": -0.9722299575805664, "logps/rejected": -1.1766026020050049, "loss": 1.5794, "rewards/accuracies": 0.625, "rewards/chosen": -1.9444599151611328, "rewards/margins": 0.4087451100349426, "rewards/rejected": -2.3532052040100098, "step": 1845 }, { "epoch": 0.4841664485736718, "grad_norm": 10.0, "learning_rate": 1.8423661675924242e-07, "logits/chosen": -2.4339308738708496, "logits/rejected": -2.3532307147979736, "logps/chosen": -1.0873157978057861, "logps/rejected": -1.3236525058746338, "loss": 1.5892, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1746315956115723, "rewards/margins": 0.47267311811447144, "rewards/rejected": -2.6473050117492676, "step": 1850 }, { "epoch": 0.48547500654278986, "grad_norm": 43.0, "learning_rate": 1.8356901326792495e-07, "logits/chosen": -2.4787116050720215, "logits/rejected": -2.293734312057495, "logps/chosen": -1.0998144149780273, "logps/rejected": -1.2502272129058838, "loss": 1.709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1996288299560547, "rewards/margins": 0.30082595348358154, "rewards/rejected": -2.5004544258117676, "step": 1855 }, { "epoch": 0.48678356451190785, "grad_norm": 10.875, "learning_rate": 1.8290070902221358e-07, "logits/chosen": -2.3675265312194824, "logits/rejected": -2.335507869720459, "logps/chosen": -0.9375478625297546, "logps/rejected": -1.1675219535827637, "loss": 1.5432, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8750957250595093, "rewards/margins": 0.4599483013153076, "rewards/rejected": -2.3350439071655273, "step": 1860 }, { "epoch": 0.4880921224810259, "grad_norm": 21.625, "learning_rate": 1.8223171797298412e-07, "logits/chosen": -2.4071269035339355, "logits/rejected": -2.2741475105285645, "logps/chosen": -0.9936102032661438, "logps/rejected": -1.2281229496002197, "loss": 1.5934, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9872204065322876, "rewards/margins": 0.4690253734588623, "rewards/rejected": -2.4562458992004395, "step": 1865 }, { "epoch": 0.48940068045014395, "grad_norm": 30.75, "learning_rate": 1.8156205408544948e-07, "logits/chosen": -2.3966362476348877, "logits/rejected": -2.319197654724121, "logps/chosen": -1.049408197402954, "logps/rejected": -1.2393567562103271, "loss": 1.6611, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.098816394805908, "rewards/margins": 0.3798971176147461, "rewards/rejected": -2.4787135124206543, "step": 1870 }, { "epoch": 0.49070923841926195, "grad_norm": 18.875, "learning_rate": 1.8089173133886808e-07, "logits/chosen": -2.4367737770080566, "logits/rejected": -2.397002935409546, "logps/chosen": -0.9955056309700012, "logps/rejected": -1.2399742603302002, "loss": 1.5765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9910112619400024, "rewards/margins": 0.48893752694129944, "rewards/rejected": -2.4799485206604004, "step": 1875 }, { "epoch": 0.49201779638838, "grad_norm": 22.0, "learning_rate": 1.8022076372625213e-07, "logits/chosen": -2.4092354774475098, "logits/rejected": -2.231597900390625, "logps/chosen": -0.9818722009658813, "logps/rejected": -1.3030831813812256, "loss": 1.4964, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9637444019317627, "rewards/margins": 0.6424221396446228, "rewards/rejected": -2.606166362762451, "step": 1880 }, { "epoch": 0.49332635435749805, "grad_norm": 7.375, "learning_rate": 1.795491652540753e-07, "logits/chosen": -2.4213340282440186, "logits/rejected": -2.3352389335632324, "logps/chosen": -0.9991891980171204, "logps/rejected": -1.2178561687469482, "loss": 1.5775, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9983783960342407, "rewards/margins": 0.4373340606689453, "rewards/rejected": -2.4357123374938965, "step": 1885 }, { "epoch": 0.49463491232661605, "grad_norm": 13.1875, "learning_rate": 1.7887694994198052e-07, "logits/chosen": -2.3088021278381348, "logits/rejected": -2.2568607330322266, "logps/chosen": -1.0938889980316162, "logps/rejected": -1.3111975193023682, "loss": 1.5978, "rewards/accuracies": 0.625, "rewards/chosen": -2.1877779960632324, "rewards/margins": 0.43461722135543823, "rewards/rejected": -2.6223950386047363, "step": 1890 }, { "epoch": 0.4959434702957341, "grad_norm": 14.8125, "learning_rate": 1.7820413182248736e-07, "logits/chosen": -2.3785736560821533, "logits/rejected": -2.225630044937134, "logps/chosen": -0.9674897193908691, "logps/rejected": -1.3713101148605347, "loss": 1.4464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9349794387817383, "rewards/margins": 0.8076407313346863, "rewards/rejected": -2.7426202297210693, "step": 1895 }, { "epoch": 0.49725202826485215, "grad_norm": 22.5, "learning_rate": 1.7753072494069897e-07, "logits/chosen": -2.3734800815582275, "logits/rejected": -2.175264358520508, "logps/chosen": -1.2184627056121826, "logps/rejected": -1.4337811470031738, "loss": 1.7161, "rewards/accuracies": 0.5625, "rewards/chosen": -2.4369254112243652, "rewards/margins": 0.4306368827819824, "rewards/rejected": -2.8675622940063477, "step": 1900 }, { "epoch": 0.49856058623397015, "grad_norm": 43.75, "learning_rate": 1.7685674335400882e-07, "logits/chosen": -2.4453847408294678, "logits/rejected": -2.3668155670166016, "logps/chosen": -1.1483983993530273, "logps/rejected": -1.2579691410064697, "loss": 1.7986, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.2967967987060547, "rewards/margins": 0.2191413938999176, "rewards/rejected": -2.5159382820129395, "step": 1905 }, { "epoch": 0.4998691442030882, "grad_norm": 21.25, "learning_rate": 1.7618220113180756e-07, "logits/chosen": -2.346877336502075, "logits/rejected": -2.3551878929138184, "logps/chosen": -1.0282467603683472, "logps/rejected": -1.305271029472351, "loss": 1.583, "rewards/accuracies": 0.625, "rewards/chosen": -2.0564935207366943, "rewards/margins": 0.5540486574172974, "rewards/rejected": -2.610542058944702, "step": 1910 }, { "epoch": 0.5011777021722063, "grad_norm": 16.5, "learning_rate": 1.7550711235518897e-07, "logits/chosen": -2.459246873855591, "logits/rejected": -2.3898706436157227, "logps/chosen": -0.9707461595535278, "logps/rejected": -1.192016839981079, "loss": 1.5939, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9414923191070557, "rewards/margins": 0.4425415098667145, "rewards/rejected": -2.384033679962158, "step": 1915 }, { "epoch": 0.5024862601413242, "grad_norm": 9.625, "learning_rate": 1.748314911166563e-07, "logits/chosen": -2.3969197273254395, "logits/rejected": -2.304248809814453, "logps/chosen": -1.0582568645477295, "logps/rejected": -1.4008970260620117, "loss": 1.5322, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.116513729095459, "rewards/margins": 0.6852802634239197, "rewards/rejected": -2.8017940521240234, "step": 1920 }, { "epoch": 0.5037948181104422, "grad_norm": 27.25, "learning_rate": 1.7415535151982792e-07, "logits/chosen": -2.4160635471343994, "logits/rejected": -2.3063693046569824, "logps/chosen": -0.9386729001998901, "logps/rejected": -1.1574931144714355, "loss": 1.5813, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8773458003997803, "rewards/margins": 0.4376404285430908, "rewards/rejected": -2.314986228942871, "step": 1925 }, { "epoch": 0.5051033760795604, "grad_norm": 18.25, "learning_rate": 1.7347870767914282e-07, "logits/chosen": -2.4901957511901855, "logits/rejected": -2.391278028488159, "logps/chosen": -0.9290682673454285, "logps/rejected": -1.247491478919983, "loss": 1.4748, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.858136534690857, "rewards/margins": 0.6368464231491089, "rewards/rejected": -2.494982957839966, "step": 1930 }, { "epoch": 0.5064119340486783, "grad_norm": 9.0, "learning_rate": 1.7280157371956627e-07, "logits/chosen": -2.36393404006958, "logits/rejected": -2.1695334911346436, "logps/chosen": -1.1287975311279297, "logps/rejected": -1.5857765674591064, "loss": 1.4663, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2575950622558594, "rewards/margins": 0.913957953453064, "rewards/rejected": -3.171553134918213, "step": 1935 }, { "epoch": 0.5077204920177963, "grad_norm": 10.875, "learning_rate": 1.7212396377629475e-07, "logits/chosen": -2.464613676071167, "logits/rejected": -2.3147294521331787, "logps/chosen": -0.9329797029495239, "logps/rejected": -1.2606232166290283, "loss": 1.5215, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8659594058990479, "rewards/margins": 0.6552866697311401, "rewards/rejected": -2.5212464332580566, "step": 1940 }, { "epoch": 0.5090290499869145, "grad_norm": 13.75, "learning_rate": 1.714458919944609e-07, "logits/chosen": -2.3663437366485596, "logits/rejected": -2.3324458599090576, "logps/chosen": -0.9812591671943665, "logps/rejected": -1.15121328830719, "loss": 1.674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.962518334388733, "rewards/margins": 0.3399081826210022, "rewards/rejected": -2.30242657661438, "step": 1945 }, { "epoch": 0.5103376079560324, "grad_norm": 11.625, "learning_rate": 1.7076737252883823e-07, "logits/chosen": -2.470449686050415, "logits/rejected": -2.3263020515441895, "logps/chosen": -1.0773837566375732, "logps/rejected": -1.341399908065796, "loss": 1.5254, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1547675132751465, "rewards/margins": 0.528032124042511, "rewards/rejected": -2.682799816131592, "step": 1950 }, { "epoch": 0.5116461659251504, "grad_norm": 12.4375, "learning_rate": 1.7008841954354564e-07, "logits/chosen": -2.343871593475342, "logits/rejected": -2.4319007396698, "logps/chosen": -0.975444495677948, "logps/rejected": -1.1809990406036377, "loss": 1.5955, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.950888991355896, "rewards/margins": 0.41110944747924805, "rewards/rejected": -2.3619980812072754, "step": 1955 }, { "epoch": 0.5129547238942685, "grad_norm": 9.75, "learning_rate": 1.694090472117519e-07, "logits/chosen": -2.4474902153015137, "logits/rejected": -2.2655673027038574, "logps/chosen": -1.0202564001083374, "logps/rejected": -1.2957381010055542, "loss": 1.5334, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.040512800216675, "rewards/margins": 0.5509636998176575, "rewards/rejected": -2.5914762020111084, "step": 1960 }, { "epoch": 0.5142632818633865, "grad_norm": 31.75, "learning_rate": 1.6872926971537951e-07, "logits/chosen": -2.4518609046936035, "logits/rejected": -2.330719470977783, "logps/chosen": -0.9688348770141602, "logps/rejected": -1.2229888439178467, "loss": 1.5408, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9376697540283203, "rewards/margins": 0.5083078145980835, "rewards/rejected": -2.4459776878356934, "step": 1965 }, { "epoch": 0.5155718398325045, "grad_norm": 32.25, "learning_rate": 1.680491012448089e-07, "logits/chosen": -2.4120161533355713, "logits/rejected": -2.366097927093506, "logps/chosen": -1.0484545230865479, "logps/rejected": -1.2525670528411865, "loss": 1.6475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0969090461730957, "rewards/margins": 0.4082249701023102, "rewards/rejected": -2.505134105682373, "step": 1970 }, { "epoch": 0.5168803978016226, "grad_norm": 20.5, "learning_rate": 1.67368555998582e-07, "logits/chosen": -2.4698212146759033, "logits/rejected": -2.3489651679992676, "logps/chosen": -0.949353039264679, "logps/rejected": -1.2221812009811401, "loss": 1.4896, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.898706078529358, "rewards/margins": 0.5456562638282776, "rewards/rejected": -2.4443624019622803, "step": 1975 }, { "epoch": 0.5181889557707406, "grad_norm": 17.875, "learning_rate": 1.666876481831061e-07, "logits/chosen": -2.4362967014312744, "logits/rejected": -2.339439868927002, "logps/chosen": -0.9847452044487, "logps/rejected": -1.291522741317749, "loss": 1.5303, "rewards/accuracies": 0.625, "rewards/chosen": -1.9694904088974, "rewards/margins": 0.6135552525520325, "rewards/rejected": -2.583045482635498, "step": 1980 }, { "epoch": 0.5194975137398586, "grad_norm": 17.25, "learning_rate": 1.6600639201235694e-07, "logits/chosen": -2.4222829341888428, "logits/rejected": -2.2653541564941406, "logps/chosen": -0.9715536832809448, "logps/rejected": -1.179700493812561, "loss": 1.6093, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9431073665618896, "rewards/margins": 0.41629353165626526, "rewards/rejected": -2.359400987625122, "step": 1985 }, { "epoch": 0.5208060717089767, "grad_norm": 30.375, "learning_rate": 1.6532480170758234e-07, "logits/chosen": -2.3355214595794678, "logits/rejected": -2.193406105041504, "logps/chosen": -1.0776045322418213, "logps/rejected": -1.334547519683838, "loss": 1.703, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1552090644836426, "rewards/margins": 0.5138862729072571, "rewards/rejected": -2.669095039367676, "step": 1990 }, { "epoch": 0.5221146296780947, "grad_norm": 18.75, "learning_rate": 1.646428914970051e-07, "logits/chosen": -2.4154224395751953, "logits/rejected": -2.2866575717926025, "logps/chosen": -1.005456566810608, "logps/rejected": -1.2554142475128174, "loss": 1.565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.010913133621216, "rewards/margins": 0.49991583824157715, "rewards/rejected": -2.5108284950256348, "step": 1995 }, { "epoch": 0.5234231876472127, "grad_norm": 26.25, "learning_rate": 1.6396067561552614e-07, "logits/chosen": -2.4819350242614746, "logits/rejected": -2.3661563396453857, "logps/chosen": -0.9648916125297546, "logps/rejected": -1.3888318538665771, "loss": 1.4432, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9297832250595093, "rewards/margins": 0.847880482673645, "rewards/rejected": -2.7776637077331543, "step": 2000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -2.239804983139038, "eval_logits/rejected": -2.1369974613189697, "eval_logps/chosen": -1.0143417119979858, "eval_logps/rejected": -1.2770490646362305, "eval_loss": 1.5559983253479004, "eval_rewards/accuracies": 0.6215000152587891, "eval_rewards/chosen": -2.0286834239959717, "eval_rewards/margins": 0.5254148840904236, "eval_rewards/rejected": -2.554098129272461, "eval_runtime": 423.7638, "eval_samples_per_second": 4.72, "eval_steps_per_second": 1.18, "step": 2000 }, { "epoch": 0.5247317456163308, "grad_norm": 25.125, "learning_rate": 1.6327816830442726e-07, "logits/chosen": -2.507526397705078, "logits/rejected": -2.35188627243042, "logps/chosen": -0.9882357716560364, "logps/rejected": -1.2618719339370728, "loss": 1.5751, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9764715433120728, "rewards/margins": 0.5472723245620728, "rewards/rejected": -2.5237438678741455, "step": 2005 }, { "epoch": 0.5260403035854488, "grad_norm": 23.0, "learning_rate": 1.625953838110737e-07, "logits/chosen": -2.3285303115844727, "logits/rejected": -2.254697322845459, "logps/chosen": -1.1001182794570923, "logps/rejected": -1.2735798358917236, "loss": 1.6586, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2002365589141846, "rewards/margins": 0.34692299365997314, "rewards/rejected": -2.5471596717834473, "step": 2010 }, { "epoch": 0.5273488615545668, "grad_norm": 8.75, "learning_rate": 1.6191233638861705e-07, "logits/chosen": -2.4660801887512207, "logits/rejected": -2.423872709274292, "logps/chosen": -1.0307989120483398, "logps/rejected": -1.2134575843811035, "loss": 1.6369, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0615978240966797, "rewards/margins": 0.3653172552585602, "rewards/rejected": -2.426915168762207, "step": 2015 }, { "epoch": 0.528657419523685, "grad_norm": 12.5, "learning_rate": 1.6122904029569762e-07, "logits/chosen": -2.5295941829681396, "logits/rejected": -2.4000179767608643, "logps/chosen": -1.0093097686767578, "logps/rejected": -1.147658348083496, "loss": 1.695, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0186195373535156, "rewards/margins": 0.27669695019721985, "rewards/rejected": -2.295316696166992, "step": 2020 }, { "epoch": 0.5299659774928029, "grad_norm": 31.75, "learning_rate": 1.6054550979614655e-07, "logits/chosen": -2.430546522140503, "logits/rejected": -2.32780122756958, "logps/chosen": -1.0063021183013916, "logps/rejected": -1.3822518587112427, "loss": 1.4438, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.012604236602783, "rewards/margins": 0.7518996596336365, "rewards/rejected": -2.7645037174224854, "step": 2025 }, { "epoch": 0.5312745354619209, "grad_norm": 17.375, "learning_rate": 1.5986175915868835e-07, "logits/chosen": -2.4233877658843994, "logits/rejected": -2.306253671646118, "logps/chosen": -1.0322774648666382, "logps/rejected": -1.2316523790359497, "loss": 1.5922, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0645549297332764, "rewards/margins": 0.39874956011772156, "rewards/rejected": -2.4633047580718994, "step": 2030 }, { "epoch": 0.532583093431039, "grad_norm": 18.625, "learning_rate": 1.5917780265664286e-07, "logits/chosen": -2.349076509475708, "logits/rejected": -2.265036106109619, "logps/chosen": -1.0076414346694946, "logps/rejected": -1.2280337810516357, "loss": 1.6507, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0152828693389893, "rewards/margins": 0.4407844543457031, "rewards/rejected": -2.4560675621032715, "step": 2035 }, { "epoch": 0.533891651400157, "grad_norm": 9.125, "learning_rate": 1.5849365456762736e-07, "logits/chosen": -2.4706027507781982, "logits/rejected": -2.2760281562805176, "logps/chosen": -1.076223611831665, "logps/rejected": -1.3633537292480469, "loss": 1.5503, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.15244722366333, "rewards/margins": 0.5742602348327637, "rewards/rejected": -2.7267074584960938, "step": 2040 }, { "epoch": 0.535200209369275, "grad_norm": 24.125, "learning_rate": 1.578093291732585e-07, "logits/chosen": -2.479267120361328, "logits/rejected": -2.299891233444214, "logps/chosen": -0.9730592966079712, "logps/rejected": -1.144584059715271, "loss": 1.6204, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9461185932159424, "rewards/margins": 0.34304967522621155, "rewards/rejected": -2.289168119430542, "step": 2045 }, { "epoch": 0.5365087673383931, "grad_norm": 8.5, "learning_rate": 1.5712484075885425e-07, "logits/chosen": -2.4434080123901367, "logits/rejected": -2.283897876739502, "logps/chosen": -0.9065520167350769, "logps/rejected": -1.3221280574798584, "loss": 1.4204, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8131040334701538, "rewards/margins": 0.8311521410942078, "rewards/rejected": -2.644256114959717, "step": 2050 }, { "epoch": 0.5378173253075111, "grad_norm": 18.125, "learning_rate": 1.564402036131355e-07, "logits/chosen": -2.330359697341919, "logits/rejected": -2.3039302825927734, "logps/chosen": -1.0314123630523682, "logps/rejected": -1.2567552328109741, "loss": 1.578, "rewards/accuracies": 0.625, "rewards/chosen": -2.0628247261047363, "rewards/margins": 0.4506858289241791, "rewards/rejected": -2.5135104656219482, "step": 2055 }, { "epoch": 0.5391258832766291, "grad_norm": 15.3125, "learning_rate": 1.5575543202792814e-07, "logits/chosen": -2.4322237968444824, "logits/rejected": -2.3605399131774902, "logps/chosen": -0.9282668232917786, "logps/rejected": -1.1992113590240479, "loss": 1.5212, "rewards/accuracies": 0.625, "rewards/chosen": -1.8565336465835571, "rewards/margins": 0.5418893694877625, "rewards/rejected": -2.3984227180480957, "step": 2060 }, { "epoch": 0.5404344412457471, "grad_norm": 13.0625, "learning_rate": 1.5507054029786424e-07, "logits/chosen": -2.4302916526794434, "logits/rejected": -2.3038675785064697, "logps/chosen": -1.0414844751358032, "logps/rejected": -1.3107296228408813, "loss": 1.5208, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0829689502716064, "rewards/margins": 0.5384901762008667, "rewards/rejected": -2.6214592456817627, "step": 2065 }, { "epoch": 0.5417429992148652, "grad_norm": 24.75, "learning_rate": 1.54385542720084e-07, "logits/chosen": -2.434128761291504, "logits/rejected": -2.386500358581543, "logps/chosen": -0.9478904604911804, "logps/rejected": -1.2053724527359009, "loss": 1.6075, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8957809209823608, "rewards/margins": 0.5149640440940857, "rewards/rejected": -2.4107449054718018, "step": 2070 }, { "epoch": 0.5430515571839832, "grad_norm": 7.25, "learning_rate": 1.5370045359393723e-07, "logits/chosen": -2.3839330673217773, "logits/rejected": -2.3697433471679688, "logps/chosen": -0.9661067128181458, "logps/rejected": -1.2642197608947754, "loss": 1.5249, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9322134256362915, "rewards/margins": 0.5962256789207458, "rewards/rejected": -2.528439521789551, "step": 2075 }, { "epoch": 0.5443601151531012, "grad_norm": 3.921875, "learning_rate": 1.530152872206848e-07, "logits/chosen": -2.5556118488311768, "logits/rejected": -2.519317150115967, "logps/chosen": -0.9654792547225952, "logps/rejected": -1.094806432723999, "loss": 1.7607, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9309585094451904, "rewards/margins": 0.25865429639816284, "rewards/rejected": -2.189612865447998, "step": 2080 }, { "epoch": 0.5456686731222193, "grad_norm": 8.3125, "learning_rate": 1.5233005790320003e-07, "logits/chosen": -2.3626646995544434, "logits/rejected": -2.214515209197998, "logps/chosen": -0.9589771032333374, "logps/rejected": -1.2014695405960083, "loss": 1.6109, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9179542064666748, "rewards/margins": 0.48498496413230896, "rewards/rejected": -2.4029390811920166, "step": 2085 }, { "epoch": 0.5469772310913373, "grad_norm": 70.5, "learning_rate": 1.5164477994567038e-07, "logits/chosen": -2.491574764251709, "logits/rejected": -2.3851351737976074, "logps/chosen": -0.9691876173019409, "logps/rejected": -1.2715169191360474, "loss": 1.5025, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9383752346038818, "rewards/margins": 0.6046585440635681, "rewards/rejected": -2.5430338382720947, "step": 2090 }, { "epoch": 0.5482857890604553, "grad_norm": 8.625, "learning_rate": 1.5095946765329846e-07, "logits/chosen": -2.402458667755127, "logits/rejected": -2.2877895832061768, "logps/chosen": -1.0700018405914307, "logps/rejected": -1.276000738143921, "loss": 1.6029, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1400036811828613, "rewards/margins": 0.41199809312820435, "rewards/rejected": -2.552001476287842, "step": 2095 }, { "epoch": 0.5495943470295734, "grad_norm": 5.375, "learning_rate": 1.5027413533200383e-07, "logits/chosen": -2.361891508102417, "logits/rejected": -2.401979684829712, "logps/chosen": -1.1186949014663696, "logps/rejected": -1.2406280040740967, "loss": 1.697, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.2373898029327393, "rewards/margins": 0.24386653304100037, "rewards/rejected": -2.4812560081481934, "step": 2100 }, { "epoch": 0.5509029049986914, "grad_norm": 20.125, "learning_rate": 1.49588797288124e-07, "logits/chosen": -2.332409381866455, "logits/rejected": -2.265857219696045, "logps/chosen": -1.0127445459365845, "logps/rejected": -1.2471717596054077, "loss": 1.6224, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.025489091873169, "rewards/margins": 0.4688544273376465, "rewards/rejected": -2.4943435192108154, "step": 2105 }, { "epoch": 0.5522114629678094, "grad_norm": 26.0, "learning_rate": 1.4890346782811597e-07, "logits/chosen": -2.3044750690460205, "logits/rejected": -2.203080654144287, "logps/chosen": -0.9690456390380859, "logps/rejected": -1.2682809829711914, "loss": 1.5275, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9380912780761719, "rewards/margins": 0.5984705686569214, "rewards/rejected": -2.536561965942383, "step": 2110 }, { "epoch": 0.5535200209369275, "grad_norm": 22.125, "learning_rate": 1.4821816125825763e-07, "logits/chosen": -2.466495990753174, "logits/rejected": -2.355477809906006, "logps/chosen": -1.0085746049880981, "logps/rejected": -1.231109380722046, "loss": 1.5813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0171492099761963, "rewards/margins": 0.4450693130493164, "rewards/rejected": -2.462218761444092, "step": 2115 }, { "epoch": 0.5548285789060455, "grad_norm": 9.375, "learning_rate": 1.475328918843489e-07, "logits/chosen": -2.4214587211608887, "logits/rejected": -2.333212375640869, "logps/chosen": -0.9313311576843262, "logps/rejected": -1.3653498888015747, "loss": 1.3689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8626623153686523, "rewards/margins": 0.8680371046066284, "rewards/rejected": -2.7306997776031494, "step": 2120 }, { "epoch": 0.5561371368751635, "grad_norm": 19.0, "learning_rate": 1.4684767401141337e-07, "logits/chosen": -2.4324254989624023, "logits/rejected": -2.3711626529693604, "logps/chosen": -1.1481876373291016, "logps/rejected": -1.4396809339523315, "loss": 1.6359, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.296375274658203, "rewards/margins": 0.5829869508743286, "rewards/rejected": -2.879361867904663, "step": 2125 }, { "epoch": 0.5574456948442816, "grad_norm": 18.125, "learning_rate": 1.461625219433995e-07, "logits/chosen": -2.3910369873046875, "logits/rejected": -2.3180079460144043, "logps/chosen": -0.9503241777420044, "logps/rejected": -1.2157315015792847, "loss": 1.5251, "rewards/accuracies": 0.625, "rewards/chosen": -1.9006483554840088, "rewards/margins": 0.5308147072792053, "rewards/rejected": -2.4314630031585693, "step": 2130 }, { "epoch": 0.5587542528133996, "grad_norm": 18.625, "learning_rate": 1.4547744998288204e-07, "logits/chosen": -2.3879895210266113, "logits/rejected": -2.299638509750366, "logps/chosen": -1.0076770782470703, "logps/rejected": -1.1975195407867432, "loss": 1.6359, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0153541564941406, "rewards/margins": 0.3796852231025696, "rewards/rejected": -2.3950390815734863, "step": 2135 }, { "epoch": 0.5600628107825176, "grad_norm": 21.5, "learning_rate": 1.447924724307635e-07, "logits/chosen": -2.4248528480529785, "logits/rejected": -2.3365838527679443, "logps/chosen": -1.066901445388794, "logps/rejected": -1.2903188467025757, "loss": 1.684, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.133802890777588, "rewards/margins": 0.44683510065078735, "rewards/rejected": -2.5806376934051514, "step": 2140 }, { "epoch": 0.5613713687516357, "grad_norm": 12.75, "learning_rate": 1.4410760358597564e-07, "logits/chosen": -2.2818288803100586, "logits/rejected": -2.234139919281006, "logps/chosen": -0.9492326974868774, "logps/rejected": -1.1566355228424072, "loss": 1.6445, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8984653949737549, "rewards/margins": 0.41480565071105957, "rewards/rejected": -2.3132710456848145, "step": 2145 }, { "epoch": 0.5626799267207537, "grad_norm": 9.125, "learning_rate": 1.4342285774518093e-07, "logits/chosen": -2.3956122398376465, "logits/rejected": -2.378326892852783, "logps/chosen": -1.084282636642456, "logps/rejected": -1.2314732074737549, "loss": 1.7145, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.168565273284912, "rewards/margins": 0.29438114166259766, "rewards/rejected": -2.4629464149475098, "step": 2150 }, { "epoch": 0.5639884846898717, "grad_norm": 13.375, "learning_rate": 1.4273824920247411e-07, "logits/chosen": -2.4850142002105713, "logits/rejected": -2.283632278442383, "logps/chosen": -0.9197729229927063, "logps/rejected": -1.2919418811798096, "loss": 1.39, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8395458459854126, "rewards/margins": 0.7443382740020752, "rewards/rejected": -2.583883762359619, "step": 2155 }, { "epoch": 0.5652970426589898, "grad_norm": 22.125, "learning_rate": 1.4205379224908385e-07, "logits/chosen": -2.5194287300109863, "logits/rejected": -2.447544813156128, "logps/chosen": -1.020878553390503, "logps/rejected": -1.2773926258087158, "loss": 1.6192, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.041757106781006, "rewards/margins": 0.5130282640457153, "rewards/rejected": -2.5547852516174316, "step": 2160 }, { "epoch": 0.5666056006281078, "grad_norm": 7.96875, "learning_rate": 1.4136950117307434e-07, "logits/chosen": -2.358635663986206, "logits/rejected": -2.2798428535461426, "logps/chosen": -1.1565183401107788, "logps/rejected": -1.4266161918640137, "loss": 1.5431, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.3130366802215576, "rewards/margins": 0.5401955842971802, "rewards/rejected": -2.8532323837280273, "step": 2165 }, { "epoch": 0.5679141585972258, "grad_norm": 9.375, "learning_rate": 1.406853902590472e-07, "logits/chosen": -2.3591835498809814, "logits/rejected": -2.213712215423584, "logps/chosen": -0.9630627632141113, "logps/rejected": -1.2146613597869873, "loss": 1.5621, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9261255264282227, "rewards/margins": 0.5031973123550415, "rewards/rejected": -2.4293227195739746, "step": 2170 }, { "epoch": 0.5692227165663439, "grad_norm": 22.375, "learning_rate": 1.4000147378784303e-07, "logits/chosen": -2.418562412261963, "logits/rejected": -2.3193328380584717, "logps/chosen": -1.0533123016357422, "logps/rejected": -1.3847312927246094, "loss": 1.5011, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1066246032714844, "rewards/margins": 0.6628381013870239, "rewards/rejected": -2.7694625854492188, "step": 2175 }, { "epoch": 0.5705312745354619, "grad_norm": 50.25, "learning_rate": 1.3931776603624343e-07, "logits/chosen": -2.4607605934143066, "logits/rejected": -2.353538751602173, "logps/chosen": -0.9523341059684753, "logps/rejected": -1.2862637042999268, "loss": 1.5114, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9046682119369507, "rewards/margins": 0.6678589582443237, "rewards/rejected": -2.5725274085998535, "step": 2180 }, { "epoch": 0.5718398325045799, "grad_norm": 16.25, "learning_rate": 1.3863428127667305e-07, "logits/chosen": -2.4386112689971924, "logits/rejected": -2.3355937004089355, "logps/chosen": -0.9184026718139648, "logps/rejected": -1.3023720979690552, "loss": 1.4466, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8368053436279297, "rewards/margins": 0.7679387331008911, "rewards/rejected": -2.6047441959381104, "step": 2185 }, { "epoch": 0.573148390473698, "grad_norm": 32.75, "learning_rate": 1.379510337769015e-07, "logits/chosen": -2.462615966796875, "logits/rejected": -2.2719521522521973, "logps/chosen": -0.9152493476867676, "logps/rejected": -1.3336483240127563, "loss": 1.3508, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8304986953735352, "rewards/margins": 0.8367980718612671, "rewards/rejected": -2.6672966480255127, "step": 2190 }, { "epoch": 0.574456948442816, "grad_norm": 33.75, "learning_rate": 1.3726803779974565e-07, "logits/chosen": -2.4071669578552246, "logits/rejected": -2.308692455291748, "logps/chosen": -0.9648585319519043, "logps/rejected": -1.1965194940567017, "loss": 1.5581, "rewards/accuracies": 0.625, "rewards/chosen": -1.9297170639038086, "rewards/margins": 0.4633216857910156, "rewards/rejected": -2.3930389881134033, "step": 2195 }, { "epoch": 0.575765506411934, "grad_norm": 16.75, "learning_rate": 1.3658530760277178e-07, "logits/chosen": -2.376499652862549, "logits/rejected": -2.3030123710632324, "logps/chosen": -0.9101018905639648, "logps/rejected": -1.2885754108428955, "loss": 1.4456, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8202037811279297, "rewards/margins": 0.756946861743927, "rewards/rejected": -2.577150821685791, "step": 2200 }, { "epoch": 0.5770740643810521, "grad_norm": 8.75, "learning_rate": 1.3590285743799794e-07, "logits/chosen": -2.455573320388794, "logits/rejected": -2.3377621173858643, "logps/chosen": -1.0217607021331787, "logps/rejected": -1.4241609573364258, "loss": 1.3797, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0435214042663574, "rewards/margins": 0.8048006892204285, "rewards/rejected": -2.8483219146728516, "step": 2205 }, { "epoch": 0.5783826223501701, "grad_norm": 10.75, "learning_rate": 1.352207015515967e-07, "logits/chosen": -2.370897054672241, "logits/rejected": -2.3605599403381348, "logps/chosen": -1.0407721996307373, "logps/rejected": -1.2561496496200562, "loss": 1.6084, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0815443992614746, "rewards/margins": 0.4307548999786377, "rewards/rejected": -2.5122992992401123, "step": 2210 }, { "epoch": 0.5796911803192881, "grad_norm": 15.1875, "learning_rate": 1.3453885418359734e-07, "logits/chosen": -2.3532214164733887, "logits/rejected": -2.2795262336730957, "logps/chosen": -0.9503658413887024, "logps/rejected": -1.1492555141448975, "loss": 1.6187, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9007316827774048, "rewards/margins": 0.3977794349193573, "rewards/rejected": -2.298511028289795, "step": 2215 }, { "epoch": 0.5809997382884062, "grad_norm": 14.125, "learning_rate": 1.3385732956758895e-07, "logits/chosen": -2.444422483444214, "logits/rejected": -2.2096850872039795, "logps/chosen": -1.052429437637329, "logps/rejected": -1.243295669555664, "loss": 1.7169, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.104858875274658, "rewards/margins": 0.38173240423202515, "rewards/rejected": -2.486591339111328, "step": 2220 }, { "epoch": 0.5823082962575242, "grad_norm": 12.375, "learning_rate": 1.3317614193042306e-07, "logits/chosen": -2.3654439449310303, "logits/rejected": -2.256114959716797, "logps/chosen": -0.9818763732910156, "logps/rejected": -1.2483128309249878, "loss": 1.5756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9637527465820312, "rewards/margins": 0.5328729748725891, "rewards/rejected": -2.4966256618499756, "step": 2225 }, { "epoch": 0.5836168542266422, "grad_norm": 11.5625, "learning_rate": 1.324953054919168e-07, "logits/chosen": -2.410255193710327, "logits/rejected": -2.3750529289245605, "logps/chosen": -0.9597309231758118, "logps/rejected": -1.124402642250061, "loss": 1.6631, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9194618463516235, "rewards/margins": 0.32934319972991943, "rewards/rejected": -2.248805284500122, "step": 2230 }, { "epoch": 0.5849254121957602, "grad_norm": 16.5, "learning_rate": 1.3181483446455604e-07, "logits/chosen": -2.4090638160705566, "logits/rejected": -2.3251593112945557, "logps/chosen": -1.020389199256897, "logps/rejected": -1.2284648418426514, "loss": 1.584, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.040778398513794, "rewards/margins": 0.41615137457847595, "rewards/rejected": -2.4569296836853027, "step": 2235 }, { "epoch": 0.5862339701648783, "grad_norm": 10.8125, "learning_rate": 1.311347430531986e-07, "logits/chosen": -2.4416494369506836, "logits/rejected": -2.4061760902404785, "logps/chosen": -0.9800472259521484, "logps/rejected": -1.3442299365997314, "loss": 1.587, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9600944519042969, "rewards/margins": 0.7283655405044556, "rewards/rejected": -2.688459873199463, "step": 2240 }, { "epoch": 0.5875425281339963, "grad_norm": 7.375, "learning_rate": 1.3045504545477788e-07, "logits/chosen": -2.4397130012512207, "logits/rejected": -2.338029623031616, "logps/chosen": -1.0340012311935425, "logps/rejected": -1.2671782970428467, "loss": 1.5712, "rewards/accuracies": 0.6875, "rewards/chosen": -2.068002462387085, "rewards/margins": 0.46635428071022034, "rewards/rejected": -2.5343565940856934, "step": 2245 }, { "epoch": 0.5888510861031143, "grad_norm": 16.625, "learning_rate": 1.297757558580064e-07, "logits/chosen": -2.3184800148010254, "logits/rejected": -2.301931142807007, "logps/chosen": -1.094741702079773, "logps/rejected": -1.2735108137130737, "loss": 1.6867, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.189483404159546, "rewards/margins": 0.35753828287124634, "rewards/rejected": -2.5470216274261475, "step": 2250 }, { "epoch": 0.5901596440722324, "grad_norm": 11.0, "learning_rate": 1.2909688844307965e-07, "logits/chosen": -2.4513399600982666, "logits/rejected": -2.251115560531616, "logps/chosen": -0.9842875599861145, "logps/rejected": -1.3181707859039307, "loss": 1.5251, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.968575119972229, "rewards/margins": 0.667766273021698, "rewards/rejected": -2.6363415718078613, "step": 2255 }, { "epoch": 0.5914682020413504, "grad_norm": 27.5, "learning_rate": 1.2841845738138002e-07, "logits/chosen": -2.5210824012756348, "logits/rejected": -2.4076409339904785, "logps/chosen": -0.9450047612190247, "logps/rejected": -1.131059169769287, "loss": 1.6669, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8900095224380493, "rewards/margins": 0.37210893630981445, "rewards/rejected": -2.262118339538574, "step": 2260 }, { "epoch": 0.5927767600104684, "grad_norm": 12.0625, "learning_rate": 1.2774047683518098e-07, "logits/chosen": -2.5099644660949707, "logits/rejected": -2.3856239318847656, "logps/chosen": -1.0717213153839111, "logps/rejected": -1.2978155612945557, "loss": 1.6418, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1434426307678223, "rewards/margins": 0.4521881639957428, "rewards/rejected": -2.5956311225891113, "step": 2265 }, { "epoch": 0.5940853179795865, "grad_norm": 17.25, "learning_rate": 1.2706296095735157e-07, "logits/chosen": -2.334601879119873, "logits/rejected": -2.2201592922210693, "logps/chosen": -1.0710841417312622, "logps/rejected": -1.292859673500061, "loss": 1.6173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1421682834625244, "rewards/margins": 0.44355130195617676, "rewards/rejected": -2.585719347000122, "step": 2270 }, { "epoch": 0.5953938759487045, "grad_norm": 37.25, "learning_rate": 1.2638592389106078e-07, "logits/chosen": -2.427112102508545, "logits/rejected": -2.293665647506714, "logps/chosen": -1.097879409790039, "logps/rejected": -1.375378966331482, "loss": 1.6306, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.195758819580078, "rewards/margins": 0.5549994111061096, "rewards/rejected": -2.750757932662964, "step": 2275 }, { "epoch": 0.5967024339178225, "grad_norm": 9.5625, "learning_rate": 1.257093797694824e-07, "logits/chosen": -2.444255828857422, "logits/rejected": -2.3432908058166504, "logps/chosen": -1.0374839305877686, "logps/rejected": -1.252784013748169, "loss": 1.5624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.074967861175537, "rewards/margins": 0.43060046434402466, "rewards/rejected": -2.505568027496338, "step": 2280 }, { "epoch": 0.5980109918869406, "grad_norm": 25.75, "learning_rate": 1.250333427155e-07, "logits/chosen": -2.249427556991577, "logits/rejected": -2.1228158473968506, "logps/chosen": -1.0070369243621826, "logps/rejected": -1.3975975513458252, "loss": 1.4214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0140738487243652, "rewards/margins": 0.7811210751533508, "rewards/rejected": -2.7951951026916504, "step": 2285 }, { "epoch": 0.5993195498560586, "grad_norm": 27.375, "learning_rate": 1.2435782684141213e-07, "logits/chosen": -2.4257540702819824, "logits/rejected": -2.3055472373962402, "logps/chosen": -1.0568474531173706, "logps/rejected": -1.2457268238067627, "loss": 1.726, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.113694906234741, "rewards/margins": 0.37775856256484985, "rewards/rejected": -2.4914536476135254, "step": 2290 }, { "epoch": 0.6006281078251766, "grad_norm": 9.9375, "learning_rate": 1.2368284624863766e-07, "logits/chosen": -2.3483424186706543, "logits/rejected": -2.2424049377441406, "logps/chosen": -1.031960368156433, "logps/rejected": -1.367994785308838, "loss": 1.4756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.063920736312866, "rewards/margins": 0.6720688343048096, "rewards/rejected": -2.735989570617676, "step": 2295 }, { "epoch": 0.6019366657942947, "grad_norm": 29.75, "learning_rate": 1.2300841502742138e-07, "logits/chosen": -2.3382885456085205, "logits/rejected": -2.2607500553131104, "logps/chosen": -0.9774505496025085, "logps/rejected": -1.2374660968780518, "loss": 1.5834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.954901099205017, "rewards/margins": 0.520031213760376, "rewards/rejected": -2.4749321937561035, "step": 2300 }, { "epoch": 0.6032452237634127, "grad_norm": 16.125, "learning_rate": 1.2233454725654004e-07, "logits/chosen": -2.4661812782287598, "logits/rejected": -2.369544506072998, "logps/chosen": -1.0134741067886353, "logps/rejected": -1.2037636041641235, "loss": 1.6199, "rewards/accuracies": 0.625, "rewards/chosen": -2.0269482135772705, "rewards/margins": 0.38057881593704224, "rewards/rejected": -2.407527208328247, "step": 2305 }, { "epoch": 0.6045537817325307, "grad_norm": 8.75, "learning_rate": 1.2166125700300824e-07, "logits/chosen": -2.458974599838257, "logits/rejected": -2.3594462871551514, "logps/chosen": -0.9509730339050293, "logps/rejected": -1.1717463731765747, "loss": 1.6195, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9019460678100586, "rewards/margins": 0.44154685735702515, "rewards/rejected": -2.3434927463531494, "step": 2310 }, { "epoch": 0.6058623397016488, "grad_norm": 5.625, "learning_rate": 1.20988558321785e-07, "logits/chosen": -2.444587230682373, "logits/rejected": -2.3330531120300293, "logps/chosen": -0.9590595960617065, "logps/rejected": -1.1642197370529175, "loss": 1.5665, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.918119192123413, "rewards/margins": 0.4103202223777771, "rewards/rejected": -2.328439474105835, "step": 2315 }, { "epoch": 0.6071708976707668, "grad_norm": 10.25, "learning_rate": 1.203164652554801e-07, "logits/chosen": -2.3558459281921387, "logits/rejected": -2.223944902420044, "logps/chosen": -1.064517855644226, "logps/rejected": -1.2413067817687988, "loss": 1.701, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.129035711288452, "rewards/margins": 0.3535779118537903, "rewards/rejected": -2.4826135635375977, "step": 2320 }, { "epoch": 0.6084794556398848, "grad_norm": 9.1875, "learning_rate": 1.1964499183406112e-07, "logits/chosen": -2.46921968460083, "logits/rejected": -2.4811508655548096, "logps/chosen": -0.973553478717804, "logps/rejected": -1.2893043756484985, "loss": 1.4647, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.947106957435608, "rewards/margins": 0.6315014958381653, "rewards/rejected": -2.578608751296997, "step": 2325 }, { "epoch": 0.6097880136090029, "grad_norm": 13.9375, "learning_rate": 1.1897415207456074e-07, "logits/chosen": -2.467343807220459, "logits/rejected": -2.3478524684906006, "logps/chosen": -0.9138134121894836, "logps/rejected": -1.2126872539520264, "loss": 1.4269, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8276268243789673, "rewards/margins": 0.5977475643157959, "rewards/rejected": -2.4253745079040527, "step": 2330 }, { "epoch": 0.6110965715781209, "grad_norm": 18.625, "learning_rate": 1.1830395998078368e-07, "logits/chosen": -2.3486573696136475, "logits/rejected": -2.3269922733306885, "logps/chosen": -1.0603501796722412, "logps/rejected": -1.1257364749908447, "loss": 1.8071, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1207003593444824, "rewards/margins": 0.13077223300933838, "rewards/rejected": -2.2514729499816895, "step": 2335 }, { "epoch": 0.6124051295472389, "grad_norm": 20.875, "learning_rate": 1.1763442954301467e-07, "logits/chosen": -2.3642868995666504, "logits/rejected": -2.23626708984375, "logps/chosen": -1.131478190422058, "logps/rejected": -1.424948811531067, "loss": 1.6422, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.262956380844116, "rewards/margins": 0.5869408845901489, "rewards/rejected": -2.849897623062134, "step": 2340 }, { "epoch": 0.613713687516357, "grad_norm": 27.5, "learning_rate": 1.1696557473772638e-07, "logits/chosen": -2.4084057807922363, "logits/rejected": -2.3532609939575195, "logps/chosen": -1.1158283948898315, "logps/rejected": -1.235832691192627, "loss": 1.7129, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.231656789779663, "rewards/margins": 0.24000878632068634, "rewards/rejected": -2.471665382385254, "step": 2345 }, { "epoch": 0.615022245485475, "grad_norm": 15.625, "learning_rate": 1.1629740952728763e-07, "logits/chosen": -2.3828654289245605, "logits/rejected": -2.314852714538574, "logps/chosen": -1.0044595003128052, "logps/rejected": -1.2730025053024292, "loss": 1.5933, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0089190006256104, "rewards/margins": 0.5370863080024719, "rewards/rejected": -2.5460050106048584, "step": 2350 }, { "epoch": 0.616330803454593, "grad_norm": 28.25, "learning_rate": 1.156299478596719e-07, "logits/chosen": -2.3985235691070557, "logits/rejected": -2.302316188812256, "logps/chosen": -0.991632342338562, "logps/rejected": -1.1910475492477417, "loss": 1.5824, "rewards/accuracies": 0.625, "rewards/chosen": -1.983264684677124, "rewards/margins": 0.39883023500442505, "rewards/rejected": -2.3820950984954834, "step": 2355 }, { "epoch": 0.6176393614237111, "grad_norm": 16.875, "learning_rate": 1.149632036681662e-07, "logits/chosen": -2.3888607025146484, "logits/rejected": -2.272970199584961, "logps/chosen": -1.0303930044174194, "logps/rejected": -1.3528447151184082, "loss": 1.5505, "rewards/accuracies": 0.625, "rewards/chosen": -2.060786008834839, "rewards/margins": 0.6449034810066223, "rewards/rejected": -2.7056894302368164, "step": 2360 }, { "epoch": 0.6189479193928291, "grad_norm": 15.25, "learning_rate": 1.1429719087108016e-07, "logits/chosen": -2.393451452255249, "logits/rejected": -2.323495626449585, "logps/chosen": -0.9895676374435425, "logps/rejected": -1.3186137676239014, "loss": 1.5498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.979135274887085, "rewards/margins": 0.6580924987792969, "rewards/rejected": -2.6372275352478027, "step": 2365 }, { "epoch": 0.6202564773619471, "grad_norm": 7.25, "learning_rate": 1.1363192337145561e-07, "logits/chosen": -2.440089464187622, "logits/rejected": -2.294459104537964, "logps/chosen": -1.059817910194397, "logps/rejected": -1.2491494417190552, "loss": 1.619, "rewards/accuracies": 0.625, "rewards/chosen": -2.119635820388794, "rewards/margins": 0.3786631226539612, "rewards/rejected": -2.4982988834381104, "step": 2370 }, { "epoch": 0.6215650353310652, "grad_norm": 15.375, "learning_rate": 1.1296741505677619e-07, "logits/chosen": -2.4676430225372314, "logits/rejected": -2.4331467151641846, "logps/chosen": -1.0720535516738892, "logps/rejected": -1.2827204465866089, "loss": 1.582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1441071033477783, "rewards/margins": 0.4213338792324066, "rewards/rejected": -2.5654408931732178, "step": 2375 }, { "epoch": 0.6228735933001832, "grad_norm": 23.0, "learning_rate": 1.1230367979867758e-07, "logits/chosen": -2.327519655227661, "logits/rejected": -2.2377219200134277, "logps/chosen": -1.116839051246643, "logps/rejected": -1.3652244806289673, "loss": 1.5682, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.233678102493286, "rewards/margins": 0.4967709481716156, "rewards/rejected": -2.7304489612579346, "step": 2380 }, { "epoch": 0.6241821512693012, "grad_norm": 13.75, "learning_rate": 1.1164073145265784e-07, "logits/chosen": -2.4089648723602295, "logits/rejected": -2.2925963401794434, "logps/chosen": -1.0654513835906982, "logps/rejected": -1.33376944065094, "loss": 1.5445, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1309027671813965, "rewards/margins": 0.5366358757019043, "rewards/rejected": -2.66753888130188, "step": 2385 }, { "epoch": 0.6254907092384192, "grad_norm": 14.75, "learning_rate": 1.1097858385778816e-07, "logits/chosen": -2.3695247173309326, "logits/rejected": -2.2461090087890625, "logps/chosen": -1.0226638317108154, "logps/rejected": -1.3561975955963135, "loss": 1.5412, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.045327663421631, "rewards/margins": 0.66706782579422, "rewards/rejected": -2.712395191192627, "step": 2390 }, { "epoch": 0.6267992672075373, "grad_norm": 15.125, "learning_rate": 1.1031725083642418e-07, "logits/chosen": -2.4139561653137207, "logits/rejected": -2.2923741340637207, "logps/chosen": -1.0330827236175537, "logps/rejected": -1.212317705154419, "loss": 1.6757, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0661654472351074, "rewards/margins": 0.35846978425979614, "rewards/rejected": -2.424635410308838, "step": 2395 }, { "epoch": 0.6281078251766553, "grad_norm": 18.25, "learning_rate": 1.0965674619391713e-07, "logits/chosen": -2.3495278358459473, "logits/rejected": -2.225348711013794, "logps/chosen": -1.0518320798873901, "logps/rejected": -1.2750811576843262, "loss": 1.5835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1036641597747803, "rewards/margins": 0.44649791717529297, "rewards/rejected": -2.5501623153686523, "step": 2400 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -2.238737106323242, "eval_logits/rejected": -2.1359598636627197, "eval_logps/chosen": -1.0393366813659668, "eval_logps/rejected": -1.3077890872955322, "eval_loss": 1.558164358139038, "eval_rewards/accuracies": 0.6215000152587891, "eval_rewards/chosen": -2.0786733627319336, "eval_rewards/margins": 0.5369049310684204, "eval_rewards/rejected": -2.6155781745910645, "eval_runtime": 424.098, "eval_samples_per_second": 4.716, "eval_steps_per_second": 1.179, "step": 2400 }, { "epoch": 0.6294163831457733, "grad_norm": 11.0, "learning_rate": 1.0899708371832583e-07, "logits/chosen": -2.4730851650238037, "logits/rejected": -2.2875025272369385, "logps/chosen": -0.8949782252311707, "logps/rejected": -1.3533945083618164, "loss": 1.3232, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7899564504623413, "rewards/margins": 0.916832447052002, "rewards/rejected": -2.706789016723633, "step": 2405 }, { "epoch": 0.6307249411148914, "grad_norm": 35.75, "learning_rate": 1.0833827718012894e-07, "logits/chosen": -2.448568344116211, "logits/rejected": -2.3400423526763916, "logps/chosen": -1.0112457275390625, "logps/rejected": -1.207571268081665, "loss": 1.6083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.022491455078125, "rewards/margins": 0.3926510214805603, "rewards/rejected": -2.41514253616333, "step": 2410 }, { "epoch": 0.6320334990840094, "grad_norm": 28.75, "learning_rate": 1.076803403319373e-07, "logits/chosen": -2.2750802040100098, "logits/rejected": -2.347308397293091, "logps/chosen": -1.0063066482543945, "logps/rejected": -1.0792009830474854, "loss": 1.7544, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.012613296508789, "rewards/margins": 0.14578866958618164, "rewards/rejected": -2.1584019660949707, "step": 2415 }, { "epoch": 0.6333420570531274, "grad_norm": 7.375, "learning_rate": 1.0702328690820691e-07, "logits/chosen": -2.4139416217803955, "logits/rejected": -2.409759521484375, "logps/chosen": -1.0117336511611938, "logps/rejected": -1.2717506885528564, "loss": 1.5935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0234673023223877, "rewards/margins": 0.5200342535972595, "rewards/rejected": -2.543501377105713, "step": 2420 }, { "epoch": 0.6346506150222455, "grad_norm": 7.03125, "learning_rate": 1.0636713062495234e-07, "logits/chosen": -2.370800495147705, "logits/rejected": -2.2154176235198975, "logps/chosen": -0.8887961506843567, "logps/rejected": -1.2706650495529175, "loss": 1.3894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7775923013687134, "rewards/margins": 0.7637379765510559, "rewards/rejected": -2.541330099105835, "step": 2425 }, { "epoch": 0.6359591729913635, "grad_norm": 7.15625, "learning_rate": 1.0571188517946024e-07, "logits/chosen": -2.495107650756836, "logits/rejected": -2.377840280532837, "logps/chosen": -1.0215175151824951, "logps/rejected": -1.235435128211975, "loss": 1.6314, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0430350303649902, "rewards/margins": 0.4278346002101898, "rewards/rejected": -2.47087025642395, "step": 2430 }, { "epoch": 0.6372677309604815, "grad_norm": 17.0, "learning_rate": 1.0505756425000358e-07, "logits/chosen": -2.2844350337982178, "logits/rejected": -2.198392868041992, "logps/chosen": -1.0967059135437012, "logps/rejected": -1.3849542140960693, "loss": 1.524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1934118270874023, "rewards/margins": 0.576496958732605, "rewards/rejected": -2.7699084281921387, "step": 2435 }, { "epoch": 0.6385762889295996, "grad_norm": 17.375, "learning_rate": 1.0440418149555597e-07, "logits/chosen": -2.365701675415039, "logits/rejected": -2.350088119506836, "logps/chosen": -1.0826886892318726, "logps/rejected": -1.241385817527771, "loss": 1.685, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.165377378463745, "rewards/margins": 0.31739428639411926, "rewards/rejected": -2.482771635055542, "step": 2440 }, { "epoch": 0.6398848468987176, "grad_norm": 10.875, "learning_rate": 1.0375175055550646e-07, "logits/chosen": -2.4226748943328857, "logits/rejected": -2.299659013748169, "logps/chosen": -1.0714317560195923, "logps/rejected": -1.2443063259124756, "loss": 1.6545, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1428635120391846, "rewards/margins": 0.3457491099834442, "rewards/rejected": -2.488612651824951, "step": 2445 }, { "epoch": 0.6411934048678356, "grad_norm": 10.75, "learning_rate": 1.0310028504937524e-07, "logits/chosen": -2.4782421588897705, "logits/rejected": -2.352353096008301, "logps/chosen": -0.99134361743927, "logps/rejected": -1.188676118850708, "loss": 1.6585, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.98268723487854, "rewards/margins": 0.3946649432182312, "rewards/rejected": -2.377352237701416, "step": 2450 }, { "epoch": 0.6425019628369537, "grad_norm": 40.25, "learning_rate": 1.0244979857652877e-07, "logits/chosen": -2.413424015045166, "logits/rejected": -2.1864733695983887, "logps/chosen": -1.046458125114441, "logps/rejected": -1.337463140487671, "loss": 1.525, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.092916250228882, "rewards/margins": 0.5820099115371704, "rewards/rejected": -2.674926280975342, "step": 2455 }, { "epoch": 0.6438105208060717, "grad_norm": 27.125, "learning_rate": 1.0180030471589622e-07, "logits/chosen": -2.4341421127319336, "logits/rejected": -2.285735607147217, "logps/chosen": -1.0777932405471802, "logps/rejected": -1.3975156545639038, "loss": 1.5193, "rewards/accuracies": 0.625, "rewards/chosen": -2.1555864810943604, "rewards/margins": 0.6394448280334473, "rewards/rejected": -2.7950313091278076, "step": 2460 }, { "epoch": 0.6451190787751897, "grad_norm": 7.65625, "learning_rate": 1.01151817025686e-07, "logits/chosen": -2.405449390411377, "logits/rejected": -2.309771776199341, "logps/chosen": -1.020456075668335, "logps/rejected": -1.3103121519088745, "loss": 1.5739, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.04091215133667, "rewards/margins": 0.5797122120857239, "rewards/rejected": -2.620624303817749, "step": 2465 }, { "epoch": 0.6464276367443078, "grad_norm": 8.9375, "learning_rate": 1.005043490431026e-07, "logits/chosen": -2.4870095252990723, "logits/rejected": -2.306039333343506, "logps/chosen": -0.9499581456184387, "logps/rejected": -1.31827712059021, "loss": 1.3831, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8999162912368774, "rewards/margins": 0.7366382479667664, "rewards/rejected": -2.63655424118042, "step": 2470 }, { "epoch": 0.6477361947134258, "grad_norm": 12.4375, "learning_rate": 9.985791428406413e-08, "logits/chosen": -2.3410439491271973, "logits/rejected": -2.2568299770355225, "logps/chosen": -0.934490978717804, "logps/rejected": -1.1321436166763306, "loss": 1.565, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.868981957435608, "rewards/margins": 0.3953050971031189, "rewards/rejected": -2.264287233352661, "step": 2475 }, { "epoch": 0.6490447526825438, "grad_norm": 27.0, "learning_rate": 9.92125262429201e-08, "logits/chosen": -2.38790225982666, "logits/rejected": -2.3221728801727295, "logps/chosen": -1.0000097751617432, "logps/rejected": -1.2264660596847534, "loss": 1.6351, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0000195503234863, "rewards/margins": 0.45291250944137573, "rewards/rejected": -2.452932119369507, "step": 2480 }, { "epoch": 0.6503533106516619, "grad_norm": 13.8125, "learning_rate": 9.85681983921697e-08, "logits/chosen": -2.3161489963531494, "logits/rejected": -2.308422088623047, "logps/chosen": -1.04147469997406, "logps/rejected": -1.2405277490615845, "loss": 1.6565, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.08294939994812, "rewards/margins": 0.3981059193611145, "rewards/rejected": -2.481055498123169, "step": 2485 }, { "epoch": 0.6516618686207799, "grad_norm": 10.75, "learning_rate": 9.792494418218074e-08, "logits/chosen": -2.41933536529541, "logits/rejected": -2.153663158416748, "logps/chosen": -0.9374859929084778, "logps/rejected": -1.3958766460418701, "loss": 1.3742, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8749719858169556, "rewards/margins": 0.9167814254760742, "rewards/rejected": -2.7917532920837402, "step": 2490 }, { "epoch": 0.6529704265898979, "grad_norm": 26.75, "learning_rate": 9.72827770409086e-08, "logits/chosen": -2.4259328842163086, "logits/rejected": -2.296947956085205, "logps/chosen": -0.9496080279350281, "logps/rejected": -1.3367187976837158, "loss": 1.4808, "rewards/accuracies": 0.625, "rewards/chosen": -1.8992160558700562, "rewards/margins": 0.7742214798927307, "rewards/rejected": -2.6734375953674316, "step": 2495 }, { "epoch": 0.654278984559016, "grad_norm": 10.625, "learning_rate": 9.664171037361614e-08, "logits/chosen": -2.389009475708008, "logits/rejected": -2.300400972366333, "logps/chosen": -1.0098192691802979, "logps/rejected": -1.3717113733291626, "loss": 1.4113, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0196385383605957, "rewards/margins": 0.7237839698791504, "rewards/rejected": -2.743422746658325, "step": 2500 }, { "epoch": 0.655587542528134, "grad_norm": 11.625, "learning_rate": 9.60017575625937e-08, "logits/chosen": -2.4407551288604736, "logits/rejected": -2.289405345916748, "logps/chosen": -0.9913199543952942, "logps/rejected": -1.3130135536193848, "loss": 1.5243, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9826399087905884, "rewards/margins": 0.6433871984481812, "rewards/rejected": -2.6260271072387695, "step": 2505 }, { "epoch": 0.656896100497252, "grad_norm": 25.25, "learning_rate": 9.536293196687996e-08, "logits/chosen": -2.4609408378601074, "logits/rejected": -2.3467025756835938, "logps/chosen": -1.0883733034133911, "logps/rejected": -1.2040952444076538, "loss": 1.7505, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1767466068267822, "rewards/margins": 0.23144373297691345, "rewards/rejected": -2.4081904888153076, "step": 2510 }, { "epoch": 0.6582046584663701, "grad_norm": 10.25, "learning_rate": 9.472524692198282e-08, "logits/chosen": -2.444101333618164, "logits/rejected": -2.2595982551574707, "logps/chosen": -1.0007539987564087, "logps/rejected": -1.375180721282959, "loss": 1.4645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0015079975128174, "rewards/margins": 0.7488536834716797, "rewards/rejected": -2.750361442565918, "step": 2515 }, { "epoch": 0.6595132164354881, "grad_norm": 16.875, "learning_rate": 9.408871573960119e-08, "logits/chosen": -2.3967909812927246, "logits/rejected": -2.4123709201812744, "logps/chosen": -1.0066630840301514, "logps/rejected": -1.2854671478271484, "loss": 1.664, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0133261680603027, "rewards/margins": 0.557607889175415, "rewards/rejected": -2.570934295654297, "step": 2520 }, { "epoch": 0.6608217744046061, "grad_norm": 27.375, "learning_rate": 9.345335170734702e-08, "logits/chosen": -2.519296646118164, "logits/rejected": -2.3289730548858643, "logps/chosen": -1.0237505435943604, "logps/rejected": -1.276888132095337, "loss": 1.4874, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0475010871887207, "rewards/margins": 0.5062751770019531, "rewards/rejected": -2.553776264190674, "step": 2525 }, { "epoch": 0.6621303323737242, "grad_norm": 14.875, "learning_rate": 9.281916808846807e-08, "logits/chosen": -2.3988661766052246, "logits/rejected": -2.3589653968811035, "logps/chosen": -1.1221929788589478, "logps/rejected": -1.2874782085418701, "loss": 1.666, "rewards/accuracies": 0.5625, "rewards/chosen": -2.2443859577178955, "rewards/margins": 0.3305709958076477, "rewards/rejected": -2.5749564170837402, "step": 2530 }, { "epoch": 0.6634388903428422, "grad_norm": 21.875, "learning_rate": 9.21861781215708e-08, "logits/chosen": -2.4502480030059814, "logits/rejected": -2.3683876991271973, "logps/chosen": -1.0562758445739746, "logps/rejected": -1.1572154760360718, "loss": 1.7615, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.112551689147949, "rewards/margins": 0.2018791139125824, "rewards/rejected": -2.3144309520721436, "step": 2535 }, { "epoch": 0.6647474483119602, "grad_norm": 24.5, "learning_rate": 9.15543950203442e-08, "logits/chosen": -2.3372139930725098, "logits/rejected": -2.2904155254364014, "logps/chosen": -0.9301155209541321, "logps/rejected": -1.1528995037078857, "loss": 1.5945, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8602310419082642, "rewards/margins": 0.4455679953098297, "rewards/rejected": -2.3057990074157715, "step": 2540 }, { "epoch": 0.6660560062810783, "grad_norm": 21.875, "learning_rate": 9.092383197328387e-08, "logits/chosen": -2.387012481689453, "logits/rejected": -2.2569687366485596, "logps/chosen": -1.0622999668121338, "logps/rejected": -1.534104585647583, "loss": 1.475, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1245999336242676, "rewards/margins": 0.9436086416244507, "rewards/rejected": -3.068209171295166, "step": 2545 }, { "epoch": 0.6673645642501963, "grad_norm": 8.125, "learning_rate": 9.029450214341672e-08, "logits/chosen": -2.4298737049102783, "logits/rejected": -2.2125282287597656, "logps/chosen": -0.9660283327102661, "logps/rejected": -1.3082364797592163, "loss": 1.4678, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9320566654205322, "rewards/margins": 0.6844165921211243, "rewards/rejected": -2.6164729595184326, "step": 2550 }, { "epoch": 0.6686731222193143, "grad_norm": 10.75, "learning_rate": 8.96664186680263e-08, "logits/chosen": -2.37648868560791, "logits/rejected": -2.328756093978882, "logps/chosen": -0.9809591174125671, "logps/rejected": -1.236372709274292, "loss": 1.5358, "rewards/accuracies": 0.625, "rewards/chosen": -1.9619182348251343, "rewards/margins": 0.5108271837234497, "rewards/rejected": -2.472745418548584, "step": 2555 }, { "epoch": 0.6699816801884323, "grad_norm": 13.875, "learning_rate": 8.903959465837833e-08, "logits/chosen": -2.5016849040985107, "logits/rejected": -2.365461587905884, "logps/chosen": -1.025858759880066, "logps/rejected": -1.2428934574127197, "loss": 1.6005, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.051717519760132, "rewards/margins": 0.4340694844722748, "rewards/rejected": -2.4857869148254395, "step": 2560 }, { "epoch": 0.6712902381575504, "grad_norm": 12.625, "learning_rate": 8.841404319944718e-08, "logits/chosen": -2.3855907917022705, "logits/rejected": -2.2605817317962646, "logps/chosen": -1.0556578636169434, "logps/rejected": -1.18600332736969, "loss": 1.6905, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1113157272338867, "rewards/margins": 0.26069125533103943, "rewards/rejected": -2.37200665473938, "step": 2565 }, { "epoch": 0.6725987961266684, "grad_norm": 10.25, "learning_rate": 8.77897773496428e-08, "logits/chosen": -2.4312407970428467, "logits/rejected": -2.2516613006591797, "logps/chosen": -1.0611122846603394, "logps/rejected": -1.3791961669921875, "loss": 1.533, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1222245693206787, "rewards/margins": 0.6361676454544067, "rewards/rejected": -2.758392333984375, "step": 2570 }, { "epoch": 0.6739073540957864, "grad_norm": 6.3125, "learning_rate": 8.716681014053788e-08, "logits/chosen": -2.4676003456115723, "logits/rejected": -2.4122684001922607, "logps/chosen": -1.0107800960540771, "logps/rejected": -1.2653642892837524, "loss": 1.5907, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0215601921081543, "rewards/margins": 0.5091684460639954, "rewards/rejected": -2.530728578567505, "step": 2575 }, { "epoch": 0.6752159120649045, "grad_norm": 9.875, "learning_rate": 8.654515457659594e-08, "logits/chosen": -2.5059523582458496, "logits/rejected": -2.31658935546875, "logps/chosen": -1.0749976634979248, "logps/rejected": -1.2823216915130615, "loss": 1.6074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1499953269958496, "rewards/margins": 0.4146478772163391, "rewards/rejected": -2.564643383026123, "step": 2580 }, { "epoch": 0.6765244700340225, "grad_norm": 27.375, "learning_rate": 8.59248236348998e-08, "logits/chosen": -2.356903076171875, "logits/rejected": -2.305155038833618, "logps/chosen": -0.9397870898246765, "logps/rejected": -1.2169208526611328, "loss": 1.5927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.879574179649353, "rewards/margins": 0.5542675256729126, "rewards/rejected": -2.4338417053222656, "step": 2585 }, { "epoch": 0.6778330280031405, "grad_norm": 19.125, "learning_rate": 8.530583026488094e-08, "logits/chosen": -2.3364341259002686, "logits/rejected": -2.186464548110962, "logps/chosen": -1.0228286981582642, "logps/rejected": -1.3034131526947021, "loss": 1.5562, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0456573963165283, "rewards/margins": 0.5611690282821655, "rewards/rejected": -2.6068263053894043, "step": 2590 }, { "epoch": 0.6791415859722586, "grad_norm": 17.75, "learning_rate": 8.468818738804876e-08, "logits/chosen": -2.3964295387268066, "logits/rejected": -2.385544776916504, "logps/chosen": -0.9999616742134094, "logps/rejected": -1.0892765522003174, "loss": 1.7652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9999233484268188, "rewards/margins": 0.1786295622587204, "rewards/rejected": -2.1785531044006348, "step": 2595 }, { "epoch": 0.6804501439413766, "grad_norm": 9.5, "learning_rate": 8.407190789772126e-08, "logits/chosen": -2.3780012130737305, "logits/rejected": -2.274895191192627, "logps/chosen": -1.045021653175354, "logps/rejected": -1.2250845432281494, "loss": 1.6485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.090043306350708, "rewards/margins": 0.3601256012916565, "rewards/rejected": -2.450169086456299, "step": 2600 }, { "epoch": 0.6817587019104946, "grad_norm": 8.25, "learning_rate": 8.345700465875542e-08, "logits/chosen": -2.405092716217041, "logits/rejected": -2.4313042163848877, "logps/chosen": -0.958307147026062, "logps/rejected": -1.1450821161270142, "loss": 1.6272, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.916614294052124, "rewards/margins": 0.3735499382019043, "rewards/rejected": -2.2901642322540283, "step": 2605 }, { "epoch": 0.6830672598796127, "grad_norm": 16.75, "learning_rate": 8.284349050727936e-08, "logits/chosen": -2.4323151111602783, "logits/rejected": -2.3850605487823486, "logps/chosen": -0.9111353158950806, "logps/rejected": -1.1868826150894165, "loss": 1.4883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8222706317901611, "rewards/margins": 0.5514944791793823, "rewards/rejected": -2.373765230178833, "step": 2610 }, { "epoch": 0.6843758178487307, "grad_norm": 6.40625, "learning_rate": 8.223137825042356e-08, "logits/chosen": -2.4281005859375, "logits/rejected": -2.3337531089782715, "logps/chosen": -0.9719001650810242, "logps/rejected": -1.256135106086731, "loss": 1.5145, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9438003301620483, "rewards/margins": 0.5684697031974792, "rewards/rejected": -2.512270212173462, "step": 2615 }, { "epoch": 0.6856843758178487, "grad_norm": 9.4375, "learning_rate": 8.162068066605399e-08, "logits/chosen": -2.4331576824188232, "logits/rejected": -2.384004592895508, "logps/chosen": -0.9206689596176147, "logps/rejected": -1.1520830392837524, "loss": 1.5981, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8413379192352295, "rewards/margins": 0.4628280997276306, "rewards/rejected": -2.304166078567505, "step": 2620 }, { "epoch": 0.6869929337869668, "grad_norm": 15.5625, "learning_rate": 8.10114105025054e-08, "logits/chosen": -2.3886940479278564, "logits/rejected": -2.4150185585021973, "logps/chosen": -1.0086795091629028, "logps/rejected": -1.315510869026184, "loss": 1.6115, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0173590183258057, "rewards/margins": 0.6136623620986938, "rewards/rejected": -2.631021738052368, "step": 2625 }, { "epoch": 0.6883014917560848, "grad_norm": 9.0, "learning_rate": 8.040358047831488e-08, "logits/chosen": -2.354318857192993, "logits/rejected": -2.247954845428467, "logps/chosen": -1.0229661464691162, "logps/rejected": -1.3260799646377563, "loss": 1.5073, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0459322929382324, "rewards/margins": 0.606227695941925, "rewards/rejected": -2.6521599292755127, "step": 2630 }, { "epoch": 0.6896100497252028, "grad_norm": 7.21875, "learning_rate": 7.979720328195684e-08, "logits/chosen": -2.335116147994995, "logits/rejected": -2.330467700958252, "logps/chosen": -1.0437685251235962, "logps/rejected": -1.2559889554977417, "loss": 1.6602, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0875370502471924, "rewards/margins": 0.4244409203529358, "rewards/rejected": -2.5119779109954834, "step": 2635 }, { "epoch": 0.6909186076943209, "grad_norm": 10.625, "learning_rate": 7.919229157157751e-08, "logits/chosen": -2.413278579711914, "logits/rejected": -2.386929750442505, "logps/chosen": -0.970443844795227, "logps/rejected": -1.1448700428009033, "loss": 1.6717, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.940887689590454, "rewards/margins": 0.3488522171974182, "rewards/rejected": -2.2897400856018066, "step": 2640 }, { "epoch": 0.6922271656634389, "grad_norm": 15.625, "learning_rate": 7.858885797473132e-08, "logits/chosen": -2.4015049934387207, "logits/rejected": -2.2180652618408203, "logps/chosen": -1.0737303495407104, "logps/rejected": -1.3226743936538696, "loss": 1.6274, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.147460699081421, "rewards/margins": 0.4978879392147064, "rewards/rejected": -2.6453487873077393, "step": 2645 }, { "epoch": 0.6935357236325569, "grad_norm": 13.375, "learning_rate": 7.798691508811692e-08, "logits/chosen": -2.3971807956695557, "logits/rejected": -2.376448154449463, "logps/chosen": -0.9634662866592407, "logps/rejected": -1.1426451206207275, "loss": 1.6526, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9269325733184814, "rewards/margins": 0.3583577573299408, "rewards/rejected": -2.285290241241455, "step": 2650 }, { "epoch": 0.694844281601675, "grad_norm": 9.25, "learning_rate": 7.73864754773144e-08, "logits/chosen": -2.3886475563049316, "logits/rejected": -2.236575126647949, "logps/chosen": -1.083742380142212, "logps/rejected": -1.383603811264038, "loss": 1.6008, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.167484760284424, "rewards/margins": 0.599722683429718, "rewards/rejected": -2.767207622528076, "step": 2655 }, { "epoch": 0.696152839570793, "grad_norm": 15.375, "learning_rate": 7.678755167652271e-08, "logits/chosen": -2.446132183074951, "logits/rejected": -2.335289716720581, "logps/chosen": -1.0704548358917236, "logps/rejected": -1.1588380336761475, "loss": 1.8416, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.1409096717834473, "rewards/margins": 0.17676623165607452, "rewards/rejected": -2.317676067352295, "step": 2660 }, { "epoch": 0.697461397539911, "grad_norm": 22.125, "learning_rate": 7.619015618829851e-08, "logits/chosen": -2.4251155853271484, "logits/rejected": -2.3589186668395996, "logps/chosen": -1.0226176977157593, "logps/rejected": -1.249564290046692, "loss": 1.6323, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0452353954315186, "rewards/margins": 0.45389336347579956, "rewards/rejected": -2.499128580093384, "step": 2665 }, { "epoch": 0.6987699555090291, "grad_norm": 11.9375, "learning_rate": 7.559430148329457e-08, "logits/chosen": -2.3543694019317627, "logits/rejected": -2.3545305728912354, "logps/chosen": -1.0942565202713013, "logps/rejected": -1.277268886566162, "loss": 1.7034, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1885130405426025, "rewards/margins": 0.36602479219436646, "rewards/rejected": -2.554537773132324, "step": 2670 }, { "epoch": 0.7000785134781471, "grad_norm": 7.84375, "learning_rate": 7.500000000000004e-08, "logits/chosen": -2.4506852626800537, "logits/rejected": -2.377368688583374, "logps/chosen": -0.9695189595222473, "logps/rejected": -1.2628130912780762, "loss": 1.4822, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9390379190444946, "rewards/margins": 0.5865882635116577, "rewards/rejected": -2.5256261825561523, "step": 2675 }, { "epoch": 0.7013870714472651, "grad_norm": 18.375, "learning_rate": 7.440726414448021e-08, "logits/chosen": -2.34486722946167, "logits/rejected": -2.234945774078369, "logps/chosen": -1.0248939990997314, "logps/rejected": -1.3084797859191895, "loss": 1.4828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.049787998199463, "rewards/margins": 0.567171573638916, "rewards/rejected": -2.616959571838379, "step": 2680 }, { "epoch": 0.7026956294163832, "grad_norm": 21.875, "learning_rate": 7.38161062901181e-08, "logits/chosen": -2.2793948650360107, "logits/rejected": -2.2135212421417236, "logps/chosen": -1.018551230430603, "logps/rejected": -1.3048248291015625, "loss": 1.5296, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.037102460861206, "rewards/margins": 0.5725473761558533, "rewards/rejected": -2.609649658203125, "step": 2685 }, { "epoch": 0.7040041873855012, "grad_norm": 10.875, "learning_rate": 7.322653877735583e-08, "logits/chosen": -2.384803056716919, "logits/rejected": -2.2426388263702393, "logps/chosen": -1.0511977672576904, "logps/rejected": -1.2277921438217163, "loss": 1.6943, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.102395534515381, "rewards/margins": 0.35318875312805176, "rewards/rejected": -2.4555842876434326, "step": 2690 }, { "epoch": 0.7053127453546192, "grad_norm": 28.5, "learning_rate": 7.263857391343693e-08, "logits/chosen": -2.309842824935913, "logits/rejected": -2.3314995765686035, "logps/chosen": -1.0462729930877686, "logps/rejected": -1.2381813526153564, "loss": 1.7201, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.092545986175537, "rewards/margins": 0.3838166892528534, "rewards/rejected": -2.476362705230713, "step": 2695 }, { "epoch": 0.7066213033237373, "grad_norm": 16.375, "learning_rate": 7.205222397214979e-08, "logits/chosen": -2.357294797897339, "logits/rejected": -2.337543249130249, "logps/chosen": -0.9523067474365234, "logps/rejected": -1.2712560892105103, "loss": 1.4749, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9046134948730469, "rewards/margins": 0.6378988027572632, "rewards/rejected": -2.5425121784210205, "step": 2700 }, { "epoch": 0.7079298612928553, "grad_norm": 7.40625, "learning_rate": 7.146750119357104e-08, "logits/chosen": -2.5635945796966553, "logits/rejected": -2.311405897140503, "logps/chosen": -0.8986091613769531, "logps/rejected": -1.390716791152954, "loss": 1.3454, "rewards/accuracies": 0.75, "rewards/chosen": -1.7972183227539062, "rewards/margins": 0.9842153787612915, "rewards/rejected": -2.781433582305908, "step": 2705 }, { "epoch": 0.7092384192619733, "grad_norm": 22.625, "learning_rate": 7.088441778381041e-08, "logits/chosen": -2.504006862640381, "logits/rejected": -2.3820934295654297, "logps/chosen": -1.0159060955047607, "logps/rejected": -1.2750980854034424, "loss": 1.6563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0318121910095215, "rewards/margins": 0.5183839201927185, "rewards/rejected": -2.5501961708068848, "step": 2710 }, { "epoch": 0.7105469772310914, "grad_norm": 70.0, "learning_rate": 7.030298591475554e-08, "logits/chosen": -2.3255512714385986, "logits/rejected": -2.373734712600708, "logps/chosen": -0.9677811861038208, "logps/rejected": -1.1621229648590088, "loss": 1.7172, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9355623722076416, "rewards/margins": 0.38868388533592224, "rewards/rejected": -2.3242459297180176, "step": 2715 }, { "epoch": 0.7118555352002094, "grad_norm": 19.375, "learning_rate": 6.972321772381831e-08, "logits/chosen": -2.3779146671295166, "logits/rejected": -2.2710764408111572, "logps/chosen": -1.012012243270874, "logps/rejected": -1.4297925233840942, "loss": 1.3961, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.024024486541748, "rewards/margins": 0.8355606198310852, "rewards/rejected": -2.8595850467681885, "step": 2720 }, { "epoch": 0.7131640931693274, "grad_norm": 11.0, "learning_rate": 6.914512531368105e-08, "logits/chosen": -2.379075527191162, "logits/rejected": -2.3118736743927, "logps/chosen": -1.077222466468811, "logps/rejected": -1.3156849145889282, "loss": 1.5792, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.154444932937622, "rewards/margins": 0.4769246578216553, "rewards/rejected": -2.6313698291778564, "step": 2725 }, { "epoch": 0.7144726511384454, "grad_norm": 12.5625, "learning_rate": 6.856872075204429e-08, "logits/chosen": -2.4213502407073975, "logits/rejected": -2.3110151290893555, "logps/chosen": -1.0430843830108643, "logps/rejected": -1.1771119832992554, "loss": 1.6779, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0861687660217285, "rewards/margins": 0.26805490255355835, "rewards/rejected": -2.3542239665985107, "step": 2730 }, { "epoch": 0.7157812091075635, "grad_norm": 8.3125, "learning_rate": 6.799401607137461e-08, "logits/chosen": -2.460589647293091, "logits/rejected": -2.3522865772247314, "logps/chosen": -1.002406358718872, "logps/rejected": -1.266150712966919, "loss": 1.4909, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.004812717437744, "rewards/margins": 0.5274888873100281, "rewards/rejected": -2.532301425933838, "step": 2735 }, { "epoch": 0.7170897670766815, "grad_norm": 11.25, "learning_rate": 6.742102326865338e-08, "logits/chosen": -2.4368221759796143, "logits/rejected": -2.195312261581421, "logps/chosen": -0.9342187643051147, "logps/rejected": -1.3770285844802856, "loss": 1.3966, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8684375286102295, "rewards/margins": 0.8856194615364075, "rewards/rejected": -2.7540571689605713, "step": 2740 }, { "epoch": 0.7183983250457995, "grad_norm": 7.40625, "learning_rate": 6.684975430512665e-08, "logits/chosen": -2.4475514888763428, "logits/rejected": -2.332526683807373, "logps/chosen": -0.9869586825370789, "logps/rejected": -1.2103749513626099, "loss": 1.612, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9739173650741577, "rewards/margins": 0.4468325674533844, "rewards/rejected": -2.4207499027252197, "step": 2745 }, { "epoch": 0.7197068830149176, "grad_norm": 19.625, "learning_rate": 6.628022110605508e-08, "logits/chosen": -2.4842114448547363, "logits/rejected": -2.3175222873687744, "logps/chosen": -1.0531316995620728, "logps/rejected": -1.2697423696517944, "loss": 1.5886, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1062633991241455, "rewards/margins": 0.43322157859802246, "rewards/rejected": -2.539484739303589, "step": 2750 }, { "epoch": 0.7210154409840356, "grad_norm": 28.875, "learning_rate": 6.571243556046536e-08, "logits/chosen": -2.4664247035980225, "logits/rejected": -2.343776226043701, "logps/chosen": -0.985080897808075, "logps/rejected": -1.2134296894073486, "loss": 1.6075, "rewards/accuracies": 0.625, "rewards/chosen": -1.97016179561615, "rewards/margins": 0.45669737458229065, "rewards/rejected": -2.4268593788146973, "step": 2755 }, { "epoch": 0.7223239989531536, "grad_norm": 22.0, "learning_rate": 6.514640952090169e-08, "logits/chosen": -2.4205281734466553, "logits/rejected": -2.37487530708313, "logps/chosen": -0.9428628087043762, "logps/rejected": -1.1345798969268799, "loss": 1.6506, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8857256174087524, "rewards/margins": 0.38343411684036255, "rewards/rejected": -2.2691597938537598, "step": 2760 }, { "epoch": 0.7236325569222717, "grad_norm": 11.0625, "learning_rate": 6.458215480317859e-08, "logits/chosen": -2.4705214500427246, "logits/rejected": -2.253159523010254, "logps/chosen": -1.0775936841964722, "logps/rejected": -1.3897502422332764, "loss": 1.518, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1551873683929443, "rewards/margins": 0.6243131756782532, "rewards/rejected": -2.7795004844665527, "step": 2765 }, { "epoch": 0.7249411148913897, "grad_norm": 11.25, "learning_rate": 6.40196831861342e-08, "logits/chosen": -2.3577442169189453, "logits/rejected": -2.2048380374908447, "logps/chosen": -1.130702018737793, "logps/rejected": -1.441443681716919, "loss": 1.4881, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.261404037475586, "rewards/margins": 0.6214836239814758, "rewards/rejected": -2.882887363433838, "step": 2770 }, { "epoch": 0.7262496728605077, "grad_norm": 14.625, "learning_rate": 6.345900641138439e-08, "logits/chosen": -2.242882251739502, "logits/rejected": -2.218822717666626, "logps/chosen": -1.1267540454864502, "logps/rejected": -1.364145040512085, "loss": 1.6429, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.2535080909729004, "rewards/margins": 0.4747818112373352, "rewards/rejected": -2.72829008102417, "step": 2775 }, { "epoch": 0.7275582308296258, "grad_norm": 16.125, "learning_rate": 6.290013618307747e-08, "logits/chosen": -2.4654622077941895, "logits/rejected": -2.3239941596984863, "logps/chosen": -1.0309946537017822, "logps/rejected": -1.2228128910064697, "loss": 1.6503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0619893074035645, "rewards/margins": 0.38363632559776306, "rewards/rejected": -2.4456257820129395, "step": 2780 }, { "epoch": 0.7288667887987438, "grad_norm": 10.25, "learning_rate": 6.23430841676502e-08, "logits/chosen": -2.4951510429382324, "logits/rejected": -2.2966792583465576, "logps/chosen": -1.0006461143493652, "logps/rejected": -1.3544574975967407, "loss": 1.4838, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0012922286987305, "rewards/margins": 0.707622766494751, "rewards/rejected": -2.7089149951934814, "step": 2785 }, { "epoch": 0.7301753467678618, "grad_norm": 7.84375, "learning_rate": 6.178786199358398e-08, "logits/chosen": -2.380481004714966, "logits/rejected": -2.266873598098755, "logps/chosen": -1.0225985050201416, "logps/rejected": -1.2401126623153687, "loss": 1.5944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.045197010040283, "rewards/margins": 0.4350283741950989, "rewards/rejected": -2.4802253246307373, "step": 2790 }, { "epoch": 0.7314839047369799, "grad_norm": 15.375, "learning_rate": 6.123448125116226e-08, "logits/chosen": -2.390517234802246, "logits/rejected": -2.345496654510498, "logps/chosen": -1.0911433696746826, "logps/rejected": -1.2971880435943604, "loss": 1.6716, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1822867393493652, "rewards/margins": 0.41208919882774353, "rewards/rejected": -2.5943760871887207, "step": 2795 }, { "epoch": 0.7327924627060979, "grad_norm": 11.375, "learning_rate": 6.068295349222846e-08, "logits/chosen": -2.42075777053833, "logits/rejected": -2.2012033462524414, "logps/chosen": -0.9808559417724609, "logps/rejected": -1.2902246713638306, "loss": 1.5021, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9617118835449219, "rewards/margins": 0.6187372803688049, "rewards/rejected": -2.580449342727661, "step": 2800 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -2.239506721496582, "eval_logits/rejected": -2.1368095874786377, "eval_logps/chosen": -1.0048226118087769, "eval_logps/rejected": -1.2706924676895142, "eval_loss": 1.5539851188659668, "eval_rewards/accuracies": 0.6234999895095825, "eval_rewards/chosen": -2.0096452236175537, "eval_rewards/margins": 0.5317397117614746, "eval_rewards/rejected": -2.5413849353790283, "eval_runtime": 423.7371, "eval_samples_per_second": 4.72, "eval_steps_per_second": 1.18, "step": 2800 }, { "epoch": 0.7341010206752159, "grad_norm": 8.875, "learning_rate": 6.0133290229945e-08, "logits/chosen": -2.4759538173675537, "logits/rejected": -2.3870277404785156, "logps/chosen": -1.0037047863006592, "logps/rejected": -1.2498763799667358, "loss": 1.6906, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0074095726013184, "rewards/margins": 0.4923429489135742, "rewards/rejected": -2.4997527599334717, "step": 2805 }, { "epoch": 0.735409578644334, "grad_norm": 35.25, "learning_rate": 5.9585502938552854e-08, "logits/chosen": -2.433368682861328, "logits/rejected": -2.259305953979492, "logps/chosen": -1.0132333040237427, "logps/rejected": -1.3466191291809082, "loss": 1.5044, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0264666080474854, "rewards/margins": 0.6667717695236206, "rewards/rejected": -2.6932382583618164, "step": 2810 }, { "epoch": 0.736718136613452, "grad_norm": 7.90625, "learning_rate": 5.903960305313188e-08, "logits/chosen": -2.343219041824341, "logits/rejected": -2.266496419906616, "logps/chosen": -0.9952095150947571, "logps/rejected": -1.3788267374038696, "loss": 1.4484, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9904190301895142, "rewards/margins": 0.7672345042228699, "rewards/rejected": -2.7576534748077393, "step": 2815 }, { "epoch": 0.73802669458257, "grad_norm": 9.1875, "learning_rate": 5.849560196936252e-08, "logits/chosen": -2.420839309692383, "logits/rejected": -2.3406076431274414, "logps/chosen": -0.9792252779006958, "logps/rejected": -1.3440744876861572, "loss": 1.5412, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9584505558013916, "rewards/margins": 0.7296984791755676, "rewards/rejected": -2.6881489753723145, "step": 2820 }, { "epoch": 0.7393352525516881, "grad_norm": 20.25, "learning_rate": 5.7953511043287404e-08, "logits/chosen": -2.344701051712036, "logits/rejected": -2.344970941543579, "logps/chosen": -1.0544888973236084, "logps/rejected": -1.1414525508880615, "loss": 1.7302, "rewards/accuracies": 0.5, "rewards/chosen": -2.108977794647217, "rewards/margins": 0.17392729222774506, "rewards/rejected": -2.282905101776123, "step": 2825 }, { "epoch": 0.7406438105208061, "grad_norm": 9.0, "learning_rate": 5.741334159107476e-08, "logits/chosen": -2.4002814292907715, "logits/rejected": -2.283538818359375, "logps/chosen": -1.01686429977417, "logps/rejected": -1.2851479053497314, "loss": 1.5312, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.03372859954834, "rewards/margins": 0.5365673303604126, "rewards/rejected": -2.570295810699463, "step": 2830 }, { "epoch": 0.7419523684899241, "grad_norm": 10.125, "learning_rate": 5.68751048887818e-08, "logits/chosen": -2.394150495529175, "logits/rejected": -2.252131938934326, "logps/chosen": -1.075210452079773, "logps/rejected": -1.376225233078003, "loss": 1.5233, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.150420904159546, "rewards/margins": 0.6020296812057495, "rewards/rejected": -2.752450466156006, "step": 2835 }, { "epoch": 0.7432609264590422, "grad_norm": 10.125, "learning_rate": 5.63388121721197e-08, "logits/chosen": -2.3892226219177246, "logits/rejected": -2.367643356323242, "logps/chosen": -1.0946518182754517, "logps/rejected": -1.352295994758606, "loss": 1.56, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1893036365509033, "rewards/margins": 0.5152882933616638, "rewards/rejected": -2.704591989517212, "step": 2840 }, { "epoch": 0.7445694844281602, "grad_norm": 19.375, "learning_rate": 5.580447463621867e-08, "logits/chosen": -2.2845184803009033, "logits/rejected": -2.2484076023101807, "logps/chosen": -0.9914742708206177, "logps/rejected": -1.2952320575714111, "loss": 1.4715, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9829485416412354, "rewards/margins": 0.6075161099433899, "rewards/rejected": -2.5904641151428223, "step": 2845 }, { "epoch": 0.7458780423972782, "grad_norm": 50.25, "learning_rate": 5.527210343539455e-08, "logits/chosen": -2.3876287937164307, "logits/rejected": -2.297377824783325, "logps/chosen": -0.8975221514701843, "logps/rejected": -1.1839669942855835, "loss": 1.4868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7950443029403687, "rewards/margins": 0.5728897452354431, "rewards/rejected": -2.367933988571167, "step": 2850 }, { "epoch": 0.7471866003663963, "grad_norm": 18.125, "learning_rate": 5.474170968291596e-08, "logits/chosen": -2.4001245498657227, "logits/rejected": -2.3006510734558105, "logps/chosen": -0.9456774592399597, "logps/rejected": -1.1857784986495972, "loss": 1.5596, "rewards/accuracies": 0.625, "rewards/chosen": -1.8913549184799194, "rewards/margins": 0.4802019000053406, "rewards/rejected": -2.3715569972991943, "step": 2855 }, { "epoch": 0.7484951583355143, "grad_norm": 32.25, "learning_rate": 5.421330445077197e-08, "logits/chosen": -2.468461751937866, "logits/rejected": -2.346519947052002, "logps/chosen": -1.0198897123336792, "logps/rejected": -1.219052791595459, "loss": 1.6288, "rewards/accuracies": 0.625, "rewards/chosen": -2.0397794246673584, "rewards/margins": 0.3983260989189148, "rewards/rejected": -2.438105583190918, "step": 2860 }, { "epoch": 0.7498037163046323, "grad_norm": 26.875, "learning_rate": 5.368689876944146e-08, "logits/chosen": -2.369800329208374, "logits/rejected": -2.269291639328003, "logps/chosen": -1.0505177974700928, "logps/rejected": -1.2002203464508057, "loss": 1.6824, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1010355949401855, "rewards/margins": 0.2994045615196228, "rewards/rejected": -2.4004406929016113, "step": 2865 }, { "epoch": 0.7511122742737504, "grad_norm": 9.4375, "learning_rate": 5.3162503627662394e-08, "logits/chosen": -2.448195457458496, "logits/rejected": -2.2896294593811035, "logps/chosen": -0.971805214881897, "logps/rejected": -1.2598263025283813, "loss": 1.5944, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.943610429763794, "rewards/margins": 0.5760419964790344, "rewards/rejected": -2.5196526050567627, "step": 2870 }, { "epoch": 0.7524208322428684, "grad_norm": 14.875, "learning_rate": 5.264012997220288e-08, "logits/chosen": -2.4888925552368164, "logits/rejected": -2.354598045349121, "logps/chosen": -1.0098767280578613, "logps/rejected": -1.3402674198150635, "loss": 1.5572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0197534561157227, "rewards/margins": 0.6607813835144043, "rewards/rejected": -2.680534839630127, "step": 2875 }, { "epoch": 0.7537293902119864, "grad_norm": 7.78125, "learning_rate": 5.211978870763217e-08, "logits/chosen": -2.431576728820801, "logits/rejected": -2.2664337158203125, "logps/chosen": -1.1248712539672852, "logps/rejected": -1.4301419258117676, "loss": 1.5278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2497425079345703, "rewards/margins": 0.6105412840843201, "rewards/rejected": -2.860283851623535, "step": 2880 }, { "epoch": 0.7550379481811044, "grad_norm": 26.0, "learning_rate": 5.160149069609349e-08, "logits/chosen": -2.2565231323242188, "logits/rejected": -2.1964163780212402, "logps/chosen": -1.070330262184143, "logps/rejected": -1.3319157361984253, "loss": 1.5594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.140660524368286, "rewards/margins": 0.5231710076332092, "rewards/rejected": -2.6638314723968506, "step": 2885 }, { "epoch": 0.7563465061502225, "grad_norm": 7.9375, "learning_rate": 5.1085246757077006e-08, "logits/chosen": -2.406395435333252, "logits/rejected": -2.3144330978393555, "logps/chosen": -0.9257711172103882, "logps/rejected": -1.3492621183395386, "loss": 1.3663, "rewards/accuracies": 0.75, "rewards/chosen": -1.8515422344207764, "rewards/margins": 0.8469821810722351, "rewards/rejected": -2.698524236679077, "step": 2890 }, { "epoch": 0.7576550641193405, "grad_norm": 16.625, "learning_rate": 5.0571067667194077e-08, "logits/chosen": -2.4465479850769043, "logits/rejected": -2.3659675121307373, "logps/chosen": -0.9980164766311646, "logps/rejected": -1.1626570224761963, "loss": 1.6926, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.996032953262329, "rewards/margins": 0.329280823469162, "rewards/rejected": -2.3253140449523926, "step": 2895 }, { "epoch": 0.7589636220884585, "grad_norm": 12.5, "learning_rate": 5.005896415995213e-08, "logits/chosen": -2.4618821144104004, "logits/rejected": -2.2898895740509033, "logps/chosen": -1.0477806329727173, "logps/rejected": -1.245457410812378, "loss": 1.5916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0955612659454346, "rewards/margins": 0.39535361528396606, "rewards/rejected": -2.490914821624756, "step": 2900 }, { "epoch": 0.7602721800575766, "grad_norm": 29.875, "learning_rate": 4.954894692553094e-08, "logits/chosen": -2.369992256164551, "logits/rejected": -2.262063503265381, "logps/chosen": -1.0548526048660278, "logps/rejected": -1.2033531665802002, "loss": 1.6886, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1097052097320557, "rewards/margins": 0.29700133204460144, "rewards/rejected": -2.4067063331604004, "step": 2905 }, { "epoch": 0.7615807380266946, "grad_norm": 34.5, "learning_rate": 4.904102661055904e-08, "logits/chosen": -2.434046506881714, "logits/rejected": -2.2709012031555176, "logps/chosen": -1.0288649797439575, "logps/rejected": -1.4876153469085693, "loss": 1.4046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.057729959487915, "rewards/margins": 0.9175006747245789, "rewards/rejected": -2.9752306938171387, "step": 2910 }, { "epoch": 0.7628892959958126, "grad_norm": 8.3125, "learning_rate": 4.853521381789189e-08, "logits/chosen": -2.4705705642700195, "logits/rejected": -2.325921058654785, "logps/chosen": -0.9357919692993164, "logps/rejected": -1.3293598890304565, "loss": 1.3996, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8715839385986328, "rewards/margins": 0.7871360182762146, "rewards/rejected": -2.658719778060913, "step": 2915 }, { "epoch": 0.7641978539649307, "grad_norm": 12.4375, "learning_rate": 4.803151910639015e-08, "logits/chosen": -2.377349615097046, "logits/rejected": -2.225003480911255, "logps/chosen": -0.9961360692977905, "logps/rejected": -1.3070439100265503, "loss": 1.4703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.992272138595581, "rewards/margins": 0.6218158006668091, "rewards/rejected": -2.6140878200531006, "step": 2920 }, { "epoch": 0.7655064119340487, "grad_norm": 9.9375, "learning_rate": 4.752995299069962e-08, "logits/chosen": -2.3604164123535156, "logits/rejected": -2.2199742794036865, "logps/chosen": -0.9728133082389832, "logps/rejected": -1.1908044815063477, "loss": 1.6074, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9456266164779663, "rewards/margins": 0.4359821677207947, "rewards/rejected": -2.3816089630126953, "step": 2925 }, { "epoch": 0.7668149699031667, "grad_norm": 10.8125, "learning_rate": 4.703052594103156e-08, "logits/chosen": -2.381723403930664, "logits/rejected": -2.271658420562744, "logps/chosen": -0.9718124270439148, "logps/rejected": -1.2840853929519653, "loss": 1.5015, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9436248540878296, "rewards/margins": 0.6245457530021667, "rewards/rejected": -2.5681707859039307, "step": 2930 }, { "epoch": 0.7681235278722848, "grad_norm": 15.0, "learning_rate": 4.6533248382944e-08, "logits/chosen": -2.429992437362671, "logits/rejected": -2.2576122283935547, "logps/chosen": -0.9073755145072937, "logps/rejected": -1.2488524913787842, "loss": 1.4615, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8147510290145874, "rewards/margins": 0.6829543113708496, "rewards/rejected": -2.4977049827575684, "step": 2935 }, { "epoch": 0.7694320858414028, "grad_norm": 28.875, "learning_rate": 4.60381306971245e-08, "logits/chosen": -2.5037996768951416, "logits/rejected": -2.273862838745117, "logps/chosen": -0.9683465957641602, "logps/rejected": -1.240882158279419, "loss": 1.4898, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9366931915283203, "rewards/margins": 0.5450714826583862, "rewards/rejected": -2.481764316558838, "step": 2940 }, { "epoch": 0.7707406438105208, "grad_norm": 9.1875, "learning_rate": 4.554518321917293e-08, "logits/chosen": -2.367374897003174, "logits/rejected": -2.276505947113037, "logps/chosen": -1.021672010421753, "logps/rejected": -1.3555033206939697, "loss": 1.5269, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.043344020843506, "rewards/margins": 0.6676627397537231, "rewards/rejected": -2.7110066413879395, "step": 2945 }, { "epoch": 0.7720492017796389, "grad_norm": 13.25, "learning_rate": 4.5054416239386236e-08, "logits/chosen": -2.4519424438476562, "logits/rejected": -2.393014430999756, "logps/chosen": -0.9677358865737915, "logps/rejected": -1.2733625173568726, "loss": 1.523, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.935471773147583, "rewards/margins": 0.6112536191940308, "rewards/rejected": -2.546725034713745, "step": 2950 }, { "epoch": 0.7733577597487569, "grad_norm": 18.625, "learning_rate": 4.4565840002543195e-08, "logits/chosen": -2.4616267681121826, "logits/rejected": -2.3774914741516113, "logps/chosen": -1.0956653356552124, "logps/rejected": -1.2979776859283447, "loss": 1.6748, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.191330671310425, "rewards/margins": 0.4046247601509094, "rewards/rejected": -2.5959553718566895, "step": 2955 }, { "epoch": 0.7746663177178749, "grad_norm": 13.6875, "learning_rate": 4.407946470769093e-08, "logits/chosen": -2.4022555351257324, "logits/rejected": -2.3983538150787354, "logps/chosen": -0.9870920181274414, "logps/rejected": -1.4206526279449463, "loss": 1.3953, "rewards/accuracies": 0.75, "rewards/chosen": -1.9741840362548828, "rewards/margins": 0.8671212196350098, "rewards/rejected": -2.8413052558898926, "step": 2960 }, { "epoch": 0.775974875686993, "grad_norm": 23.125, "learning_rate": 4.359530050793158e-08, "logits/chosen": -2.50380277633667, "logits/rejected": -2.329406261444092, "logps/chosen": -0.9522857666015625, "logps/rejected": -1.1493818759918213, "loss": 1.6953, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.904571533203125, "rewards/margins": 0.39419206976890564, "rewards/rejected": -2.2987637519836426, "step": 2965 }, { "epoch": 0.777283433656111, "grad_norm": 11.75, "learning_rate": 4.311335751021082e-08, "logits/chosen": -2.4081146717071533, "logits/rejected": -2.2348570823669434, "logps/chosen": -0.9355310201644897, "logps/rejected": -1.3979088068008423, "loss": 1.4565, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8710620403289795, "rewards/margins": 0.9247554540634155, "rewards/rejected": -2.7958176136016846, "step": 2970 }, { "epoch": 0.778591991625229, "grad_norm": 12.375, "learning_rate": 4.2633645775106594e-08, "logits/chosen": -2.396559238433838, "logits/rejected": -2.224064350128174, "logps/chosen": -0.9377058148384094, "logps/rejected": -1.274951457977295, "loss": 1.5142, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8754116296768188, "rewards/margins": 0.6744912266731262, "rewards/rejected": -2.54990291595459, "step": 2975 }, { "epoch": 0.7799005495943471, "grad_norm": 11.625, "learning_rate": 4.215617531661901e-08, "logits/chosen": -2.464500665664673, "logits/rejected": -2.2840540409088135, "logps/chosen": -1.043826937675476, "logps/rejected": -1.3147501945495605, "loss": 1.5604, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.087653875350952, "rewards/margins": 0.5418466925621033, "rewards/rejected": -2.629500389099121, "step": 2980 }, { "epoch": 0.7812091075634651, "grad_norm": 6.5, "learning_rate": 4.168095610196166e-08, "logits/chosen": -2.533639430999756, "logits/rejected": -2.4199843406677246, "logps/chosen": -0.9270984530448914, "logps/rejected": -1.1742452383041382, "loss": 1.5344, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8541969060897827, "rewards/margins": 0.4942935109138489, "rewards/rejected": -2.3484904766082764, "step": 2985 }, { "epoch": 0.7825176655325831, "grad_norm": 7.25, "learning_rate": 4.120799805135313e-08, "logits/chosen": -2.5143609046936035, "logits/rejected": -2.468674421310425, "logps/chosen": -1.0180352926254272, "logps/rejected": -1.1511329412460327, "loss": 1.7402, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0360705852508545, "rewards/margins": 0.26619529724121094, "rewards/rejected": -2.3022658824920654, "step": 2990 }, { "epoch": 0.7838262235017012, "grad_norm": 26.25, "learning_rate": 4.073731103781031e-08, "logits/chosen": -2.460416555404663, "logits/rejected": -2.2100436687469482, "logps/chosen": -1.0915604829788208, "logps/rejected": -1.3500556945800781, "loss": 1.5588, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1831209659576416, "rewards/margins": 0.5169904232025146, "rewards/rejected": -2.7001113891601562, "step": 2995 }, { "epoch": 0.7851347814708192, "grad_norm": 28.25, "learning_rate": 4.0268904886941974e-08, "logits/chosen": -2.453082799911499, "logits/rejected": -2.389941692352295, "logps/chosen": -1.0352948904037476, "logps/rejected": -1.1685142517089844, "loss": 1.6889, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.070589780807495, "rewards/margins": 0.2664383053779602, "rewards/rejected": -2.3370285034179688, "step": 3000 }, { "epoch": 0.7864433394399372, "grad_norm": 37.5, "learning_rate": 3.980278937674384e-08, "logits/chosen": -2.4849653244018555, "logits/rejected": -2.38706636428833, "logps/chosen": -0.9351577758789062, "logps/rejected": -1.3579368591308594, "loss": 1.441, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8703155517578125, "rewards/margins": 0.8455581665039062, "rewards/rejected": -2.7158737182617188, "step": 3005 }, { "epoch": 0.7877518974090553, "grad_norm": 15.5, "learning_rate": 3.9338974237394474e-08, "logits/chosen": -2.4985506534576416, "logits/rejected": -2.315535545349121, "logps/chosen": -1.0119073390960693, "logps/rejected": -1.1582492589950562, "loss": 1.6512, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0238146781921387, "rewards/margins": 0.2926838994026184, "rewards/rejected": -2.3164985179901123, "step": 3010 }, { "epoch": 0.7890604553781733, "grad_norm": 17.0, "learning_rate": 3.8877469151052086e-08, "logits/chosen": -2.4100759029388428, "logits/rejected": -2.2711284160614014, "logps/chosen": -0.9768432378768921, "logps/rejected": -1.251725673675537, "loss": 1.5355, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9536864757537842, "rewards/margins": 0.5497647523880005, "rewards/rejected": -2.503451347351074, "step": 3015 }, { "epoch": 0.7903690133472913, "grad_norm": 27.625, "learning_rate": 3.841828375165234e-08, "logits/chosen": -2.440445899963379, "logits/rejected": -2.4146180152893066, "logps/chosen": -0.9715566635131836, "logps/rejected": -1.1971080303192139, "loss": 1.6094, "rewards/accuracies": 0.625, "rewards/chosen": -1.9431133270263672, "rewards/margins": 0.4511028826236725, "rewards/rejected": -2.3942160606384277, "step": 3020 }, { "epoch": 0.7916775713164094, "grad_norm": 10.1875, "learning_rate": 3.796142762470753e-08, "logits/chosen": -2.380035877227783, "logits/rejected": -2.360431671142578, "logps/chosen": -1.0238114595413208, "logps/rejected": -1.1766431331634521, "loss": 1.7175, "rewards/accuracies": 0.5, "rewards/chosen": -2.0476229190826416, "rewards/margins": 0.305663526058197, "rewards/rejected": -2.3532862663269043, "step": 3025 }, { "epoch": 0.7929861292855274, "grad_norm": 7.0, "learning_rate": 3.750691030710613e-08, "logits/chosen": -2.4228291511535645, "logits/rejected": -2.3697350025177, "logps/chosen": -1.0448884963989258, "logps/rejected": -1.2764734029769897, "loss": 1.6438, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0897769927978516, "rewards/margins": 0.46316996216773987, "rewards/rejected": -2.5529468059539795, "step": 3030 }, { "epoch": 0.7942946872546454, "grad_norm": 9.125, "learning_rate": 3.7054741286914044e-08, "logits/chosen": -2.457716464996338, "logits/rejected": -2.3829922676086426, "logps/chosen": -1.0081822872161865, "logps/rejected": -1.1987855434417725, "loss": 1.6389, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.016364574432373, "rewards/margins": 0.3812064528465271, "rewards/rejected": -2.397571086883545, "step": 3035 }, { "epoch": 0.7956032452237635, "grad_norm": 10.875, "learning_rate": 3.660493000317624e-08, "logits/chosen": -2.4820752143859863, "logits/rejected": -2.4047961235046387, "logps/chosen": -0.9398635625839233, "logps/rejected": -1.2753970623016357, "loss": 1.5328, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8797271251678467, "rewards/margins": 0.67106693983078, "rewards/rejected": -2.5507941246032715, "step": 3040 }, { "epoch": 0.7969118031928815, "grad_norm": 12.3125, "learning_rate": 3.615748584571985e-08, "logits/chosen": -2.23185658454895, "logits/rejected": -2.2139110565185547, "logps/chosen": -1.0182114839553833, "logps/rejected": -1.1205089092254639, "loss": 1.7144, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0364229679107666, "rewards/margins": 0.20459461212158203, "rewards/rejected": -2.2410178184509277, "step": 3045 }, { "epoch": 0.7982203611619995, "grad_norm": 14.125, "learning_rate": 3.571241815495837e-08, "logits/chosen": -2.3096487522125244, "logits/rejected": -2.278414487838745, "logps/chosen": -1.152794599533081, "logps/rejected": -1.3077088594436646, "loss": 1.7722, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.305589199066162, "rewards/margins": 0.3098284602165222, "rewards/rejected": -2.615417718887329, "step": 3050 }, { "epoch": 0.7995289191311175, "grad_norm": 6.4375, "learning_rate": 3.526973622169616e-08, "logits/chosen": -2.5251197814941406, "logits/rejected": -2.4078097343444824, "logps/chosen": -0.9497879147529602, "logps/rejected": -1.1980053186416626, "loss": 1.5637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8995758295059204, "rewards/margins": 0.49643492698669434, "rewards/rejected": -2.396010637283325, "step": 3055 }, { "epoch": 0.8008374771002356, "grad_norm": 17.0, "learning_rate": 3.482944928693507e-08, "logits/chosen": -2.4318175315856934, "logits/rejected": -2.2547106742858887, "logps/chosen": -0.9657788276672363, "logps/rejected": -1.2323737144470215, "loss": 1.4845, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9315576553344727, "rewards/margins": 0.5331897139549255, "rewards/rejected": -2.464747428894043, "step": 3060 }, { "epoch": 0.8021460350693536, "grad_norm": 4.625, "learning_rate": 3.4391566541681085e-08, "logits/chosen": -2.3926703929901123, "logits/rejected": -2.348784923553467, "logps/chosen": -0.993707001209259, "logps/rejected": -1.2574628591537476, "loss": 1.5595, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.987414002418518, "rewards/margins": 0.5275118947029114, "rewards/rejected": -2.514925718307495, "step": 3065 }, { "epoch": 0.8034545930384716, "grad_norm": 14.9375, "learning_rate": 3.3956097126752754e-08, "logits/chosen": -2.3634772300720215, "logits/rejected": -2.2741293907165527, "logps/chosen": -0.947992205619812, "logps/rejected": -1.2986249923706055, "loss": 1.4365, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.895984411239624, "rewards/margins": 0.7012653350830078, "rewards/rejected": -2.597249984741211, "step": 3070 }, { "epoch": 0.8047631510075897, "grad_norm": 8.75, "learning_rate": 3.3523050132590155e-08, "logits/chosen": -2.4607555866241455, "logits/rejected": -2.365262508392334, "logps/chosen": -0.9718250036239624, "logps/rejected": -1.2156105041503906, "loss": 1.5143, "rewards/accuracies": 0.625, "rewards/chosen": -1.9436500072479248, "rewards/margins": 0.4875709116458893, "rewards/rejected": -2.4312210083007812, "step": 3075 }, { "epoch": 0.8060717089767077, "grad_norm": 3.84375, "learning_rate": 3.30924345990654e-08, "logits/chosen": -2.414721965789795, "logits/rejected": -2.3310763835906982, "logps/chosen": -1.0256900787353516, "logps/rejected": -1.2254316806793213, "loss": 1.6853, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.051380157470703, "rewards/margins": 0.3994831442832947, "rewards/rejected": -2.4508633613586426, "step": 3080 }, { "epoch": 0.8073802669458257, "grad_norm": 8.125, "learning_rate": 3.26642595152936e-08, "logits/chosen": -2.399613618850708, "logits/rejected": -2.3682827949523926, "logps/chosen": -1.031362771987915, "logps/rejected": -1.2638393640518188, "loss": 1.5837, "rewards/accuracies": 0.625, "rewards/chosen": -2.06272554397583, "rewards/margins": 0.4649530053138733, "rewards/rejected": -2.5276787281036377, "step": 3085 }, { "epoch": 0.8086888249149438, "grad_norm": 30.25, "learning_rate": 3.223853381944551e-08, "logits/chosen": -2.361109733581543, "logits/rejected": -2.2909817695617676, "logps/chosen": -0.9989153146743774, "logps/rejected": -1.1343430280685425, "loss": 1.6686, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9978306293487549, "rewards/margins": 0.270855188369751, "rewards/rejected": -2.268686056137085, "step": 3090 }, { "epoch": 0.8099973828840618, "grad_norm": 16.875, "learning_rate": 3.1815266398560834e-08, "logits/chosen": -2.3800570964813232, "logits/rejected": -2.3379855155944824, "logps/chosen": -0.9914461374282837, "logps/rejected": -1.265268087387085, "loss": 1.5595, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9828922748565674, "rewards/margins": 0.5476440191268921, "rewards/rejected": -2.53053617477417, "step": 3095 }, { "epoch": 0.8113059408531798, "grad_norm": 42.0, "learning_rate": 3.1394466088362536e-08, "logits/chosen": -2.3924407958984375, "logits/rejected": -2.3550355434417725, "logps/chosen": -1.0313466787338257, "logps/rejected": -1.1435056924819946, "loss": 1.7257, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0626933574676514, "rewards/margins": 0.2243177890777588, "rewards/rejected": -2.2870113849639893, "step": 3100 }, { "epoch": 0.8126144988222979, "grad_norm": 16.125, "learning_rate": 3.097614167307276e-08, "logits/chosen": -2.391306161880493, "logits/rejected": -2.2163243293762207, "logps/chosen": -1.1593459844589233, "logps/rejected": -1.4063975811004639, "loss": 1.6107, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.3186919689178467, "rewards/margins": 0.4941031336784363, "rewards/rejected": -2.8127951622009277, "step": 3105 }, { "epoch": 0.8139230567914159, "grad_norm": 20.625, "learning_rate": 3.056030188522908e-08, "logits/chosen": -2.4196321964263916, "logits/rejected": -2.3014461994171143, "logps/chosen": -0.9566611051559448, "logps/rejected": -1.4016366004943848, "loss": 1.4639, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9133222103118896, "rewards/margins": 0.8899508714675903, "rewards/rejected": -2.8032732009887695, "step": 3110 }, { "epoch": 0.8152316147605339, "grad_norm": 16.0, "learning_rate": 3.014695540550251e-08, "logits/chosen": -2.438748359680176, "logits/rejected": -2.4371185302734375, "logps/chosen": -1.0175426006317139, "logps/rejected": -1.1882712841033936, "loss": 1.6524, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0350852012634277, "rewards/margins": 0.3414570987224579, "rewards/rejected": -2.376542568206787, "step": 3115 }, { "epoch": 0.816540172729652, "grad_norm": 17.25, "learning_rate": 2.973611086251603e-08, "logits/chosen": -2.4398794174194336, "logits/rejected": -2.398367166519165, "logps/chosen": -0.9818497896194458, "logps/rejected": -1.1953831911087036, "loss": 1.5379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9636995792388916, "rewards/margins": 0.4270668923854828, "rewards/rejected": -2.3907663822174072, "step": 3120 }, { "epoch": 0.81784873069877, "grad_norm": 9.125, "learning_rate": 2.9327776832664725e-08, "logits/chosen": -2.4159655570983887, "logits/rejected": -2.337883710861206, "logps/chosen": -0.9473508596420288, "logps/rejected": -1.1087888479232788, "loss": 1.6631, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.8947017192840576, "rewards/margins": 0.3228759169578552, "rewards/rejected": -2.2175776958465576, "step": 3125 }, { "epoch": 0.819157288667888, "grad_norm": 12.5, "learning_rate": 2.892196183993658e-08, "logits/chosen": -2.3964157104492188, "logits/rejected": -2.245603561401367, "logps/chosen": -0.9592461585998535, "logps/rejected": -1.2759965658187866, "loss": 1.4688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.918492317199707, "rewards/margins": 0.6335008144378662, "rewards/rejected": -2.5519931316375732, "step": 3130 }, { "epoch": 0.8204658466370061, "grad_norm": 17.25, "learning_rate": 2.851867435573461e-08, "logits/chosen": -2.436161518096924, "logits/rejected": -2.298663854598999, "logps/chosen": -1.0512534379959106, "logps/rejected": -1.3134206533432007, "loss": 1.5874, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1025068759918213, "rewards/margins": 0.5243349075317383, "rewards/rejected": -2.6268413066864014, "step": 3135 }, { "epoch": 0.821774404606124, "grad_norm": 15.5, "learning_rate": 2.8117922798699883e-08, "logits/chosen": -2.449985980987549, "logits/rejected": -2.3135969638824463, "logps/chosen": -1.0239399671554565, "logps/rejected": -1.333899736404419, "loss": 1.5273, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.047879934310913, "rewards/margins": 0.6199191808700562, "rewards/rejected": -2.667799472808838, "step": 3140 }, { "epoch": 0.823082962575242, "grad_norm": 5.3125, "learning_rate": 2.7719715534536074e-08, "logits/chosen": -2.4440865516662598, "logits/rejected": -2.317347764968872, "logps/chosen": -0.9288721084594727, "logps/rejected": -1.2281444072723389, "loss": 1.4968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8577442169189453, "rewards/margins": 0.598544716835022, "rewards/rejected": -2.4562888145446777, "step": 3145 }, { "epoch": 0.8243915205443602, "grad_norm": 8.5, "learning_rate": 2.7324060875834502e-08, "logits/chosen": -2.351055860519409, "logits/rejected": -2.3224778175354004, "logps/chosen": -1.0313200950622559, "logps/rejected": -1.3013689517974854, "loss": 1.5234, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0626401901245117, "rewards/margins": 0.5400978922843933, "rewards/rejected": -2.6027379035949707, "step": 3150 }, { "epoch": 0.8257000785134782, "grad_norm": 10.1875, "learning_rate": 2.693096708190079e-08, "logits/chosen": -2.435624599456787, "logits/rejected": -2.3732106685638428, "logps/chosen": -0.9744235277175903, "logps/rejected": -1.0926306247711182, "loss": 1.6975, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9488470554351807, "rewards/margins": 0.23641440272331238, "rewards/rejected": -2.1852612495422363, "step": 3155 }, { "epoch": 0.8270086364825961, "grad_norm": 47.0, "learning_rate": 2.654044235858247e-08, "logits/chosen": -2.3651368618011475, "logits/rejected": -2.315763235092163, "logps/chosen": -0.9055688977241516, "logps/rejected": -1.1408638954162598, "loss": 1.5665, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8111377954483032, "rewards/margins": 0.47058993577957153, "rewards/rejected": -2.2817277908325195, "step": 3160 }, { "epoch": 0.8283171944517143, "grad_norm": 14.125, "learning_rate": 2.6152494858097514e-08, "logits/chosen": -2.367861747741699, "logits/rejected": -2.2873597145080566, "logps/chosen": -1.073318600654602, "logps/rejected": -1.3021563291549683, "loss": 1.6119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.146637201309204, "rewards/margins": 0.45767563581466675, "rewards/rejected": -2.6043126583099365, "step": 3165 }, { "epoch": 0.8296257524208323, "grad_norm": 21.875, "learning_rate": 2.5767132678864466e-08, "logits/chosen": -2.4992244243621826, "logits/rejected": -2.381605625152588, "logps/chosen": -0.9065955877304077, "logps/rejected": -1.0601500272750854, "loss": 1.6404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8131911754608154, "rewards/margins": 0.3071088194847107, "rewards/rejected": -2.120300054550171, "step": 3170 }, { "epoch": 0.8309343103899502, "grad_norm": 43.0, "learning_rate": 2.5384363865332992e-08, "logits/chosen": -2.4742653369903564, "logits/rejected": -2.344566822052002, "logps/chosen": -0.9113607406616211, "logps/rejected": -1.2585718631744385, "loss": 1.4236, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8227214813232422, "rewards/margins": 0.6944223642349243, "rewards/rejected": -2.517143726348877, "step": 3175 }, { "epoch": 0.8322428683590684, "grad_norm": 20.25, "learning_rate": 2.500419640781628e-08, "logits/chosen": -2.4108669757843018, "logits/rejected": -2.2749760150909424, "logps/chosen": -1.0304162502288818, "logps/rejected": -1.349993348121643, "loss": 1.5144, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0608325004577637, "rewards/margins": 0.6391541957855225, "rewards/rejected": -2.699986696243286, "step": 3180 }, { "epoch": 0.8335514263281864, "grad_norm": 11.5625, "learning_rate": 2.4626638242324016e-08, "logits/chosen": -2.26642107963562, "logits/rejected": -2.2630293369293213, "logps/chosen": -1.0319706201553345, "logps/rejected": -1.1296160221099854, "loss": 1.74, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.063941240310669, "rewards/margins": 0.19529111683368683, "rewards/rejected": -2.2592320442199707, "step": 3185 }, { "epoch": 0.8348599842973043, "grad_norm": 7.875, "learning_rate": 2.4251697250396952e-08, "logits/chosen": -2.4506642818450928, "logits/rejected": -2.371619462966919, "logps/chosen": -1.1012970209121704, "logps/rejected": -1.2737290859222412, "loss": 1.6894, "rewards/accuracies": 0.5625, "rewards/chosen": -2.202594041824341, "rewards/margins": 0.3448641896247864, "rewards/rejected": -2.5474581718444824, "step": 3190 }, { "epoch": 0.8361685422664225, "grad_norm": 13.6875, "learning_rate": 2.387938125894208e-08, "logits/chosen": -2.400301694869995, "logits/rejected": -2.358680486679077, "logps/chosen": -1.0398378372192383, "logps/rejected": -1.3217432498931885, "loss": 1.6123, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0796756744384766, "rewards/margins": 0.5638106465339661, "rewards/rejected": -2.643486499786377, "step": 3195 }, { "epoch": 0.8374771002355405, "grad_norm": 15.6875, "learning_rate": 2.3509698040069565e-08, "logits/chosen": -2.4488673210144043, "logits/rejected": -2.415466785430908, "logps/chosen": -0.9651442766189575, "logps/rejected": -1.1178889274597168, "loss": 1.6684, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.930288553237915, "rewards/margins": 0.30548930168151855, "rewards/rejected": -2.2357778549194336, "step": 3200 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -2.2404778003692627, "eval_logits/rejected": -2.137896776199341, "eval_logps/chosen": -1.0095354318618774, "eval_logps/rejected": -1.276259183883667, "eval_loss": 1.5541574954986572, "eval_rewards/accuracies": 0.6215000152587891, "eval_rewards/chosen": -2.019070863723755, "eval_rewards/margins": 0.5334474444389343, "eval_rewards/rejected": -2.552518367767334, "eval_runtime": 424.5741, "eval_samples_per_second": 4.711, "eval_steps_per_second": 1.178, "step": 3200 }, { "epoch": 0.8387856582046584, "grad_norm": 20.875, "learning_rate": 2.3142655310930214e-08, "logits/chosen": -2.3685390949249268, "logits/rejected": -2.326451539993286, "logps/chosen": -0.9285084009170532, "logps/rejected": -1.202251672744751, "loss": 1.52, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8570168018341064, "rewards/margins": 0.5474866628646851, "rewards/rejected": -2.404503345489502, "step": 3205 }, { "epoch": 0.8400942161737766, "grad_norm": 11.8125, "learning_rate": 2.277826073355462e-08, "logits/chosen": -2.392665386199951, "logits/rejected": -2.2857306003570557, "logps/chosen": -1.0543001890182495, "logps/rejected": -1.2741917371749878, "loss": 1.6742, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.108600378036499, "rewards/margins": 0.4397830367088318, "rewards/rejected": -2.5483834743499756, "step": 3210 }, { "epoch": 0.8414027741428945, "grad_norm": 25.375, "learning_rate": 2.2416521914693083e-08, "logits/chosen": -2.4511687755584717, "logits/rejected": -2.3786120414733887, "logps/chosen": -1.077629566192627, "logps/rejected": -1.2740514278411865, "loss": 1.636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.155259132385254, "rewards/margins": 0.3928437829017639, "rewards/rejected": -2.548102855682373, "step": 3215 }, { "epoch": 0.8427113321120125, "grad_norm": 19.0, "learning_rate": 2.2057446405656787e-08, "logits/chosen": -2.421433687210083, "logits/rejected": -2.3072190284729004, "logps/chosen": -1.0151922702789307, "logps/rejected": -1.412712812423706, "loss": 1.49, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0303845405578613, "rewards/margins": 0.7950409650802612, "rewards/rejected": -2.825425624847412, "step": 3220 }, { "epoch": 0.8440198900811305, "grad_norm": 17.5, "learning_rate": 2.1701041702160323e-08, "logits/chosen": -2.4857101440429688, "logits/rejected": -2.4346108436584473, "logps/chosen": -1.0297340154647827, "logps/rejected": -1.2436965703964233, "loss": 1.607, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0594680309295654, "rewards/margins": 0.42792487144470215, "rewards/rejected": -2.4873931407928467, "step": 3225 }, { "epoch": 0.8453284480502486, "grad_norm": 24.375, "learning_rate": 2.1347315244164983e-08, "logits/chosen": -2.446000337600708, "logits/rejected": -2.278386354446411, "logps/chosen": -1.1050500869750977, "logps/rejected": -1.2150299549102783, "loss": 1.7479, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.2101001739501953, "rewards/margins": 0.21995961666107178, "rewards/rejected": -2.4300599098205566, "step": 3230 }, { "epoch": 0.8466370060193666, "grad_norm": 29.25, "learning_rate": 2.0996274415723737e-08, "logits/chosen": -2.421968698501587, "logits/rejected": -2.3339333534240723, "logps/chosen": -1.025153398513794, "logps/rejected": -1.3070855140686035, "loss": 1.5685, "rewards/accuracies": 0.625, "rewards/chosen": -2.050306797027588, "rewards/margins": 0.5638642311096191, "rewards/rejected": -2.614171028137207, "step": 3235 }, { "epoch": 0.8479455639884846, "grad_norm": 9.5, "learning_rate": 2.0647926544826815e-08, "logits/chosen": -2.3939080238342285, "logits/rejected": -2.2694098949432373, "logps/chosen": -0.8943246603012085, "logps/rejected": -1.2874724864959717, "loss": 1.4295, "rewards/accuracies": 0.625, "rewards/chosen": -1.788649320602417, "rewards/margins": 0.7862960696220398, "rewards/rejected": -2.5749449729919434, "step": 3240 }, { "epoch": 0.8492541219576027, "grad_norm": 14.75, "learning_rate": 2.0302278903248937e-08, "logits/chosen": -2.399911642074585, "logits/rejected": -2.302757740020752, "logps/chosen": -1.0776337385177612, "logps/rejected": -1.333750605583191, "loss": 1.5764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1552674770355225, "rewards/margins": 0.5122334361076355, "rewards/rejected": -2.667501211166382, "step": 3245 }, { "epoch": 0.8505626799267207, "grad_norm": 8.5625, "learning_rate": 1.9959338706397406e-08, "logits/chosen": -2.4527502059936523, "logits/rejected": -2.3132331371307373, "logps/chosen": -0.9426477551460266, "logps/rejected": -1.207658290863037, "loss": 1.5378, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8852955102920532, "rewards/margins": 0.530021071434021, "rewards/rejected": -2.415316581726074, "step": 3250 }, { "epoch": 0.8518712378958387, "grad_norm": 48.5, "learning_rate": 1.9619113113161552e-08, "logits/chosen": -2.448359966278076, "logits/rejected": -2.2861156463623047, "logps/chosen": -1.0301740169525146, "logps/rejected": -1.2991020679473877, "loss": 1.5196, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0603480339050293, "rewards/margins": 0.5378562211990356, "rewards/rejected": -2.5982041358947754, "step": 3255 }, { "epoch": 0.8531797958649568, "grad_norm": 12.25, "learning_rate": 1.9281609225763167e-08, "logits/chosen": -2.5451226234436035, "logits/rejected": -2.3040645122528076, "logps/chosen": -0.8870974779129028, "logps/rejected": -1.3016536235809326, "loss": 1.3545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7741949558258057, "rewards/margins": 0.82911217212677, "rewards/rejected": -2.6033072471618652, "step": 3260 }, { "epoch": 0.8544883538340748, "grad_norm": 13.9375, "learning_rate": 1.89468340896084e-08, "logits/chosen": -2.30216646194458, "logits/rejected": -2.319826602935791, "logps/chosen": -1.0067510604858398, "logps/rejected": -1.255724310874939, "loss": 1.6052, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0135021209716797, "rewards/margins": 0.4979466497898102, "rewards/rejected": -2.511448621749878, "step": 3265 }, { "epoch": 0.8557969118031928, "grad_norm": 8.5625, "learning_rate": 1.8614794693140635e-08, "logits/chosen": -2.4642372131347656, "logits/rejected": -2.354973316192627, "logps/chosen": -1.0428730249404907, "logps/rejected": -1.3670299053192139, "loss": 1.5068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0857460498809814, "rewards/margins": 0.6483141183853149, "rewards/rejected": -2.7340598106384277, "step": 3270 }, { "epoch": 0.857105469772311, "grad_norm": 13.0625, "learning_rate": 1.8285497967694495e-08, "logits/chosen": -2.356292247772217, "logits/rejected": -2.4088733196258545, "logps/chosen": -1.1163990497589111, "logps/rejected": -1.143258810043335, "loss": 1.8962, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.2327980995178223, "rewards/margins": 0.053719114512205124, "rewards/rejected": -2.28651762008667, "step": 3275 }, { "epoch": 0.8584140277414289, "grad_norm": 16.625, "learning_rate": 1.795895078735139e-08, "logits/chosen": -2.4612107276916504, "logits/rejected": -2.379434585571289, "logps/chosen": -1.0002456903457642, "logps/rejected": -1.2225115299224854, "loss": 1.5836, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0004913806915283, "rewards/margins": 0.4445319175720215, "rewards/rejected": -2.4450230598449707, "step": 3280 }, { "epoch": 0.8597225857105469, "grad_norm": 29.875, "learning_rate": 1.7635159968795684e-08, "logits/chosen": -2.3862338066101074, "logits/rejected": -2.2259583473205566, "logps/chosen": -1.028977394104004, "logps/rejected": -1.3275554180145264, "loss": 1.508, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.057954788208008, "rewards/margins": 0.5971564054489136, "rewards/rejected": -2.6551108360290527, "step": 3285 }, { "epoch": 0.861031143679665, "grad_norm": 14.3125, "learning_rate": 1.7314132271172815e-08, "logits/chosen": -2.3274483680725098, "logits/rejected": -2.2989087104797363, "logps/chosen": -1.05400812625885, "logps/rejected": -1.3457926511764526, "loss": 1.5315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1080162525177, "rewards/margins": 0.5835691690444946, "rewards/rejected": -2.6915853023529053, "step": 3290 }, { "epoch": 0.862339701648783, "grad_norm": 13.1875, "learning_rate": 1.6995874395947756e-08, "logits/chosen": -2.439561367034912, "logits/rejected": -2.4360227584838867, "logps/chosen": -1.122040033340454, "logps/rejected": -1.1900089979171753, "loss": 1.7766, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.244080066680908, "rewards/margins": 0.1359379142522812, "rewards/rejected": -2.3800179958343506, "step": 3295 }, { "epoch": 0.863648259617901, "grad_norm": 19.125, "learning_rate": 1.668039298676549e-08, "logits/chosen": -2.414977550506592, "logits/rejected": -2.3018033504486084, "logps/chosen": -0.9150127172470093, "logps/rejected": -1.225091814994812, "loss": 1.5732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8300254344940186, "rewards/margins": 0.6201579570770264, "rewards/rejected": -2.450183629989624, "step": 3300 }, { "epoch": 0.8649568175870191, "grad_norm": 42.0, "learning_rate": 1.6367694629312045e-08, "logits/chosen": -2.4268276691436768, "logits/rejected": -2.3997445106506348, "logps/chosen": -1.0066455602645874, "logps/rejected": -1.2108497619628906, "loss": 1.5853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.013291120529175, "rewards/margins": 0.4084084630012512, "rewards/rejected": -2.4216995239257812, "step": 3305 }, { "epoch": 0.8662653755561371, "grad_norm": 36.5, "learning_rate": 1.6057785851177276e-08, "logits/chosen": -2.4874818325042725, "logits/rejected": -2.3407206535339355, "logps/chosen": -0.9342752695083618, "logps/rejected": -1.2497273683547974, "loss": 1.4768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8685505390167236, "rewards/margins": 0.630903959274292, "rewards/rejected": -2.4994547367095947, "step": 3310 }, { "epoch": 0.8675739335252551, "grad_norm": 17.375, "learning_rate": 1.5750673121718312e-08, "logits/chosen": -2.498610258102417, "logits/rejected": -2.4314980506896973, "logps/chosen": -1.0238749980926514, "logps/rejected": -1.2633006572723389, "loss": 1.6383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0477499961853027, "rewards/margins": 0.478850781917572, "rewards/rejected": -2.5266013145446777, "step": 3315 }, { "epoch": 0.8688824914943732, "grad_norm": 12.125, "learning_rate": 1.5446362851924812e-08, "logits/chosen": -2.393065929412842, "logits/rejected": -2.298051357269287, "logps/chosen": -1.0044113397598267, "logps/rejected": -1.2074459791183472, "loss": 1.6047, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0088226795196533, "rewards/margins": 0.4060695767402649, "rewards/rejected": -2.4148919582366943, "step": 3320 }, { "epoch": 0.8701910494634912, "grad_norm": 17.25, "learning_rate": 1.514486139428484e-08, "logits/chosen": -2.499373435974121, "logits/rejected": -2.3543670177459717, "logps/chosen": -1.0492924451828003, "logps/rejected": -1.2613431215286255, "loss": 1.6445, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0985848903656006, "rewards/margins": 0.4241012930870056, "rewards/rejected": -2.522686243057251, "step": 3325 }, { "epoch": 0.8714996074326092, "grad_norm": 14.1875, "learning_rate": 1.4846175042652542e-08, "logits/chosen": -2.348522663116455, "logits/rejected": -2.2787318229675293, "logps/chosen": -1.0590589046478271, "logps/rejected": -1.3663066625595093, "loss": 1.5347, "rewards/accuracies": 0.625, "rewards/chosen": -2.1181178092956543, "rewards/margins": 0.6144956350326538, "rewards/rejected": -2.7326133251190186, "step": 3330 }, { "epoch": 0.8728081654017273, "grad_norm": 13.875, "learning_rate": 1.4550310032116559e-08, "logits/chosen": -2.4727697372436523, "logits/rejected": -2.3022396564483643, "logps/chosen": -1.0268465280532837, "logps/rejected": -1.3071858882904053, "loss": 1.5659, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0536930561065674, "rewards/margins": 0.5606786608695984, "rewards/rejected": -2.6143717765808105, "step": 3335 }, { "epoch": 0.8741167233708453, "grad_norm": 27.625, "learning_rate": 1.4257272538869896e-08, "logits/chosen": -2.416084051132202, "logits/rejected": -2.418163537979126, "logps/chosen": -0.9862356185913086, "logps/rejected": -1.2267391681671143, "loss": 1.5481, "rewards/accuracies": 0.625, "rewards/chosen": -1.9724712371826172, "rewards/margins": 0.48100727796554565, "rewards/rejected": -2.4534783363342285, "step": 3340 }, { "epoch": 0.8754252813399633, "grad_norm": 14.0625, "learning_rate": 1.3967068680081113e-08, "logits/chosen": -2.44897723197937, "logits/rejected": -2.2498748302459717, "logps/chosen": -1.0293354988098145, "logps/rejected": -1.4489762783050537, "loss": 1.4669, "rewards/accuracies": 0.6875, "rewards/chosen": -2.058670997619629, "rewards/margins": 0.8392817378044128, "rewards/rejected": -2.8979525566101074, "step": 3345 }, { "epoch": 0.8767338393090814, "grad_norm": 34.5, "learning_rate": 1.3679704513766427e-08, "logits/chosen": -2.4799509048461914, "logits/rejected": -2.3530070781707764, "logps/chosen": -1.0005347728729248, "logps/rejected": -1.2235629558563232, "loss": 1.6053, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0010695457458496, "rewards/margins": 0.44605618715286255, "rewards/rejected": -2.4471259117126465, "step": 3350 }, { "epoch": 0.8780423972781994, "grad_norm": 4.625, "learning_rate": 1.3395186038663514e-08, "logits/chosen": -2.437450408935547, "logits/rejected": -2.219031572341919, "logps/chosen": -0.9704686999320984, "logps/rejected": -1.4492781162261963, "loss": 1.2614, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9409373998641968, "rewards/margins": 0.9576187133789062, "rewards/rejected": -2.8985562324523926, "step": 3355 }, { "epoch": 0.8793509552473174, "grad_norm": 8.9375, "learning_rate": 1.3113519194106032e-08, "logits/chosen": -2.256155252456665, "logits/rejected": -2.135873556137085, "logps/chosen": -1.0256778001785278, "logps/rejected": -1.3932005167007446, "loss": 1.4513, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0513556003570557, "rewards/margins": 0.7350452542304993, "rewards/rejected": -2.7864010334014893, "step": 3360 }, { "epoch": 0.8806595132164355, "grad_norm": 8.3125, "learning_rate": 1.2834709859899773e-08, "logits/chosen": -2.4943270683288574, "logits/rejected": -2.2311959266662598, "logps/chosen": -0.9842844009399414, "logps/rejected": -1.217644453048706, "loss": 1.6164, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9685688018798828, "rewards/margins": 0.46672019362449646, "rewards/rejected": -2.435288906097412, "step": 3365 }, { "epoch": 0.8819680711855535, "grad_norm": 58.5, "learning_rate": 1.2558763856199944e-08, "logits/chosen": -2.5242977142333984, "logits/rejected": -2.515683650970459, "logps/chosen": -0.9634108543395996, "logps/rejected": -1.104688048362732, "loss": 1.6466, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9268217086791992, "rewards/margins": 0.28255438804626465, "rewards/rejected": -2.209376096725464, "step": 3370 }, { "epoch": 0.8832766291546715, "grad_norm": 10.5, "learning_rate": 1.2285686943389533e-08, "logits/chosen": -2.3281872272491455, "logits/rejected": -2.283799886703491, "logps/chosen": -0.9953163266181946, "logps/rejected": -1.38651704788208, "loss": 1.4953, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9906326532363892, "rewards/margins": 0.7824015021324158, "rewards/rejected": -2.77303409576416, "step": 3375 }, { "epoch": 0.8845851871237895, "grad_norm": 12.6875, "learning_rate": 1.2015484821959238e-08, "logits/chosen": -2.3856160640716553, "logits/rejected": -2.267967462539673, "logps/chosen": -0.9792869687080383, "logps/rejected": -1.386744499206543, "loss": 1.3323, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9585739374160767, "rewards/margins": 0.8149151802062988, "rewards/rejected": -2.773488998413086, "step": 3380 }, { "epoch": 0.8858937450929076, "grad_norm": 6.03125, "learning_rate": 1.1748163132388322e-08, "logits/chosen": -2.446974992752075, "logits/rejected": -2.2024872303009033, "logps/chosen": -0.9644155502319336, "logps/rejected": -1.3742280006408691, "loss": 1.4181, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9288311004638672, "rewards/margins": 0.819625198841095, "rewards/rejected": -2.7484560012817383, "step": 3385 }, { "epoch": 0.8872023030620256, "grad_norm": 26.0, "learning_rate": 1.1483727455026954e-08, "logits/chosen": -2.4318699836730957, "logits/rejected": -2.319911479949951, "logps/chosen": -0.9740043878555298, "logps/rejected": -1.1483094692230225, "loss": 1.6176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9480087757110596, "rewards/margins": 0.3486101031303406, "rewards/rejected": -2.296618938446045, "step": 3390 }, { "epoch": 0.8885108610311436, "grad_norm": 17.25, "learning_rate": 1.1222183309979655e-08, "logits/chosen": -2.3823249340057373, "logits/rejected": -2.202089786529541, "logps/chosen": -1.0919240713119507, "logps/rejected": -1.363614797592163, "loss": 1.5763, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1838481426239014, "rewards/margins": 0.5433812141418457, "rewards/rejected": -2.727229595184326, "step": 3395 }, { "epoch": 0.8898194190002617, "grad_norm": 13.8125, "learning_rate": 1.0963536156990166e-08, "logits/chosen": -2.4247846603393555, "logits/rejected": -2.440491199493408, "logps/chosen": -0.9965038299560547, "logps/rejected": -1.2360988855361938, "loss": 1.5451, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9930076599121094, "rewards/margins": 0.4791898727416992, "rewards/rejected": -2.4721977710723877, "step": 3400 }, { "epoch": 0.8911279769693797, "grad_norm": 9.4375, "learning_rate": 1.070779139532732e-08, "logits/chosen": -2.3498101234436035, "logits/rejected": -2.308609962463379, "logps/chosen": -0.9538379907608032, "logps/rejected": -1.2538671493530273, "loss": 1.4613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9076759815216064, "rewards/margins": 0.6000581383705139, "rewards/rejected": -2.5077342987060547, "step": 3405 }, { "epoch": 0.8924365349384977, "grad_norm": 19.625, "learning_rate": 1.0454954363672564e-08, "logits/chosen": -2.365246295928955, "logits/rejected": -2.3017075061798096, "logps/chosen": -1.018118977546692, "logps/rejected": -1.1584354639053345, "loss": 1.6949, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.036237955093384, "rewards/margins": 0.2806331515312195, "rewards/rejected": -2.316870927810669, "step": 3410 }, { "epoch": 0.8937450929076158, "grad_norm": 8.125, "learning_rate": 1.020503034000823e-08, "logits/chosen": -2.4209189414978027, "logits/rejected": -2.2629926204681396, "logps/chosen": -0.9881771206855774, "logps/rejected": -1.360541582107544, "loss": 1.4466, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9763542413711548, "rewards/margins": 0.7447290420532227, "rewards/rejected": -2.721083164215088, "step": 3415 }, { "epoch": 0.8950536508767338, "grad_norm": 13.75, "learning_rate": 9.95802454150758e-09, "logits/chosen": -2.333183765411377, "logits/rejected": -2.2646124362945557, "logps/chosen": -1.0076110363006592, "logps/rejected": -1.3419052362442017, "loss": 1.4461, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0152220726013184, "rewards/margins": 0.6685881614685059, "rewards/rejected": -2.6838104724884033, "step": 3420 }, { "epoch": 0.8963622088458518, "grad_norm": 8.75, "learning_rate": 9.713942124425755e-09, "logits/chosen": -2.41074800491333, "logits/rejected": -2.3371498584747314, "logps/chosen": -1.0529429912567139, "logps/rejected": -1.3211615085601807, "loss": 1.6434, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1058859825134277, "rewards/margins": 0.5364372134208679, "rewards/rejected": -2.6423230171203613, "step": 3425 }, { "epoch": 0.8976707668149699, "grad_norm": 31.75, "learning_rate": 9.472788183992308e-09, "logits/chosen": -2.424175262451172, "logits/rejected": -2.2461509704589844, "logps/chosen": -0.8403332829475403, "logps/rejected": -1.2157419919967651, "loss": 1.4487, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6806665658950806, "rewards/margins": 0.7508176565170288, "rewards/rejected": -2.4314839839935303, "step": 3430 }, { "epoch": 0.8989793247840879, "grad_norm": 20.125, "learning_rate": 9.234567754304612e-09, "logits/chosen": -2.5148801803588867, "logits/rejected": -2.343024492263794, "logps/chosen": -0.9926145672798157, "logps/rejected": -1.1564438343048096, "loss": 1.7336, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9852291345596313, "rewards/margins": 0.32765844464302063, "rewards/rejected": -2.312887668609619, "step": 3435 }, { "epoch": 0.9002878827532059, "grad_norm": 24.5, "learning_rate": 8.99928580822299e-09, "logits/chosen": -2.4763379096984863, "logits/rejected": -2.417632818222046, "logps/chosen": -0.9256867170333862, "logps/rejected": -1.1939494609832764, "loss": 1.4925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8513734340667725, "rewards/margins": 0.5365256071090698, "rewards/rejected": -2.3878989219665527, "step": 3440 }, { "epoch": 0.901596440722324, "grad_norm": 23.75, "learning_rate": 8.766947257266716e-09, "logits/chosen": -2.39778208732605, "logits/rejected": -2.309729814529419, "logps/chosen": -1.1804935932159424, "logps/rejected": -1.4070818424224854, "loss": 1.6839, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.3609871864318848, "rewards/margins": 0.45317649841308594, "rewards/rejected": -2.8141636848449707, "step": 3445 }, { "epoch": 0.902904998691442, "grad_norm": 14.625, "learning_rate": 8.537556951511681e-09, "logits/chosen": -2.5047378540039062, "logits/rejected": -2.3181793689727783, "logps/chosen": -0.9701582789421082, "logps/rejected": -1.3134857416152954, "loss": 1.4255, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9403165578842163, "rewards/margins": 0.6866546869277954, "rewards/rejected": -2.626971483230591, "step": 3450 }, { "epoch": 0.90421355666056, "grad_norm": 23.75, "learning_rate": 8.311119679489025e-09, "logits/chosen": -2.4171652793884277, "logits/rejected": -2.285288095474243, "logps/chosen": -0.9819577932357788, "logps/rejected": -1.3668018579483032, "loss": 1.4821, "rewards/accuracies": 0.625, "rewards/chosen": -1.9639155864715576, "rewards/margins": 0.7696882486343384, "rewards/rejected": -2.7336037158966064, "step": 3455 }, { "epoch": 0.9055221146296781, "grad_norm": 8.0625, "learning_rate": 8.08764016808513e-09, "logits/chosen": -2.408510446548462, "logits/rejected": -2.323523759841919, "logps/chosen": -1.0509381294250488, "logps/rejected": -1.4151966571807861, "loss": 1.4424, "rewards/accuracies": 0.625, "rewards/chosen": -2.1018762588500977, "rewards/margins": 0.7285168766975403, "rewards/rejected": -2.8303933143615723, "step": 3460 }, { "epoch": 0.9068306725987961, "grad_norm": 19.25, "learning_rate": 7.8671230824431e-09, "logits/chosen": -2.2767434120178223, "logits/rejected": -2.2831568717956543, "logps/chosen": -1.0090820789337158, "logps/rejected": -1.2616827487945557, "loss": 1.5505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0181641578674316, "rewards/margins": 0.5052012801170349, "rewards/rejected": -2.5233654975891113, "step": 3465 }, { "epoch": 0.9081392305679141, "grad_norm": 13.5, "learning_rate": 7.649573025865225e-09, "logits/chosen": -2.5304019451141357, "logits/rejected": -2.3643252849578857, "logps/chosen": -0.876559853553772, "logps/rejected": -1.2027791738510132, "loss": 1.4909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.753119707107544, "rewards/margins": 0.652438759803772, "rewards/rejected": -2.4055583477020264, "step": 3470 }, { "epoch": 0.9094477885370322, "grad_norm": 21.0, "learning_rate": 7.434994539717021e-09, "logits/chosen": -2.456418991088867, "logits/rejected": -2.3765196800231934, "logps/chosen": -0.9730094075202942, "logps/rejected": -1.1940938234329224, "loss": 1.6235, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9460188150405884, "rewards/margins": 0.4421687722206116, "rewards/rejected": -2.3881876468658447, "step": 3475 }, { "epoch": 0.9107563465061502, "grad_norm": 8.75, "learning_rate": 7.223392103332276e-09, "logits/chosen": -2.410161256790161, "logits/rejected": -2.3350677490234375, "logps/chosen": -0.95721834897995, "logps/rejected": -1.250063180923462, "loss": 1.5501, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9144366979599, "rewards/margins": 0.5856900215148926, "rewards/rejected": -2.500126361846924, "step": 3480 }, { "epoch": 0.9120649044752682, "grad_norm": 10.9375, "learning_rate": 7.014770133919706e-09, "logits/chosen": -2.4439361095428467, "logits/rejected": -2.309663772583008, "logps/chosen": -0.9355000257492065, "logps/rejected": -1.1734426021575928, "loss": 1.5591, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.871000051498413, "rewards/margins": 0.4758850038051605, "rewards/rejected": -2.3468852043151855, "step": 3485 }, { "epoch": 0.9133734624443863, "grad_norm": 57.0, "learning_rate": 6.809132986470678e-09, "logits/chosen": -2.406585216522217, "logits/rejected": -2.372904062271118, "logps/chosen": -1.0035032033920288, "logps/rejected": -1.3342068195343018, "loss": 1.4516, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0070064067840576, "rewards/margins": 0.6614068746566772, "rewards/rejected": -2.6684136390686035, "step": 3490 }, { "epoch": 0.9146820204135043, "grad_norm": 10.25, "learning_rate": 6.606484953668184e-09, "logits/chosen": -2.4528400897979736, "logits/rejected": -2.3940601348876953, "logps/chosen": -0.9719008207321167, "logps/rejected": -1.1836154460906982, "loss": 1.6526, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.9438016414642334, "rewards/margins": 0.42342957854270935, "rewards/rejected": -2.3672308921813965, "step": 3495 }, { "epoch": 0.9159905783826223, "grad_norm": 16.75, "learning_rate": 6.406830265797481e-09, "logits/chosen": -2.4871819019317627, "logits/rejected": -2.3389832973480225, "logps/chosen": -0.9406238794326782, "logps/rejected": -1.0730986595153809, "loss": 1.6759, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8812477588653564, "rewards/margins": 0.2649495601654053, "rewards/rejected": -2.1461973190307617, "step": 3500 }, { "epoch": 0.9172991363517404, "grad_norm": 15.0, "learning_rate": 6.210173090657473e-09, "logits/chosen": -2.3804402351379395, "logits/rejected": -2.3191189765930176, "logps/chosen": -1.0030243396759033, "logps/rejected": -1.2765815258026123, "loss": 1.5408, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0060486793518066, "rewards/margins": 0.5471146106719971, "rewards/rejected": -2.5531630516052246, "step": 3505 }, { "epoch": 0.9186076943208584, "grad_norm": 11.8125, "learning_rate": 6.0165175334740525e-09, "logits/chosen": -2.372527599334717, "logits/rejected": -2.2382028102874756, "logps/chosen": -1.0221322774887085, "logps/rejected": -1.3434547185897827, "loss": 1.5214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.044264554977417, "rewards/margins": 0.6426447629928589, "rewards/rejected": -2.6869094371795654, "step": 3510 }, { "epoch": 0.9199162522899764, "grad_norm": 29.5, "learning_rate": 5.8258676368140854e-09, "logits/chosen": -2.433445453643799, "logits/rejected": -2.4365949630737305, "logps/chosen": -0.9297979474067688, "logps/rejected": -1.125828742980957, "loss": 1.6118, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8595958948135376, "rewards/margins": 0.392061710357666, "rewards/rejected": -2.251657485961914, "step": 3515 }, { "epoch": 0.9212248102590945, "grad_norm": 12.0, "learning_rate": 5.638227380501259e-09, "logits/chosen": -2.44991397857666, "logits/rejected": -2.348816394805908, "logps/chosen": -1.0543513298034668, "logps/rejected": -1.3009096384048462, "loss": 1.5878, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1087026596069336, "rewards/margins": 0.493116557598114, "rewards/rejected": -2.6018192768096924, "step": 3520 }, { "epoch": 0.9225333682282125, "grad_norm": 7.0625, "learning_rate": 5.453600681532816e-09, "logits/chosen": -2.412670612335205, "logits/rejected": -2.3256282806396484, "logps/chosen": -1.012093186378479, "logps/rejected": -1.2213852405548096, "loss": 1.5879, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.024186372756958, "rewards/margins": 0.4185841977596283, "rewards/rejected": -2.442770481109619, "step": 3525 }, { "epoch": 0.9238419261973305, "grad_norm": 13.4375, "learning_rate": 5.271991393997988e-09, "logits/chosen": -2.485053539276123, "logits/rejected": -2.3201475143432617, "logps/chosen": -0.9424951672554016, "logps/rejected": -1.4520604610443115, "loss": 1.3273, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8849903345108032, "rewards/margins": 1.0191303491592407, "rewards/rejected": -2.904120922088623, "step": 3530 }, { "epoch": 0.9251504841664486, "grad_norm": 25.125, "learning_rate": 5.093403308997307e-09, "logits/chosen": -2.425349235534668, "logits/rejected": -2.358520030975342, "logps/chosen": -1.1776845455169678, "logps/rejected": -1.2830383777618408, "loss": 1.7914, "rewards/accuracies": 0.5, "rewards/chosen": -2.3553690910339355, "rewards/margins": 0.21070747077465057, "rewards/rejected": -2.5660767555236816, "step": 3535 }, { "epoch": 0.9264590421355666, "grad_norm": 13.0, "learning_rate": 4.9178401545637235e-09, "logits/chosen": -2.3537168502807617, "logits/rejected": -2.2558584213256836, "logps/chosen": -0.9968541264533997, "logps/rejected": -1.339596152305603, "loss": 1.4663, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9937082529067993, "rewards/margins": 0.6854841113090515, "rewards/rejected": -2.679192304611206, "step": 3540 }, { "epoch": 0.9277676001046846, "grad_norm": 11.8125, "learning_rate": 4.745305595584514e-09, "logits/chosen": -2.365734577178955, "logits/rejected": -2.3138182163238525, "logps/chosen": -0.9019550085067749, "logps/rejected": -1.2526875734329224, "loss": 1.4718, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8039100170135498, "rewards/margins": 0.7014651894569397, "rewards/rejected": -2.5053751468658447, "step": 3545 }, { "epoch": 0.9290761580738026, "grad_norm": 29.625, "learning_rate": 4.575803233725045e-09, "logits/chosen": -2.4139840602874756, "logits/rejected": -2.222573757171631, "logps/chosen": -1.0473335981369019, "logps/rejected": -1.3331353664398193, "loss": 1.5195, "rewards/accuracies": 0.625, "rewards/chosen": -2.0946671962738037, "rewards/margins": 0.571603536605835, "rewards/rejected": -2.6662707328796387, "step": 3550 }, { "epoch": 0.9303847160429207, "grad_norm": 16.5, "learning_rate": 4.409336607353331e-09, "logits/chosen": -2.377103090286255, "logits/rejected": -2.339891195297241, "logps/chosen": -1.0012644529342651, "logps/rejected": -1.1964675188064575, "loss": 1.6623, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0025289058685303, "rewards/margins": 0.39040589332580566, "rewards/rejected": -2.392935037612915, "step": 3555 }, { "epoch": 0.9316932740120387, "grad_norm": 37.5, "learning_rate": 4.245909191466412e-09, "logits/chosen": -2.491673707962036, "logits/rejected": -2.4676220417022705, "logps/chosen": -0.9232865571975708, "logps/rejected": -1.1422159671783447, "loss": 1.5891, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8465731143951416, "rewards/margins": 0.4378587603569031, "rewards/rejected": -2.2844319343566895, "step": 3560 }, { "epoch": 0.9330018319811567, "grad_norm": 6.3125, "learning_rate": 4.085524397617579e-09, "logits/chosen": -2.3168911933898926, "logits/rejected": -2.3578522205352783, "logps/chosen": -1.1015708446502686, "logps/rejected": -1.1920583248138428, "loss": 1.834, "rewards/accuracies": 0.5, "rewards/chosen": -2.203141689300537, "rewards/margins": 0.18097470700740814, "rewards/rejected": -2.3841166496276855, "step": 3565 }, { "epoch": 0.9343103899502748, "grad_norm": 19.375, "learning_rate": 3.928185573845394e-09, "logits/chosen": -2.2751595973968506, "logits/rejected": -2.2079505920410156, "logps/chosen": -0.9761800765991211, "logps/rejected": -1.3677023649215698, "loss": 1.4285, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9523601531982422, "rewards/margins": 0.7830442190170288, "rewards/rejected": -2.7354047298431396, "step": 3570 }, { "epoch": 0.9356189479193928, "grad_norm": 14.125, "learning_rate": 3.7738960046036e-09, "logits/chosen": -2.4739573001861572, "logits/rejected": -2.327948570251465, "logps/chosen": -0.9217453002929688, "logps/rejected": -1.0801275968551636, "loss": 1.6652, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8434906005859375, "rewards/margins": 0.3167646825313568, "rewards/rejected": -2.160255193710327, "step": 3575 }, { "epoch": 0.9369275058885108, "grad_norm": 12.25, "learning_rate": 3.6226589106926553e-09, "logits/chosen": -2.4707000255584717, "logits/rejected": -2.3323001861572266, "logps/chosen": -0.899354100227356, "logps/rejected": -1.2973957061767578, "loss": 1.4173, "rewards/accuracies": 0.6875, "rewards/chosen": -1.798708200454712, "rewards/margins": 0.7960832118988037, "rewards/rejected": -2.5947914123535156, "step": 3580 }, { "epoch": 0.9382360638576289, "grad_norm": 14.75, "learning_rate": 3.474477449192509e-09, "logits/chosen": -2.3936009407043457, "logits/rejected": -2.3535573482513428, "logps/chosen": -0.9846906661987305, "logps/rejected": -1.1885309219360352, "loss": 1.6389, "rewards/accuracies": 0.625, "rewards/chosen": -1.969381332397461, "rewards/margins": 0.40768080949783325, "rewards/rejected": -2.3770618438720703, "step": 3585 }, { "epoch": 0.9395446218267469, "grad_norm": 17.5, "learning_rate": 3.329354713396648e-09, "logits/chosen": -2.5096306800842285, "logits/rejected": -2.3358612060546875, "logps/chosen": -1.0073914527893066, "logps/rejected": -1.1934107542037964, "loss": 1.64, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0147829055786133, "rewards/margins": 0.37203869223594666, "rewards/rejected": -2.3868215084075928, "step": 3590 }, { "epoch": 0.9408531797958649, "grad_norm": 21.875, "learning_rate": 3.187293732747537e-09, "logits/chosen": -2.457213878631592, "logits/rejected": -2.2936506271362305, "logps/chosen": -1.103711724281311, "logps/rejected": -1.3423250913619995, "loss": 1.6552, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.207423448562622, "rewards/margins": 0.47722673416137695, "rewards/rejected": -2.684650182723999, "step": 3595 }, { "epoch": 0.942161737764983, "grad_norm": 17.875, "learning_rate": 3.0482974727734146e-09, "logits/chosen": -2.3827576637268066, "logits/rejected": -2.235463857650757, "logps/chosen": -0.9707067608833313, "logps/rejected": -1.2195385694503784, "loss": 1.5034, "rewards/accuracies": 0.625, "rewards/chosen": -1.9414135217666626, "rewards/margins": 0.4976634383201599, "rewards/rejected": -2.439077138900757, "step": 3600 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -2.2371788024902344, "eval_logits/rejected": -2.1341538429260254, "eval_logps/chosen": -1.0109565258026123, "eval_logps/rejected": -1.27748441696167, "eval_loss": 1.5545604228973389, "eval_rewards/accuracies": 0.6209999918937683, "eval_rewards/chosen": -2.0219130516052246, "eval_rewards/margins": 0.5330557227134705, "eval_rewards/rejected": -2.55496883392334, "eval_runtime": 424.5697, "eval_samples_per_second": 4.711, "eval_steps_per_second": 1.178, "step": 3600 }, { "epoch": 0.943470295734101, "grad_norm": 6.09375, "learning_rate": 2.9123688350263463e-09, "logits/chosen": -2.4388720989227295, "logits/rejected": -2.3022005558013916, "logps/chosen": -1.0046164989471436, "logps/rejected": -1.21682608127594, "loss": 1.6058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.009232997894287, "rewards/margins": 0.42441898584365845, "rewards/rejected": -2.43365216255188, "step": 3605 }, { "epoch": 0.944778853703219, "grad_norm": 10.5, "learning_rate": 2.7795106570216887e-09, "logits/chosen": -2.510956048965454, "logits/rejected": -2.285193681716919, "logps/chosen": -1.0223907232284546, "logps/rejected": -1.193016767501831, "loss": 1.6557, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.044781446456909, "rewards/margins": 0.3412519693374634, "rewards/rejected": -2.386033535003662, "step": 3610 }, { "epoch": 0.9460874116723371, "grad_norm": 7.625, "learning_rate": 2.649725712178802e-09, "logits/chosen": -2.32120680809021, "logits/rejected": -2.2553458213806152, "logps/chosen": -1.0607266426086426, "logps/rejected": -1.1830602884292603, "loss": 1.6865, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.121453285217285, "rewards/margins": 0.2446671724319458, "rewards/rejected": -2.3661205768585205, "step": 3615 }, { "epoch": 0.9473959696414551, "grad_norm": 10.5, "learning_rate": 2.523016709763215e-09, "logits/chosen": -2.3948216438293457, "logits/rejected": -2.3552143573760986, "logps/chosen": -0.983300507068634, "logps/rejected": -1.2326714992523193, "loss": 1.5595, "rewards/accuracies": 0.5625, "rewards/chosen": -1.966601014137268, "rewards/margins": 0.4987418055534363, "rewards/rejected": -2.4653429985046387, "step": 3620 }, { "epoch": 0.9487045276105731, "grad_norm": 12.5, "learning_rate": 2.3993862948300525e-09, "logits/chosen": -2.4418578147888184, "logits/rejected": -2.408268451690674, "logps/chosen": -1.0105574131011963, "logps/rejected": -1.1310327053070068, "loss": 1.7627, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0211148262023926, "rewards/margins": 0.24095046520233154, "rewards/rejected": -2.2620654106140137, "step": 3625 }, { "epoch": 0.9500130855796912, "grad_norm": 17.375, "learning_rate": 2.278837048168797e-09, "logits/chosen": -2.3561794757843018, "logits/rejected": -2.377061367034912, "logps/chosen": -1.154947280883789, "logps/rejected": -1.2621784210205078, "loss": 1.7461, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.309894561767578, "rewards/margins": 0.21446199715137482, "rewards/rejected": -2.5243568420410156, "step": 3630 }, { "epoch": 0.9513216435488092, "grad_norm": 15.5, "learning_rate": 2.1613714862494305e-09, "logits/chosen": -2.452681303024292, "logits/rejected": -2.38481068611145, "logps/chosen": -1.0022783279418945, "logps/rejected": -1.210624098777771, "loss": 1.5959, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.004556655883789, "rewards/margins": 0.41669169068336487, "rewards/rejected": -2.421248197555542, "step": 3635 }, { "epoch": 0.9526302015179272, "grad_norm": 14.6875, "learning_rate": 2.0469920611698942e-09, "logits/chosen": -2.363806962966919, "logits/rejected": -2.3070688247680664, "logps/chosen": -1.1168370246887207, "logps/rejected": -1.3793450593948364, "loss": 1.6186, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2336740493774414, "rewards/margins": 0.5250161290168762, "rewards/rejected": -2.758690118789673, "step": 3640 }, { "epoch": 0.9539387594870453, "grad_norm": 14.0625, "learning_rate": 1.9357011606049135e-09, "logits/chosen": -2.4529051780700684, "logits/rejected": -2.2184505462646484, "logps/chosen": -0.9820153117179871, "logps/rejected": -1.4211007356643677, "loss": 1.378, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9640306234359741, "rewards/margins": 0.8781706094741821, "rewards/rejected": -2.8422014713287354, "step": 3645 }, { "epoch": 0.9552473174561633, "grad_norm": 6.71875, "learning_rate": 1.827501107756202e-09, "logits/chosen": -2.2756030559539795, "logits/rejected": -2.217421293258667, "logps/chosen": -0.9130918383598328, "logps/rejected": -1.3043773174285889, "loss": 1.4002, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8261836767196655, "rewards/margins": 0.7825709581375122, "rewards/rejected": -2.6087546348571777, "step": 3650 }, { "epoch": 0.9565558754252813, "grad_norm": 10.3125, "learning_rate": 1.722394161303803e-09, "logits/chosen": -2.470982551574707, "logits/rejected": -2.3457751274108887, "logps/chosen": -1.0017516613006592, "logps/rejected": -1.2404826879501343, "loss": 1.62, "rewards/accuracies": 0.625, "rewards/chosen": -2.0035033226013184, "rewards/margins": 0.47746211290359497, "rewards/rejected": -2.4809653759002686, "step": 3655 }, { "epoch": 0.9578644333943994, "grad_norm": 11.375, "learning_rate": 1.6203825153591588e-09, "logits/chosen": -2.3340036869049072, "logits/rejected": -2.146888256072998, "logps/chosen": -1.052917718887329, "logps/rejected": -1.271505355834961, "loss": 1.624, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.105835437774658, "rewards/margins": 0.43717536330223083, "rewards/rejected": -2.543010711669922, "step": 3660 }, { "epoch": 0.9591729913635174, "grad_norm": 25.625, "learning_rate": 1.5214682994191318e-09, "logits/chosen": -2.3542654514312744, "logits/rejected": -2.2539801597595215, "logps/chosen": -1.0494226217269897, "logps/rejected": -1.2231982946395874, "loss": 1.6419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0988452434539795, "rewards/margins": 0.3475513756275177, "rewards/rejected": -2.446396589279175, "step": 3665 }, { "epoch": 0.9604815493326354, "grad_norm": 5.1875, "learning_rate": 1.4256535783216395e-09, "logits/chosen": -2.5291426181793213, "logits/rejected": -2.3978288173675537, "logps/chosen": -1.0803297758102417, "logps/rejected": -1.2409896850585938, "loss": 1.6212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1606595516204834, "rewards/margins": 0.3213199973106384, "rewards/rejected": -2.4819793701171875, "step": 3670 }, { "epoch": 0.9617901073017535, "grad_norm": 15.5, "learning_rate": 1.3329403522025384e-09, "logits/chosen": -2.4303736686706543, "logits/rejected": -2.3431928157806396, "logps/chosen": -0.96930330991745, "logps/rejected": -1.2241352796554565, "loss": 1.6268, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9386066198349, "rewards/margins": 0.5096637606620789, "rewards/rejected": -2.448270559310913, "step": 3675 }, { "epoch": 0.9630986652708715, "grad_norm": 13.875, "learning_rate": 1.2433305564538588e-09, "logits/chosen": -2.4278552532196045, "logits/rejected": -2.1977553367614746, "logps/chosen": -1.0852134227752686, "logps/rejected": -1.372556209564209, "loss": 1.5512, "rewards/accuracies": 0.5625, "rewards/chosen": -2.170426845550537, "rewards/margins": 0.5746855735778809, "rewards/rejected": -2.745112419128418, "step": 3680 }, { "epoch": 0.9644072232399895, "grad_norm": 10.875, "learning_rate": 1.1568260616833693e-09, "logits/chosen": -2.301744222640991, "logits/rejected": -2.258436918258667, "logps/chosen": -1.031067967414856, "logps/rejected": -1.374632716178894, "loss": 1.5619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.062135934829712, "rewards/margins": 0.6871297359466553, "rewards/rejected": -2.749265432357788, "step": 3685 }, { "epoch": 0.9657157812091076, "grad_norm": 36.0, "learning_rate": 1.0734286736756248e-09, "logits/chosen": -2.351868152618408, "logits/rejected": -2.225125789642334, "logps/chosen": -0.9740773439407349, "logps/rejected": -1.2568466663360596, "loss": 1.5923, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9481546878814697, "rewards/margins": 0.5655385851860046, "rewards/rejected": -2.513693332672119, "step": 3690 }, { "epoch": 0.9670243391782256, "grad_norm": 12.5625, "learning_rate": 9.931401333541978e-10, "logits/chosen": -2.441795825958252, "logits/rejected": -2.244086265563965, "logps/chosen": -1.0273977518081665, "logps/rejected": -1.244836688041687, "loss": 1.7275, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.054795503616333, "rewards/margins": 0.4348781704902649, "rewards/rejected": -2.489673376083374, "step": 3695 }, { "epoch": 0.9683328971473436, "grad_norm": 16.5, "learning_rate": 9.159621167453225e-10, "logits/chosen": -2.403102397918701, "logits/rejected": -2.2304115295410156, "logps/chosen": -1.0517702102661133, "logps/rejected": -1.50136399269104, "loss": 1.4756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1035404205322266, "rewards/margins": 0.899187445640564, "rewards/rejected": -3.00272798538208, "step": 3700 }, { "epoch": 0.9696414551164617, "grad_norm": 22.375, "learning_rate": 8.418962349429404e-10, "logits/chosen": -2.394867420196533, "logits/rejected": -2.1992578506469727, "logps/chosen": -1.042255163192749, "logps/rejected": -1.270740270614624, "loss": 1.6159, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.084510326385498, "rewards/margins": 0.4569699168205261, "rewards/rejected": -2.541480541229248, "step": 3705 }, { "epoch": 0.9709500130855797, "grad_norm": 23.875, "learning_rate": 7.709440340750773e-10, "logits/chosen": -2.4447951316833496, "logits/rejected": -2.357909917831421, "logps/chosen": -1.025728702545166, "logps/rejected": -1.4622869491577148, "loss": 1.3704, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.051457405090332, "rewards/margins": 0.873116672039032, "rewards/rejected": -2.9245738983154297, "step": 3710 }, { "epoch": 0.9722585710546977, "grad_norm": 6.53125, "learning_rate": 7.031069952715851e-10, "logits/chosen": -2.475543260574341, "logits/rejected": -2.3271825313568115, "logps/chosen": -0.982906699180603, "logps/rejected": -1.2799656391143799, "loss": 1.4891, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.965813398361206, "rewards/margins": 0.5941178202629089, "rewards/rejected": -2.5599312782287598, "step": 3715 }, { "epoch": 0.9735671290238157, "grad_norm": 7.1875, "learning_rate": 6.383865346331175e-10, "logits/chosen": -2.4627485275268555, "logits/rejected": -2.3551087379455566, "logps/chosen": -0.982761561870575, "logps/rejected": -1.3204834461212158, "loss": 1.4764, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.96552312374115, "rewards/margins": 0.6754439473152161, "rewards/rejected": -2.6409668922424316, "step": 3720 }, { "epoch": 0.9748756869929338, "grad_norm": 20.25, "learning_rate": 5.767840032016858e-10, "logits/chosen": -2.2345879077911377, "logits/rejected": -2.131214141845703, "logps/chosen": -0.8781890869140625, "logps/rejected": -1.1670963764190674, "loss": 1.5571, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.756378173828125, "rewards/margins": 0.5778144598007202, "rewards/rejected": -2.3341927528381348, "step": 3725 }, { "epoch": 0.9761842449620518, "grad_norm": 20.25, "learning_rate": 5.183006869324491e-10, "logits/chosen": -2.4951510429382324, "logits/rejected": -2.326505661010742, "logps/chosen": -0.9524418115615845, "logps/rejected": -1.1507132053375244, "loss": 1.6028, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.904883623123169, "rewards/margins": 0.3965425491333008, "rewards/rejected": -2.301426410675049, "step": 3730 }, { "epoch": 0.9774928029311698, "grad_norm": 13.5, "learning_rate": 4.6293780666676883e-10, "logits/chosen": -2.3604602813720703, "logits/rejected": -2.3586974143981934, "logps/chosen": -0.9290230870246887, "logps/rejected": -1.1986194849014282, "loss": 1.51, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8580461740493774, "rewards/margins": 0.539192795753479, "rewards/rejected": -2.3972389698028564, "step": 3735 }, { "epoch": 0.9788013609002879, "grad_norm": 32.25, "learning_rate": 4.1069651810682894e-10, "logits/chosen": -2.427234172821045, "logits/rejected": -2.403646230697632, "logps/chosen": -0.9928901791572571, "logps/rejected": -1.1989109516143799, "loss": 1.6478, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9857803583145142, "rewards/margins": 0.41204148530960083, "rewards/rejected": -2.3978219032287598, "step": 3740 }, { "epoch": 0.9801099188694059, "grad_norm": 10.9375, "learning_rate": 3.615779117914386e-10, "logits/chosen": -2.446317672729492, "logits/rejected": -2.265881061553955, "logps/chosen": -1.0042821168899536, "logps/rejected": -1.3443725109100342, "loss": 1.5628, "rewards/accuracies": 0.625, "rewards/chosen": -2.0085642337799072, "rewards/margins": 0.6801807880401611, "rewards/rejected": -2.6887450218200684, "step": 3745 }, { "epoch": 0.9814184768385239, "grad_norm": 11.5, "learning_rate": 3.155830130733672e-10, "logits/chosen": -2.2511234283447266, "logits/rejected": -2.2209434509277344, "logps/chosen": -0.9472674131393433, "logps/rejected": -1.168357491493225, "loss": 1.6458, "rewards/accuracies": 0.5, "rewards/chosen": -1.8945348262786865, "rewards/margins": 0.44217991828918457, "rewards/rejected": -2.33671498298645, "step": 3750 }, { "epoch": 0.982727034807642, "grad_norm": 12.6875, "learning_rate": 2.727127820977782e-10, "logits/chosen": -2.4479992389678955, "logits/rejected": -2.3137974739074707, "logps/chosen": -1.0278701782226562, "logps/rejected": -1.2718907594680786, "loss": 1.5929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0557403564453125, "rewards/margins": 0.4880410134792328, "rewards/rejected": -2.5437815189361572, "step": 3755 }, { "epoch": 0.98403559277676, "grad_norm": 14.75, "learning_rate": 2.3296811378237804e-10, "logits/chosen": -2.440977096557617, "logits/rejected": -2.2399818897247314, "logps/chosen": -0.9181006550788879, "logps/rejected": -1.2932581901550293, "loss": 1.4363, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8362013101577759, "rewards/margins": 0.7503151297569275, "rewards/rejected": -2.5865163803100586, "step": 3760 }, { "epoch": 0.985344150745878, "grad_norm": 15.125, "learning_rate": 1.9634983779853176e-10, "logits/chosen": -2.453641176223755, "logits/rejected": -2.4028515815734863, "logps/chosen": -1.0720411539077759, "logps/rejected": -1.2203114032745361, "loss": 1.6706, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1440823078155518, "rewards/margins": 0.29654088616371155, "rewards/rejected": -2.4406228065490723, "step": 3765 }, { "epoch": 0.9866527087149961, "grad_norm": 16.0, "learning_rate": 1.6285871855415966e-10, "logits/chosen": -2.35579252243042, "logits/rejected": -2.3463213443756104, "logps/chosen": -1.070924997329712, "logps/rejected": -1.357664704322815, "loss": 1.5531, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.141849994659424, "rewards/margins": 0.5734796524047852, "rewards/rejected": -2.71532940864563, "step": 3770 }, { "epoch": 0.9879612666841141, "grad_norm": 30.75, "learning_rate": 1.3249545517758366e-10, "logits/chosen": -2.355701446533203, "logits/rejected": -2.305257558822632, "logps/chosen": -0.9575854539871216, "logps/rejected": -1.369092583656311, "loss": 1.3936, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9151709079742432, "rewards/margins": 0.8230142593383789, "rewards/rejected": -2.738185167312622, "step": 3775 }, { "epoch": 0.9892698246532321, "grad_norm": 13.875, "learning_rate": 1.0526068150305545e-10, "logits/chosen": -2.358494997024536, "logits/rejected": -2.2890284061431885, "logps/chosen": -1.0211832523345947, "logps/rejected": -1.2550833225250244, "loss": 1.6445, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0423665046691895, "rewards/margins": 0.4678001403808594, "rewards/rejected": -2.510166645050049, "step": 3780 }, { "epoch": 0.9905783826223502, "grad_norm": 17.375, "learning_rate": 8.115496605748396e-11, "logits/chosen": -2.408569812774658, "logits/rejected": -2.294351577758789, "logps/chosen": -0.9710305333137512, "logps/rejected": -1.328296422958374, "loss": 1.4356, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9420610666275024, "rewards/margins": 0.7145318388938904, "rewards/rejected": -2.656592845916748, "step": 3785 }, { "epoch": 0.9918869405914682, "grad_norm": 42.25, "learning_rate": 6.0178812048578e-11, "logits/chosen": -2.527743339538574, "logits/rejected": -2.3799543380737305, "logps/chosen": -0.9607137441635132, "logps/rejected": -1.3316587209701538, "loss": 1.4671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9214274883270264, "rewards/margins": 0.7418899536132812, "rewards/rejected": -2.6633174419403076, "step": 3790 }, { "epoch": 0.9931954985605862, "grad_norm": 7.6875, "learning_rate": 4.233265735432146e-11, "logits/chosen": -2.4929494857788086, "logits/rejected": -2.4012415409088135, "logps/chosen": -1.0290257930755615, "logps/rejected": -1.2989561557769775, "loss": 1.5552, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.058051586151123, "rewards/margins": 0.539860725402832, "rewards/rejected": -2.597912311553955, "step": 3795 }, { "epoch": 0.9945040565297043, "grad_norm": 42.0, "learning_rate": 2.76168745138472e-11, "logits/chosen": -2.3607070446014404, "logits/rejected": -2.2905001640319824, "logps/chosen": -0.9110482335090637, "logps/rejected": -1.171035647392273, "loss": 1.4822, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8220964670181274, "rewards/margins": 0.5199750661849976, "rewards/rejected": -2.342071294784546, "step": 3800 }, { "epoch": 0.9958126144988223, "grad_norm": 15.375, "learning_rate": 1.6031770719693306e-11, "logits/chosen": -2.4658079147338867, "logits/rejected": -2.3413503170013428, "logps/chosen": -1.0280430316925049, "logps/rejected": -1.1959391832351685, "loss": 1.7055, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.0560860633850098, "rewards/margins": 0.3357921838760376, "rewards/rejected": -2.391878366470337, "step": 3805 }, { "epoch": 0.9971211724679403, "grad_norm": 18.75, "learning_rate": 7.577587811291585e-12, "logits/chosen": -2.4558184146881104, "logits/rejected": -2.401414394378662, "logps/chosen": -1.0063097476959229, "logps/rejected": -1.211281180381775, "loss": 1.6163, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0126194953918457, "rewards/margins": 0.4099429249763489, "rewards/rejected": -2.42256236076355, "step": 3810 }, { "epoch": 0.9984297304370584, "grad_norm": 19.0, "learning_rate": 2.254502270054859e-12, "logits/chosen": -2.4101758003234863, "logits/rejected": -2.2630929946899414, "logps/chosen": -1.0286520719528198, "logps/rejected": -1.2913074493408203, "loss": 1.5974, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0573041439056396, "rewards/margins": 0.5253106951713562, "rewards/rejected": -2.5826148986816406, "step": 3815 }, { "epoch": 0.9997382884061764, "grad_norm": 9.5625, "learning_rate": 6.262521557998113e-14, "logits/chosen": -2.40633225440979, "logits/rejected": -2.2127370834350586, "logps/chosen": -1.0145456790924072, "logps/rejected": -1.2355073690414429, "loss": 1.5903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0290913581848145, "rewards/margins": 0.44192320108413696, "rewards/rejected": -2.4710147380828857, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 4.514, "train_samples_per_second": 13543.57, "train_steps_per_second": 846.487 } ], "logging_steps": 5, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }