{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2458100558659218, "eval_steps": 50, "global_step": 1450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008594757198109154, "grad_norm": 0.05934199318289757, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.762972831726074, "logits/rejected": 15.199728012084961, "logps/chosen": -0.3259914815425873, "logps/rejected": -0.34297481179237366, "loss": 0.9377, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.4889872074127197, "rewards/margins": 0.02547495998442173, "rewards/rejected": -0.5144621729850769, "step": 10 }, { "epoch": 0.017189514396218308, "grad_norm": 0.06342790275812149, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.351249694824219, "logits/rejected": 15.068448066711426, "logps/chosen": -0.2809392511844635, "logps/rejected": -0.3711296617984772, "loss": 0.9352, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.42140883207321167, "rewards/margins": 0.1352856159210205, "rewards/rejected": -0.5566944479942322, "step": 20 }, { "epoch": 0.02578427159432746, "grad_norm": 0.053961098194122314, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.636960983276367, "logits/rejected": 15.265243530273438, "logps/chosen": -0.2820780873298645, "logps/rejected": -0.34024301171302795, "loss": 0.9351, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.42311716079711914, "rewards/margins": 0.08724743127822876, "rewards/rejected": -0.5103646516799927, "step": 30 }, { "epoch": 0.034379028792436615, "grad_norm": 0.13506193459033966, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.4556884765625, "logits/rejected": 15.048967361450195, "logps/chosen": -0.2897028625011444, "logps/rejected": -0.34129124879837036, "loss": 0.922, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.43455424904823303, "rewards/margins": 0.07738252729177475, "rewards/rejected": -0.5119368433952332, "step": 40 }, { "epoch": 0.042973785990545764, "grad_norm": 0.05230574309825897, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.628789901733398, "logits/rejected": 15.307828903198242, "logps/chosen": -0.28786614537239075, "logps/rejected": -0.3513876795768738, "loss": 0.9201, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4317992329597473, "rewards/margins": 0.09528233855962753, "rewards/rejected": -0.5270815491676331, "step": 50 }, { "epoch": 0.042973785990545764, "eval_logits/chosen": 14.234943389892578, "eval_logits/rejected": 15.258601188659668, "eval_logps/chosen": -0.2844341993331909, "eval_logps/rejected": -0.3695394694805145, "eval_loss": 0.9226060509681702, "eval_rewards/accuracies": 0.5157894492149353, "eval_rewards/chosen": -0.42665132880210876, "eval_rewards/margins": 0.1276579648256302, "eval_rewards/rejected": -0.5543092489242554, "eval_runtime": 25.9356, "eval_samples_per_second": 29.033, "eval_steps_per_second": 3.663, "step": 50 }, { "epoch": 0.05156854318865492, "grad_norm": 0.09328428655862808, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.35963249206543, "logits/rejected": 15.055354118347168, "logps/chosen": -0.27534741163253784, "logps/rejected": -0.33098170161247253, "loss": 0.9356, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4130210876464844, "rewards/margins": 0.08345144242048264, "rewards/rejected": -0.4964725375175476, "step": 60 }, { "epoch": 0.060163300386764075, "grad_norm": 0.06518550217151642, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.599525451660156, "logits/rejected": 14.825297355651855, "logps/chosen": -0.2708163857460022, "logps/rejected": -0.3305850923061371, "loss": 0.9257, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4062245786190033, "rewards/margins": 0.08965305984020233, "rewards/rejected": -0.4958776533603668, "step": 70 }, { "epoch": 0.06875805758487323, "grad_norm": 0.07543154805898666, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.947430610656738, "logits/rejected": 15.093690872192383, "logps/chosen": -0.2602943778038025, "logps/rejected": -0.31820863485336304, "loss": 0.9168, "rewards/accuracies": 0.5, "rewards/chosen": -0.39044153690338135, "rewards/margins": 0.08687138557434082, "rewards/rejected": -0.47731298208236694, "step": 80 }, { "epoch": 0.07735281478298238, "grad_norm": 0.06628195196390152, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.43529987335205, "logits/rejected": 14.750699043273926, "logps/chosen": -0.2884291708469391, "logps/rejected": -0.34193652868270874, "loss": 0.9273, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.43264374136924744, "rewards/margins": 0.08026103675365448, "rewards/rejected": -0.5129047632217407, "step": 90 }, { "epoch": 0.08594757198109153, "grad_norm": 0.08684897422790527, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 13.573002815246582, "logits/rejected": 14.441877365112305, "logps/chosen": -0.2569890320301056, "logps/rejected": -0.37049269676208496, "loss": 0.9009, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3854835629463196, "rewards/margins": 0.17025551199913025, "rewards/rejected": -0.5557390451431274, "step": 100 }, { "epoch": 0.08594757198109153, "eval_logits/chosen": 14.026633262634277, "eval_logits/rejected": 15.08835220336914, "eval_logps/chosen": -0.2761566936969757, "eval_logps/rejected": -0.3717801570892334, "eval_loss": 0.9138591885566711, "eval_rewards/accuracies": 0.5368421077728271, "eval_rewards/chosen": -0.41423505544662476, "eval_rewards/margins": 0.1434352546930313, "eval_rewards/rejected": -0.5576702952384949, "eval_runtime": 25.3996, "eval_samples_per_second": 29.646, "eval_steps_per_second": 3.74, "step": 100 }, { "epoch": 0.09454232917920069, "grad_norm": 0.08046824485063553, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.500630378723145, "logits/rejected": 14.831761360168457, "logps/chosen": -0.30049553513526917, "logps/rejected": -0.3315966725349426, "loss": 0.916, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.45074325799942017, "rewards/margins": 0.04665176197886467, "rewards/rejected": -0.49739497900009155, "step": 110 }, { "epoch": 0.10313708637730984, "grad_norm": 0.12244562804698944, "learning_rate": 4.921457902821578e-06, "logits/chosen": 14.26713752746582, "logits/rejected": 14.495455741882324, "logps/chosen": -0.2670941650867462, "logps/rejected": -0.32481229305267334, "loss": 0.9167, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4006412625312805, "rewards/margins": 0.08657723665237427, "rewards/rejected": -0.4872184693813324, "step": 120 }, { "epoch": 0.11173184357541899, "grad_norm": 0.1828213334083557, "learning_rate": 4.907906416994146e-06, "logits/chosen": 14.009546279907227, "logits/rejected": 14.297094345092773, "logps/chosen": -0.27995598316192627, "logps/rejected": -0.3530685007572174, "loss": 0.9087, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.419933944940567, "rewards/margins": 0.10966875404119492, "rewards/rejected": -0.5296027660369873, "step": 130 }, { "epoch": 0.12032660077352815, "grad_norm": 0.10407563298940659, "learning_rate": 4.893298743830168e-06, "logits/chosen": 13.689155578613281, "logits/rejected": 14.1933012008667, "logps/chosen": -0.25955715775489807, "logps/rejected": -0.3815004229545593, "loss": 0.9053, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3893357217311859, "rewards/margins": 0.18291489779949188, "rewards/rejected": -0.5722506046295166, "step": 140 }, { "epoch": 0.1289213579716373, "grad_norm": 0.10028588026762009, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.851397514343262, "logits/rejected": 13.509778022766113, "logps/chosen": -0.23652991652488708, "logps/rejected": -0.3720462918281555, "loss": 0.8999, "rewards/accuracies": 0.625, "rewards/chosen": -0.3547949194908142, "rewards/margins": 0.2032744586467743, "rewards/rejected": -0.5580693483352661, "step": 150 }, { "epoch": 0.1289213579716373, "eval_logits/chosen": 12.384929656982422, "eval_logits/rejected": 13.672826766967773, "eval_logps/chosen": -0.27857670187950134, "eval_logps/rejected": -0.4014737904071808, "eval_loss": 0.8956203460693359, "eval_rewards/accuracies": 0.5684210658073425, "eval_rewards/chosen": -0.4178650677204132, "eval_rewards/margins": 0.18434564769268036, "eval_rewards/rejected": -0.6022107601165771, "eval_runtime": 25.4176, "eval_samples_per_second": 29.625, "eval_steps_per_second": 3.738, "step": 150 }, { "epoch": 0.13751611516974646, "grad_norm": 0.12453093379735947, "learning_rate": 4.860940925593703e-06, "logits/chosen": 12.110003471374512, "logits/rejected": 13.076980590820312, "logps/chosen": -0.27192068099975586, "logps/rejected": -0.3863692879676819, "loss": 0.8907, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4078810214996338, "rewards/margins": 0.1716729700565338, "rewards/rejected": -0.5795539617538452, "step": 160 }, { "epoch": 0.1461108723678556, "grad_norm": 0.17137788236141205, "learning_rate": 4.84320497372973e-06, "logits/chosen": 11.92918586730957, "logits/rejected": 12.573629379272461, "logps/chosen": -0.27472984790802, "logps/rejected": -0.41249385476112366, "loss": 0.8831, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.41209474205970764, "rewards/margins": 0.20664596557617188, "rewards/rejected": -0.6187406778335571, "step": 170 }, { "epoch": 0.15470562956596476, "grad_norm": 0.3904883861541748, "learning_rate": 4.824441214720629e-06, "logits/chosen": 11.182531356811523, "logits/rejected": 12.176573753356934, "logps/chosen": -0.2953718304634094, "logps/rejected": -0.4208717942237854, "loss": 0.8736, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4430577754974365, "rewards/margins": 0.18824996054172516, "rewards/rejected": -0.6313077211380005, "step": 180 }, { "epoch": 0.1633003867640739, "grad_norm": 0.17574089765548706, "learning_rate": 4.804657878971252e-06, "logits/chosen": 10.119890213012695, "logits/rejected": 11.05900764465332, "logps/chosen": -0.29340866208076477, "logps/rejected": -0.4555762708187103, "loss": 0.884, "rewards/accuracies": 0.625, "rewards/chosen": -0.44011297821998596, "rewards/margins": 0.24325144290924072, "rewards/rejected": -0.6833644509315491, "step": 190 }, { "epoch": 0.17189514396218306, "grad_norm": 0.2242884337902069, "learning_rate": 4.783863644106502e-06, "logits/chosen": 9.674784660339355, "logits/rejected": 10.418611526489258, "logps/chosen": -0.3504490852355957, "logps/rejected": -0.5431731939315796, "loss": 0.8419, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5256736278533936, "rewards/margins": 0.2890861928462982, "rewards/rejected": -0.8147598505020142, "step": 200 }, { "epoch": 0.17189514396218306, "eval_logits/chosen": 7.944870471954346, "eval_logits/rejected": 8.979729652404785, "eval_logps/chosen": -0.33341673016548157, "eval_logps/rejected": -0.5431775450706482, "eval_loss": 0.8462886810302734, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.5001251101493835, "eval_rewards/margins": 0.3146411180496216, "eval_rewards/rejected": -0.8147663474082947, "eval_runtime": 25.419, "eval_samples_per_second": 29.623, "eval_steps_per_second": 3.737, "step": 200 }, { "epoch": 0.18048990116029223, "grad_norm": 0.32119837403297424, "learning_rate": 4.762067631165049e-06, "logits/chosen": 7.16138219833374, "logits/rejected": 8.43680477142334, "logps/chosen": -0.36649250984191895, "logps/rejected": -0.5420924425125122, "loss": 0.8187, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5497387647628784, "rewards/margins": 0.2633998692035675, "rewards/rejected": -0.8131386041641235, "step": 210 }, { "epoch": 0.18908465835840138, "grad_norm": 0.48516562581062317, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 4.770083427429199, "logits/rejected": 5.710458278656006, "logps/chosen": -0.34041497111320496, "logps/rejected": -0.6309320330619812, "loss": 0.8448, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.510622501373291, "rewards/margins": 0.4357755780220032, "rewards/rejected": -0.9463980793952942, "step": 220 }, { "epoch": 0.19767941555651053, "grad_norm": 0.29154208302497864, "learning_rate": 4.715508948078037e-06, "logits/chosen": 5.168765068054199, "logits/rejected": 5.421420574188232, "logps/chosen": -0.3792352080345154, "logps/rejected": -0.65748131275177, "loss": 0.8066, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5688528418540955, "rewards/margins": 0.41736921668052673, "rewards/rejected": -0.986221969127655, "step": 230 }, { "epoch": 0.20627417275461968, "grad_norm": 0.42973750829696655, "learning_rate": 4.690766700109659e-06, "logits/chosen": 4.204717636108398, "logits/rejected": 3.706291913986206, "logps/chosen": -0.39414530992507935, "logps/rejected": -0.7194588780403137, "loss": 0.7787, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5912179350852966, "rewards/margins": 0.4879704415798187, "rewards/rejected": -1.079188346862793, "step": 240 }, { "epoch": 0.21486892995272883, "grad_norm": 0.5244571566581726, "learning_rate": 4.665063509461098e-06, "logits/chosen": 3.335484743118286, "logits/rejected": 3.3176345825195312, "logps/chosen": -0.4493131637573242, "logps/rejected": -0.8293434381484985, "loss": 0.7776, "rewards/accuracies": 0.625, "rewards/chosen": -0.6739697456359863, "rewards/margins": 0.5700454115867615, "rewards/rejected": -1.244015097618103, "step": 250 }, { "epoch": 0.21486892995272883, "eval_logits/chosen": 2.590949058532715, "eval_logits/rejected": 2.2929749488830566, "eval_logps/chosen": -0.48714593052864075, "eval_logps/rejected": -0.9267774224281311, "eval_loss": 0.7469337582588196, "eval_rewards/accuracies": 0.6526315808296204, "eval_rewards/chosen": -0.7307189106941223, "eval_rewards/margins": 0.659447193145752, "eval_rewards/rejected": -1.390166163444519, "eval_runtime": 25.3944, "eval_samples_per_second": 29.652, "eval_steps_per_second": 3.741, "step": 250 }, { "epoch": 0.22346368715083798, "grad_norm": 0.39347293972969055, "learning_rate": 4.638410650401267e-06, "logits/chosen": 2.2975668907165527, "logits/rejected": 1.2855035066604614, "logps/chosen": -0.5228341817855835, "logps/rejected": -1.00227952003479, "loss": 0.6981, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.78425133228302, "rewards/margins": 0.7191681265830994, "rewards/rejected": -1.5034195184707642, "step": 260 }, { "epoch": 0.23205844434894715, "grad_norm": 0.69575434923172, "learning_rate": 4.610819813755038e-06, "logits/chosen": 2.8782780170440674, "logits/rejected": 1.9394336938858032, "logps/chosen": -0.4982885718345642, "logps/rejected": -1.035541296005249, "loss": 0.7174, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7474328875541687, "rewards/margins": 0.8058789372444153, "rewards/rejected": -1.5533119440078735, "step": 270 }, { "epoch": 0.2406532015470563, "grad_norm": 0.7858326435089111, "learning_rate": 4.582303101775249e-06, "logits/chosen": 2.710908889770508, "logits/rejected": 1.6444288492202759, "logps/chosen": -0.600068211555481, "logps/rejected": -1.1271780729293823, "loss": 0.6972, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9001023173332214, "rewards/margins": 0.7906648516654968, "rewards/rejected": -1.6907672882080078, "step": 280 }, { "epoch": 0.24924795874516545, "grad_norm": 0.7384620904922485, "learning_rate": 4.55287302283426e-06, "logits/chosen": 1.5841500759124756, "logits/rejected": 0.640514612197876, "logps/chosen": -0.6465060710906982, "logps/rejected": -1.4245095252990723, "loss": 0.6192, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9697591066360474, "rewards/margins": 1.1670053005218506, "rewards/rejected": -2.1367642879486084, "step": 290 }, { "epoch": 0.2578427159432746, "grad_norm": 0.8262321352958679, "learning_rate": 4.522542485937369e-06, "logits/chosen": 1.7300422191619873, "logits/rejected": 0.7782856225967407, "logps/chosen": -0.7083590626716614, "logps/rejected": -1.6742557287216187, "loss": 0.5721, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.062538504600525, "rewards/margins": 1.4488452672958374, "rewards/rejected": -2.511383533477783, "step": 300 }, { "epoch": 0.2578427159432746, "eval_logits/chosen": 1.3559931516647339, "eval_logits/rejected": 0.6592276096343994, "eval_logps/chosen": -0.7815767526626587, "eval_logps/rejected": -2.1176154613494873, "eval_loss": 0.5730626583099365, "eval_rewards/accuracies": 0.7052631378173828, "eval_rewards/chosen": -1.1723653078079224, "eval_rewards/margins": 2.0040581226348877, "eval_rewards/rejected": -3.1764233112335205, "eval_runtime": 25.539, "eval_samples_per_second": 29.484, "eval_steps_per_second": 3.72, "step": 300 }, { "epoch": 0.2664374731413838, "grad_norm": 0.8472572565078735, "learning_rate": 4.491324795060491e-06, "logits/chosen": 1.4461088180541992, "logits/rejected": 0.49669915437698364, "logps/chosen": -0.7694377899169922, "logps/rejected": -2.362783432006836, "loss": 0.5091, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1541565656661987, "rewards/margins": 2.390018939971924, "rewards/rejected": -3.544174909591675, "step": 310 }, { "epoch": 0.2750322303394929, "grad_norm": 0.41847530007362366, "learning_rate": 4.4592336433146e-06, "logits/chosen": 2.172646999359131, "logits/rejected": 1.0526962280273438, "logps/chosen": -0.7410945296287537, "logps/rejected": -1.9158353805541992, "loss": 0.5352, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1116416454315186, "rewards/margins": 1.7621114253997803, "rewards/rejected": -2.873753070831299, "step": 320 }, { "epoch": 0.28362698753760207, "grad_norm": 1.7422096729278564, "learning_rate": 4.426283106939474e-06, "logits/chosen": 2.611234188079834, "logits/rejected": 1.7068111896514893, "logps/chosen": -0.8319486379623413, "logps/rejected": -2.32024884223938, "loss": 0.5397, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2479230165481567, "rewards/margins": 2.232450008392334, "rewards/rejected": -3.480372905731201, "step": 330 }, { "epoch": 0.2922217447357112, "grad_norm": 0.8699240684509277, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 1.996747612953186, "logits/rejected": 1.1473515033721924, "logps/chosen": -0.8445833921432495, "logps/rejected": -2.675687551498413, "loss": 0.4817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2668750286102295, "rewards/margins": 2.7466559410095215, "rewards/rejected": -4.01353120803833, "step": 340 }, { "epoch": 0.30081650193382037, "grad_norm": 2.089289426803589, "learning_rate": 4.357862063693486e-06, "logits/chosen": 1.7134803533554077, "logits/rejected": 1.3000510931015015, "logps/chosen": -0.8976927995681763, "logps/rejected": -2.1593873500823975, "loss": 0.5098, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3465392589569092, "rewards/margins": 1.8925418853759766, "rewards/rejected": -3.2390809059143066, "step": 350 }, { "epoch": 0.30081650193382037, "eval_logits/chosen": 1.6772903203964233, "eval_logits/rejected": 1.2370609045028687, "eval_logps/chosen": -0.9737761616706848, "eval_logps/rejected": -3.1528680324554443, "eval_loss": 0.5162621736526489, "eval_rewards/accuracies": 0.7263157963752747, "eval_rewards/chosen": -1.46066415309906, "eval_rewards/margins": 3.2686376571655273, "eval_rewards/rejected": -4.729301929473877, "eval_runtime": 25.4163, "eval_samples_per_second": 29.627, "eval_steps_per_second": 3.738, "step": 350 }, { "epoch": 0.3094112591319295, "grad_norm": 0.47079572081565857, "learning_rate": 4.322421568553529e-06, "logits/chosen": 1.9561872482299805, "logits/rejected": 0.8960329294204712, "logps/chosen": -0.9378088712692261, "logps/rejected": -2.8065876960754395, "loss": 0.5046, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4067132472991943, "rewards/margins": 2.8031680583953857, "rewards/rejected": -4.209881782531738, "step": 360 }, { "epoch": 0.31800601633003867, "grad_norm": 0.6202365159988403, "learning_rate": 4.286181699082008e-06, "logits/chosen": 2.152726411819458, "logits/rejected": 1.4309433698654175, "logps/chosen": -1.007157564163208, "logps/rejected": -3.3813462257385254, "loss": 0.4526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5107364654541016, "rewards/margins": 3.561283588409424, "rewards/rejected": -5.072019577026367, "step": 370 }, { "epoch": 0.3266007735281478, "grad_norm": 1.080393671989441, "learning_rate": 4.249158351283414e-06, "logits/chosen": 1.7528371810913086, "logits/rejected": 1.3293968439102173, "logps/chosen": -1.0258004665374756, "logps/rejected": -2.984057903289795, "loss": 0.4879, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5387006998062134, "rewards/margins": 2.9373860359191895, "rewards/rejected": -4.476086616516113, "step": 380 }, { "epoch": 0.33519553072625696, "grad_norm": 1.4520032405853271, "learning_rate": 4.211367764821722e-06, "logits/chosen": 3.061373233795166, "logits/rejected": 2.0103466510772705, "logps/chosen": -1.0191391706466675, "logps/rejected": -2.9054081439971924, "loss": 0.4776, "rewards/accuracies": 0.625, "rewards/chosen": -1.5287089347839355, "rewards/margins": 2.8294031620025635, "rewards/rejected": -4.358112335205078, "step": 390 }, { "epoch": 0.3437902879243661, "grad_norm": 0.5479139089584351, "learning_rate": 4.172826515897146e-06, "logits/chosen": 2.8395092487335205, "logits/rejected": 2.0935282707214355, "logps/chosen": -1.0769506692886353, "logps/rejected": -3.11635160446167, "loss": 0.4686, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6154258251190186, "rewards/margins": 3.0591015815734863, "rewards/rejected": -4.674527168273926, "step": 400 }, { "epoch": 0.3437902879243661, "eval_logits/chosen": 2.5064592361450195, "eval_logits/rejected": 2.108433485031128, "eval_logps/chosen": -1.1957285404205322, "eval_logps/rejected": -3.7678382396698, "eval_loss": 0.46578800678253174, "eval_rewards/accuracies": 0.7368420958518982, "eval_rewards/chosen": -1.793592929840088, "eval_rewards/margins": 3.8581647872924805, "eval_rewards/rejected": -5.651757717132568, "eval_runtime": 25.415, "eval_samples_per_second": 29.628, "eval_steps_per_second": 3.738, "step": 400 }, { "epoch": 0.3523850451224753, "grad_norm": 0.9966821670532227, "learning_rate": 4.133551509975264e-06, "logits/chosen": 2.6411917209625244, "logits/rejected": 1.8634885549545288, "logps/chosen": -1.0934125185012817, "logps/rejected": -3.2207794189453125, "loss": 0.4335, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6401188373565674, "rewards/margins": 3.1910502910614014, "rewards/rejected": -4.831169128417969, "step": 410 }, { "epoch": 0.36097980232058446, "grad_norm": 0.6384722590446472, "learning_rate": 4.093559974371725e-06, "logits/chosen": 3.1368844509124756, "logits/rejected": 2.3800251483917236, "logps/chosen": -1.2108217477798462, "logps/rejected": -3.484806537628174, "loss": 0.4543, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.816232681274414, "rewards/margins": 3.4109771251678467, "rewards/rejected": -5.227209568023682, "step": 420 }, { "epoch": 0.3695745595186936, "grad_norm": 0.856741726398468, "learning_rate": 4.052869450695776e-06, "logits/chosen": 3.155728816986084, "logits/rejected": 2.257838726043701, "logps/chosen": -1.4214586019515991, "logps/rejected": -4.186622619628906, "loss": 0.4091, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.132187604904175, "rewards/margins": 4.1477460861206055, "rewards/rejected": -6.279933929443359, "step": 430 }, { "epoch": 0.37816931671680276, "grad_norm": 1.3310774564743042, "learning_rate": 4.011497787155938e-06, "logits/chosen": 1.9942185878753662, "logits/rejected": 1.6246827840805054, "logps/chosen": -1.8575637340545654, "logps/rejected": -4.5355329513549805, "loss": 0.3995, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.7863457202911377, "rewards/margins": 4.016953945159912, "rewards/rejected": -6.8032989501953125, "step": 440 }, { "epoch": 0.3867640739149119, "grad_norm": 2.0849101543426514, "learning_rate": 3.969463130731183e-06, "logits/chosen": 2.406555652618408, "logits/rejected": 2.0490009784698486, "logps/chosen": -2.392570972442627, "logps/rejected": -5.055584907531738, "loss": 0.3671, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.588855743408203, "rewards/margins": 3.994520902633667, "rewards/rejected": -7.583376884460449, "step": 450 }, { "epoch": 0.3867640739149119, "eval_logits/chosen": 2.2324020862579346, "eval_logits/rejected": 2.365755319595337, "eval_logps/chosen": -2.736898422241211, "eval_logps/rejected": -5.73967170715332, "eval_loss": 0.3965117633342743, "eval_rewards/accuracies": 0.8736842274665833, "eval_rewards/chosen": -4.105347633361816, "eval_rewards/margins": 4.504159927368164, "eval_rewards/rejected": -8.60950756072998, "eval_runtime": 25.428, "eval_samples_per_second": 29.613, "eval_steps_per_second": 3.736, "step": 450 }, { "epoch": 0.39535883111302106, "grad_norm": 2.223949432373047, "learning_rate": 3.92678391921108e-06, "logits/chosen": 2.651564598083496, "logits/rejected": 2.383842945098877, "logps/chosen": -2.591308355331421, "logps/rejected": -5.308972358703613, "loss": 0.3412, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.886962413787842, "rewards/margins": 4.07649564743042, "rewards/rejected": -7.963458061218262, "step": 460 }, { "epoch": 0.4039535883111302, "grad_norm": 3.110624074935913, "learning_rate": 3.88347887310836e-06, "logits/chosen": 2.5435309410095215, "logits/rejected": 2.46763277053833, "logps/chosen": -2.413583993911743, "logps/rejected": -5.543262481689453, "loss": 0.3832, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.620375871658325, "rewards/margins": 4.694517135620117, "rewards/rejected": -8.314892768859863, "step": 470 }, { "epoch": 0.41254834550923936, "grad_norm": 1.6255794763565063, "learning_rate": 3.839566987447492e-06, "logits/chosen": 3.842928409576416, "logits/rejected": 3.5797982215881348, "logps/chosen": -2.6448044776916504, "logps/rejected": -4.98160982131958, "loss": 0.3547, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9672069549560547, "rewards/margins": 3.5052082538604736, "rewards/rejected": -7.472414493560791, "step": 480 }, { "epoch": 0.4211431027073485, "grad_norm": 2.9274284839630127, "learning_rate": 3.795067523432826e-06, "logits/chosen": 3.3297150135040283, "logits/rejected": 3.0205535888671875, "logps/chosen": -2.811923027038574, "logps/rejected": -6.040881156921387, "loss": 0.3097, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.217884063720703, "rewards/margins": 4.843437194824219, "rewards/rejected": -9.061322212219238, "step": 490 }, { "epoch": 0.42973785990545765, "grad_norm": 2.9143636226654053, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 2.760014772415161, "logits/rejected": 2.535520315170288, "logps/chosen": -3.068406820297241, "logps/rejected": -5.877435684204102, "loss": 0.3031, "rewards/accuracies": 0.875, "rewards/chosen": -4.602609634399414, "rewards/margins": 4.21354341506958, "rewards/rejected": -8.816153526306152, "step": 500 }, { "epoch": 0.42973785990545765, "eval_logits/chosen": 2.0952131748199463, "eval_logits/rejected": 2.1864659786224365, "eval_logps/chosen": -3.392296075820923, "eval_logps/rejected": -6.948195457458496, "eval_loss": 0.33660775423049927, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.088444232940674, "eval_rewards/margins": 5.3338494300842285, "eval_rewards/rejected": -10.422293663024902, "eval_runtime": 25.4226, "eval_samples_per_second": 29.619, "eval_steps_per_second": 3.737, "step": 500 }, { "epoch": 0.4383326171035668, "grad_norm": 2.563810348510742, "learning_rate": 3.7043841852542884e-06, "logits/chosen": 2.950286388397217, "logits/rejected": 2.619025945663452, "logps/chosen": -3.237391710281372, "logps/rejected": -5.953216552734375, "loss": 0.318, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.856087684631348, "rewards/margins": 4.073737144470215, "rewards/rejected": -8.929824829101562, "step": 510 }, { "epoch": 0.44692737430167595, "grad_norm": 2.0339434146881104, "learning_rate": 3.658240087799655e-06, "logits/chosen": 2.987595558166504, "logits/rejected": 2.6243975162506104, "logps/chosen": -3.5633530616760254, "logps/rejected": -7.0458879470825195, "loss": 0.3053, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.345029354095459, "rewards/margins": 5.223802089691162, "rewards/rejected": -10.568831443786621, "step": 520 }, { "epoch": 0.45552213149978515, "grad_norm": 4.091029644012451, "learning_rate": 3.611587947962319e-06, "logits/chosen": 2.297576904296875, "logits/rejected": 2.0218777656555176, "logps/chosen": -3.297245502471924, "logps/rejected": -6.101919651031494, "loss": 0.3255, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.945868015289307, "rewards/margins": 4.207010746002197, "rewards/rejected": -9.152878761291504, "step": 530 }, { "epoch": 0.4641168886978943, "grad_norm": 2.7896900177001953, "learning_rate": 3.564448228912682e-06, "logits/chosen": 2.103950023651123, "logits/rejected": 1.9478647708892822, "logps/chosen": -2.9360263347625732, "logps/rejected": -6.406435489654541, "loss": 0.3361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.40403938293457, "rewards/margins": 5.20561408996582, "rewards/rejected": -9.60965347290039, "step": 540 }, { "epoch": 0.47271164589600345, "grad_norm": 2.657970905303955, "learning_rate": 3.516841607689501e-06, "logits/chosen": 2.1658639907836914, "logits/rejected": 2.214900493621826, "logps/chosen": -3.084073066711426, "logps/rejected": -6.935500144958496, "loss": 0.2928, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.626110076904297, "rewards/margins": 5.7771406173706055, "rewards/rejected": -10.403249740600586, "step": 550 }, { "epoch": 0.47271164589600345, "eval_logits/chosen": 2.285294771194458, "eval_logits/rejected": 2.3312103748321533, "eval_logps/chosen": -3.35794997215271, "eval_logps/rejected": -7.37537145614624, "eval_loss": 0.3121817409992218, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.036925792694092, "eval_rewards/margins": 6.026132106781006, "eval_rewards/rejected": -11.063057899475098, "eval_runtime": 25.4015, "eval_samples_per_second": 29.644, "eval_steps_per_second": 3.74, "step": 550 }, { "epoch": 0.4813064030941126, "grad_norm": 2.940019369125366, "learning_rate": 3.4687889661302577e-06, "logits/chosen": 1.9122416973114014, "logits/rejected": 1.9943454265594482, "logps/chosen": -3.27177095413208, "logps/rejected": -7.023342132568359, "loss": 0.3105, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.907656669616699, "rewards/margins": 5.6273579597473145, "rewards/rejected": -10.535014152526855, "step": 560 }, { "epoch": 0.48990116029222175, "grad_norm": 1.8887412548065186, "learning_rate": 3.4203113817116955e-06, "logits/chosen": 2.274843692779541, "logits/rejected": 2.392199993133545, "logps/chosen": -3.383749008178711, "logps/rejected": -7.265415191650391, "loss": 0.3003, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.075623512268066, "rewards/margins": 5.8224992752075195, "rewards/rejected": -10.898123741149902, "step": 570 }, { "epoch": 0.4984959174903309, "grad_norm": 1.6364414691925049, "learning_rate": 3.3714301183045382e-06, "logits/chosen": 2.423910617828369, "logits/rejected": 2.244985818862915, "logps/chosen": -3.0959205627441406, "logps/rejected": -6.822405815124512, "loss": 0.2471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.643880844116211, "rewards/margins": 5.58972692489624, "rewards/rejected": -10.233609199523926, "step": 580 }, { "epoch": 0.50709067468844, "grad_norm": 2.6540188789367676, "learning_rate": 3.3221666168464584e-06, "logits/chosen": 2.8146812915802, "logits/rejected": 2.5971922874450684, "logps/chosen": -4.139407157897949, "logps/rejected": -7.71649694442749, "loss": 0.2809, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.209111213684082, "rewards/margins": 5.365634441375732, "rewards/rejected": -11.574746131896973, "step": 590 }, { "epoch": 0.5156854318865493, "grad_norm": 4.229885578155518, "learning_rate": 3.272542485937369e-06, "logits/chosen": 2.2735249996185303, "logits/rejected": 1.8577899932861328, "logps/chosen": -3.731342315673828, "logps/rejected": -7.2900390625, "loss": 0.2956, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.5970139503479, "rewards/margins": 5.338044166564941, "rewards/rejected": -10.93505859375, "step": 600 }, { "epoch": 0.5156854318865493, "eval_logits/chosen": 2.3333992958068848, "eval_logits/rejected": 2.529745578765869, "eval_logps/chosen": -3.679597854614258, "eval_logps/rejected": -7.917842864990234, "eval_loss": 0.3030374050140381, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.519396781921387, "eval_rewards/margins": 6.357367992401123, "eval_rewards/rejected": -11.876765251159668, "eval_runtime": 25.5622, "eval_samples_per_second": 29.458, "eval_steps_per_second": 3.716, "step": 600 }, { "epoch": 0.5242801890846583, "grad_norm": 2.657008647918701, "learning_rate": 3.222579492361179e-06, "logits/chosen": 2.699007034301758, "logits/rejected": 2.731860876083374, "logps/chosen": -3.3311946392059326, "logps/rejected": -7.005735874176025, "loss": 0.2898, "rewards/accuracies": 0.9375, "rewards/chosen": -4.996791839599609, "rewards/margins": 5.511812686920166, "rewards/rejected": -10.508604049682617, "step": 610 }, { "epoch": 0.5328749462827675, "grad_norm": 3.046638250350952, "learning_rate": 3.1722995515381644e-06, "logits/chosen": 2.7617671489715576, "logits/rejected": 2.7338194847106934, "logps/chosen": -3.336381435394287, "logps/rejected": -7.058961391448975, "loss": 0.2895, "rewards/accuracies": 0.9375, "rewards/chosen": -5.004572868347168, "rewards/margins": 5.583868980407715, "rewards/rejected": -10.588441848754883, "step": 620 }, { "epoch": 0.5414697034808766, "grad_norm": 2.342069387435913, "learning_rate": 3.121724717912138e-06, "logits/chosen": 2.5818216800689697, "logits/rejected": 1.987378716468811, "logps/chosen": -3.0970518589019775, "logps/rejected": -6.240235805511475, "loss": 0.2634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.645577430725098, "rewards/margins": 4.714776039123535, "rewards/rejected": -9.36035442352295, "step": 630 }, { "epoch": 0.5500644606789858, "grad_norm": 1.9333513975143433, "learning_rate": 3.0708771752766397e-06, "logits/chosen": 2.911674737930298, "logits/rejected": 2.7606472969055176, "logps/chosen": -3.2809441089630127, "logps/rejected": -7.210829257965088, "loss": 0.2594, "rewards/accuracies": 0.9375, "rewards/chosen": -4.921416282653809, "rewards/margins": 5.894827365875244, "rewards/rejected": -10.816244125366211, "step": 640 }, { "epoch": 0.5586592178770949, "grad_norm": 5.659445285797119, "learning_rate": 3.019779227044398e-06, "logits/chosen": 2.4733409881591797, "logits/rejected": 2.102668285369873, "logps/chosen": -3.4448726177215576, "logps/rejected": -7.304962158203125, "loss": 0.2399, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.167309284210205, "rewards/margins": 5.790134429931641, "rewards/rejected": -10.957443237304688, "step": 650 }, { "epoch": 0.5586592178770949, "eval_logits/chosen": 2.482032537460327, "eval_logits/rejected": 2.66147780418396, "eval_logps/chosen": -3.728013515472412, "eval_logps/rejected": -8.231985092163086, "eval_loss": 0.2814938426017761, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.592020511627197, "eval_rewards/margins": 6.75595760345459, "eval_rewards/rejected": -12.347977638244629, "eval_runtime": 25.4252, "eval_samples_per_second": 29.616, "eval_steps_per_second": 3.736, "step": 650 }, { "epoch": 0.5672539750752041, "grad_norm": 2.189638137817383, "learning_rate": 2.9684532864643123e-06, "logits/chosen": 2.875077962875366, "logits/rejected": 2.712646484375, "logps/chosen": -3.757338762283325, "logps/rejected": -6.6974897384643555, "loss": 0.2759, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.636007785797119, "rewards/margins": 4.410226821899414, "rewards/rejected": -10.046236038208008, "step": 660 }, { "epoch": 0.5758487322733132, "grad_norm": 3.5755774974823, "learning_rate": 2.9169218667902562e-06, "logits/chosen": 2.9562981128692627, "logits/rejected": 2.7660539150238037, "logps/chosen": -3.2358715534210205, "logps/rejected": -6.90399169921875, "loss": 0.2586, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.853806972503662, "rewards/margins": 5.502181053161621, "rewards/rejected": -10.355987548828125, "step": 670 }, { "epoch": 0.5844434894714224, "grad_norm": 2.5616958141326904, "learning_rate": 2.8652075714060296e-06, "logits/chosen": 2.5067126750946045, "logits/rejected": 2.3888354301452637, "logps/chosen": -3.462563991546631, "logps/rejected": -6.964964866638184, "loss": 0.251, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.193846225738525, "rewards/margins": 5.253602027893066, "rewards/rejected": -10.447446823120117, "step": 680 }, { "epoch": 0.5930382466695315, "grad_norm": 2.964050531387329, "learning_rate": 2.813333083910761e-06, "logits/chosen": 2.659935474395752, "logits/rejected": 2.6573758125305176, "logps/chosen": -3.9107768535614014, "logps/rejected": -7.865903377532959, "loss": 0.2294, "rewards/accuracies": 0.9375, "rewards/chosen": -5.866166114807129, "rewards/margins": 5.9326887130737305, "rewards/rejected": -11.79885482788086, "step": 690 }, { "epoch": 0.6016330038676407, "grad_norm": 4.389697551727295, "learning_rate": 2.761321158169134e-06, "logits/chosen": 2.217245578765869, "logits/rejected": 2.421597957611084, "logps/chosen": -4.029661655426025, "logps/rejected": -8.073125839233398, "loss": 0.2469, "rewards/accuracies": 0.9375, "rewards/chosen": -6.044493675231934, "rewards/margins": 6.065195083618164, "rewards/rejected": -12.109688758850098, "step": 700 }, { "epoch": 0.6016330038676407, "eval_logits/chosen": 2.0770955085754395, "eval_logits/rejected": 2.3815462589263916, "eval_logps/chosen": -3.924149751663208, "eval_logps/rejected": -8.844257354736328, "eval_loss": 0.2584603726863861, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.886224746704102, "eval_rewards/margins": 7.380159854888916, "eval_rewards/rejected": -13.26638412475586, "eval_runtime": 25.4228, "eval_samples_per_second": 29.619, "eval_steps_per_second": 3.737, "step": 700 }, { "epoch": 0.6102277610657499, "grad_norm": 3.290154457092285, "learning_rate": 2.70919460833079e-06, "logits/chosen": 2.458578586578369, "logits/rejected": 2.275515079498291, "logps/chosen": -3.2734694480895996, "logps/rejected": -7.873226165771484, "loss": 0.2732, "rewards/accuracies": 0.9375, "rewards/chosen": -4.91020393371582, "rewards/margins": 6.899635314941406, "rewards/rejected": -11.809839248657227, "step": 710 }, { "epoch": 0.618822518263859, "grad_norm": 2.2760908603668213, "learning_rate": 2.6569762988232838e-06, "logits/chosen": 2.6856372356414795, "logits/rejected": 2.722838878631592, "logps/chosen": -3.589418411254883, "logps/rejected": -7.638446807861328, "loss": 0.2583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.384127140045166, "rewards/margins": 6.073542594909668, "rewards/rejected": -11.457670211791992, "step": 720 }, { "epoch": 0.6274172754619682, "grad_norm": 6.937672138214111, "learning_rate": 2.604689134322999e-06, "logits/chosen": 2.928969383239746, "logits/rejected": 2.5493836402893066, "logps/chosen": -3.3862743377685547, "logps/rejected": -7.568005561828613, "loss": 0.2889, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.07941198348999, "rewards/margins": 6.27259635925293, "rewards/rejected": -11.352007865905762, "step": 730 }, { "epoch": 0.6360120326600773, "grad_norm": 2.1878838539123535, "learning_rate": 2.5523560497083927e-06, "logits/chosen": 2.3824827671051025, "logits/rejected": 2.257145404815674, "logps/chosen": -3.5448341369628906, "logps/rejected": -7.594444274902344, "loss": 0.1972, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.3172502517700195, "rewards/margins": 6.074415683746338, "rewards/rejected": -11.3916654586792, "step": 740 }, { "epoch": 0.6446067898581865, "grad_norm": 4.405832767486572, "learning_rate": 2.5e-06, "logits/chosen": 3.204157590866089, "logits/rejected": 3.0262837409973145, "logps/chosen": -3.67409086227417, "logps/rejected": -8.078901290893555, "loss": 0.2282, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.511136531829834, "rewards/margins": 6.607214450836182, "rewards/rejected": -12.118351936340332, "step": 750 }, { "epoch": 0.6446067898581865, "eval_logits/chosen": 2.1246254444122314, "eval_logits/rejected": 2.4088852405548096, "eval_logps/chosen": -4.221064567565918, "eval_logps/rejected": -9.4141206741333, "eval_loss": 0.2537557780742645, "eval_rewards/accuracies": 0.9368420839309692, "eval_rewards/chosen": -6.331596374511719, "eval_rewards/margins": 7.789584159851074, "eval_rewards/rejected": -14.121179580688477, "eval_runtime": 25.436, "eval_samples_per_second": 29.604, "eval_steps_per_second": 3.735, "step": 750 }, { "epoch": 0.6532015470562956, "grad_norm": 2.8693907260894775, "learning_rate": 2.447643950291608e-06, "logits/chosen": 2.5033986568450928, "logits/rejected": 2.2746779918670654, "logps/chosen": -4.256644248962402, "logps/rejected": -8.564817428588867, "loss": 0.2337, "rewards/accuracies": 0.9375, "rewards/chosen": -6.3849663734436035, "rewards/margins": 6.462259769439697, "rewards/rejected": -12.8472261428833, "step": 760 }, { "epoch": 0.6617963042544048, "grad_norm": 4.912906646728516, "learning_rate": 2.3953108656770018e-06, "logits/chosen": 2.861431837081909, "logits/rejected": 2.974611759185791, "logps/chosen": -3.9564735889434814, "logps/rejected": -7.863286018371582, "loss": 0.2585, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.934710502624512, "rewards/margins": 5.860217571258545, "rewards/rejected": -11.794927597045898, "step": 770 }, { "epoch": 0.6703910614525139, "grad_norm": 3.215716600418091, "learning_rate": 2.3430237011767166e-06, "logits/chosen": 1.9008615016937256, "logits/rejected": 1.9049352407455444, "logps/chosen": -4.304060935974121, "logps/rejected": -8.806629180908203, "loss": 0.2279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.456091403961182, "rewards/margins": 6.753852844238281, "rewards/rejected": -13.209943771362305, "step": 780 }, { "epoch": 0.6789858186506231, "grad_norm": 3.8724021911621094, "learning_rate": 2.290805391669212e-06, "logits/chosen": 2.2521636486053467, "logits/rejected": 2.2159788608551025, "logps/chosen": -4.012774467468262, "logps/rejected": -8.53366470336914, "loss": 0.2437, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.019161701202393, "rewards/margins": 6.78133487701416, "rewards/rejected": -12.800497055053711, "step": 790 }, { "epoch": 0.6875805758487322, "grad_norm": 3.56345796585083, "learning_rate": 2.238678841830867e-06, "logits/chosen": 2.0579304695129395, "logits/rejected": 2.304316997528076, "logps/chosen": -3.590430736541748, "logps/rejected": -8.182169914245605, "loss": 0.213, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.385646820068359, "rewards/margins": 6.887608528137207, "rewards/rejected": -12.27325439453125, "step": 800 }, { "epoch": 0.6875805758487322, "eval_logits/chosen": 2.228646755218506, "eval_logits/rejected": 2.444817543029785, "eval_logps/chosen": -3.8403449058532715, "eval_logps/rejected": -9.179658889770508, "eval_loss": 0.23895224928855896, "eval_rewards/accuracies": 0.9368420839309692, "eval_rewards/chosen": -5.76051664352417, "eval_rewards/margins": 8.00897216796875, "eval_rewards/rejected": -13.769490242004395, "eval_runtime": 25.3925, "eval_samples_per_second": 29.654, "eval_steps_per_second": 3.741, "step": 800 }, { "epoch": 0.6961753330468414, "grad_norm": 3.4880526065826416, "learning_rate": 2.186666916089239e-06, "logits/chosen": 1.7993383407592773, "logits/rejected": 1.754417061805725, "logps/chosen": -4.045234680175781, "logps/rejected": -8.927519798278809, "loss": 0.2391, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.067852020263672, "rewards/margins": 7.323427677154541, "rewards/rejected": -13.391279220581055, "step": 810 }, { "epoch": 0.7047700902449506, "grad_norm": 3.56809139251709, "learning_rate": 2.134792428593971e-06, "logits/chosen": 2.9591994285583496, "logits/rejected": 2.960444211959839, "logps/chosen": -4.150156497955322, "logps/rejected": -8.512441635131836, "loss": 0.1972, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -6.225234031677246, "rewards/margins": 6.54342794418335, "rewards/rejected": -12.768662452697754, "step": 820 }, { "epoch": 0.7133648474430597, "grad_norm": 4.127833843231201, "learning_rate": 2.0830781332097446e-06, "logits/chosen": 3.008269786834717, "logits/rejected": 2.63409686088562, "logps/chosen": -3.8291163444519043, "logps/rejected": -8.657347679138184, "loss": 0.2161, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.743674278259277, "rewards/margins": 7.24234676361084, "rewards/rejected": -12.986021041870117, "step": 830 }, { "epoch": 0.7219596046411689, "grad_norm": 4.475767612457275, "learning_rate": 2.031546713535688e-06, "logits/chosen": 2.7164976596832275, "logits/rejected": 2.5976195335388184, "logps/chosen": -4.153134346008301, "logps/rejected": -8.893486022949219, "loss": 0.1895, "rewards/accuracies": 0.9375, "rewards/chosen": -6.229701519012451, "rewards/margins": 7.110527992248535, "rewards/rejected": -13.340228080749512, "step": 840 }, { "epoch": 0.730554361839278, "grad_norm": 4.190205097198486, "learning_rate": 1.9802207729556023e-06, "logits/chosen": 2.6235451698303223, "logits/rejected": 2.5486202239990234, "logps/chosen": -3.899543046951294, "logps/rejected": -8.277327537536621, "loss": 0.2239, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.8493146896362305, "rewards/margins": 6.566677093505859, "rewards/rejected": -12.415990829467773, "step": 850 }, { "epoch": 0.730554361839278, "eval_logits/chosen": 2.173233985900879, "eval_logits/rejected": 2.433162212371826, "eval_logps/chosen": -4.13487434387207, "eval_logps/rejected": -9.577596664428711, "eval_loss": 0.23591776192188263, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.2023115158081055, "eval_rewards/margins": 8.164085388183594, "eval_rewards/rejected": -14.3663969039917, "eval_runtime": 25.4513, "eval_samples_per_second": 29.586, "eval_steps_per_second": 3.733, "step": 850 }, { "epoch": 0.7391491190373872, "grad_norm": 2.6548664569854736, "learning_rate": 1.9291228247233607e-06, "logits/chosen": 1.7737414836883545, "logits/rejected": 2.080662965774536, "logps/chosen": -3.9447720050811768, "logps/rejected": -9.01865005493164, "loss": 0.2268, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.9171576499938965, "rewards/margins": 7.610815525054932, "rewards/rejected": -13.527974128723145, "step": 860 }, { "epoch": 0.7477438762354963, "grad_norm": 2.5912184715270996, "learning_rate": 1.8782752820878636e-06, "logits/chosen": 2.5428760051727295, "logits/rejected": 2.3569278717041016, "logps/chosen": -3.685049057006836, "logps/rejected": -9.194517135620117, "loss": 0.2001, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.527573585510254, "rewards/margins": 8.264203071594238, "rewards/rejected": -13.791775703430176, "step": 870 }, { "epoch": 0.7563386334336055, "grad_norm": 3.789594888687134, "learning_rate": 1.827700448461836e-06, "logits/chosen": 3.139338970184326, "logits/rejected": 3.003114700317383, "logps/chosen": -4.347461700439453, "logps/rejected": -8.560078620910645, "loss": 0.2257, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.521193027496338, "rewards/margins": 6.318924903869629, "rewards/rejected": -12.840118408203125, "step": 880 }, { "epoch": 0.7649333906317146, "grad_norm": 2.3799326419830322, "learning_rate": 1.7774205076388207e-06, "logits/chosen": 3.2622504234313965, "logits/rejected": 2.922945261001587, "logps/chosen": -4.306991100311279, "logps/rejected": -8.622769355773926, "loss": 0.2123, "rewards/accuracies": 0.9375, "rewards/chosen": -6.46048641204834, "rewards/margins": 6.473666191101074, "rewards/rejected": -12.93415355682373, "step": 890 }, { "epoch": 0.7735281478298238, "grad_norm": 3.4133400917053223, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 2.8558292388916016, "logits/rejected": 2.919982433319092, "logps/chosen": -3.791405200958252, "logps/rejected": -9.348276138305664, "loss": 0.2345, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.687107086181641, "rewards/margins": 8.335307121276855, "rewards/rejected": -14.022415161132812, "step": 900 }, { "epoch": 0.7735281478298238, "eval_logits/chosen": 2.2851152420043945, "eval_logits/rejected": 2.5511629581451416, "eval_logps/chosen": -4.023584842681885, "eval_logps/rejected": -9.625852584838867, "eval_loss": 0.23031750321388245, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.035377025604248, "eval_rewards/margins": 8.403401374816895, "eval_rewards/rejected": -14.4387788772583, "eval_runtime": 25.4061, "eval_samples_per_second": 29.639, "eval_steps_per_second": 3.739, "step": 900 }, { "epoch": 0.7821229050279329, "grad_norm": 2.178900957107544, "learning_rate": 1.677833383153542e-06, "logits/chosen": 2.3001868724823, "logits/rejected": 2.365304470062256, "logps/chosen": -3.690169095993042, "logps/rejected": -8.727324485778809, "loss": 0.1988, "rewards/accuracies": 0.9375, "rewards/chosen": -5.535253047943115, "rewards/margins": 7.555734157562256, "rewards/rejected": -13.090988159179688, "step": 910 }, { "epoch": 0.7907176622260421, "grad_norm": 4.60929536819458, "learning_rate": 1.6285698816954626e-06, "logits/chosen": 3.103785276412964, "logits/rejected": 3.0096678733825684, "logps/chosen": -4.184874534606934, "logps/rejected": -8.704519271850586, "loss": 0.2128, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.277312278747559, "rewards/margins": 6.7794671058654785, "rewards/rejected": -13.056779861450195, "step": 920 }, { "epoch": 0.7993124194241513, "grad_norm": 1.1031241416931152, "learning_rate": 1.5796886182883053e-06, "logits/chosen": 3.2616991996765137, "logits/rejected": 2.990100622177124, "logps/chosen": -4.041825771331787, "logps/rejected": -9.399754524230957, "loss": 0.2131, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.062739372253418, "rewards/margins": 8.03689193725586, "rewards/rejected": -14.099630355834961, "step": 930 }, { "epoch": 0.8079071766222604, "grad_norm": 1.8013640642166138, "learning_rate": 1.5312110338697427e-06, "logits/chosen": 2.2281856536865234, "logits/rejected": 2.1705000400543213, "logps/chosen": -3.802743434906006, "logps/rejected": -8.745875358581543, "loss": 0.2211, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.704115867614746, "rewards/margins": 7.414697170257568, "rewards/rejected": -13.118814468383789, "step": 940 }, { "epoch": 0.8165019338203696, "grad_norm": 5.369480609893799, "learning_rate": 1.4831583923105e-06, "logits/chosen": 2.0146822929382324, "logits/rejected": 2.0050222873687744, "logps/chosen": -4.013974189758301, "logps/rejected": -9.14311408996582, "loss": 0.2359, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.020960807800293, "rewards/margins": 7.693708896636963, "rewards/rejected": -13.71467113494873, "step": 950 }, { "epoch": 0.8165019338203696, "eval_logits/chosen": 2.4391298294067383, "eval_logits/rejected": 2.693408250808716, "eval_logps/chosen": -3.877185106277466, "eval_logps/rejected": -9.608528137207031, "eval_loss": 0.2290637195110321, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -5.81577730178833, "eval_rewards/margins": 8.597016334533691, "eval_rewards/rejected": -14.41279411315918, "eval_runtime": 25.4752, "eval_samples_per_second": 29.558, "eval_steps_per_second": 3.729, "step": 950 }, { "epoch": 0.8250966910184787, "grad_norm": 4.45559549331665, "learning_rate": 1.4355517710873184e-06, "logits/chosen": 3.141005516052246, "logits/rejected": 3.037994861602783, "logps/chosen": -4.300066947937012, "logps/rejected": -9.016799926757812, "loss": 0.2369, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.450100898742676, "rewards/margins": 7.075097560882568, "rewards/rejected": -13.525197982788086, "step": 960 }, { "epoch": 0.8336914482165879, "grad_norm": 4.787370681762695, "learning_rate": 1.388412052037682e-06, "logits/chosen": 2.0776166915893555, "logits/rejected": 2.1846489906311035, "logps/chosen": -3.574514865875244, "logps/rejected": -9.252939224243164, "loss": 0.2339, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.3617730140686035, "rewards/margins": 8.517634391784668, "rewards/rejected": -13.879406929016113, "step": 970 }, { "epoch": 0.842286205414697, "grad_norm": 2.436915397644043, "learning_rate": 1.3417599122003464e-06, "logits/chosen": 2.4205520153045654, "logits/rejected": 2.362619161605835, "logps/chosen": -3.942296266555786, "logps/rejected": -9.501001358032227, "loss": 0.1535, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.9134440422058105, "rewards/margins": 8.338058471679688, "rewards/rejected": -14.251502990722656, "step": 980 }, { "epoch": 0.8508809626128062, "grad_norm": 2.881063461303711, "learning_rate": 1.2956158147457116e-06, "logits/chosen": 2.490622043609619, "logits/rejected": 2.438882350921631, "logps/chosen": -4.083024501800537, "logps/rejected": -8.853937149047852, "loss": 0.2337, "rewards/accuracies": 0.9375, "rewards/chosen": -6.124536991119385, "rewards/margins": 7.156369686126709, "rewards/rejected": -13.280906677246094, "step": 990 }, { "epoch": 0.8594757198109153, "grad_norm": 2.4361555576324463, "learning_rate": 1.2500000000000007e-06, "logits/chosen": 2.5789554119110107, "logits/rejected": 2.755476951599121, "logps/chosen": -3.6121764183044434, "logps/rejected": -7.818983554840088, "loss": 0.2062, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.418264389038086, "rewards/margins": 6.310210227966309, "rewards/rejected": -11.728475570678711, "step": 1000 }, { "epoch": 0.8594757198109153, "eval_logits/chosen": 2.3194375038146973, "eval_logits/rejected": 2.6192238330841064, "eval_logps/chosen": -3.992403745651245, "eval_logps/rejected": -9.769493103027344, "eval_loss": 0.2193986475467682, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -5.98860502243042, "eval_rewards/margins": 8.665634155273438, "eval_rewards/rejected": -14.654237747192383, "eval_runtime": 25.4033, "eval_samples_per_second": 29.642, "eval_steps_per_second": 3.74, "step": 1000 }, { "epoch": 0.8680704770090245, "grad_norm": 4.216193199157715, "learning_rate": 1.204932476567175e-06, "logits/chosen": 2.592757225036621, "logits/rejected": 2.777765989303589, "logps/chosen": -4.057430267333984, "logps/rejected": -8.371369361877441, "loss": 0.2322, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.086144924163818, "rewards/margins": 6.470907688140869, "rewards/rejected": -12.557052612304688, "step": 1010 }, { "epoch": 0.8766652342071336, "grad_norm": 11.027617454528809, "learning_rate": 1.160433012552508e-06, "logits/chosen": 2.89387845993042, "logits/rejected": 2.8747739791870117, "logps/chosen": -4.125360012054443, "logps/rejected": -8.518974304199219, "loss": 0.2383, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.188040256500244, "rewards/margins": 6.590420722961426, "rewards/rejected": -12.778460502624512, "step": 1020 }, { "epoch": 0.8852599914052428, "grad_norm": 3.2465028762817383, "learning_rate": 1.11652112689164e-06, "logits/chosen": 2.5261311531066895, "logits/rejected": 2.5254740715026855, "logps/chosen": -4.115787506103516, "logps/rejected": -9.08761215209961, "loss": 0.2262, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.173680782318115, "rewards/margins": 7.457736968994141, "rewards/rejected": -13.631416320800781, "step": 1030 }, { "epoch": 0.8938547486033519, "grad_norm": 3.501631259918213, "learning_rate": 1.073216080788921e-06, "logits/chosen": 2.3466925621032715, "logits/rejected": 2.311033248901367, "logps/chosen": -3.7890372276306152, "logps/rejected": -8.364242553710938, "loss": 0.1909, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.683555603027344, "rewards/margins": 6.8628082275390625, "rewards/rejected": -12.546364784240723, "step": 1040 }, { "epoch": 0.9024495058014611, "grad_norm": 2.803072690963745, "learning_rate": 1.0305368692688175e-06, "logits/chosen": 2.7137346267700195, "logits/rejected": 2.825568675994873, "logps/chosen": -3.8556430339813232, "logps/rejected": -8.831448554992676, "loss": 0.1965, "rewards/accuracies": 0.9375, "rewards/chosen": -5.783464431762695, "rewards/margins": 7.463706970214844, "rewards/rejected": -13.247172355651855, "step": 1050 }, { "epoch": 0.9024495058014611, "eval_logits/chosen": 2.3855948448181152, "eval_logits/rejected": 2.6855878829956055, "eval_logps/chosen": -4.08680534362793, "eval_logps/rejected": -9.835321426391602, "eval_loss": 0.21824777126312256, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.130208492279053, "eval_rewards/margins": 8.622772216796875, "eval_rewards/rejected": -14.752982139587402, "eval_runtime": 25.407, "eval_samples_per_second": 29.637, "eval_steps_per_second": 3.739, "step": 1050 }, { "epoch": 0.9110442629995703, "grad_norm": 3.8827121257781982, "learning_rate": 9.88502212844063e-07, "logits/chosen": 2.339254379272461, "logits/rejected": 2.321803092956543, "logps/chosen": -4.195307731628418, "logps/rejected": -8.984090805053711, "loss": 0.2003, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.292961120605469, "rewards/margins": 7.183175086975098, "rewards/rejected": -13.47613525390625, "step": 1060 }, { "epoch": 0.9196390201976794, "grad_norm": 2.1895060539245605, "learning_rate": 9.471305493042243e-07, "logits/chosen": 2.5805039405822754, "logits/rejected": 2.725888729095459, "logps/chosen": -3.9197421073913574, "logps/rejected": -9.688833236694336, "loss": 0.1979, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.879612922668457, "rewards/margins": 8.653637886047363, "rewards/rejected": -14.533251762390137, "step": 1070 }, { "epoch": 0.9282337773957886, "grad_norm": 2.7553319931030273, "learning_rate": 9.064400256282757e-07, "logits/chosen": 2.130366563796997, "logits/rejected": 2.3263564109802246, "logps/chosen": -4.264749526977539, "logps/rejected": -9.922590255737305, "loss": 0.1787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.397124290466309, "rewards/margins": 8.486761093139648, "rewards/rejected": -14.883886337280273, "step": 1080 }, { "epoch": 0.9368285345938977, "grad_norm": 3.2198596000671387, "learning_rate": 8.664484900247363e-07, "logits/chosen": 2.867159128189087, "logits/rejected": 2.5984394550323486, "logps/chosen": -3.8312149047851562, "logps/rejected": -8.639516830444336, "loss": 0.1971, "rewards/accuracies": 0.9375, "rewards/chosen": -5.746821403503418, "rewards/margins": 7.212453365325928, "rewards/rejected": -12.959274291992188, "step": 1090 }, { "epoch": 0.9454232917920069, "grad_norm": 2.1565632820129395, "learning_rate": 8.271734841028553e-07, "logits/chosen": 3.2416539192199707, "logits/rejected": 2.98675274848938, "logps/chosen": -3.9062061309814453, "logps/rejected": -9.09926700592041, "loss": 0.1766, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.859309673309326, "rewards/margins": 7.789591312408447, "rewards/rejected": -13.648900032043457, "step": 1100 }, { "epoch": 0.9454232917920069, "eval_logits/chosen": 2.452310562133789, "eval_logits/rejected": 2.728320837020874, "eval_logps/chosen": -4.018543243408203, "eval_logps/rejected": -9.840487480163574, "eval_loss": 0.21524910628795624, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.027814865112305, "eval_rewards/margins": 8.732914924621582, "eval_rewards/rejected": -14.760730743408203, "eval_runtime": 25.4021, "eval_samples_per_second": 29.643, "eval_steps_per_second": 3.74, "step": 1100 }, { "epoch": 0.954018048990116, "grad_norm": 4.705589294433594, "learning_rate": 7.886322351782782e-07, "logits/chosen": 3.07834529876709, "logits/rejected": 2.766554117202759, "logps/chosen": -4.105443477630615, "logps/rejected": -8.913984298706055, "loss": 0.2274, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.158164978027344, "rewards/margins": 7.212811470031738, "rewards/rejected": -13.370976448059082, "step": 1110 }, { "epoch": 0.9626128061882252, "grad_norm": 1.656952977180481, "learning_rate": 7.508416487165862e-07, "logits/chosen": 2.760190725326538, "logits/rejected": 2.4847655296325684, "logps/chosen": -3.7116692066192627, "logps/rejected": -8.578731536865234, "loss": 0.1609, "rewards/accuracies": 0.9375, "rewards/chosen": -5.567503452301025, "rewards/margins": 7.300595283508301, "rewards/rejected": -12.868098258972168, "step": 1120 }, { "epoch": 0.9712075633863343, "grad_norm": 6.795106410980225, "learning_rate": 7.138183009179922e-07, "logits/chosen": 2.71061635017395, "logits/rejected": 2.543354034423828, "logps/chosen": -4.008772373199463, "logps/rejected": -9.546417236328125, "loss": 0.1813, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.013156890869141, "rewards/margins": 8.306467056274414, "rewards/rejected": -14.319625854492188, "step": 1130 }, { "epoch": 0.9798023205844435, "grad_norm": 3.892643690109253, "learning_rate": 6.775784314464717e-07, "logits/chosen": 2.826934814453125, "logits/rejected": 2.82672381401062, "logps/chosen": -3.863964796066284, "logps/rejected": -9.300487518310547, "loss": 0.1841, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.7959465980529785, "rewards/margins": 8.154784202575684, "rewards/rejected": -13.950732231140137, "step": 1140 }, { "epoch": 0.9883970777825526, "grad_norm": 2.7565932273864746, "learning_rate": 6.421379363065142e-07, "logits/chosen": 2.8669843673706055, "logits/rejected": 2.703218936920166, "logps/chosen": -4.042016983032227, "logps/rejected": -9.321676254272461, "loss": 0.1698, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.06302547454834, "rewards/margins": 7.919488430023193, "rewards/rejected": -13.982513427734375, "step": 1150 }, { "epoch": 0.9883970777825526, "eval_logits/chosen": 2.4691615104675293, "eval_logits/rejected": 2.7426767349243164, "eval_logps/chosen": -4.0566277503967285, "eval_logps/rejected": -9.914444923400879, "eval_loss": 0.2154170125722885, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.084941864013672, "eval_rewards/margins": 8.786724090576172, "eval_rewards/rejected": -14.871667861938477, "eval_runtime": 25.4309, "eval_samples_per_second": 29.61, "eval_steps_per_second": 3.736, "step": 1150 }, { "epoch": 0.9969918349806618, "grad_norm": 5.313329219818115, "learning_rate": 6.075123608706093e-07, "logits/chosen": 2.488896131515503, "logits/rejected": 2.602234363555908, "logps/chosen": -4.304905891418457, "logps/rejected": -8.938867568969727, "loss": 0.187, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.457359313964844, "rewards/margins": 6.9509429931640625, "rewards/rejected": -13.408302307128906, "step": 1160 }, { "epoch": 1.0051568543188656, "grad_norm": 4.5972089767456055, "learning_rate": 5.737168930605272e-07, "logits/chosen": 2.951173782348633, "logits/rejected": 2.7277865409851074, "logps/chosen": -3.993422746658325, "logps/rejected": -9.191303253173828, "loss": 0.1904, "rewards/accuracies": 0.9342105388641357, "rewards/chosen": -5.990133285522461, "rewards/margins": 7.796821117401123, "rewards/rejected": -13.786954879760742, "step": 1170 }, { "epoch": 1.0137516115169747, "grad_norm": 8.393081665039062, "learning_rate": 5.407663566854008e-07, "logits/chosen": 2.6279826164245605, "logits/rejected": 2.718173027038574, "logps/chosen": -4.164522647857666, "logps/rejected": -9.509660720825195, "loss": 0.1984, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.246783256530762, "rewards/margins": 8.017707824707031, "rewards/rejected": -14.264490127563477, "step": 1180 }, { "epoch": 1.0223463687150838, "grad_norm": 1.666337251663208, "learning_rate": 5.086752049395094e-07, "logits/chosen": 3.449803590774536, "logits/rejected": 3.336327075958252, "logps/chosen": -4.214791297912598, "logps/rejected": -9.531986236572266, "loss": 0.1891, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.322187423706055, "rewards/margins": 7.975789546966553, "rewards/rejected": -14.297978401184082, "step": 1190 }, { "epoch": 1.0309411259131929, "grad_norm": 4.041133403778076, "learning_rate": 4.774575140626317e-07, "logits/chosen": 3.2861239910125732, "logits/rejected": 3.2246017456054688, "logps/chosen": -4.407445430755615, "logps/rejected": -9.031600952148438, "loss": 0.2037, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.611169338226318, "rewards/margins": 6.936232566833496, "rewards/rejected": -13.547401428222656, "step": 1200 }, { "epoch": 1.0309411259131929, "eval_logits/chosen": 2.4687464237213135, "eval_logits/rejected": 2.747849941253662, "eval_logps/chosen": -4.021546840667725, "eval_logps/rejected": -9.913942337036133, "eval_loss": 0.21217840909957886, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.032320499420166, "eval_rewards/margins": 8.838594436645508, "eval_rewards/rejected": -14.870915412902832, "eval_runtime": 25.4113, "eval_samples_per_second": 29.632, "eval_steps_per_second": 3.738, "step": 1200 }, { "epoch": 1.0395358831113022, "grad_norm": 2.6725575923919678, "learning_rate": 4.4712697716573994e-07, "logits/chosen": 2.674346446990967, "logits/rejected": 2.491318941116333, "logps/chosen": -3.7565529346466064, "logps/rejected": -7.836224555969238, "loss": 0.2035, "rewards/accuracies": 0.9375, "rewards/chosen": -5.634829521179199, "rewards/margins": 6.119507789611816, "rewards/rejected": -11.7543363571167, "step": 1210 }, { "epoch": 1.0481306403094113, "grad_norm": 3.706759214401245, "learning_rate": 4.1769689822475147e-07, "logits/chosen": 2.4985451698303223, "logits/rejected": 2.393381118774414, "logps/chosen": -3.8855807781219482, "logps/rejected": -8.915465354919434, "loss": 0.192, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.828372001647949, "rewards/margins": 7.544825553894043, "rewards/rejected": -13.373197555541992, "step": 1220 }, { "epoch": 1.0567253975075204, "grad_norm": 3.47476863861084, "learning_rate": 3.891801862449629e-07, "logits/chosen": 2.4244041442871094, "logits/rejected": 2.176212787628174, "logps/chosen": -4.165676593780518, "logps/rejected": -9.606474876403809, "loss": 0.1734, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.2485151290893555, "rewards/margins": 8.1611967086792, "rewards/rejected": -14.409710884094238, "step": 1230 }, { "epoch": 1.0653201547056295, "grad_norm": 7.537776470184326, "learning_rate": 3.615893495987335e-07, "logits/chosen": 3.5634818077087402, "logits/rejected": 3.599396228790283, "logps/chosen": -3.8801186084747314, "logps/rejected": -8.772649765014648, "loss": 0.1907, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.8201775550842285, "rewards/margins": 7.338797092437744, "rewards/rejected": -13.158975601196289, "step": 1240 }, { "epoch": 1.0739149119037388, "grad_norm": 7.371191501617432, "learning_rate": 3.3493649053890325e-07, "logits/chosen": 1.536439299583435, "logits/rejected": 1.6537296772003174, "logps/chosen": -3.4690723419189453, "logps/rejected": -9.245938301086426, "loss": 0.1963, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.203608512878418, "rewards/margins": 8.665300369262695, "rewards/rejected": -13.868908882141113, "step": 1250 }, { "epoch": 1.0739149119037388, "eval_logits/chosen": 2.463451862335205, "eval_logits/rejected": 2.7573211193084717, "eval_logps/chosen": -4.089573383331299, "eval_logps/rejected": -10.03561019897461, "eval_loss": 0.21280600130558014, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.1343607902526855, "eval_rewards/margins": 8.919055938720703, "eval_rewards/rejected": -15.05341625213623, "eval_runtime": 26.0304, "eval_samples_per_second": 28.928, "eval_steps_per_second": 3.65, "step": 1250 }, { "epoch": 1.0825096691018479, "grad_norm": 3.3083207607269287, "learning_rate": 3.092332998903416e-07, "logits/chosen": 2.6976983547210693, "logits/rejected": 2.558777332305908, "logps/chosen": -4.322577953338623, "logps/rejected": -9.3988037109375, "loss": 0.1782, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.483867645263672, "rewards/margins": 7.614336967468262, "rewards/rejected": -14.098203659057617, "step": 1260 }, { "epoch": 1.091104426299957, "grad_norm": 2.7674386501312256, "learning_rate": 2.844910519219632e-07, "logits/chosen": 2.5444347858428955, "logits/rejected": 2.5096020698547363, "logps/chosen": -4.426403522491455, "logps/rejected": -9.394776344299316, "loss": 0.1731, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.639605522155762, "rewards/margins": 7.4525580406188965, "rewards/rejected": -14.0921630859375, "step": 1270 }, { "epoch": 1.0996991834980663, "grad_norm": 3.5264317989349365, "learning_rate": 2.6072059940146775e-07, "logits/chosen": 2.0977225303649902, "logits/rejected": 2.335002899169922, "logps/chosen": -4.064654350280762, "logps/rejected": -9.52751350402832, "loss": 0.1904, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.096982002258301, "rewards/margins": 8.19428825378418, "rewards/rejected": -14.291269302368164, "step": 1280 }, { "epoch": 1.1082939406961754, "grad_norm": 3.810760498046875, "learning_rate": 2.3793236883495164e-07, "logits/chosen": 2.6432700157165527, "logits/rejected": 2.542976140975952, "logps/chosen": -4.272869110107422, "logps/rejected": -9.520709037780762, "loss": 0.1787, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.409303188323975, "rewards/margins": 7.871760368347168, "rewards/rejected": -14.281064987182617, "step": 1290 }, { "epoch": 1.1168886978942845, "grad_norm": 5.784368991851807, "learning_rate": 2.1613635589349756e-07, "logits/chosen": 2.5322649478912354, "logits/rejected": 2.5456764698028564, "logps/chosen": -4.310420513153076, "logps/rejected": -8.846685409545898, "loss": 0.2135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.465630531311035, "rewards/margins": 6.8043975830078125, "rewards/rejected": -13.270029067993164, "step": 1300 }, { "epoch": 1.1168886978942845, "eval_logits/chosen": 2.43306040763855, "eval_logits/rejected": 2.7410776615142822, "eval_logps/chosen": -4.157689571380615, "eval_logps/rejected": -10.128125190734863, "eval_loss": 0.21243833005428314, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.236534595489502, "eval_rewards/margins": 8.95565414428711, "eval_rewards/rejected": -15.192190170288086, "eval_runtime": 25.4225, "eval_samples_per_second": 29.619, "eval_steps_per_second": 3.737, "step": 1300 }, { "epoch": 1.1254834550923936, "grad_norm": 4.539431095123291, "learning_rate": 1.95342121028749e-07, "logits/chosen": 3.897085666656494, "logits/rejected": 3.714550018310547, "logps/chosen": -4.3084564208984375, "logps/rejected": -9.26058292388916, "loss": 0.1939, "rewards/accuracies": 0.9375, "rewards/chosen": -6.462684631347656, "rewards/margins": 7.428189754486084, "rewards/rejected": -13.890874862670898, "step": 1310 }, { "epoch": 1.1340782122905029, "grad_norm": 3.7019519805908203, "learning_rate": 1.7555878527937164e-07, "logits/chosen": 2.3946852684020996, "logits/rejected": 2.312767505645752, "logps/chosen": -3.7729697227478027, "logps/rejected": -9.062173843383789, "loss": 0.1838, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.659454345703125, "rewards/margins": 7.933804512023926, "rewards/rejected": -13.59325885772705, "step": 1320 }, { "epoch": 1.142672969488612, "grad_norm": 6.9092559814453125, "learning_rate": 1.567950262702714e-07, "logits/chosen": 3.005070209503174, "logits/rejected": 3.02156400680542, "logps/chosen": -4.185842037200928, "logps/rejected": -9.492212295532227, "loss": 0.1659, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.278763771057129, "rewards/margins": 7.959554195404053, "rewards/rejected": -14.238316535949707, "step": 1330 }, { "epoch": 1.151267726686721, "grad_norm": 4.459160327911377, "learning_rate": 1.3905907440629752e-07, "logits/chosen": 2.853377103805542, "logits/rejected": 2.9094462394714355, "logps/chosen": -4.523777008056641, "logps/rejected": -9.852740287780762, "loss": 0.1863, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.785665035247803, "rewards/margins": 7.99344539642334, "rewards/rejected": -14.779111862182617, "step": 1340 }, { "epoch": 1.1598624838848304, "grad_norm": 2.7226345539093018, "learning_rate": 1.223587092621162e-07, "logits/chosen": 3.3432698249816895, "logits/rejected": 3.3845882415771484, "logps/chosen": -3.862396240234375, "logps/rejected": -8.756799697875977, "loss": 0.1924, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.793594837188721, "rewards/margins": 7.341606140136719, "rewards/rejected": -13.135202407836914, "step": 1350 }, { "epoch": 1.1598624838848304, "eval_logits/chosen": 2.406911849975586, "eval_logits/rejected": 2.7142908573150635, "eval_logps/chosen": -4.146352767944336, "eval_logps/rejected": -10.133431434631348, "eval_loss": 0.210895836353302, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.219529151916504, "eval_rewards/margins": 8.980619430541992, "eval_rewards/rejected": -15.20014762878418, "eval_runtime": 25.415, "eval_samples_per_second": 29.628, "eval_steps_per_second": 3.738, "step": 1350 }, { "epoch": 1.1684572410829395, "grad_norm": 2.9276647567749023, "learning_rate": 1.067012561698319e-07, "logits/chosen": 3.421785831451416, "logits/rejected": 3.449258804321289, "logps/chosen": -4.521992206573486, "logps/rejected": -9.448256492614746, "loss": 0.2042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.78298807144165, "rewards/margins": 7.389396667480469, "rewards/rejected": -14.172386169433594, "step": 1360 }, { "epoch": 1.1770519982810486, "grad_norm": 2.2031686305999756, "learning_rate": 9.209358300585474e-08, "logits/chosen": 1.9827455282211304, "logits/rejected": 2.079033851623535, "logps/chosen": -4.297107696533203, "logps/rejected": -9.094736099243164, "loss": 0.189, "rewards/accuracies": 0.9375, "rewards/chosen": -6.445662021636963, "rewards/margins": 7.196441650390625, "rewards/rejected": -13.642102241516113, "step": 1370 }, { "epoch": 1.1856467554791577, "grad_norm": 3.5787320137023926, "learning_rate": 7.854209717842231e-08, "logits/chosen": 2.4248852729797363, "logits/rejected": 2.360355854034424, "logps/chosen": -4.048138618469238, "logps/rejected": -8.906003952026367, "loss": 0.1725, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.072208404541016, "rewards/margins": 7.286799430847168, "rewards/rejected": -13.359006881713867, "step": 1380 }, { "epoch": 1.1942415126772667, "grad_norm": 3.7278518676757812, "learning_rate": 6.605274281709929e-08, "logits/chosen": 2.819120407104492, "logits/rejected": 2.8942766189575195, "logps/chosen": -4.503249168395996, "logps/rejected": -9.22131061553955, "loss": 0.1906, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.754873752593994, "rewards/margins": 7.077092170715332, "rewards/rejected": -13.831965446472168, "step": 1390 }, { "epoch": 1.202836269875376, "grad_norm": 3.5006744861602783, "learning_rate": 5.463099816548578e-08, "logits/chosen": 2.785336971282959, "logits/rejected": 2.590388059616089, "logps/chosen": -4.427987575531006, "logps/rejected": -9.677424430847168, "loss": 0.1958, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.641981601715088, "rewards/margins": 7.8741559982299805, "rewards/rejected": -14.516138076782227, "step": 1400 }, { "epoch": 1.202836269875376, "eval_logits/chosen": 2.403634548187256, "eval_logits/rejected": 2.723357915878296, "eval_logps/chosen": -4.18450403213501, "eval_logps/rejected": -10.158361434936523, "eval_loss": 0.21151940524578094, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.276756763458252, "eval_rewards/margins": 8.960786819458008, "eval_rewards/rejected": -15.237542152404785, "eval_runtime": 25.4271, "eval_samples_per_second": 29.614, "eval_steps_per_second": 3.736, "step": 1400 }, { "epoch": 1.2114310270734852, "grad_norm": 4.615660190582275, "learning_rate": 4.428187317827848e-08, "logits/chosen": 1.9216268062591553, "logits/rejected": 2.0172224044799805, "logps/chosen": -4.234434604644775, "logps/rejected": -9.351160049438477, "loss": 0.1911, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.351652145385742, "rewards/margins": 7.675088405609131, "rewards/rejected": -14.026741027832031, "step": 1410 }, { "epoch": 1.2200257842715942, "grad_norm": 2.876373529434204, "learning_rate": 3.5009907323737826e-08, "logits/chosen": 2.8389785289764404, "logits/rejected": 2.5476391315460205, "logps/chosen": -4.389086723327637, "logps/rejected": -8.940258979797363, "loss": 0.2009, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.583629608154297, "rewards/margins": 6.826756954193115, "rewards/rejected": -13.41038703918457, "step": 1420 }, { "epoch": 1.2286205414697036, "grad_norm": 3.2574303150177, "learning_rate": 2.681916759252917e-08, "logits/chosen": 2.1023097038269043, "logits/rejected": 2.1705641746520996, "logps/chosen": -3.8304667472839355, "logps/rejected": -9.32093620300293, "loss": 0.2198, "rewards/accuracies": 0.9375, "rewards/chosen": -5.745700836181641, "rewards/margins": 8.235702514648438, "rewards/rejected": -13.981404304504395, "step": 1430 }, { "epoch": 1.2372152986678127, "grad_norm": 3.100552797317505, "learning_rate": 1.9713246713805588e-08, "logits/chosen": 2.747933864593506, "logits/rejected": 2.99653959274292, "logps/chosen": -4.103874206542969, "logps/rejected": -8.748233795166016, "loss": 0.1607, "rewards/accuracies": 0.9375, "rewards/chosen": -6.155811309814453, "rewards/margins": 6.966541290283203, "rewards/rejected": -13.122352600097656, "step": 1440 }, { "epoch": 1.2458100558659218, "grad_norm": 2.113027572631836, "learning_rate": 1.3695261579316776e-08, "logits/chosen": 3.0195822715759277, "logits/rejected": 2.5288455486297607, "logps/chosen": -4.176327705383301, "logps/rejected": -8.921086311340332, "loss": 0.2137, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -6.264491081237793, "rewards/margins": 7.1171393394470215, "rewards/rejected": -13.381629943847656, "step": 1450 }, { "epoch": 1.2458100558659218, "eval_logits/chosen": 2.3938305377960205, "eval_logits/rejected": 2.715986728668213, "eval_logps/chosen": -4.178752899169922, "eval_logps/rejected": -10.14517879486084, "eval_loss": 0.21230751276016235, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.268129348754883, "eval_rewards/margins": 8.949638366699219, "eval_rewards/rejected": -15.217768669128418, "eval_runtime": 25.4179, "eval_samples_per_second": 29.625, "eval_steps_per_second": 3.738, "step": 1450 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.334294615678255e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }