{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7735281478298238, "eval_steps": 50, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008594757198109154, "grad_norm": 0.05934199318289757, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.762972831726074, "logits/rejected": 15.199728012084961, "logps/chosen": -0.3259914815425873, "logps/rejected": -0.34297481179237366, "loss": 0.9377, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.4889872074127197, "rewards/margins": 0.02547495998442173, "rewards/rejected": -0.5144621729850769, "step": 10 }, { "epoch": 0.017189514396218308, "grad_norm": 0.06342790275812149, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.351249694824219, "logits/rejected": 15.068448066711426, "logps/chosen": -0.2809392511844635, "logps/rejected": -0.3711296617984772, "loss": 0.9352, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.42140883207321167, "rewards/margins": 0.1352856159210205, "rewards/rejected": -0.5566944479942322, "step": 20 }, { "epoch": 0.02578427159432746, "grad_norm": 0.053961098194122314, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.636960983276367, "logits/rejected": 15.265243530273438, "logps/chosen": -0.2820780873298645, "logps/rejected": -0.34024301171302795, "loss": 0.9351, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.42311716079711914, "rewards/margins": 0.08724743127822876, "rewards/rejected": -0.5103646516799927, "step": 30 }, { "epoch": 0.034379028792436615, "grad_norm": 0.13506193459033966, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.4556884765625, "logits/rejected": 15.048967361450195, "logps/chosen": -0.2897028625011444, "logps/rejected": -0.34129124879837036, "loss": 0.922, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.43455424904823303, "rewards/margins": 0.07738252729177475, "rewards/rejected": -0.5119368433952332, "step": 40 }, { "epoch": 0.042973785990545764, "grad_norm": 0.05230574309825897, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.628789901733398, "logits/rejected": 15.307828903198242, "logps/chosen": -0.28786614537239075, "logps/rejected": -0.3513876795768738, "loss": 0.9201, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4317992329597473, "rewards/margins": 0.09528233855962753, "rewards/rejected": -0.5270815491676331, "step": 50 }, { "epoch": 0.042973785990545764, "eval_logits/chosen": 14.234943389892578, "eval_logits/rejected": 15.258601188659668, "eval_logps/chosen": -0.2844341993331909, "eval_logps/rejected": -0.3695394694805145, "eval_loss": 0.9226060509681702, "eval_rewards/accuracies": 0.5157894492149353, "eval_rewards/chosen": -0.42665132880210876, "eval_rewards/margins": 0.1276579648256302, "eval_rewards/rejected": -0.5543092489242554, "eval_runtime": 25.9356, "eval_samples_per_second": 29.033, "eval_steps_per_second": 3.663, "step": 50 }, { "epoch": 0.05156854318865492, "grad_norm": 0.09328428655862808, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.35963249206543, "logits/rejected": 15.055354118347168, "logps/chosen": -0.27534741163253784, "logps/rejected": -0.33098170161247253, "loss": 0.9356, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4130210876464844, "rewards/margins": 0.08345144242048264, "rewards/rejected": -0.4964725375175476, "step": 60 }, { "epoch": 0.060163300386764075, "grad_norm": 0.06518550217151642, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.599525451660156, "logits/rejected": 14.825297355651855, "logps/chosen": -0.2708163857460022, "logps/rejected": -0.3305850923061371, "loss": 0.9257, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4062245786190033, "rewards/margins": 0.08965305984020233, "rewards/rejected": -0.4958776533603668, "step": 70 }, { "epoch": 0.06875805758487323, "grad_norm": 0.07543154805898666, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.947430610656738, "logits/rejected": 15.093690872192383, "logps/chosen": -0.2602943778038025, "logps/rejected": -0.31820863485336304, "loss": 0.9168, "rewards/accuracies": 0.5, "rewards/chosen": -0.39044153690338135, "rewards/margins": 0.08687138557434082, "rewards/rejected": -0.47731298208236694, "step": 80 }, { "epoch": 0.07735281478298238, "grad_norm": 0.06628195196390152, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.43529987335205, "logits/rejected": 14.750699043273926, "logps/chosen": -0.2884291708469391, "logps/rejected": -0.34193652868270874, "loss": 0.9273, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.43264374136924744, "rewards/margins": 0.08026103675365448, "rewards/rejected": -0.5129047632217407, "step": 90 }, { "epoch": 0.08594757198109153, "grad_norm": 0.08684897422790527, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 13.573002815246582, "logits/rejected": 14.441877365112305, "logps/chosen": -0.2569890320301056, "logps/rejected": -0.37049269676208496, "loss": 0.9009, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3854835629463196, "rewards/margins": 0.17025551199913025, "rewards/rejected": -0.5557390451431274, "step": 100 }, { "epoch": 0.08594757198109153, "eval_logits/chosen": 14.026633262634277, "eval_logits/rejected": 15.08835220336914, "eval_logps/chosen": -0.2761566936969757, "eval_logps/rejected": -0.3717801570892334, "eval_loss": 0.9138591885566711, "eval_rewards/accuracies": 0.5368421077728271, "eval_rewards/chosen": -0.41423505544662476, "eval_rewards/margins": 0.1434352546930313, "eval_rewards/rejected": -0.5576702952384949, "eval_runtime": 25.3996, "eval_samples_per_second": 29.646, "eval_steps_per_second": 3.74, "step": 100 }, { "epoch": 0.09454232917920069, "grad_norm": 0.08046824485063553, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.500630378723145, "logits/rejected": 14.831761360168457, "logps/chosen": -0.30049553513526917, "logps/rejected": -0.3315966725349426, "loss": 0.916, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.45074325799942017, "rewards/margins": 0.04665176197886467, "rewards/rejected": -0.49739497900009155, "step": 110 }, { "epoch": 0.10313708637730984, "grad_norm": 0.12244562804698944, "learning_rate": 4.921457902821578e-06, "logits/chosen": 14.26713752746582, "logits/rejected": 14.495455741882324, "logps/chosen": -0.2670941650867462, "logps/rejected": -0.32481229305267334, "loss": 0.9167, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4006412625312805, "rewards/margins": 0.08657723665237427, "rewards/rejected": -0.4872184693813324, "step": 120 }, { "epoch": 0.11173184357541899, "grad_norm": 0.1828213334083557, "learning_rate": 4.907906416994146e-06, "logits/chosen": 14.009546279907227, "logits/rejected": 14.297094345092773, "logps/chosen": -0.27995598316192627, "logps/rejected": -0.3530685007572174, "loss": 0.9087, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.419933944940567, "rewards/margins": 0.10966875404119492, "rewards/rejected": -0.5296027660369873, "step": 130 }, { "epoch": 0.12032660077352815, "grad_norm": 0.10407563298940659, "learning_rate": 4.893298743830168e-06, "logits/chosen": 13.689155578613281, "logits/rejected": 14.1933012008667, "logps/chosen": -0.25955715775489807, "logps/rejected": -0.3815004229545593, "loss": 0.9053, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3893357217311859, "rewards/margins": 0.18291489779949188, "rewards/rejected": -0.5722506046295166, "step": 140 }, { "epoch": 0.1289213579716373, "grad_norm": 0.10028588026762009, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.851397514343262, "logits/rejected": 13.509778022766113, "logps/chosen": -0.23652991652488708, "logps/rejected": -0.3720462918281555, "loss": 0.8999, "rewards/accuracies": 0.625, "rewards/chosen": -0.3547949194908142, "rewards/margins": 0.2032744586467743, "rewards/rejected": -0.5580693483352661, "step": 150 }, { "epoch": 0.1289213579716373, "eval_logits/chosen": 12.384929656982422, "eval_logits/rejected": 13.672826766967773, "eval_logps/chosen": -0.27857670187950134, "eval_logps/rejected": -0.4014737904071808, "eval_loss": 0.8956203460693359, "eval_rewards/accuracies": 0.5684210658073425, "eval_rewards/chosen": -0.4178650677204132, "eval_rewards/margins": 0.18434564769268036, "eval_rewards/rejected": -0.6022107601165771, "eval_runtime": 25.4176, "eval_samples_per_second": 29.625, "eval_steps_per_second": 3.738, "step": 150 }, { "epoch": 0.13751611516974646, "grad_norm": 0.12453093379735947, "learning_rate": 4.860940925593703e-06, "logits/chosen": 12.110003471374512, "logits/rejected": 13.076980590820312, "logps/chosen": -0.27192068099975586, "logps/rejected": -0.3863692879676819, "loss": 0.8907, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4078810214996338, "rewards/margins": 0.1716729700565338, "rewards/rejected": -0.5795539617538452, "step": 160 }, { "epoch": 0.1461108723678556, "grad_norm": 0.17137788236141205, "learning_rate": 4.84320497372973e-06, "logits/chosen": 11.92918586730957, "logits/rejected": 12.573629379272461, "logps/chosen": -0.27472984790802, "logps/rejected": -0.41249385476112366, "loss": 0.8831, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.41209474205970764, "rewards/margins": 0.20664596557617188, "rewards/rejected": -0.6187406778335571, "step": 170 }, { "epoch": 0.15470562956596476, "grad_norm": 0.3904883861541748, "learning_rate": 4.824441214720629e-06, "logits/chosen": 11.182531356811523, "logits/rejected": 12.176573753356934, "logps/chosen": -0.2953718304634094, "logps/rejected": -0.4208717942237854, "loss": 0.8736, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4430577754974365, "rewards/margins": 0.18824996054172516, "rewards/rejected": -0.6313077211380005, "step": 180 }, { "epoch": 0.1633003867640739, "grad_norm": 0.17574089765548706, "learning_rate": 4.804657878971252e-06, "logits/chosen": 10.119890213012695, "logits/rejected": 11.05900764465332, "logps/chosen": -0.29340866208076477, "logps/rejected": -0.4555762708187103, "loss": 0.884, "rewards/accuracies": 0.625, "rewards/chosen": -0.44011297821998596, "rewards/margins": 0.24325144290924072, "rewards/rejected": -0.6833644509315491, "step": 190 }, { "epoch": 0.17189514396218306, "grad_norm": 0.2242884337902069, "learning_rate": 4.783863644106502e-06, "logits/chosen": 9.674784660339355, "logits/rejected": 10.418611526489258, "logps/chosen": -0.3504490852355957, "logps/rejected": -0.5431731939315796, "loss": 0.8419, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5256736278533936, "rewards/margins": 0.2890861928462982, "rewards/rejected": -0.8147598505020142, "step": 200 }, { "epoch": 0.17189514396218306, "eval_logits/chosen": 7.944870471954346, "eval_logits/rejected": 8.979729652404785, "eval_logps/chosen": -0.33341673016548157, "eval_logps/rejected": -0.5431775450706482, "eval_loss": 0.8462886810302734, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.5001251101493835, "eval_rewards/margins": 0.3146411180496216, "eval_rewards/rejected": -0.8147663474082947, "eval_runtime": 25.419, "eval_samples_per_second": 29.623, "eval_steps_per_second": 3.737, "step": 200 }, { "epoch": 0.18048990116029223, "grad_norm": 0.32119837403297424, "learning_rate": 4.762067631165049e-06, "logits/chosen": 7.16138219833374, "logits/rejected": 8.43680477142334, "logps/chosen": -0.36649250984191895, "logps/rejected": -0.5420924425125122, "loss": 0.8187, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5497387647628784, "rewards/margins": 0.2633998692035675, "rewards/rejected": -0.8131386041641235, "step": 210 }, { "epoch": 0.18908465835840138, "grad_norm": 0.48516562581062317, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 4.770083427429199, "logits/rejected": 5.710458278656006, "logps/chosen": -0.34041497111320496, "logps/rejected": -0.6309320330619812, "loss": 0.8448, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.510622501373291, "rewards/margins": 0.4357755780220032, "rewards/rejected": -0.9463980793952942, "step": 220 }, { "epoch": 0.19767941555651053, "grad_norm": 0.29154208302497864, "learning_rate": 4.715508948078037e-06, "logits/chosen": 5.168765068054199, "logits/rejected": 5.421420574188232, "logps/chosen": -0.3792352080345154, "logps/rejected": -0.65748131275177, "loss": 0.8066, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5688528418540955, "rewards/margins": 0.41736921668052673, "rewards/rejected": -0.986221969127655, "step": 230 }, { "epoch": 0.20627417275461968, "grad_norm": 0.42973750829696655, "learning_rate": 4.690766700109659e-06, "logits/chosen": 4.204717636108398, "logits/rejected": 3.706291913986206, "logps/chosen": -0.39414530992507935, "logps/rejected": -0.7194588780403137, "loss": 0.7787, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5912179350852966, "rewards/margins": 0.4879704415798187, "rewards/rejected": -1.079188346862793, "step": 240 }, { "epoch": 0.21486892995272883, "grad_norm": 0.5244571566581726, "learning_rate": 4.665063509461098e-06, "logits/chosen": 3.335484743118286, "logits/rejected": 3.3176345825195312, "logps/chosen": -0.4493131637573242, "logps/rejected": -0.8293434381484985, "loss": 0.7776, "rewards/accuracies": 0.625, "rewards/chosen": -0.6739697456359863, "rewards/margins": 0.5700454115867615, "rewards/rejected": -1.244015097618103, "step": 250 }, { "epoch": 0.21486892995272883, "eval_logits/chosen": 2.590949058532715, "eval_logits/rejected": 2.2929749488830566, "eval_logps/chosen": -0.48714593052864075, "eval_logps/rejected": -0.9267774224281311, "eval_loss": 0.7469337582588196, "eval_rewards/accuracies": 0.6526315808296204, "eval_rewards/chosen": -0.7307189106941223, "eval_rewards/margins": 0.659447193145752, "eval_rewards/rejected": -1.390166163444519, "eval_runtime": 25.3944, "eval_samples_per_second": 29.652, "eval_steps_per_second": 3.741, "step": 250 }, { "epoch": 0.22346368715083798, "grad_norm": 0.39347293972969055, "learning_rate": 4.638410650401267e-06, "logits/chosen": 2.2975668907165527, "logits/rejected": 1.2855035066604614, "logps/chosen": -0.5228341817855835, "logps/rejected": -1.00227952003479, "loss": 0.6981, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.78425133228302, "rewards/margins": 0.7191681265830994, "rewards/rejected": -1.5034195184707642, "step": 260 }, { "epoch": 0.23205844434894715, "grad_norm": 0.69575434923172, "learning_rate": 4.610819813755038e-06, "logits/chosen": 2.8782780170440674, "logits/rejected": 1.9394336938858032, "logps/chosen": -0.4982885718345642, "logps/rejected": -1.035541296005249, "loss": 0.7174, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7474328875541687, "rewards/margins": 0.8058789372444153, "rewards/rejected": -1.5533119440078735, "step": 270 }, { "epoch": 0.2406532015470563, "grad_norm": 0.7858326435089111, "learning_rate": 4.582303101775249e-06, "logits/chosen": 2.710908889770508, "logits/rejected": 1.6444288492202759, "logps/chosen": -0.600068211555481, "logps/rejected": -1.1271780729293823, "loss": 0.6972, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9001023173332214, "rewards/margins": 0.7906648516654968, "rewards/rejected": -1.6907672882080078, "step": 280 }, { "epoch": 0.24924795874516545, "grad_norm": 0.7384620904922485, "learning_rate": 4.55287302283426e-06, "logits/chosen": 1.5841500759124756, "logits/rejected": 0.640514612197876, "logps/chosen": -0.6465060710906982, "logps/rejected": -1.4245095252990723, "loss": 0.6192, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9697591066360474, "rewards/margins": 1.1670053005218506, "rewards/rejected": -2.1367642879486084, "step": 290 }, { "epoch": 0.2578427159432746, "grad_norm": 0.8262321352958679, "learning_rate": 4.522542485937369e-06, "logits/chosen": 1.7300422191619873, "logits/rejected": 0.7782856225967407, "logps/chosen": -0.7083590626716614, "logps/rejected": -1.6742557287216187, "loss": 0.5721, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.062538504600525, "rewards/margins": 1.4488452672958374, "rewards/rejected": -2.511383533477783, "step": 300 }, { "epoch": 0.2578427159432746, "eval_logits/chosen": 1.3559931516647339, "eval_logits/rejected": 0.6592276096343994, "eval_logps/chosen": -0.7815767526626587, "eval_logps/rejected": -2.1176154613494873, "eval_loss": 0.5730626583099365, "eval_rewards/accuracies": 0.7052631378173828, "eval_rewards/chosen": -1.1723653078079224, "eval_rewards/margins": 2.0040581226348877, "eval_rewards/rejected": -3.1764233112335205, "eval_runtime": 25.539, "eval_samples_per_second": 29.484, "eval_steps_per_second": 3.72, "step": 300 }, { "epoch": 0.2664374731413838, "grad_norm": 0.8472572565078735, "learning_rate": 4.491324795060491e-06, "logits/chosen": 1.4461088180541992, "logits/rejected": 0.49669915437698364, "logps/chosen": -0.7694377899169922, "logps/rejected": -2.362783432006836, "loss": 0.5091, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1541565656661987, "rewards/margins": 2.390018939971924, "rewards/rejected": -3.544174909591675, "step": 310 }, { "epoch": 0.2750322303394929, "grad_norm": 0.41847530007362366, "learning_rate": 4.4592336433146e-06, "logits/chosen": 2.172646999359131, "logits/rejected": 1.0526962280273438, "logps/chosen": -0.7410945296287537, "logps/rejected": -1.9158353805541992, "loss": 0.5352, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1116416454315186, "rewards/margins": 1.7621114253997803, "rewards/rejected": -2.873753070831299, "step": 320 }, { "epoch": 0.28362698753760207, "grad_norm": 1.7422096729278564, "learning_rate": 4.426283106939474e-06, "logits/chosen": 2.611234188079834, "logits/rejected": 1.7068111896514893, "logps/chosen": -0.8319486379623413, "logps/rejected": -2.32024884223938, "loss": 0.5397, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2479230165481567, "rewards/margins": 2.232450008392334, "rewards/rejected": -3.480372905731201, "step": 330 }, { "epoch": 0.2922217447357112, "grad_norm": 0.8699240684509277, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 1.996747612953186, "logits/rejected": 1.1473515033721924, "logps/chosen": -0.8445833921432495, "logps/rejected": -2.675687551498413, "loss": 0.4817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2668750286102295, "rewards/margins": 2.7466559410095215, "rewards/rejected": -4.01353120803833, "step": 340 }, { "epoch": 0.30081650193382037, "grad_norm": 2.089289426803589, "learning_rate": 4.357862063693486e-06, "logits/chosen": 1.7134803533554077, "logits/rejected": 1.3000510931015015, "logps/chosen": -0.8976927995681763, "logps/rejected": -2.1593873500823975, "loss": 0.5098, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3465392589569092, "rewards/margins": 1.8925418853759766, "rewards/rejected": -3.2390809059143066, "step": 350 }, { "epoch": 0.30081650193382037, "eval_logits/chosen": 1.6772903203964233, "eval_logits/rejected": 1.2370609045028687, "eval_logps/chosen": -0.9737761616706848, "eval_logps/rejected": -3.1528680324554443, "eval_loss": 0.5162621736526489, "eval_rewards/accuracies": 0.7263157963752747, "eval_rewards/chosen": -1.46066415309906, "eval_rewards/margins": 3.2686376571655273, "eval_rewards/rejected": -4.729301929473877, "eval_runtime": 25.4163, "eval_samples_per_second": 29.627, "eval_steps_per_second": 3.738, "step": 350 }, { "epoch": 0.3094112591319295, "grad_norm": 0.47079572081565857, "learning_rate": 4.322421568553529e-06, "logits/chosen": 1.9561872482299805, "logits/rejected": 0.8960329294204712, "logps/chosen": -0.9378088712692261, "logps/rejected": -2.8065876960754395, "loss": 0.5046, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4067132472991943, "rewards/margins": 2.8031680583953857, "rewards/rejected": -4.209881782531738, "step": 360 }, { "epoch": 0.31800601633003867, "grad_norm": 0.6202365159988403, "learning_rate": 4.286181699082008e-06, "logits/chosen": 2.152726411819458, "logits/rejected": 1.4309433698654175, "logps/chosen": -1.007157564163208, "logps/rejected": -3.3813462257385254, "loss": 0.4526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5107364654541016, "rewards/margins": 3.561283588409424, "rewards/rejected": -5.072019577026367, "step": 370 }, { "epoch": 0.3266007735281478, "grad_norm": 1.080393671989441, "learning_rate": 4.249158351283414e-06, "logits/chosen": 1.7528371810913086, "logits/rejected": 1.3293968439102173, "logps/chosen": -1.0258004665374756, "logps/rejected": -2.984057903289795, "loss": 0.4879, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5387006998062134, "rewards/margins": 2.9373860359191895, "rewards/rejected": -4.476086616516113, "step": 380 }, { "epoch": 0.33519553072625696, "grad_norm": 1.4520032405853271, "learning_rate": 4.211367764821722e-06, "logits/chosen": 3.061373233795166, "logits/rejected": 2.0103466510772705, "logps/chosen": -1.0191391706466675, "logps/rejected": -2.9054081439971924, "loss": 0.4776, "rewards/accuracies": 0.625, "rewards/chosen": -1.5287089347839355, "rewards/margins": 2.8294031620025635, "rewards/rejected": -4.358112335205078, "step": 390 }, { "epoch": 0.3437902879243661, "grad_norm": 0.5479139089584351, "learning_rate": 4.172826515897146e-06, "logits/chosen": 2.8395092487335205, "logits/rejected": 2.0935282707214355, "logps/chosen": -1.0769506692886353, "logps/rejected": -3.11635160446167, "loss": 0.4686, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6154258251190186, "rewards/margins": 3.0591015815734863, "rewards/rejected": -4.674527168273926, "step": 400 }, { "epoch": 0.3437902879243661, "eval_logits/chosen": 2.5064592361450195, "eval_logits/rejected": 2.108433485031128, "eval_logps/chosen": -1.1957285404205322, "eval_logps/rejected": -3.7678382396698, "eval_loss": 0.46578800678253174, "eval_rewards/accuracies": 0.7368420958518982, "eval_rewards/chosen": -1.793592929840088, "eval_rewards/margins": 3.8581647872924805, "eval_rewards/rejected": -5.651757717132568, "eval_runtime": 25.415, "eval_samples_per_second": 29.628, "eval_steps_per_second": 3.738, "step": 400 }, { "epoch": 0.3523850451224753, "grad_norm": 0.9966821670532227, "learning_rate": 4.133551509975264e-06, "logits/chosen": 2.6411917209625244, "logits/rejected": 1.8634885549545288, "logps/chosen": -1.0934125185012817, "logps/rejected": -3.2207794189453125, "loss": 0.4335, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6401188373565674, "rewards/margins": 3.1910502910614014, "rewards/rejected": -4.831169128417969, "step": 410 }, { "epoch": 0.36097980232058446, "grad_norm": 0.6384722590446472, "learning_rate": 4.093559974371725e-06, "logits/chosen": 3.1368844509124756, "logits/rejected": 2.3800251483917236, "logps/chosen": -1.2108217477798462, "logps/rejected": -3.484806537628174, "loss": 0.4543, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.816232681274414, "rewards/margins": 3.4109771251678467, "rewards/rejected": -5.227209568023682, "step": 420 }, { "epoch": 0.3695745595186936, "grad_norm": 0.856741726398468, "learning_rate": 4.052869450695776e-06, "logits/chosen": 3.155728816986084, "logits/rejected": 2.257838726043701, "logps/chosen": -1.4214586019515991, "logps/rejected": -4.186622619628906, "loss": 0.4091, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.132187604904175, "rewards/margins": 4.1477460861206055, "rewards/rejected": -6.279933929443359, "step": 430 }, { "epoch": 0.37816931671680276, "grad_norm": 1.3310774564743042, "learning_rate": 4.011497787155938e-06, "logits/chosen": 1.9942185878753662, "logits/rejected": 1.6246827840805054, "logps/chosen": -1.8575637340545654, "logps/rejected": -4.5355329513549805, "loss": 0.3995, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.7863457202911377, "rewards/margins": 4.016953945159912, "rewards/rejected": -6.8032989501953125, "step": 440 }, { "epoch": 0.3867640739149119, "grad_norm": 2.0849101543426514, "learning_rate": 3.969463130731183e-06, "logits/chosen": 2.406555652618408, "logits/rejected": 2.0490009784698486, "logps/chosen": -2.392570972442627, "logps/rejected": -5.055584907531738, "loss": 0.3671, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.588855743408203, "rewards/margins": 3.994520902633667, "rewards/rejected": -7.583376884460449, "step": 450 }, { "epoch": 0.3867640739149119, "eval_logits/chosen": 2.2324020862579346, "eval_logits/rejected": 2.365755319595337, "eval_logps/chosen": -2.736898422241211, "eval_logps/rejected": -5.73967170715332, "eval_loss": 0.3965117633342743, "eval_rewards/accuracies": 0.8736842274665833, "eval_rewards/chosen": -4.105347633361816, "eval_rewards/margins": 4.504159927368164, "eval_rewards/rejected": -8.60950756072998, "eval_runtime": 25.428, "eval_samples_per_second": 29.613, "eval_steps_per_second": 3.736, "step": 450 }, { "epoch": 0.39535883111302106, "grad_norm": 2.223949432373047, "learning_rate": 3.92678391921108e-06, "logits/chosen": 2.651564598083496, "logits/rejected": 2.383842945098877, "logps/chosen": -2.591308355331421, "logps/rejected": -5.308972358703613, "loss": 0.3412, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.886962413787842, "rewards/margins": 4.07649564743042, "rewards/rejected": -7.963458061218262, "step": 460 }, { "epoch": 0.4039535883111302, "grad_norm": 3.110624074935913, "learning_rate": 3.88347887310836e-06, "logits/chosen": 2.5435309410095215, "logits/rejected": 2.46763277053833, "logps/chosen": -2.413583993911743, "logps/rejected": -5.543262481689453, "loss": 0.3832, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.620375871658325, "rewards/margins": 4.694517135620117, "rewards/rejected": -8.314892768859863, "step": 470 }, { "epoch": 0.41254834550923936, "grad_norm": 1.6255794763565063, "learning_rate": 3.839566987447492e-06, "logits/chosen": 3.842928409576416, "logits/rejected": 3.5797982215881348, "logps/chosen": -2.6448044776916504, "logps/rejected": -4.98160982131958, "loss": 0.3547, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9672069549560547, "rewards/margins": 3.5052082538604736, "rewards/rejected": -7.472414493560791, "step": 480 }, { "epoch": 0.4211431027073485, "grad_norm": 2.9274284839630127, "learning_rate": 3.795067523432826e-06, "logits/chosen": 3.3297150135040283, "logits/rejected": 3.0205535888671875, "logps/chosen": -2.811923027038574, "logps/rejected": -6.040881156921387, "loss": 0.3097, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.217884063720703, "rewards/margins": 4.843437194824219, "rewards/rejected": -9.061322212219238, "step": 490 }, { "epoch": 0.42973785990545765, "grad_norm": 2.9143636226654053, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 2.760014772415161, "logits/rejected": 2.535520315170288, "logps/chosen": -3.068406820297241, "logps/rejected": -5.877435684204102, "loss": 0.3031, "rewards/accuracies": 0.875, "rewards/chosen": -4.602609634399414, "rewards/margins": 4.21354341506958, "rewards/rejected": -8.816153526306152, "step": 500 }, { "epoch": 0.42973785990545765, "eval_logits/chosen": 2.0952131748199463, "eval_logits/rejected": 2.1864659786224365, "eval_logps/chosen": -3.392296075820923, "eval_logps/rejected": -6.948195457458496, "eval_loss": 0.33660775423049927, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.088444232940674, "eval_rewards/margins": 5.3338494300842285, "eval_rewards/rejected": -10.422293663024902, "eval_runtime": 25.4226, "eval_samples_per_second": 29.619, "eval_steps_per_second": 3.737, "step": 500 }, { "epoch": 0.4383326171035668, "grad_norm": 2.563810348510742, "learning_rate": 3.7043841852542884e-06, "logits/chosen": 2.950286388397217, "logits/rejected": 2.619025945663452, "logps/chosen": -3.237391710281372, "logps/rejected": -5.953216552734375, "loss": 0.318, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.856087684631348, "rewards/margins": 4.073737144470215, "rewards/rejected": -8.929824829101562, "step": 510 }, { "epoch": 0.44692737430167595, "grad_norm": 2.0339434146881104, "learning_rate": 3.658240087799655e-06, "logits/chosen": 2.987595558166504, "logits/rejected": 2.6243975162506104, "logps/chosen": -3.5633530616760254, "logps/rejected": -7.0458879470825195, "loss": 0.3053, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.345029354095459, "rewards/margins": 5.223802089691162, "rewards/rejected": -10.568831443786621, "step": 520 }, { "epoch": 0.45552213149978515, "grad_norm": 4.091029644012451, "learning_rate": 3.611587947962319e-06, "logits/chosen": 2.297576904296875, "logits/rejected": 2.0218777656555176, "logps/chosen": -3.297245502471924, "logps/rejected": -6.101919651031494, "loss": 0.3255, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.945868015289307, "rewards/margins": 4.207010746002197, "rewards/rejected": -9.152878761291504, "step": 530 }, { "epoch": 0.4641168886978943, "grad_norm": 2.7896900177001953, "learning_rate": 3.564448228912682e-06, "logits/chosen": 2.103950023651123, "logits/rejected": 1.9478647708892822, "logps/chosen": -2.9360263347625732, "logps/rejected": -6.406435489654541, "loss": 0.3361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.40403938293457, "rewards/margins": 5.20561408996582, "rewards/rejected": -9.60965347290039, "step": 540 }, { "epoch": 0.47271164589600345, "grad_norm": 2.657970905303955, "learning_rate": 3.516841607689501e-06, "logits/chosen": 2.1658639907836914, "logits/rejected": 2.214900493621826, "logps/chosen": -3.084073066711426, "logps/rejected": -6.935500144958496, "loss": 0.2928, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.626110076904297, "rewards/margins": 5.7771406173706055, "rewards/rejected": -10.403249740600586, "step": 550 }, { "epoch": 0.47271164589600345, "eval_logits/chosen": 2.285294771194458, "eval_logits/rejected": 2.3312103748321533, "eval_logps/chosen": -3.35794997215271, "eval_logps/rejected": -7.37537145614624, "eval_loss": 0.3121817409992218, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.036925792694092, "eval_rewards/margins": 6.026132106781006, "eval_rewards/rejected": -11.063057899475098, "eval_runtime": 25.4015, "eval_samples_per_second": 29.644, "eval_steps_per_second": 3.74, "step": 550 }, { "epoch": 0.4813064030941126, "grad_norm": 2.940019369125366, "learning_rate": 3.4687889661302577e-06, "logits/chosen": 1.9122416973114014, "logits/rejected": 1.9943454265594482, "logps/chosen": -3.27177095413208, "logps/rejected": -7.023342132568359, "loss": 0.3105, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.907656669616699, "rewards/margins": 5.6273579597473145, "rewards/rejected": -10.535014152526855, "step": 560 }, { "epoch": 0.48990116029222175, "grad_norm": 1.8887412548065186, "learning_rate": 3.4203113817116955e-06, "logits/chosen": 2.274843692779541, "logits/rejected": 2.392199993133545, "logps/chosen": -3.383749008178711, "logps/rejected": -7.265415191650391, "loss": 0.3003, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.075623512268066, "rewards/margins": 5.8224992752075195, "rewards/rejected": -10.898123741149902, "step": 570 }, { "epoch": 0.4984959174903309, "grad_norm": 1.6364414691925049, "learning_rate": 3.3714301183045382e-06, "logits/chosen": 2.423910617828369, "logits/rejected": 2.244985818862915, "logps/chosen": -3.0959205627441406, "logps/rejected": -6.822405815124512, "loss": 0.2471, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.643880844116211, "rewards/margins": 5.58972692489624, "rewards/rejected": -10.233609199523926, "step": 580 }, { "epoch": 0.50709067468844, "grad_norm": 2.6540188789367676, "learning_rate": 3.3221666168464584e-06, "logits/chosen": 2.8146812915802, "logits/rejected": 2.5971922874450684, "logps/chosen": -4.139407157897949, "logps/rejected": -7.71649694442749, "loss": 0.2809, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.209111213684082, "rewards/margins": 5.365634441375732, "rewards/rejected": -11.574746131896973, "step": 590 }, { "epoch": 0.5156854318865493, "grad_norm": 4.229885578155518, "learning_rate": 3.272542485937369e-06, "logits/chosen": 2.2735249996185303, "logits/rejected": 1.8577899932861328, "logps/chosen": -3.731342315673828, "logps/rejected": -7.2900390625, "loss": 0.2956, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.5970139503479, "rewards/margins": 5.338044166564941, "rewards/rejected": -10.93505859375, "step": 600 }, { "epoch": 0.5156854318865493, "eval_logits/chosen": 2.3333992958068848, "eval_logits/rejected": 2.529745578765869, "eval_logps/chosen": -3.679597854614258, "eval_logps/rejected": -7.917842864990234, "eval_loss": 0.3030374050140381, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.519396781921387, "eval_rewards/margins": 6.357367992401123, "eval_rewards/rejected": -11.876765251159668, "eval_runtime": 25.5622, "eval_samples_per_second": 29.458, "eval_steps_per_second": 3.716, "step": 600 }, { "epoch": 0.5242801890846583, "grad_norm": 2.657008647918701, "learning_rate": 3.222579492361179e-06, "logits/chosen": 2.699007034301758, "logits/rejected": 2.731860876083374, "logps/chosen": -3.3311946392059326, "logps/rejected": -7.005735874176025, "loss": 0.2898, "rewards/accuracies": 0.9375, "rewards/chosen": -4.996791839599609, "rewards/margins": 5.511812686920166, "rewards/rejected": -10.508604049682617, "step": 610 }, { "epoch": 0.5328749462827675, "grad_norm": 3.046638250350952, "learning_rate": 3.1722995515381644e-06, "logits/chosen": 2.7617671489715576, "logits/rejected": 2.7338194847106934, "logps/chosen": -3.336381435394287, "logps/rejected": -7.058961391448975, "loss": 0.2895, "rewards/accuracies": 0.9375, "rewards/chosen": -5.004572868347168, "rewards/margins": 5.583868980407715, "rewards/rejected": -10.588441848754883, "step": 620 }, { "epoch": 0.5414697034808766, "grad_norm": 2.342069387435913, "learning_rate": 3.121724717912138e-06, "logits/chosen": 2.5818216800689697, "logits/rejected": 1.987378716468811, "logps/chosen": -3.0970518589019775, "logps/rejected": -6.240235805511475, "loss": 0.2634, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.645577430725098, "rewards/margins": 4.714776039123535, "rewards/rejected": -9.36035442352295, "step": 630 }, { "epoch": 0.5500644606789858, "grad_norm": 1.9333513975143433, "learning_rate": 3.0708771752766397e-06, "logits/chosen": 2.911674737930298, "logits/rejected": 2.7606472969055176, "logps/chosen": -3.2809441089630127, "logps/rejected": -7.210829257965088, "loss": 0.2594, "rewards/accuracies": 0.9375, "rewards/chosen": -4.921416282653809, "rewards/margins": 5.894827365875244, "rewards/rejected": -10.816244125366211, "step": 640 }, { "epoch": 0.5586592178770949, "grad_norm": 5.659445285797119, "learning_rate": 3.019779227044398e-06, "logits/chosen": 2.4733409881591797, "logits/rejected": 2.102668285369873, "logps/chosen": -3.4448726177215576, "logps/rejected": -7.304962158203125, "loss": 0.2399, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.167309284210205, "rewards/margins": 5.790134429931641, "rewards/rejected": -10.957443237304688, "step": 650 }, { "epoch": 0.5586592178770949, "eval_logits/chosen": 2.482032537460327, "eval_logits/rejected": 2.66147780418396, "eval_logps/chosen": -3.728013515472412, "eval_logps/rejected": -8.231985092163086, "eval_loss": 0.2814938426017761, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.592020511627197, "eval_rewards/margins": 6.75595760345459, "eval_rewards/rejected": -12.347977638244629, "eval_runtime": 25.4252, "eval_samples_per_second": 29.616, "eval_steps_per_second": 3.736, "step": 650 }, { "epoch": 0.5672539750752041, "grad_norm": 2.189638137817383, "learning_rate": 2.9684532864643123e-06, "logits/chosen": 2.875077962875366, "logits/rejected": 2.712646484375, "logps/chosen": -3.757338762283325, "logps/rejected": -6.6974897384643555, "loss": 0.2759, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.636007785797119, "rewards/margins": 4.410226821899414, "rewards/rejected": -10.046236038208008, "step": 660 }, { "epoch": 0.5758487322733132, "grad_norm": 3.5755774974823, "learning_rate": 2.9169218667902562e-06, "logits/chosen": 2.9562981128692627, "logits/rejected": 2.7660539150238037, "logps/chosen": -3.2358715534210205, "logps/rejected": -6.90399169921875, "loss": 0.2586, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.853806972503662, "rewards/margins": 5.502181053161621, "rewards/rejected": -10.355987548828125, "step": 670 }, { "epoch": 0.5844434894714224, "grad_norm": 2.5616958141326904, "learning_rate": 2.8652075714060296e-06, "logits/chosen": 2.5067126750946045, "logits/rejected": 2.3888354301452637, "logps/chosen": -3.462563991546631, "logps/rejected": -6.964964866638184, "loss": 0.251, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.193846225738525, "rewards/margins": 5.253602027893066, "rewards/rejected": -10.447446823120117, "step": 680 }, { "epoch": 0.5930382466695315, "grad_norm": 2.964050531387329, "learning_rate": 2.813333083910761e-06, "logits/chosen": 2.659935474395752, "logits/rejected": 2.6573758125305176, "logps/chosen": -3.9107768535614014, "logps/rejected": -7.865903377532959, "loss": 0.2294, "rewards/accuracies": 0.9375, "rewards/chosen": -5.866166114807129, "rewards/margins": 5.9326887130737305, "rewards/rejected": -11.79885482788086, "step": 690 }, { "epoch": 0.6016330038676407, "grad_norm": 4.389697551727295, "learning_rate": 2.761321158169134e-06, "logits/chosen": 2.217245578765869, "logits/rejected": 2.421597957611084, "logps/chosen": -4.029661655426025, "logps/rejected": -8.073125839233398, "loss": 0.2469, "rewards/accuracies": 0.9375, "rewards/chosen": -6.044493675231934, "rewards/margins": 6.065195083618164, "rewards/rejected": -12.109688758850098, "step": 700 }, { "epoch": 0.6016330038676407, "eval_logits/chosen": 2.0770955085754395, "eval_logits/rejected": 2.3815462589263916, "eval_logps/chosen": -3.924149751663208, "eval_logps/rejected": -8.844257354736328, "eval_loss": 0.2584603726863861, "eval_rewards/accuracies": 0.9263157844543457, "eval_rewards/chosen": -5.886224746704102, "eval_rewards/margins": 7.380159854888916, "eval_rewards/rejected": -13.26638412475586, "eval_runtime": 25.4228, "eval_samples_per_second": 29.619, "eval_steps_per_second": 3.737, "step": 700 }, { "epoch": 0.6102277610657499, "grad_norm": 3.290154457092285, "learning_rate": 2.70919460833079e-06, "logits/chosen": 2.458578586578369, "logits/rejected": 2.275515079498291, "logps/chosen": -3.2734694480895996, "logps/rejected": -7.873226165771484, "loss": 0.2732, "rewards/accuracies": 0.9375, "rewards/chosen": -4.91020393371582, "rewards/margins": 6.899635314941406, "rewards/rejected": -11.809839248657227, "step": 710 }, { "epoch": 0.618822518263859, "grad_norm": 2.2760908603668213, "learning_rate": 2.6569762988232838e-06, "logits/chosen": 2.6856372356414795, "logits/rejected": 2.722838878631592, "logps/chosen": -3.589418411254883, "logps/rejected": -7.638446807861328, "loss": 0.2583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.384127140045166, "rewards/margins": 6.073542594909668, "rewards/rejected": -11.457670211791992, "step": 720 }, { "epoch": 0.6274172754619682, "grad_norm": 6.937672138214111, "learning_rate": 2.604689134322999e-06, "logits/chosen": 2.928969383239746, "logits/rejected": 2.5493836402893066, "logps/chosen": -3.3862743377685547, "logps/rejected": -7.568005561828613, "loss": 0.2889, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.07941198348999, "rewards/margins": 6.27259635925293, "rewards/rejected": -11.352007865905762, "step": 730 }, { "epoch": 0.6360120326600773, "grad_norm": 2.1878838539123535, "learning_rate": 2.5523560497083927e-06, "logits/chosen": 2.3824827671051025, "logits/rejected": 2.257145404815674, "logps/chosen": -3.5448341369628906, "logps/rejected": -7.594444274902344, "loss": 0.1972, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.3172502517700195, "rewards/margins": 6.074415683746338, "rewards/rejected": -11.3916654586792, "step": 740 }, { "epoch": 0.6446067898581865, "grad_norm": 4.405832767486572, "learning_rate": 2.5e-06, "logits/chosen": 3.204157590866089, "logits/rejected": 3.0262837409973145, "logps/chosen": -3.67409086227417, "logps/rejected": -8.078901290893555, "loss": 0.2282, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.511136531829834, "rewards/margins": 6.607214450836182, "rewards/rejected": -12.118351936340332, "step": 750 }, { "epoch": 0.6446067898581865, "eval_logits/chosen": 2.1246254444122314, "eval_logits/rejected": 2.4088852405548096, "eval_logps/chosen": -4.221064567565918, "eval_logps/rejected": -9.4141206741333, "eval_loss": 0.2537557780742645, "eval_rewards/accuracies": 0.9368420839309692, "eval_rewards/chosen": -6.331596374511719, "eval_rewards/margins": 7.789584159851074, "eval_rewards/rejected": -14.121179580688477, "eval_runtime": 25.436, "eval_samples_per_second": 29.604, "eval_steps_per_second": 3.735, "step": 750 }, { "epoch": 0.6532015470562956, "grad_norm": 2.8693907260894775, "learning_rate": 2.447643950291608e-06, "logits/chosen": 2.5033986568450928, "logits/rejected": 2.2746779918670654, "logps/chosen": -4.256644248962402, "logps/rejected": -8.564817428588867, "loss": 0.2337, "rewards/accuracies": 0.9375, "rewards/chosen": -6.3849663734436035, "rewards/margins": 6.462259769439697, "rewards/rejected": -12.8472261428833, "step": 760 }, { "epoch": 0.6617963042544048, "grad_norm": 4.912906646728516, "learning_rate": 2.3953108656770018e-06, "logits/chosen": 2.861431837081909, "logits/rejected": 2.974611759185791, "logps/chosen": -3.9564735889434814, "logps/rejected": -7.863286018371582, "loss": 0.2585, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.934710502624512, "rewards/margins": 5.860217571258545, "rewards/rejected": -11.794927597045898, "step": 770 }, { "epoch": 0.6703910614525139, "grad_norm": 3.215716600418091, "learning_rate": 2.3430237011767166e-06, "logits/chosen": 1.9008615016937256, "logits/rejected": 1.9049352407455444, "logps/chosen": -4.304060935974121, "logps/rejected": -8.806629180908203, "loss": 0.2279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.456091403961182, "rewards/margins": 6.753852844238281, "rewards/rejected": -13.209943771362305, "step": 780 }, { "epoch": 0.6789858186506231, "grad_norm": 3.8724021911621094, "learning_rate": 2.290805391669212e-06, "logits/chosen": 2.2521636486053467, "logits/rejected": 2.2159788608551025, "logps/chosen": -4.012774467468262, "logps/rejected": -8.53366470336914, "loss": 0.2437, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.019161701202393, "rewards/margins": 6.78133487701416, "rewards/rejected": -12.800497055053711, "step": 790 }, { "epoch": 0.6875805758487322, "grad_norm": 3.56345796585083, "learning_rate": 2.238678841830867e-06, "logits/chosen": 2.0579304695129395, "logits/rejected": 2.304316997528076, "logps/chosen": -3.590430736541748, "logps/rejected": -8.182169914245605, "loss": 0.213, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.385646820068359, "rewards/margins": 6.887608528137207, "rewards/rejected": -12.27325439453125, "step": 800 }, { "epoch": 0.6875805758487322, "eval_logits/chosen": 2.228646755218506, "eval_logits/rejected": 2.444817543029785, "eval_logps/chosen": -3.8403449058532715, "eval_logps/rejected": -9.179658889770508, "eval_loss": 0.23895224928855896, "eval_rewards/accuracies": 0.9368420839309692, "eval_rewards/chosen": -5.76051664352417, "eval_rewards/margins": 8.00897216796875, "eval_rewards/rejected": -13.769490242004395, "eval_runtime": 25.3925, "eval_samples_per_second": 29.654, "eval_steps_per_second": 3.741, "step": 800 }, { "epoch": 0.6961753330468414, "grad_norm": 3.4880526065826416, "learning_rate": 2.186666916089239e-06, "logits/chosen": 1.7993383407592773, "logits/rejected": 1.754417061805725, "logps/chosen": -4.045234680175781, "logps/rejected": -8.927519798278809, "loss": 0.2391, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.067852020263672, "rewards/margins": 7.323427677154541, "rewards/rejected": -13.391279220581055, "step": 810 }, { "epoch": 0.7047700902449506, "grad_norm": 3.56809139251709, "learning_rate": 2.134792428593971e-06, "logits/chosen": 2.9591994285583496, "logits/rejected": 2.960444211959839, "logps/chosen": -4.150156497955322, "logps/rejected": -8.512441635131836, "loss": 0.1972, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -6.225234031677246, "rewards/margins": 6.54342794418335, "rewards/rejected": -12.768662452697754, "step": 820 }, { "epoch": 0.7133648474430597, "grad_norm": 4.127833843231201, "learning_rate": 2.0830781332097446e-06, "logits/chosen": 3.008269786834717, "logits/rejected": 2.63409686088562, "logps/chosen": -3.8291163444519043, "logps/rejected": -8.657347679138184, "loss": 0.2161, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.743674278259277, "rewards/margins": 7.24234676361084, "rewards/rejected": -12.986021041870117, "step": 830 }, { "epoch": 0.7219596046411689, "grad_norm": 4.475767612457275, "learning_rate": 2.031546713535688e-06, "logits/chosen": 2.7164976596832275, "logits/rejected": 2.5976195335388184, "logps/chosen": -4.153134346008301, "logps/rejected": -8.893486022949219, "loss": 0.1895, "rewards/accuracies": 0.9375, "rewards/chosen": -6.229701519012451, "rewards/margins": 7.110527992248535, "rewards/rejected": -13.340228080749512, "step": 840 }, { "epoch": 0.730554361839278, "grad_norm": 4.190205097198486, "learning_rate": 1.9802207729556023e-06, "logits/chosen": 2.6235451698303223, "logits/rejected": 2.5486202239990234, "logps/chosen": -3.899543046951294, "logps/rejected": -8.277327537536621, "loss": 0.2239, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.8493146896362305, "rewards/margins": 6.566677093505859, "rewards/rejected": -12.415990829467773, "step": 850 }, { "epoch": 0.730554361839278, "eval_logits/chosen": 2.173233985900879, "eval_logits/rejected": 2.433162212371826, "eval_logps/chosen": -4.13487434387207, "eval_logps/rejected": -9.577596664428711, "eval_loss": 0.23591776192188263, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.2023115158081055, "eval_rewards/margins": 8.164085388183594, "eval_rewards/rejected": -14.3663969039917, "eval_runtime": 25.4513, "eval_samples_per_second": 29.586, "eval_steps_per_second": 3.733, "step": 850 }, { "epoch": 0.7391491190373872, "grad_norm": 2.6548664569854736, "learning_rate": 1.9291228247233607e-06, "logits/chosen": 1.7737414836883545, "logits/rejected": 2.080662965774536, "logps/chosen": -3.9447720050811768, "logps/rejected": -9.01865005493164, "loss": 0.2268, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.9171576499938965, "rewards/margins": 7.610815525054932, "rewards/rejected": -13.527974128723145, "step": 860 }, { "epoch": 0.7477438762354963, "grad_norm": 2.5912184715270996, "learning_rate": 1.8782752820878636e-06, "logits/chosen": 2.5428760051727295, "logits/rejected": 2.3569278717041016, "logps/chosen": -3.685049057006836, "logps/rejected": -9.194517135620117, "loss": 0.2001, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.527573585510254, "rewards/margins": 8.264203071594238, "rewards/rejected": -13.791775703430176, "step": 870 }, { "epoch": 0.7563386334336055, "grad_norm": 3.789594888687134, "learning_rate": 1.827700448461836e-06, "logits/chosen": 3.139338970184326, "logits/rejected": 3.003114700317383, "logps/chosen": -4.347461700439453, "logps/rejected": -8.560078620910645, "loss": 0.2257, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.521193027496338, "rewards/margins": 6.318924903869629, "rewards/rejected": -12.840118408203125, "step": 880 }, { "epoch": 0.7649333906317146, "grad_norm": 2.3799326419830322, "learning_rate": 1.7774205076388207e-06, "logits/chosen": 3.2622504234313965, "logits/rejected": 2.922945261001587, "logps/chosen": -4.306991100311279, "logps/rejected": -8.622769355773926, "loss": 0.2123, "rewards/accuracies": 0.9375, "rewards/chosen": -6.46048641204834, "rewards/margins": 6.473666191101074, "rewards/rejected": -12.93415355682373, "step": 890 }, { "epoch": 0.7735281478298238, "grad_norm": 3.4133400917053223, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 2.8558292388916016, "logits/rejected": 2.919982433319092, "logps/chosen": -3.791405200958252, "logps/rejected": -9.348276138305664, "loss": 0.2345, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.687107086181641, "rewards/margins": 8.335307121276855, "rewards/rejected": -14.022415161132812, "step": 900 }, { "epoch": 0.7735281478298238, "eval_logits/chosen": 2.2851152420043945, "eval_logits/rejected": 2.5511629581451416, "eval_logps/chosen": -4.023584842681885, "eval_logps/rejected": -9.625852584838867, "eval_loss": 0.23031750321388245, "eval_rewards/accuracies": 0.9473684430122375, "eval_rewards/chosen": -6.035377025604248, "eval_rewards/margins": 8.403401374816895, "eval_rewards/rejected": -14.4387788772583, "eval_runtime": 25.4061, "eval_samples_per_second": 29.639, "eval_steps_per_second": 3.739, "step": 900 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0700038789416878e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }