{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 301517.0427429078, "learning_rate": 1.5625e-08, "logits/chosen": -0.2715578079223633, "logits/rejected": -0.42230841517448425, "logps/chosen": -74.72806549072266, "logps/rejected": -86.24398040771484, "loss": 47111.2656, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.032, "grad_norm": 262907.88134103786, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.464042067527771, "logits/rejected": -0.4814835786819458, "logps/chosen": -98.76150512695312, "logps/rejected": -98.51900482177734, "loss": 46480.3472, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 3.484352646410116e-06, "rewards/margins": 4.8643836635164917e-05, "rewards/rejected": -4.5159493311075494e-05, "step": 10 }, { "epoch": 0.064, "grad_norm": 258146.1292254514, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.5679231882095337, "logits/rejected": -0.5402768850326538, "logps/chosen": -120.5081558227539, "logps/rejected": -118.08524322509766, "loss": 45353.8531, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0015855863457545638, "rewards/margins": -0.00022199496743269265, "rewards/rejected": -0.001363591174595058, "step": 20 }, { "epoch": 0.096, "grad_norm": 272342.29803302046, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.6670567393302917, "logits/rejected": -0.6592522859573364, "logps/chosen": -117.73258209228516, "logps/rejected": -117.10823822021484, "loss": 45740.5375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0023494339548051357, "rewards/margins": 0.0008368989219889045, "rewards/rejected": -0.0031863327603787184, "step": 30 }, { "epoch": 0.128, "grad_norm": 254349.424821141, "learning_rate": 4.857142857142857e-07, "logits/chosen": -0.6472231149673462, "logits/rejected": -0.6133359670639038, "logps/chosen": -104.78807067871094, "logps/rejected": -102.49015045166016, "loss": 46397.3, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0014789658598601818, "rewards/margins": 0.0007001858321018517, "rewards/rejected": -0.0021791516337543726, "step": 40 }, { "epoch": 0.16, "grad_norm": 317871.6740038316, "learning_rate": 4.6785714285714283e-07, "logits/chosen": -0.5559561848640442, "logits/rejected": -0.4931167662143707, "logps/chosen": -105.31684875488281, "logps/rejected": -100.81905364990234, "loss": 46727.1, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0029151481576263905, "rewards/margins": -0.00013320180005393922, "rewards/rejected": -0.002781946212053299, "step": 50 }, { "epoch": 0.192, "grad_norm": 309432.48532645905, "learning_rate": 4.5e-07, "logits/chosen": -0.5389941930770874, "logits/rejected": -0.5341317653656006, "logps/chosen": -100.21482849121094, "logps/rejected": -101.88697814941406, "loss": 47186.6906, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005146821960806847, "rewards/margins": 0.0010053727310150862, "rewards/rejected": -0.0061521949246525764, "step": 60 }, { "epoch": 0.224, "grad_norm": 279882.5151706957, "learning_rate": 4.3214285714285713e-07, "logits/chosen": -0.6586358547210693, "logits/rejected": -0.642874538898468, "logps/chosen": -103.6421890258789, "logps/rejected": -106.9367904663086, "loss": 47560.4625, "rewards/accuracies": 0.5, "rewards/chosen": -0.0009041793528012931, "rewards/margins": 0.0008955754456110299, "rewards/rejected": -0.001799754798412323, "step": 70 }, { "epoch": 0.256, "grad_norm": 307667.6742927268, "learning_rate": 4.142857142857143e-07, "logits/chosen": -0.6012131571769714, "logits/rejected": -0.608718991279602, "logps/chosen": -102.5672378540039, "logps/rejected": -102.2068099975586, "loss": 47294.5156, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008262965711764991, "rewards/margins": 0.001540123368613422, "rewards/rejected": -0.00236641988158226, "step": 80 }, { "epoch": 0.288, "grad_norm": 272828.7827659401, "learning_rate": 3.9642857142857137e-07, "logits/chosen": -0.5353714823722839, "logits/rejected": -0.5272339582443237, "logps/chosen": -89.8922348022461, "logps/rejected": -96.12710571289062, "loss": 46162.6719, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0036609251983463764, "rewards/margins": 0.0021247321274131536, "rewards/rejected": -0.005785657558590174, "step": 90 }, { "epoch": 0.32, "grad_norm": 268429.20188699494, "learning_rate": 3.785714285714285e-07, "logits/chosen": -0.6908645629882812, "logits/rejected": -0.6659768223762512, "logps/chosen": -103.90727233886719, "logps/rejected": -104.8873291015625, "loss": 46836.85, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008818693459033966, "rewards/margins": 0.0011547221802175045, "rewards/rejected": -0.002036591526120901, "step": 100 }, { "epoch": 0.352, "grad_norm": 269671.9409354351, "learning_rate": 3.607142857142857e-07, "logits/chosen": -0.5391483902931213, "logits/rejected": -0.518116295337677, "logps/chosen": -73.18064880371094, "logps/rejected": -74.66477966308594, "loss": 46931.35, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.000710971187800169, "rewards/margins": -0.00017498522356618196, "rewards/rejected": -0.0005359860369935632, "step": 110 }, { "epoch": 0.384, "grad_norm": 283739.571196758, "learning_rate": 3.4285714285714286e-07, "logits/chosen": -0.6659616827964783, "logits/rejected": -0.7110891938209534, "logps/chosen": -105.9618148803711, "logps/rejected": -113.90108489990234, "loss": 47461.1406, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0007265805033966899, "rewards/margins": 0.0006529040401801467, "rewards/rejected": -0.001379484310746193, "step": 120 }, { "epoch": 0.416, "grad_norm": 356192.57857636997, "learning_rate": 3.25e-07, "logits/chosen": -0.5708358287811279, "logits/rejected": -0.5911769866943359, "logps/chosen": -109.13777923583984, "logps/rejected": -111.34733581542969, "loss": 48096.3938, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0033053618390113115, "rewards/margins": 0.0012148560490459204, "rewards/rejected": -0.004520217888057232, "step": 130 }, { "epoch": 0.448, "grad_norm": 335211.4928609446, "learning_rate": 3.0714285714285716e-07, "logits/chosen": -0.6561946868896484, "logits/rejected": -0.6476176977157593, "logps/chosen": -123.64212799072266, "logps/rejected": -123.40422058105469, "loss": 45940.975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.003593811299651861, "rewards/margins": 0.00053530337754637, "rewards/rejected": -0.004129114560782909, "step": 140 }, { "epoch": 0.48, "grad_norm": 305395.21981975477, "learning_rate": 2.892857142857143e-07, "logits/chosen": -0.5857366323471069, "logits/rejected": -0.5595449209213257, "logps/chosen": -93.74351501464844, "logps/rejected": -93.7242660522461, "loss": 47410.8, "rewards/accuracies": 0.625, "rewards/chosen": -0.002368563786149025, "rewards/margins": 0.000440702453488484, "rewards/rejected": -0.0028092663269490004, "step": 150 }, { "epoch": 0.512, "grad_norm": 332600.8895494031, "learning_rate": 2.714285714285714e-07, "logits/chosen": -0.5348027944564819, "logits/rejected": -0.5600031614303589, "logps/chosen": -93.50711059570312, "logps/rejected": -97.4880599975586, "loss": 47739.1156, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0032862056978046894, "rewards/margins": 0.001124653615988791, "rewards/rejected": -0.004410859197378159, "step": 160 }, { "epoch": 0.544, "grad_norm": 331388.7328009726, "learning_rate": 2.5357142857142855e-07, "logits/chosen": -0.615580677986145, "logits/rejected": -0.6166919469833374, "logps/chosen": -113.16961669921875, "logps/rejected": -116.9697494506836, "loss": 47041.25, "rewards/accuracies": 0.625, "rewards/chosen": -0.0033803496044129133, "rewards/margins": 0.0007489208364859223, "rewards/rejected": -0.0041292705573141575, "step": 170 }, { "epoch": 0.576, "grad_norm": 332044.78524281725, "learning_rate": 2.357142857142857e-07, "logits/chosen": -0.6037168502807617, "logits/rejected": -0.6863250136375427, "logps/chosen": -119.31378173828125, "logps/rejected": -126.0621566772461, "loss": 47535.0344, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.002967274049296975, "rewards/margins": -0.0003730076423380524, "rewards/rejected": -0.002594266552478075, "step": 180 }, { "epoch": 0.608, "grad_norm": 319413.8319562671, "learning_rate": 2.1785714285714284e-07, "logits/chosen": -0.7273733019828796, "logits/rejected": -0.7188557982444763, "logps/chosen": -115.4392318725586, "logps/rejected": -112.0301284790039, "loss": 46876.4094, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.002623769221827388, "rewards/margins": 0.001058573485352099, "rewards/rejected": -0.003682342590764165, "step": 190 }, { "epoch": 0.64, "grad_norm": 314346.2695610602, "learning_rate": 2e-07, "logits/chosen": -0.6322755813598633, "logits/rejected": -0.5915661454200745, "logps/chosen": -100.2895278930664, "logps/rejected": -93.4193344116211, "loss": 47579.9, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.004441672004759312, "rewards/margins": 0.00043794940575025976, "rewards/rejected": -0.00487962132319808, "step": 200 }, { "epoch": 0.672, "grad_norm": 350476.79993683187, "learning_rate": 1.8214285714285714e-07, "logits/chosen": -0.6528446078300476, "logits/rejected": -0.6807696223258972, "logps/chosen": -88.17680358886719, "logps/rejected": -88.77709197998047, "loss": 48939.0469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.001382711692713201, "rewards/margins": 0.0008936499943956733, "rewards/rejected": -0.002276361919939518, "step": 210 }, { "epoch": 0.704, "grad_norm": 333610.0532688813, "learning_rate": 1.6428571428571429e-07, "logits/chosen": -0.7356145977973938, "logits/rejected": -0.7182696461677551, "logps/chosen": -132.14749145507812, "logps/rejected": -126.3568344116211, "loss": 47693.8156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0011152013903483748, "rewards/margins": 0.0019210099708288908, "rewards/rejected": -0.003036211710423231, "step": 220 }, { "epoch": 0.736, "grad_norm": 298046.1912209505, "learning_rate": 1.4642857142857143e-07, "logits/chosen": -0.5968427658081055, "logits/rejected": -0.5768970847129822, "logps/chosen": -94.10835266113281, "logps/rejected": -101.12223815917969, "loss": 46278.9875, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007765673799440265, "rewards/margins": 0.0012700657825917006, "rewards/rejected": -0.002046633278951049, "step": 230 }, { "epoch": 0.768, "grad_norm": 288133.9839915777, "learning_rate": 1.2857142857142855e-07, "logits/chosen": -0.7007887363433838, "logits/rejected": -0.6801734566688538, "logps/chosen": -129.5726776123047, "logps/rejected": -131.34750366210938, "loss": 46299.8406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0021117436699569225, "rewards/margins": 0.0007607269217260182, "rewards/rejected": -0.00287247053347528, "step": 240 }, { "epoch": 0.8, "grad_norm": 311647.07184519153, "learning_rate": 1.107142857142857e-07, "logits/chosen": -0.6636364459991455, "logits/rejected": -0.6806343793869019, "logps/chosen": -126.0498275756836, "logps/rejected": -129.86505126953125, "loss": 46564.025, "rewards/accuracies": 0.625, "rewards/chosen": -0.0026524278800934553, "rewards/margins": 0.0016201415564864874, "rewards/rejected": -0.00427256990224123, "step": 250 }, { "epoch": 0.832, "grad_norm": 332873.16670732293, "learning_rate": 9.285714285714286e-08, "logits/chosen": -0.6928391456604004, "logits/rejected": -0.7199726700782776, "logps/chosen": -91.19033813476562, "logps/rejected": -99.61897277832031, "loss": 45619.1937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0015564201166853309, "rewards/margins": 0.002241902519017458, "rewards/rejected": -0.0037983227521181107, "step": 260 }, { "epoch": 0.864, "grad_norm": 321461.36179731, "learning_rate": 7.5e-08, "logits/chosen": -0.6582412123680115, "logits/rejected": -0.6260276436805725, "logps/chosen": -100.95478820800781, "logps/rejected": -103.7972412109375, "loss": 47785.325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0012073480756953359, "rewards/margins": 0.0029904134571552277, "rewards/rejected": -0.004197761416435242, "step": 270 }, { "epoch": 0.896, "grad_norm": 369055.42210931255, "learning_rate": 5.714285714285714e-08, "logits/chosen": -0.8000235557556152, "logits/rejected": -0.8175070881843567, "logps/chosen": -112.53459167480469, "logps/rejected": -116.30067443847656, "loss": 47248.0156, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0023680843878537416, "rewards/margins": 0.001865853788331151, "rewards/rejected": -0.004233937710523605, "step": 280 }, { "epoch": 0.928, "grad_norm": 337372.99771556864, "learning_rate": 3.9285714285714285e-08, "logits/chosen": -0.7372657656669617, "logits/rejected": -0.7545084953308105, "logps/chosen": -133.76637268066406, "logps/rejected": -133.70761108398438, "loss": 46951.3187, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.003720104694366455, "rewards/margins": 0.0001435236044926569, "rewards/rejected": -0.0038636289536952972, "step": 290 }, { "epoch": 0.96, "grad_norm": 321067.1618704567, "learning_rate": 2.142857142857143e-08, "logits/chosen": -0.5622406601905823, "logits/rejected": -0.5376971364021301, "logps/chosen": -113.44036865234375, "logps/rejected": -111.9993667602539, "loss": 46523.5844, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0034807869233191013, "rewards/margins": 0.001533324713818729, "rewards/rejected": -0.005014111753553152, "step": 300 }, { "epoch": 0.992, "grad_norm": 332780.2074089866, "learning_rate": 3.571428571428571e-09, "logits/chosen": -0.6824791431427002, "logits/rejected": -0.6886446475982666, "logps/chosen": -121.04481506347656, "logps/rejected": -121.3174819946289, "loss": 45077.1906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.001433422090485692, "rewards/margins": 0.001681215362623334, "rewards/rejected": -0.003114637453109026, "step": 310 }, { "epoch": 0.9984, "step": 312, "total_flos": 0.0, "train_loss": 46907.07216546474, "train_runtime": 2770.7347, "train_samples_per_second": 7.217, "train_steps_per_second": 0.113 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }