{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 0.12694114446640015, "learning_rate": 1.8750000000000002e-05, "logits/chosen": -0.05689077079296112, "logits/rejected": -0.10778996348381042, "logps/chosen": -51.92569351196289, "logps/rejected": -58.57919692993164, "loss": 0.3444, "rewards/accuracies": 0.8846153616905212, "rewards/chosen": 0.8715194463729858, "rewards/margins": 1.492788553237915, "rewards/rejected": -0.6212692260742188, "step": 26 }, { "epoch": 0.39, "grad_norm": 0.0007516579935327172, "learning_rate": 2.9073033707865168e-05, "logits/chosen": -0.13101856410503387, "logits/rejected": -0.13021668791770935, "logps/chosen": -27.36025619506836, "logps/rejected": -104.82938385009766, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 3.3301053047180176, "rewards/margins": 8.549433708190918, "rewards/rejected": -5.219327926635742, "step": 52 }, { "epoch": 0.59, "grad_norm": 0.00040602186345495284, "learning_rate": 2.6882022471910113e-05, "logits/chosen": -0.21169501543045044, "logits/rejected": -0.18469807505607605, "logps/chosen": -21.630279541015625, "logps/rejected": -122.15676879882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.915804386138916, "rewards/margins": 10.897878646850586, "rewards/rejected": -6.98207426071167, "step": 78 }, { "epoch": 0.79, "grad_norm": 0.00029909086879342794, "learning_rate": 2.4691011235955056e-05, "logits/chosen": -0.22328408062458038, "logits/rejected": -0.19363602995872498, "logps/chosen": -20.646940231323242, "logps/rejected": -125.94076538085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.001604080200195, "rewards/margins": 11.342691421508789, "rewards/rejected": -7.341087818145752, "step": 104 }, { "epoch": 0.98, "grad_norm": 0.0008555773529224098, "learning_rate": 2.25e-05, "logits/chosen": -0.21473294496536255, "logits/rejected": -0.1837671399116516, "logps/chosen": -20.335859298706055, "logps/rejected": -127.00772094726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.043529033660889, "rewards/margins": 11.498052597045898, "rewards/rejected": -7.454523086547852, "step": 130 }, { "epoch": 1.18, "grad_norm": 0.00022369994258042425, "learning_rate": 2.0308988764044947e-05, "logits/chosen": -0.21327663958072662, "logits/rejected": -0.1831081211566925, "logps/chosen": -20.030139923095703, "logps/rejected": -127.70628356933594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.0749359130859375, "rewards/margins": 11.608404159545898, "rewards/rejected": -7.533467769622803, "step": 156 }, { "epoch": 1.38, "grad_norm": 0.0002135779504897073, "learning_rate": 1.8117977528089886e-05, "logits/chosen": -0.22201663255691528, "logits/rejected": -0.1898954212665558, "logps/chosen": -19.46499252319336, "logps/rejected": -130.10020446777344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.127849102020264, "rewards/margins": 11.870473861694336, "rewards/rejected": -7.742624759674072, "step": 182 }, { "epoch": 1.58, "grad_norm": 0.00018591841217130423, "learning_rate": 1.5926966292134832e-05, "logits/chosen": -0.2321176379919052, "logits/rejected": -0.20056405663490295, "logps/chosen": -19.843402862548828, "logps/rejected": -129.42715454101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.082259178161621, "rewards/margins": 11.799769401550293, "rewards/rejected": -7.71751070022583, "step": 208 }, { "epoch": 1.77, "grad_norm": 0.0001685543538769707, "learning_rate": 1.3735955056179776e-05, "logits/chosen": -0.22563436627388, "logits/rejected": -0.19371062517166138, "logps/chosen": -19.1228084564209, "logps/rejected": -131.58114624023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.158024787902832, "rewards/margins": 12.063767433166504, "rewards/rejected": -7.905743598937988, "step": 234 }, { "epoch": 1.97, "grad_norm": 0.0001603550190338865, "learning_rate": 1.154494382022472e-05, "logits/chosen": -0.23086732625961304, "logits/rejected": -0.1993020474910736, "logps/chosen": -19.312213897705078, "logps/rejected": -131.04287719726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.135034561157227, "rewards/margins": 12.01096248626709, "rewards/rejected": -7.8759284019470215, "step": 260 }, { "epoch": 2.17, "grad_norm": 0.00015357887605205178, "learning_rate": 9.353932584269662e-06, "logits/chosen": -0.23084178566932678, "logits/rejected": -0.1990644335746765, "logps/chosen": -18.966245651245117, "logps/rejected": -132.33311462402344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.179388523101807, "rewards/margins": 12.165773391723633, "rewards/rejected": -7.986386299133301, "step": 286 }, { "epoch": 2.36, "grad_norm": 0.0001516837510280311, "learning_rate": 7.162921348314607e-06, "logits/chosen": -0.22668816149234772, "logits/rejected": -0.19253727793693542, "logps/chosen": -18.499208450317383, "logps/rejected": -133.57839965820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.219171047210693, "rewards/margins": 12.308831214904785, "rewards/rejected": -8.08966064453125, "step": 312 }, { "epoch": 2.56, "grad_norm": 0.0005327428807504475, "learning_rate": 4.97191011235955e-06, "logits/chosen": -0.2215849608182907, "logits/rejected": -0.1875036060810089, "logps/chosen": -18.43309211730957, "logps/rejected": -133.9069366455078, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.235932350158691, "rewards/margins": 12.36194896697998, "rewards/rejected": -8.126015663146973, "step": 338 }, { "epoch": 2.76, "grad_norm": 0.00013107992708683014, "learning_rate": 2.7808988764044947e-06, "logits/chosen": -0.23455286026000977, "logits/rejected": -0.20136354863643646, "logps/chosen": -18.942277908325195, "logps/rejected": -132.9586639404297, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.165750503540039, "rewards/margins": 12.227232933044434, "rewards/rejected": -8.061481475830078, "step": 364 }, { "epoch": 2.95, "grad_norm": 0.00013512423902284354, "learning_rate": 5.898876404494382e-07, "logits/chosen": -0.23882614076137543, "logits/rejected": -0.20455202460289001, "logps/chosen": -18.73713493347168, "logps/rejected": -132.86285400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.19658088684082, "rewards/margins": 12.254826545715332, "rewards/rejected": -8.058244705200195, "step": 390 } ], "logging_steps": 26, "max_steps": 396, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }