{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1289213579716373, "eval_steps": 50, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008594757198109154, "grad_norm": 0.05934199318289757, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.762972831726074, "logits/rejected": 15.199728012084961, "logps/chosen": -0.3259914815425873, "logps/rejected": -0.34297481179237366, "loss": 0.9377, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.4889872074127197, "rewards/margins": 0.02547495998442173, "rewards/rejected": -0.5144621729850769, "step": 10 }, { "epoch": 0.017189514396218308, "grad_norm": 0.06342790275812149, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.351249694824219, "logits/rejected": 15.068448066711426, "logps/chosen": -0.2809392511844635, "logps/rejected": -0.3711296617984772, "loss": 0.9352, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.42140883207321167, "rewards/margins": 0.1352856159210205, "rewards/rejected": -0.5566944479942322, "step": 20 }, { "epoch": 0.02578427159432746, "grad_norm": 0.053961098194122314, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.636960983276367, "logits/rejected": 15.265243530273438, "logps/chosen": -0.2820780873298645, "logps/rejected": -0.34024301171302795, "loss": 0.9351, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.42311716079711914, "rewards/margins": 0.08724743127822876, "rewards/rejected": -0.5103646516799927, "step": 30 }, { "epoch": 0.034379028792436615, "grad_norm": 0.13506193459033966, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.4556884765625, "logits/rejected": 15.048967361450195, "logps/chosen": -0.2897028625011444, "logps/rejected": -0.34129124879837036, "loss": 0.922, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.43455424904823303, "rewards/margins": 0.07738252729177475, "rewards/rejected": -0.5119368433952332, "step": 40 }, { "epoch": 0.042973785990545764, "grad_norm": 0.05230574309825897, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.628789901733398, "logits/rejected": 15.307828903198242, "logps/chosen": -0.28786614537239075, "logps/rejected": -0.3513876795768738, "loss": 0.9201, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.4317992329597473, "rewards/margins": 0.09528233855962753, "rewards/rejected": -0.5270815491676331, "step": 50 }, { "epoch": 0.042973785990545764, "eval_logits/chosen": 14.234943389892578, "eval_logits/rejected": 15.258601188659668, "eval_logps/chosen": -0.2844341993331909, "eval_logps/rejected": -0.3695394694805145, "eval_loss": 0.9226060509681702, "eval_rewards/accuracies": 0.5157894492149353, "eval_rewards/chosen": -0.42665132880210876, "eval_rewards/margins": 0.1276579648256302, "eval_rewards/rejected": -0.5543092489242554, "eval_runtime": 25.9356, "eval_samples_per_second": 29.033, "eval_steps_per_second": 3.663, "step": 50 }, { "epoch": 0.05156854318865492, "grad_norm": 0.09328428655862808, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.35963249206543, "logits/rejected": 15.055354118347168, "logps/chosen": -0.27534741163253784, "logps/rejected": -0.33098170161247253, "loss": 0.9356, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4130210876464844, "rewards/margins": 0.08345144242048264, "rewards/rejected": -0.4964725375175476, "step": 60 }, { "epoch": 0.060163300386764075, "grad_norm": 0.06518550217151642, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.599525451660156, "logits/rejected": 14.825297355651855, "logps/chosen": -0.2708163857460022, "logps/rejected": -0.3305850923061371, "loss": 0.9257, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4062245786190033, "rewards/margins": 0.08965305984020233, "rewards/rejected": -0.4958776533603668, "step": 70 }, { "epoch": 0.06875805758487323, "grad_norm": 0.07543154805898666, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.947430610656738, "logits/rejected": 15.093690872192383, "logps/chosen": -0.2602943778038025, "logps/rejected": -0.31820863485336304, "loss": 0.9168, "rewards/accuracies": 0.5, "rewards/chosen": -0.39044153690338135, "rewards/margins": 0.08687138557434082, "rewards/rejected": -0.47731298208236694, "step": 80 }, { "epoch": 0.07735281478298238, "grad_norm": 0.06628195196390152, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.43529987335205, "logits/rejected": 14.750699043273926, "logps/chosen": -0.2884291708469391, "logps/rejected": -0.34193652868270874, "loss": 0.9273, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.43264374136924744, "rewards/margins": 0.08026103675365448, "rewards/rejected": -0.5129047632217407, "step": 90 }, { "epoch": 0.08594757198109153, "grad_norm": 0.08684897422790527, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 13.573002815246582, "logits/rejected": 14.441877365112305, "logps/chosen": -0.2569890320301056, "logps/rejected": -0.37049269676208496, "loss": 0.9009, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3854835629463196, "rewards/margins": 0.17025551199913025, "rewards/rejected": -0.5557390451431274, "step": 100 }, { "epoch": 0.08594757198109153, "eval_logits/chosen": 14.026633262634277, "eval_logits/rejected": 15.08835220336914, "eval_logps/chosen": -0.2761566936969757, "eval_logps/rejected": -0.3717801570892334, "eval_loss": 0.9138591885566711, "eval_rewards/accuracies": 0.5368421077728271, "eval_rewards/chosen": -0.41423505544662476, "eval_rewards/margins": 0.1434352546930313, "eval_rewards/rejected": -0.5576702952384949, "eval_runtime": 25.3996, "eval_samples_per_second": 29.646, "eval_steps_per_second": 3.74, "step": 100 }, { "epoch": 0.09454232917920069, "grad_norm": 0.08046824485063553, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.500630378723145, "logits/rejected": 14.831761360168457, "logps/chosen": -0.30049553513526917, "logps/rejected": -0.3315966725349426, "loss": 0.916, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.45074325799942017, "rewards/margins": 0.04665176197886467, "rewards/rejected": -0.49739497900009155, "step": 110 }, { "epoch": 0.10313708637730984, "grad_norm": 0.12244562804698944, "learning_rate": 4.921457902821578e-06, "logits/chosen": 14.26713752746582, "logits/rejected": 14.495455741882324, "logps/chosen": -0.2670941650867462, "logps/rejected": -0.32481229305267334, "loss": 0.9167, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.4006412625312805, "rewards/margins": 0.08657723665237427, "rewards/rejected": -0.4872184693813324, "step": 120 }, { "epoch": 0.11173184357541899, "grad_norm": 0.1828213334083557, "learning_rate": 4.907906416994146e-06, "logits/chosen": 14.009546279907227, "logits/rejected": 14.297094345092773, "logps/chosen": -0.27995598316192627, "logps/rejected": -0.3530685007572174, "loss": 0.9087, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.419933944940567, "rewards/margins": 0.10966875404119492, "rewards/rejected": -0.5296027660369873, "step": 130 }, { "epoch": 0.12032660077352815, "grad_norm": 0.10407563298940659, "learning_rate": 4.893298743830168e-06, "logits/chosen": 13.689155578613281, "logits/rejected": 14.1933012008667, "logps/chosen": -0.25955715775489807, "logps/rejected": -0.3815004229545593, "loss": 0.9053, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3893357217311859, "rewards/margins": 0.18291489779949188, "rewards/rejected": -0.5722506046295166, "step": 140 }, { "epoch": 0.1289213579716373, "grad_norm": 0.10028588026762009, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.851397514343262, "logits/rejected": 13.509778022766113, "logps/chosen": -0.23652991652488708, "logps/rejected": -0.3720462918281555, "loss": 0.8999, "rewards/accuracies": 0.625, "rewards/chosen": -0.3547949194908142, "rewards/margins": 0.2032744586467743, "rewards/rejected": -0.5580693483352661, "step": 150 }, { "epoch": 0.1289213579716373, "eval_logits/chosen": 12.384929656982422, "eval_logits/rejected": 13.672826766967773, "eval_logps/chosen": -0.27857670187950134, "eval_logps/rejected": -0.4014737904071808, "eval_loss": 0.8956203460693359, "eval_rewards/accuracies": 0.5684210658073425, "eval_rewards/chosen": -0.4178650677204132, "eval_rewards/margins": 0.18434564769268036, "eval_rewards/rejected": -0.6022107601165771, "eval_runtime": 25.4176, "eval_samples_per_second": 29.625, "eval_steps_per_second": 3.738, "step": 150 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4556641787930214e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }