{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 147, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_gradient/correlation": 0.2451171875, "eval_gradient/inner_product": 86507520.0, "eval_gradient/nabla_chosen_logps": 16384.0, "eval_gradient/nabla_rejected_logps": 17152.0, "eval_logits/chosen_all": -2.860478639602661, "eval_logits/chosen_avg": 19.5573673248291, "eval_logits/chosen_sum": 7878.5537109375, "eval_logits/rejected_all": -2.867154121398926, "eval_logits/rejected_avg": 19.835920333862305, "eval_logits/rejected_sum": 7351.955078125, "eval_logps/chosen": -300.9012145996094, "eval_logps/rejected": -339.9275207519531, "eval_loss": 0.6931472420692444, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": 0.0, "eval_rewards/margins": 0.0, "eval_rewards/rejected": 0.0, "eval_runtime": 995.2085, "eval_samples_per_second": 9.428, "eval_steps_per_second": 0.295, "step": 0 }, { "epoch": 0.006802721088435374, "grad_norm": 29.4046255037492, "gradient/correlation": 0.54296875, "gradient/inner_product": 104333312.0, "gradient/nabla_chosen_logps": 12928.0, "gradient/nabla_rejected_logps": 14848.0, "learning_rate": 3.3333333333333334e-08, "logits/chosen_all": -2.8881030082702637, "logits/chosen_avg": 19.100177764892578, "logits/chosen_sum": 5325.2724609375, "logits/rejected_all": -2.8739447593688965, "logits/rejected_avg": 18.758451461791992, "logits/rejected_sum": 5390.216796875, "logps/chosen": -261.74505615234375, "logps/rejected": -265.43463134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06802721088435375, "grad_norm": 31.079740345360065, "gradient/correlation": 0.396484375, "gradient/inner_product": 57933824.0, "gradient/nabla_chosen_logps": 11712.0, "gradient/nabla_rejected_logps": 12288.0, "learning_rate": 3.333333333333333e-07, "logits/chosen_all": -2.9010279178619385, "logits/chosen_avg": 19.462263107299805, "logits/chosen_sum": 7821.427734375, "logits/rejected_all": -2.8874688148498535, "logits/rejected_avg": 19.705490112304688, "logits/rejected_sum": 7311.00439453125, "logps/chosen": -309.2275390625, "logps/rejected": -335.8962097167969, "loss": 0.6929, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.0008738188771530986, "rewards/margins": 0.0014805120881646872, "rewards/rejected": -0.0023543310817331076, "step": 10 }, { "epoch": 0.1360544217687075, "grad_norm": 28.25080363552144, "gradient/correlation": 0.52734375, "gradient/inner_product": 131596288.0, "gradient/nabla_chosen_logps": 13824.0, "gradient/nabla_rejected_logps": 15680.0, "learning_rate": 4.982319711683221e-07, "logits/chosen_all": -2.8525900840759277, "logits/chosen_avg": 19.73147964477539, "logits/chosen_sum": 8136.91552734375, "logits/rejected_all": -2.853966236114502, "logits/rejected_avg": 19.96231460571289, "logits/rejected_sum": 7306.9111328125, "logps/chosen": -291.052734375, "logps/rejected": -340.9748840332031, "loss": 0.692, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.018822144716978073, "rewards/margins": 0.005229341331869364, "rewards/rejected": -0.024051483720541, "step": 20 }, { "epoch": 0.20408163265306123, "grad_norm": 33.010449292180766, "gradient/correlation": 0.455078125, "gradient/inner_product": 137363456.0, "gradient/nabla_chosen_logps": 15040.0, "gradient/nabla_rejected_logps": 18176.0, "learning_rate": 4.842374312499405e-07, "logits/chosen_all": -2.8493168354034424, "logits/chosen_avg": 19.768428802490234, "logits/chosen_sum": 7949.44384765625, "logits/rejected_all": -2.831387758255005, "logits/rejected_avg": 19.950336456298828, "logits/rejected_sum": 7626.23828125, "logps/chosen": -323.625, "logps/rejected": -345.86505126953125, "loss": 0.6864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07886435836553574, "rewards/margins": 0.01951368898153305, "rewards/rejected": -0.09837804734706879, "step": 30 }, { "epoch": 0.272108843537415, "grad_norm": 34.443566822120715, "gradient/correlation": 0.380859375, "gradient/inner_product": 170917888.0, "gradient/nabla_chosen_logps": 19968.0, "gradient/nabla_rejected_logps": 20992.0, "learning_rate": 4.5703731967784265e-07, "logits/chosen_all": -2.792343854904175, "logits/chosen_avg": 20.117395401000977, "logits/chosen_sum": 7771.77978515625, "logits/rejected_all": -2.793656826019287, "logits/rejected_avg": 20.50921058654785, "logits/rejected_sum": 7198.1337890625, "logps/chosen": -279.9584045410156, "logps/rejected": -327.79376220703125, "loss": 0.6776, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.17420102655887604, "rewards/margins": 0.03441625088453293, "rewards/rejected": -0.20861725509166718, "step": 40 }, { "epoch": 0.3401360544217687, "grad_norm": 45.75176403693249, "gradient/correlation": 0.59375, "gradient/inner_product": 392167424.0, "gradient/nabla_chosen_logps": 22144.0, "gradient/nabla_rejected_logps": 27648.0, "learning_rate": 4.1816509342531317e-07, "logits/chosen_all": -2.7981345653533936, "logits/chosen_avg": 20.486557006835938, "logits/chosen_sum": 8967.0947265625, "logits/rejected_all": -2.776093006134033, "logits/rejected_avg": 20.966766357421875, "logits/rejected_sum": 8001.73193359375, "logps/chosen": -382.888427734375, "logps/rejected": -448.8055725097656, "loss": 0.6689, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4583281874656677, "rewards/margins": 0.0806916207075119, "rewards/rejected": -0.5390198230743408, "step": 50 }, { "epoch": 0.40816326530612246, "grad_norm": 45.63073294397926, "gradient/correlation": 0.474609375, "gradient/inner_product": 406847488.0, "gradient/nabla_chosen_logps": 31232.0, "gradient/nabla_rejected_logps": 36608.0, "learning_rate": 3.698122466800142e-07, "logits/chosen_all": -2.7306084632873535, "logits/chosen_avg": 21.461116790771484, "logits/chosen_sum": 8760.287109375, "logits/rejected_all": -2.7338788509368896, "logits/rejected_avg": 21.832843780517578, "logits/rejected_sum": 8742.1240234375, "logps/chosen": -389.3448181152344, "logps/rejected": -426.97076416015625, "loss": 0.6575, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.6901549696922302, "rewards/margins": 0.08368454873561859, "rewards/rejected": -0.7738395929336548, "step": 60 }, { "epoch": 0.47619047619047616, "grad_norm": 45.39265557209564, "gradient/correlation": 0.51953125, "gradient/inner_product": 406847488.0, "gradient/nabla_chosen_logps": 26496.0, "gradient/nabla_rejected_logps": 30976.0, "learning_rate": 3.147047612756302e-07, "logits/chosen_all": -2.7141072750091553, "logits/chosen_avg": 21.571430206298828, "logits/chosen_sum": 8563.5771484375, "logits/rejected_all": -2.7004411220550537, "logits/rejected_avg": 21.90009117126465, "logits/rejected_sum": 7685.75634765625, "logps/chosen": -354.9464416503906, "logps/rejected": -412.0948181152344, "loss": 0.6379, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7182197570800781, "rewards/margins": 0.16819757223129272, "rewards/rejected": -0.8864172697067261, "step": 70 }, { "epoch": 0.54421768707483, "grad_norm": 43.2408764788425, "gradient/correlation": 0.53125, "gradient/inner_product": 400556032.0, "gradient/nabla_chosen_logps": 23552.0, "gradient/nabla_rejected_logps": 30336.0, "learning_rate": 2.5594942438652685e-07, "logits/chosen_all": -2.767631769180298, "logits/chosen_avg": 21.534954071044922, "logits/chosen_sum": 8986.39453125, "logits/rejected_all": -2.8037843704223633, "logits/rejected_avg": 22.02815055847168, "logits/rejected_sum": 7317.31640625, "logps/chosen": -332.03619384765625, "logps/rejected": -462.08831787109375, "loss": 0.651, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6966558694839478, "rewards/margins": 0.3038768470287323, "rewards/rejected": -1.0005327463150024, "step": 80 }, { "epoch": 0.6122448979591837, "grad_norm": 56.360376179583014, "gradient/correlation": 0.5390625, "gradient/inner_product": 444596224.0, "gradient/nabla_chosen_logps": 24064.0, "gradient/nabla_rejected_logps": 29440.0, "learning_rate": 1.968586776117558e-07, "logits/chosen_all": -2.7752909660339355, "logits/chosen_avg": 21.457965850830078, "logits/chosen_sum": 8509.26171875, "logits/rejected_all": -2.7356925010681152, "logits/rejected_avg": 21.785287857055664, "logits/rejected_sum": 7872.609375, "logps/chosen": -329.578369140625, "logps/rejected": -410.619140625, "loss": 0.6457, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6983135342597961, "rewards/margins": 0.23080599308013916, "rewards/rejected": -0.9291195869445801, "step": 90 }, { "epoch": 0.6802721088435374, "grad_norm": 44.79707815595916, "gradient/correlation": 0.4921875, "gradient/inner_product": 390070272.0, "gradient/nabla_chosen_logps": 25600.0, "gradient/nabla_rejected_logps": 28160.0, "learning_rate": 1.4076387190766014e-07, "logits/chosen_all": -2.6019034385681152, "logits/chosen_avg": 21.301361083984375, "logits/chosen_sum": 8701.6455078125, "logits/rejected_all": -2.613145112991333, "logits/rejected_avg": 21.54312515258789, "logits/rejected_sum": 7855.30712890625, "logps/chosen": -372.92529296875, "logps/rejected": -421.94189453125, "loss": 0.667, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7300583720207214, "rewards/margins": 0.1716311275959015, "rewards/rejected": -0.9016895294189453, "step": 100 }, { "epoch": 0.6802721088435374, "eval_gradient/correlation": 0.400390625, "eval_gradient/inner_product": 463470592.0, "eval_gradient/nabla_chosen_logps": 28288.0, "eval_gradient/nabla_rejected_logps": 37632.0, "eval_logits/chosen_all": -2.770193576812744, "eval_logits/chosen_avg": 21.098602294921875, "eval_logits/chosen_sum": 8554.5498046875, "eval_logits/rejected_all": -2.7774152755737305, "eval_logits/rejected_avg": 21.607807159423828, "eval_logits/rejected_sum": 8023.353515625, "eval_logps/chosen": -346.3625183105469, "eval_logps/rejected": -459.2464294433594, "eval_loss": 0.4740375578403473, "eval_rewards/accuracies": 0.8035714030265808, "eval_rewards/chosen": -0.45461341738700867, "eval_rewards/margins": 0.7385759353637695, "eval_rewards/rejected": -1.1931893825531006, "eval_runtime": 997.3521, "eval_samples_per_second": 9.408, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.7482993197278912, "grad_norm": 40.30215443485796, "gradient/correlation": 0.59375, "gradient/inner_product": 469762048.0, "gradient/nabla_chosen_logps": 25984.0, "gradient/nabla_rejected_logps": 26880.0, "learning_rate": 9.082745647022797e-08, "logits/chosen_all": -2.699470043182373, "logits/chosen_avg": 20.729663848876953, "logits/chosen_sum": 8757.3212890625, "logits/rejected_all": -2.6742231845855713, "logits/rejected_avg": 21.31679344177246, "logits/rejected_sum": 8163.1875, "logps/chosen": -375.778076171875, "logps/rejected": -408.4828796386719, "loss": 0.6532, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7500641345977783, "rewards/margins": 0.07548926770687103, "rewards/rejected": -0.825553297996521, "step": 110 }, { "epoch": 0.8163265306122449, "grad_norm": 44.17421295117634, "gradient/correlation": 0.4375, "gradient/inner_product": 408944640.0, "gradient/nabla_chosen_logps": 28928.0, "gradient/nabla_rejected_logps": 32128.0, "learning_rate": 4.986468976890992e-08, "logits/chosen_all": -2.597139358520508, "logits/chosen_avg": 20.95490074157715, "logits/chosen_sum": 9339.2890625, "logits/rejected_all": -2.569540500640869, "logits/rejected_avg": 21.029306411743164, "logits/rejected_sum": 8224.537109375, "logps/chosen": -407.2939758300781, "logps/rejected": -449.58056640625, "loss": 0.6498, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7030726671218872, "rewards/margins": 0.1318582147359848, "rewards/rejected": -0.8349308967590332, "step": 120 }, { "epoch": 0.8843537414965986, "grad_norm": 39.6374082354362, "gradient/correlation": 0.462890625, "gradient/inner_product": 299892736.0, "gradient/nabla_chosen_logps": 22400.0, "gradient/nabla_rejected_logps": 25728.0, "learning_rate": 2.0184924104583612e-08, "logits/chosen_all": -2.817037343978882, "logits/chosen_avg": 21.62957763671875, "logits/chosen_sum": 8717.798828125, "logits/rejected_all": -2.775411605834961, "logits/rejected_avg": 22.049942016601562, "logits/rejected_sum": 8576.767578125, "logps/chosen": -329.4313659667969, "logps/rejected": -376.56793212890625, "loss": 0.6547, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6067415475845337, "rewards/margins": 0.1636410653591156, "rewards/rejected": -0.7703827023506165, "step": 130 }, { "epoch": 0.9523809523809523, "grad_norm": 38.25960337464306, "gradient/correlation": 0.48828125, "gradient/inner_product": 463470592.0, "gradient/nabla_chosen_logps": 25216.0, "gradient/nabla_rejected_logps": 30464.0, "learning_rate": 3.4614115704533766e-09, "logits/chosen_all": -2.8207552433013916, "logits/chosen_avg": 21.21940803527832, "logits/chosen_sum": 9160.130859375, "logits/rejected_all": -2.834463596343994, "logits/rejected_avg": 21.833744049072266, "logits/rejected_sum": 8493.126953125, "logps/chosen": -371.8961486816406, "logps/rejected": -458.52642822265625, "loss": 0.6422, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7322984933853149, "rewards/margins": 0.22834627330303192, "rewards/rejected": -0.9606448411941528, "step": 140 }, { "epoch": 1.0, "step": 147, "total_flos": 0.0, "train_loss": 0.6616816390939311, "train_runtime": 3221.9386, "train_samples_per_second": 2.912, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 147, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }