{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9555555555555557, "eval_steps": 500, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11851851851851852, "grad_norm": 2.2370028495788574, "learning_rate": 0.00011428571428571428, "logits/chosen": -0.2420351207256317, "logits/rejected": -0.23719240725040436, "logps/chosen": -164.4378204345703, "logps/rejected": -218.79978942871094, "loss": 0.6876, "rewards/accuracies": 0.28125, "rewards/chosen": -0.010453129187226295, "rewards/margins": 0.011684644967317581, "rewards/rejected": -0.022137774154543877, "step": 4 }, { "epoch": 0.23703703703703705, "grad_norm": 2.211751699447632, "learning_rate": 0.00019661016949152545, "logits/chosen": -0.30724823474884033, "logits/rejected": -0.1725369691848755, "logps/chosen": -177.024169921875, "logps/rejected": -196.5380096435547, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": 0.013838172890245914, "rewards/margins": 0.11086592078208923, "rewards/rejected": -0.0970277488231659, "step": 8 }, { "epoch": 0.35555555555555557, "grad_norm": 3.362987995147705, "learning_rate": 0.00018305084745762714, "logits/chosen": -0.12524788081645966, "logits/rejected": -0.22343379259109497, "logps/chosen": -163.24392700195312, "logps/rejected": -172.90646362304688, "loss": 0.5896, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09611811488866806, "rewards/margins": 0.40950363874435425, "rewards/rejected": -0.3133855164051056, "step": 12 }, { "epoch": 0.4740740740740741, "grad_norm": 2.0503623485565186, "learning_rate": 0.00016949152542372882, "logits/chosen": -0.10405930876731873, "logits/rejected": -0.31087443232536316, "logps/chosen": -153.96531677246094, "logps/rejected": -212.83856201171875, "loss": 0.4884, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7000998258590698, "rewards/margins": 0.9018418788909912, "rewards/rejected": -0.20174214243888855, "step": 16 }, { "epoch": 0.5925925925925926, "grad_norm": 3.2123215198516846, "learning_rate": 0.00015593220338983051, "logits/chosen": -0.1336425244808197, "logits/rejected": -0.23538736999034882, "logps/chosen": -149.6715850830078, "logps/rejected": -197.02955627441406, "loss": 0.5888, "rewards/accuracies": 0.625, "rewards/chosen": 0.6735118627548218, "rewards/margins": 0.7550574541091919, "rewards/rejected": -0.0815456211566925, "step": 20 }, { "epoch": 0.7111111111111111, "grad_norm": 4.250487804412842, "learning_rate": 0.0001423728813559322, "logits/chosen": 0.0016336403787136078, "logits/rejected": 0.043180257081985474, "logps/chosen": -168.38230895996094, "logps/rejected": -252.7860565185547, "loss": 0.4445, "rewards/accuracies": 0.8125, "rewards/chosen": 1.229780673980713, "rewards/margins": 1.1931822299957275, "rewards/rejected": 0.036598339676856995, "step": 24 }, { "epoch": 0.8296296296296296, "grad_norm": 2.603621482849121, "learning_rate": 0.0001288135593220339, "logits/chosen": -0.028663629665970802, "logits/rejected": -0.16369199752807617, "logps/chosen": -168.54714965820312, "logps/rejected": -265.1011047363281, "loss": 0.2936, "rewards/accuracies": 0.875, "rewards/chosen": 1.0682289600372314, "rewards/margins": 2.0167641639709473, "rewards/rejected": -0.9485354423522949, "step": 28 }, { "epoch": 0.9481481481481482, "grad_norm": 1.2125921249389648, "learning_rate": 0.0001152542372881356, "logits/chosen": -0.15845011174678802, "logits/rejected": -0.18072912096977234, "logps/chosen": -180.09385681152344, "logps/rejected": -246.99932861328125, "loss": 0.2322, "rewards/accuracies": 0.96875, "rewards/chosen": 0.47764796018600464, "rewards/margins": 2.6183388233184814, "rewards/rejected": -2.140690803527832, "step": 32 }, { "epoch": 1.0666666666666667, "grad_norm": 1.5252747535705566, "learning_rate": 0.00010169491525423729, "logits/chosen": -0.2885245680809021, "logits/rejected": -0.2729518711566925, "logps/chosen": -171.0482940673828, "logps/rejected": -241.3873748779297, "loss": 0.1404, "rewards/accuracies": 0.96875, "rewards/chosen": 0.5696389079093933, "rewards/margins": 4.009322166442871, "rewards/rejected": -3.439683198928833, "step": 36 }, { "epoch": 1.1851851851851851, "grad_norm": 2.2474632263183594, "learning_rate": 8.813559322033899e-05, "logits/chosen": -0.2718096971511841, "logits/rejected": -0.15868759155273438, "logps/chosen": -174.72865295410156, "logps/rejected": -247.9293212890625, "loss": 0.1658, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2684851288795471, "rewards/margins": 3.3368659019470215, "rewards/rejected": -3.605351209640503, "step": 40 }, { "epoch": 1.3037037037037038, "grad_norm": 2.837970733642578, "learning_rate": 7.457627118644068e-05, "logits/chosen": -0.32800883054733276, "logits/rejected": -0.39847540855407715, "logps/chosen": -155.1390380859375, "logps/rejected": -274.467529296875, "loss": 0.1983, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5497639179229736, "rewards/margins": 4.935270309448242, "rewards/rejected": -4.385505676269531, "step": 44 }, { "epoch": 1.4222222222222223, "grad_norm": 0.9811547994613647, "learning_rate": 6.101694915254238e-05, "logits/chosen": -0.19821931421756744, "logits/rejected": -0.3745233416557312, "logps/chosen": -177.2127685546875, "logps/rejected": -233.73451232910156, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 0.5935864448547363, "rewards/margins": 4.892382621765137, "rewards/rejected": -4.298796653747559, "step": 48 }, { "epoch": 1.5407407407407407, "grad_norm": 0.5493781566619873, "learning_rate": 4.745762711864407e-05, "logits/chosen": -0.19866187870502472, "logits/rejected": -0.40253084897994995, "logps/chosen": -177.7249755859375, "logps/rejected": -257.3801574707031, "loss": 0.2262, "rewards/accuracies": 0.875, "rewards/chosen": 0.2777794599533081, "rewards/margins": 4.573861122131348, "rewards/rejected": -4.296081066131592, "step": 52 }, { "epoch": 1.6592592592592592, "grad_norm": 2.3989391326904297, "learning_rate": 3.389830508474576e-05, "logits/chosen": -0.2050989866256714, "logits/rejected": -0.3374294936656952, "logps/chosen": -150.08154296875, "logps/rejected": -275.553955078125, "loss": 0.171, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8470695614814758, "rewards/margins": 5.13323974609375, "rewards/rejected": -4.28617000579834, "step": 56 }, { "epoch": 1.7777777777777777, "grad_norm": 3.8446531295776367, "learning_rate": 2.033898305084746e-05, "logits/chosen": 0.011507619172334671, "logits/rejected": -0.2918490469455719, "logps/chosen": -179.98802185058594, "logps/rejected": -255.36036682128906, "loss": 0.1897, "rewards/accuracies": 0.875, "rewards/chosen": 1.0594336986541748, "rewards/margins": 5.518366813659668, "rewards/rejected": -4.458932876586914, "step": 60 }, { "epoch": 1.8962962962962964, "grad_norm": 2.1428377628326416, "learning_rate": 6.779661016949153e-06, "logits/chosen": -0.17610307037830353, "logits/rejected": -0.2640307545661926, "logps/chosen": -149.84603881835938, "logps/rejected": -261.7028503417969, "loss": 0.1876, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4072057008743286, "rewards/margins": 5.339127063751221, "rewards/rejected": -3.9319217205047607, "step": 64 } ], "logging_steps": 4, "max_steps": 66, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }