{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 34, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 1.922844648361206, "debug/policy_chosen_logps": -408.2371520996094, "debug/policy_rejected_logits": 1.7306561470031738, "debug/policy_rejected_logps": -435.7992248535156, "debug/reference_chosen_logps": -408.2371520996094, "debug/reference_rejected_logps": -435.7992248535156, "epoch": 0.029411764705882353, "grad_norm": 4.137213790923358, "learning_rate": 1e-06, "logits/chosen": 1.922844648361206, "logits/rejected": 1.7306561470031738, "logps/chosen": -408.2371520996094, "logps/rejected": -435.7992248535156, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 1.8619999885559082, "debug/policy_chosen_logps": -427.6330871582031, "debug/policy_rejected_logits": 1.6860817670822144, "debug/policy_rejected_logps": -461.19268798828125, "debug/reference_chosen_logps": -427.9091491699219, "debug/reference_rejected_logps": -461.35797119140625, "epoch": 0.058823529411764705, "grad_norm": 4.258600986002571, "learning_rate": 1e-06, "logits/chosen": 1.8619999885559082, "logits/rejected": 1.6860817670822144, "logps/chosen": -427.6330871582031, "logps/rejected": -461.19268798828125, "loss": 0.4995, "rewards/accuracies": 0.625, "rewards/chosen": 0.0027605819050222635, "rewards/margins": 0.001107597490772605, "rewards/rejected": 0.0016529845306649804, "step": 2 }, { "debug/policy_chosen_logits": 1.72812020778656, "debug/policy_chosen_logps": -454.173095703125, "debug/policy_rejected_logits": 1.7377153635025024, "debug/policy_rejected_logps": -429.8233337402344, "debug/reference_chosen_logps": -453.95745849609375, "debug/reference_rejected_logps": -429.6916809082031, "epoch": 0.08823529411764706, "grad_norm": 3.8342179309910014, "learning_rate": 1e-06, "logits/chosen": 1.72812020778656, "logits/rejected": 1.7377153635025024, "logps/chosen": -454.173095703125, "logps/rejected": -429.8233337402344, "loss": 0.5013, "rewards/accuracies": 0.625, "rewards/chosen": -0.002156372182071209, "rewards/margins": -0.0008401109953410923, "rewards/rejected": -0.0013162612449377775, "step": 3 }, { "debug/policy_chosen_logits": 1.666637897491455, "debug/policy_chosen_logps": -395.48492431640625, "debug/policy_rejected_logits": 2.09572434425354, "debug/policy_rejected_logps": -416.84332275390625, "debug/reference_chosen_logps": -395.9959716796875, "debug/reference_rejected_logps": -416.729736328125, "epoch": 0.11764705882352941, "grad_norm": 4.148905007134245, "learning_rate": 1e-06, "logits/chosen": 1.666637897491455, "logits/rejected": 2.09572434425354, "logps/chosen": -395.48492431640625, "logps/rejected": -416.84332275390625, "loss": 0.499, "rewards/accuracies": 0.875, "rewards/chosen": 0.005110206548124552, "rewards/margins": 0.006246108561754227, "rewards/rejected": -0.0011359023628756404, "step": 4 }, { "debug/policy_chosen_logits": 1.725293517112732, "debug/policy_chosen_logps": -450.16796875, "debug/policy_rejected_logits": 1.8551357984542847, "debug/policy_rejected_logps": -515.6107788085938, "debug/reference_chosen_logps": -450.22454833984375, "debug/reference_rejected_logps": -515.43017578125, "epoch": 0.14705882352941177, "grad_norm": 6.612597098633739, "learning_rate": 1e-06, "logits/chosen": 1.725293517112732, "logits/rejected": 1.8551357984542847, "logps/chosen": -450.16796875, "logps/rejected": -515.6107788085938, "loss": 0.4987, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005657197907567024, "rewards/margins": 0.0023711395915597677, "rewards/rejected": -0.0018054196843877435, "step": 5 }, { "debug/policy_chosen_logits": 1.5482574701309204, "debug/policy_chosen_logps": -406.3376159667969, "debug/policy_rejected_logits": 1.9273473024368286, "debug/policy_rejected_logps": -443.02490234375, "debug/reference_chosen_logps": -406.5204162597656, "debug/reference_rejected_logps": -442.7237854003906, "epoch": 0.17647058823529413, "grad_norm": 4.0254140902734425, "learning_rate": 1e-06, "logits/chosen": 1.5482574701309204, "logits/rejected": 1.9273473024368286, "logps/chosen": -406.3376159667969, "logps/rejected": -443.02490234375, "loss": 0.4979, "rewards/accuracies": 0.5, "rewards/chosen": 0.0018279263749718666, "rewards/margins": 0.004839172121137381, "rewards/rejected": -0.0030112455133348703, "step": 6 }, { "debug/policy_chosen_logits": 1.9279674291610718, "debug/policy_chosen_logps": -441.6047058105469, "debug/policy_rejected_logits": 1.8206740617752075, "debug/policy_rejected_logps": -477.34332275390625, "debug/reference_chosen_logps": -441.14202880859375, "debug/reference_rejected_logps": -477.178955078125, "epoch": 0.20588235294117646, "grad_norm": 4.1758431593611585, "learning_rate": 1e-06, "logits/chosen": 1.9279674291610718, "logits/rejected": 1.8206740617752075, "logps/chosen": -441.6047058105469, "logps/rejected": -477.34332275390625, "loss": 0.5011, "rewards/accuracies": 0.5, "rewards/chosen": -0.00462696049362421, "rewards/margins": -0.0029835125897079706, "rewards/rejected": -0.0016434479039162397, "step": 7 }, { "debug/policy_chosen_logits": 1.8140891790390015, "debug/policy_chosen_logps": -433.96484375, "debug/policy_rejected_logits": 1.9336739778518677, "debug/policy_rejected_logps": -443.0285949707031, "debug/reference_chosen_logps": -433.82769775390625, "debug/reference_rejected_logps": -443.1069641113281, "epoch": 0.23529411764705882, "grad_norm": 4.052042922921632, "learning_rate": 1e-06, "logits/chosen": 1.8140891790390015, "logits/rejected": 1.9336739778518677, "logps/chosen": -433.96484375, "logps/rejected": -443.0285949707031, "loss": 0.4981, "rewards/accuracies": 0.25, "rewards/chosen": -0.0013714599190279841, "rewards/margins": -0.002155113033950329, "rewards/rejected": 0.0007836532313376665, "step": 8 }, { "debug/policy_chosen_logits": 1.7726589441299438, "debug/policy_chosen_logps": -411.8170471191406, "debug/policy_rejected_logits": 1.9146279096603394, "debug/policy_rejected_logps": -453.87408447265625, "debug/reference_chosen_logps": -411.6216125488281, "debug/reference_rejected_logps": -453.7139892578125, "epoch": 0.2647058823529412, "grad_norm": 4.431752041021209, "learning_rate": 1e-06, "logits/chosen": 1.7726589441299438, "logits/rejected": 1.9146279096603394, "logps/chosen": -411.8170471191406, "logps/rejected": -453.87408447265625, "loss": 0.4998, "rewards/accuracies": 0.625, "rewards/chosen": -0.001954421866685152, "rewards/margins": -0.00035381317138671875, "rewards/rejected": -0.001600608928129077, "step": 9 }, { "debug/policy_chosen_logits": 1.8682191371917725, "debug/policy_chosen_logps": -384.9290771484375, "debug/policy_rejected_logits": 1.776376485824585, "debug/policy_rejected_logps": -445.5006103515625, "debug/reference_chosen_logps": -384.94744873046875, "debug/reference_rejected_logps": -444.9970703125, "epoch": 0.29411764705882354, "grad_norm": 4.119382998606939, "learning_rate": 1e-06, "logits/chosen": 1.8682191371917725, "logits/rejected": 1.776376485824585, "logps/chosen": -384.9290771484375, "logps/rejected": -445.5006103515625, "loss": 0.4983, "rewards/accuracies": 0.875, "rewards/chosen": 0.0001836773008108139, "rewards/margins": 0.005218924954533577, "rewards/rejected": -0.005035247653722763, "step": 10 }, { "debug/policy_chosen_logits": 1.7481721639633179, "debug/policy_chosen_logps": -446.7349853515625, "debug/policy_rejected_logits": 1.8622864484786987, "debug/policy_rejected_logps": -448.75958251953125, "debug/reference_chosen_logps": -446.6666259765625, "debug/reference_rejected_logps": -449.26861572265625, "epoch": 0.3235294117647059, "grad_norm": 4.314334923529288, "learning_rate": 1e-06, "logits/chosen": 1.7481721639633179, "logits/rejected": 1.8622864484786987, "logps/chosen": -446.7349853515625, "logps/rejected": -448.75958251953125, "loss": 0.5045, "rewards/accuracies": 0.375, "rewards/chosen": -0.0006834029918536544, "rewards/margins": -0.005773734766989946, "rewards/rejected": 0.0050903321243822575, "step": 11 }, { "debug/policy_chosen_logits": 1.673924446105957, "debug/policy_chosen_logps": -415.7007141113281, "debug/policy_rejected_logits": 1.6867071390151978, "debug/policy_rejected_logps": -432.5093078613281, "debug/reference_chosen_logps": -416.0489807128906, "debug/reference_rejected_logps": -432.8421630859375, "epoch": 0.35294117647058826, "grad_norm": 4.2999680871604555, "learning_rate": 1e-06, "logits/chosen": 1.673924446105957, "logits/rejected": 1.6867071390151978, "logps/chosen": -415.7007141113281, "logps/rejected": -432.5093078613281, "loss": 0.5013, "rewards/accuracies": 0.375, "rewards/chosen": 0.003482742002233863, "rewards/margins": 0.00015388487372547388, "rewards/rejected": 0.0033288574777543545, "step": 12 }, { "debug/policy_chosen_logits": 1.7633516788482666, "debug/policy_chosen_logps": -502.5340270996094, "debug/policy_rejected_logits": 1.9169749021530151, "debug/policy_rejected_logps": -468.00848388671875, "debug/reference_chosen_logps": -502.9071960449219, "debug/reference_rejected_logps": -467.6253356933594, "epoch": 0.38235294117647056, "grad_norm": 4.076909715786231, "learning_rate": 1e-06, "logits/chosen": 1.7633516788482666, "logits/rejected": 1.9169749021530151, "logps/chosen": -502.5340270996094, "logps/rejected": -468.00848388671875, "loss": 0.4964, "rewards/accuracies": 0.75, "rewards/chosen": 0.0037316130474209785, "rewards/margins": 0.007563132792711258, "rewards/rejected": -0.003831519978120923, "step": 13 }, { "debug/policy_chosen_logits": 1.8117332458496094, "debug/policy_chosen_logps": -421.8986511230469, "debug/policy_rejected_logits": 1.8946248292922974, "debug/policy_rejected_logps": -440.38092041015625, "debug/reference_chosen_logps": -421.764404296875, "debug/reference_rejected_logps": -440.68359375, "epoch": 0.4117647058823529, "grad_norm": 4.488141272538123, "learning_rate": 1e-06, "logits/chosen": 1.8117332458496094, "logits/rejected": 1.8946248292922974, "logps/chosen": -421.8986511230469, "logps/rejected": -440.38092041015625, "loss": 0.4988, "rewards/accuracies": 0.25, "rewards/chosen": -0.0013426971854642034, "rewards/margins": -0.004369430243968964, "rewards/rejected": 0.0030267334077507257, "step": 14 }, { "debug/policy_chosen_logits": 1.816784381866455, "debug/policy_chosen_logps": -473.92059326171875, "debug/policy_rejected_logits": 1.803492784500122, "debug/policy_rejected_logps": -467.276611328125, "debug/reference_chosen_logps": -473.57781982421875, "debug/reference_rejected_logps": -466.53240966796875, "epoch": 0.4411764705882353, "grad_norm": 7.5452438692613955, "learning_rate": 1e-06, "logits/chosen": 1.816784381866455, "logits/rejected": 1.803492784500122, "logps/chosen": -473.92059326171875, "logps/rejected": -467.276611328125, "loss": 0.499, "rewards/accuracies": 0.625, "rewards/chosen": -0.003427658462896943, "rewards/margins": 0.004014625214040279, "rewards/rejected": -0.007442283444106579, "step": 15 }, { "debug/policy_chosen_logits": 1.764467477798462, "debug/policy_chosen_logps": -469.1932067871094, "debug/policy_rejected_logits": 1.8750910758972168, "debug/policy_rejected_logps": -425.4044494628906, "debug/reference_chosen_logps": -468.972412109375, "debug/reference_rejected_logps": -426.2842102050781, "epoch": 0.47058823529411764, "grad_norm": 4.6018762866711755, "learning_rate": 1e-06, "logits/chosen": 1.764467477798462, "logits/rejected": 1.8750910758972168, "logps/chosen": -469.1932067871094, "logps/rejected": -425.4044494628906, "loss": 0.4944, "rewards/accuracies": 0.125, "rewards/chosen": -0.0022079083137214184, "rewards/margins": -0.011005439795553684, "rewards/rejected": 0.008797531947493553, "step": 16 }, { "debug/policy_chosen_logits": 1.6022701263427734, "debug/policy_chosen_logps": -402.24810791015625, "debug/policy_rejected_logits": 1.6826766729354858, "debug/policy_rejected_logps": -399.31292724609375, "debug/reference_chosen_logps": -402.65313720703125, "debug/reference_rejected_logps": -400.0274963378906, "epoch": 0.5, "grad_norm": 4.154364572648437, "learning_rate": 1e-06, "logits/chosen": 1.6022701263427734, "logits/rejected": 1.6826766729354858, "logps/chosen": -402.24810791015625, "logps/rejected": -399.31292724609375, "loss": 0.4986, "rewards/accuracies": 0.375, "rewards/chosen": 0.004050330724567175, "rewards/margins": -0.003095397725701332, "rewards/rejected": 0.007145728915929794, "step": 17 }, { "debug/policy_chosen_logits": 1.6320748329162598, "debug/policy_chosen_logps": -402.7867431640625, "debug/policy_rejected_logits": 1.6496365070343018, "debug/policy_rejected_logps": -405.9878845214844, "debug/reference_chosen_logps": -403.00579833984375, "debug/reference_rejected_logps": -405.14886474609375, "epoch": 0.5294117647058824, "grad_norm": 4.087671831481816, "learning_rate": 1e-06, "logits/chosen": 1.6320748329162598, "logits/rejected": 1.6496365070343018, "logps/chosen": -402.7867431640625, "logps/rejected": -405.9878845214844, "loss": 0.4992, "rewards/accuracies": 0.75, "rewards/chosen": 0.0021905899047851562, "rewards/margins": 0.010580749250948429, "rewards/rejected": -0.008390159346163273, "step": 18 }, { "debug/policy_chosen_logits": 1.796993613243103, "debug/policy_chosen_logps": -465.04254150390625, "debug/policy_rejected_logits": 1.6831399202346802, "debug/policy_rejected_logps": -412.93450927734375, "debug/reference_chosen_logps": -465.1028747558594, "debug/reference_rejected_logps": -413.2862548828125, "epoch": 0.5588235294117647, "grad_norm": 5.9430676221534515, "learning_rate": 1e-06, "logits/chosen": 1.796993613243103, "logits/rejected": 1.6831399202346802, "logps/chosen": -465.04254150390625, "logps/rejected": -412.93450927734375, "loss": 0.4954, "rewards/accuracies": 0.5, "rewards/chosen": 0.0006032180972397327, "rewards/margins": -0.002914008917286992, "rewards/rejected": 0.003517227014526725, "step": 19 }, { "debug/policy_chosen_logits": 1.8579211235046387, "debug/policy_chosen_logps": -392.926513671875, "debug/policy_rejected_logits": 1.7997496128082275, "debug/policy_rejected_logps": -451.51373291015625, "debug/reference_chosen_logps": -393.1466979980469, "debug/reference_rejected_logps": -451.5287170410156, "epoch": 0.5882352941176471, "grad_norm": 5.324465364710387, "learning_rate": 1e-06, "logits/chosen": 1.8579211235046387, "logits/rejected": 1.7997496128082275, "logps/chosen": -392.926513671875, "logps/rejected": -451.51373291015625, "loss": 0.5013, "rewards/accuracies": 0.5, "rewards/chosen": 0.002201843075454235, "rewards/margins": 0.002052116207778454, "rewards/rejected": 0.00014972680946812034, "step": 20 }, { "debug/policy_chosen_logits": 1.6464605331420898, "debug/policy_chosen_logps": -452.0064697265625, "debug/policy_rejected_logits": 1.7376089096069336, "debug/policy_rejected_logps": -407.5381164550781, "debug/reference_chosen_logps": -452.4896240234375, "debug/reference_rejected_logps": -407.4380798339844, "epoch": 0.6176470588235294, "grad_norm": 4.087685828427492, "learning_rate": 1e-06, "logits/chosen": 1.6464605331420898, "logits/rejected": 1.7376089096069336, "logps/chosen": -452.0064697265625, "logps/rejected": -407.5381164550781, "loss": 0.494, "rewards/accuracies": 0.875, "rewards/chosen": 0.004831466358155012, "rewards/margins": 0.005831870716065168, "rewards/rejected": -0.0010004041250795126, "step": 21 }, { "debug/policy_chosen_logits": 1.6895753145217896, "debug/policy_chosen_logps": -498.2047119140625, "debug/policy_rejected_logits": 1.7272963523864746, "debug/policy_rejected_logps": -456.6476135253906, "debug/reference_chosen_logps": -497.6826171875, "debug/reference_rejected_logps": -455.4757080078125, "epoch": 0.6470588235294118, "grad_norm": 4.051966833827472, "learning_rate": 1e-06, "logits/chosen": 1.6895753145217896, "logits/rejected": 1.7272963523864746, "logps/chosen": -498.2047119140625, "logps/rejected": -456.6476135253906, "loss": 0.4983, "rewards/accuracies": 0.625, "rewards/chosen": -0.005220795050263405, "rewards/margins": 0.006498374976217747, "rewards/rejected": -0.011719169095158577, "step": 22 }, { "debug/policy_chosen_logits": 1.6556023359298706, "debug/policy_chosen_logps": -484.2706298828125, "debug/policy_rejected_logits": 1.6021193265914917, "debug/policy_rejected_logps": -434.21087646484375, "debug/reference_chosen_logps": -484.40423583984375, "debug/reference_rejected_logps": -432.92205810546875, "epoch": 0.6764705882352942, "grad_norm": 3.997044307316433, "learning_rate": 1e-06, "logits/chosen": 1.6556023359298706, "logits/rejected": 1.6021193265914917, "logps/chosen": -484.2706298828125, "logps/rejected": -434.21087646484375, "loss": 0.4954, "rewards/accuracies": 0.75, "rewards/chosen": 0.0013361359015107155, "rewards/margins": 0.014224356971681118, "rewards/rejected": -0.012888221070170403, "step": 23 }, { "debug/policy_chosen_logits": 1.5961276292800903, "debug/policy_chosen_logps": -341.4281921386719, "debug/policy_rejected_logits": 1.661395788192749, "debug/policy_rejected_logps": -431.88909912109375, "debug/reference_chosen_logps": -341.4843444824219, "debug/reference_rejected_logps": -431.17218017578125, "epoch": 0.7058823529411765, "grad_norm": 3.754277051742812, "learning_rate": 1e-06, "logits/chosen": 1.5961276292800903, "logits/rejected": 1.661395788192749, "logps/chosen": -341.4281921386719, "logps/rejected": -431.88909912109375, "loss": 0.4945, "rewards/accuracies": 0.75, "rewards/chosen": 0.0005615615518763661, "rewards/margins": 0.007730789016932249, "rewards/rejected": -0.0071692271158099174, "step": 24 }, { "debug/policy_chosen_logits": 1.8066837787628174, "debug/policy_chosen_logps": -423.83929443359375, "debug/policy_rejected_logits": 1.7892667055130005, "debug/policy_rejected_logps": -402.4519958496094, "debug/reference_chosen_logps": -423.56634521484375, "debug/reference_rejected_logps": -402.6119689941406, "epoch": 0.7352941176470589, "grad_norm": 4.029266390640827, "learning_rate": 1e-06, "logits/chosen": 1.8066837787628174, "logits/rejected": 1.7892667055130005, "logps/chosen": -423.83929443359375, "logps/rejected": -402.4519958496094, "loss": 0.4968, "rewards/accuracies": 0.375, "rewards/chosen": -0.0027294543106108904, "rewards/margins": -0.004329109098762274, "rewards/rejected": 0.0015996552538126707, "step": 25 }, { "debug/policy_chosen_logits": 1.6511558294296265, "debug/policy_chosen_logps": -388.6883239746094, "debug/policy_rejected_logits": 1.493870496749878, "debug/policy_rejected_logps": -381.03460693359375, "debug/reference_chosen_logps": -389.6356201171875, "debug/reference_rejected_logps": -381.1142578125, "epoch": 0.7647058823529411, "grad_norm": 3.921722844855713, "learning_rate": 1e-06, "logits/chosen": 1.6511558294296265, "logits/rejected": 1.493870496749878, "logps/chosen": -388.6883239746094, "logps/rejected": -381.03460693359375, "loss": 0.4967, "rewards/accuracies": 0.625, "rewards/chosen": 0.009473076090216637, "rewards/margins": 0.008676452562212944, "rewards/rejected": 0.000796623295173049, "step": 26 }, { "debug/policy_chosen_logits": 1.7741831541061401, "debug/policy_chosen_logps": -442.03326416015625, "debug/policy_rejected_logits": 1.5636245012283325, "debug/policy_rejected_logps": -439.15057373046875, "debug/reference_chosen_logps": -442.7723388671875, "debug/reference_rejected_logps": -438.6939697265625, "epoch": 0.7941176470588235, "grad_norm": 4.600836077028018, "learning_rate": 1e-06, "logits/chosen": 1.7741831541061401, "logits/rejected": 1.5636245012283325, "logps/chosen": -442.03326416015625, "logps/rejected": -439.15057373046875, "loss": 0.4955, "rewards/accuracies": 0.75, "rewards/chosen": 0.007390594109892845, "rewards/margins": 0.01195678673684597, "rewards/rejected": -0.004566192161291838, "step": 27 }, { "debug/policy_chosen_logits": 1.500828742980957, "debug/policy_chosen_logps": -424.57275390625, "debug/policy_rejected_logits": 1.5941702127456665, "debug/policy_rejected_logps": -436.2298583984375, "debug/reference_chosen_logps": -425.43853759765625, "debug/reference_rejected_logps": -437.13531494140625, "epoch": 0.8235294117647058, "grad_norm": 6.011602987170812, "learning_rate": 1e-06, "logits/chosen": 1.500828742980957, "logits/rejected": 1.5941702127456665, "logps/chosen": -424.57275390625, "logps/rejected": -436.2298583984375, "loss": 0.4945, "rewards/accuracies": 0.375, "rewards/chosen": 0.008657912723720074, "rewards/margins": -0.0003968430683016777, "rewards/rejected": 0.009054755792021751, "step": 28 }, { "debug/policy_chosen_logits": 1.7050580978393555, "debug/policy_chosen_logps": -439.244873046875, "debug/policy_rejected_logits": 1.5073894262313843, "debug/policy_rejected_logps": -438.55352783203125, "debug/reference_chosen_logps": -439.664794921875, "debug/reference_rejected_logps": -438.343994140625, "epoch": 0.8529411764705882, "grad_norm": 5.064453786722452, "learning_rate": 1e-06, "logits/chosen": 1.7050580978393555, "logits/rejected": 1.5073894262313843, "logps/chosen": -439.244873046875, "logps/rejected": -438.55352783203125, "loss": 0.489, "rewards/accuracies": 0.5, "rewards/chosen": 0.00419910391792655, "rewards/margins": 0.006294555030763149, "rewards/rejected": -0.002095451345667243, "step": 29 }, { "debug/policy_chosen_logits": 1.6806142330169678, "debug/policy_chosen_logps": -460.19696044921875, "debug/policy_rejected_logits": 1.9049626588821411, "debug/policy_rejected_logps": -478.7205810546875, "debug/reference_chosen_logps": -462.6081848144531, "debug/reference_rejected_logps": -476.98736572265625, "epoch": 0.8823529411764706, "grad_norm": 4.211886639336461, "learning_rate": 1e-06, "logits/chosen": 1.6806142330169678, "logits/rejected": 1.9049626588821411, "logps/chosen": -460.19696044921875, "logps/rejected": -478.7205810546875, "loss": 0.4935, "rewards/accuracies": 0.875, "rewards/chosen": 0.02411239594221115, "rewards/margins": 0.04144474118947983, "rewards/rejected": -0.017332345247268677, "step": 30 }, { "debug/policy_chosen_logits": 1.8234918117523193, "debug/policy_chosen_logps": -453.6963806152344, "debug/policy_rejected_logits": 1.7467701435089111, "debug/policy_rejected_logps": -419.4752502441406, "debug/reference_chosen_logps": -454.2802734375, "debug/reference_rejected_logps": -419.27642822265625, "epoch": 0.9117647058823529, "grad_norm": 4.432618295795027, "learning_rate": 1e-06, "logits/chosen": 1.8234918117523193, "logits/rejected": 1.7467701435089111, "logps/chosen": -453.6963806152344, "logps/rejected": -419.4752502441406, "loss": 0.4906, "rewards/accuracies": 0.75, "rewards/chosen": 0.005838965997099876, "rewards/margins": 0.007827376946806908, "rewards/rejected": -0.0019884109497070312, "step": 31 }, { "debug/policy_chosen_logits": 1.5167354345321655, "debug/policy_chosen_logps": -412.47796630859375, "debug/policy_rejected_logits": 1.886920690536499, "debug/policy_rejected_logps": -420.974365234375, "debug/reference_chosen_logps": -413.27532958984375, "debug/reference_rejected_logps": -420.81805419921875, "epoch": 0.9411764705882353, "grad_norm": 4.118143439705179, "learning_rate": 1e-06, "logits/chosen": 1.5167354345321655, "logits/rejected": 1.886920690536499, "logps/chosen": -412.47796630859375, "logps/rejected": -420.974365234375, "loss": 0.4916, "rewards/accuracies": 0.875, "rewards/chosen": 0.007973899133503437, "rewards/margins": 0.009537162259221077, "rewards/rejected": -0.0015632628928869963, "step": 32 }, { "debug/policy_chosen_logits": 1.4519771337509155, "debug/policy_chosen_logps": -411.4702453613281, "debug/policy_rejected_logits": 1.562813401222229, "debug/policy_rejected_logps": -458.34564208984375, "debug/reference_chosen_logps": -411.874755859375, "debug/reference_rejected_logps": -457.6955871582031, "epoch": 0.9705882352941176, "grad_norm": 16.71508641927412, "learning_rate": 1e-06, "logits/chosen": 1.4519771337509155, "logits/rejected": 1.562813401222229, "logps/chosen": -411.4702453613281, "logps/rejected": -458.34564208984375, "loss": 0.4945, "rewards/accuracies": 0.625, "rewards/chosen": 0.004045028239488602, "rewards/margins": 0.010545616038143635, "rewards/rejected": -0.006500587798655033, "step": 33 }, { "debug/policy_chosen_logits": 1.642136573791504, "debug/policy_chosen_logps": -465.1538391113281, "debug/policy_rejected_logits": 1.6247493028640747, "debug/policy_rejected_logps": -454.161376953125, "debug/reference_chosen_logps": -464.42230224609375, "debug/reference_rejected_logps": -453.6876220703125, "epoch": 1.0, "grad_norm": 5.047352534944459, "learning_rate": 1e-06, "logits/chosen": 1.642136573791504, "logits/rejected": 1.6247493028640747, "logps/chosen": -465.1538391113281, "logps/rejected": -454.161376953125, "loss": 0.4703, "rewards/accuracies": 0.5, "rewards/chosen": -0.0073150633834302425, "rewards/margins": -0.002577248029410839, "rewards/rejected": -0.004737815819680691, "step": 34 }, { "epoch": 1.0, "step": 34, "total_flos": 0.0, "train_loss": 0.49640727744382973, "train_runtime": 426.3178, "train_samples_per_second": 5.06, "train_steps_per_second": 0.08 } ], "logging_steps": 1, "max_steps": 34, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }