{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1076923076923078, "eval_steps": 5, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024615384615384615, "grad_norm": 76.71143242842001, "learning_rate": 5.88235294117647e-08, "logits/chosen": -0.8526538610458374, "logits/rejected": -0.8570448756217957, "logps/chosen": -23.282678604125977, "logps/rejected": -30.289661407470703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.04923076923076923, "grad_norm": 83.48860140376064, "learning_rate": 1.176470588235294e-07, "logits/chosen": -0.8658735752105713, "logits/rejected": -0.8668463826179504, "logps/chosen": -30.029348373413086, "logps/rejected": -32.66902160644531, "loss": 0.6971, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0059752073138952255, "rewards/margins": -0.010342838242650032, "rewards/rejected": 0.016318045556545258, "step": 4 }, { "epoch": 0.06153846153846154, "eval_logits/chosen": -0.8142690658569336, "eval_logits/rejected": -0.8192334771156311, "eval_logps/chosen": -29.028133392333984, "eval_logps/rejected": -38.206329345703125, "eval_loss": 0.6946294903755188, "eval_rewards/accuracies": 0.47602739930152893, "eval_rewards/chosen": -0.0063199191354215145, "eval_rewards/margins": 0.0024889421183615923, "eval_rewards/rejected": -0.00880886148661375, "eval_runtime": 507.7462, "eval_samples_per_second": 3.415, "eval_steps_per_second": 0.144, "step": 5 }, { "epoch": 0.07384615384615385, "grad_norm": 69.27587032017023, "learning_rate": 1.764705882352941e-07, "logits/chosen": -0.8687959909439087, "logits/rejected": -0.8719169497489929, "logps/chosen": -32.97465133666992, "logps/rejected": -38.870269775390625, "loss": 0.6921, "rewards/accuracies": 0.453125, "rewards/chosen": 0.006213514134287834, "rewards/margins": 0.019537178799510002, "rewards/rejected": -0.013323664665222168, "step": 6 }, { "epoch": 0.09846153846153846, "grad_norm": 62.33035519033225, "learning_rate": 2.352941176470588e-07, "logits/chosen": -0.8589476346969604, "logits/rejected": -0.8617635369300842, "logps/chosen": -32.01108932495117, "logps/rejected": -34.94812774658203, "loss": 0.6928, "rewards/accuracies": 0.515625, "rewards/chosen": -0.01385729480534792, "rewards/margins": -8.035916835069656e-05, "rewards/rejected": -0.013776935636997223, "step": 8 }, { "epoch": 0.12307692307692308, "grad_norm": 57.3164290353471, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.8420453071594238, "logits/rejected": -0.8434039354324341, "logps/chosen": -30.150243759155273, "logps/rejected": -33.6817626953125, "loss": 0.6827, "rewards/accuracies": 0.484375, "rewards/chosen": -0.021684179082512856, "rewards/margins": -0.011505719274282455, "rewards/rejected": -0.0101784598082304, "step": 10 }, { "epoch": 0.12307692307692308, "eval_logits/chosen": -0.8148170709609985, "eval_logits/rejected": -0.8197983503341675, "eval_logps/chosen": -29.04338264465332, "eval_logps/rejected": -38.422237396240234, "eval_loss": 0.6693353056907654, "eval_rewards/accuracies": 0.5650684833526611, "eval_rewards/chosen": -0.013945668935775757, "eval_rewards/margins": 0.10281600803136826, "eval_rewards/rejected": -0.11676166206598282, "eval_runtime": 507.283, "eval_samples_per_second": 3.418, "eval_steps_per_second": 0.144, "step": 10 }, { "epoch": 0.1476923076923077, "grad_norm": 54.695358572076394, "learning_rate": 3.529411764705882e-07, "logits/chosen": -0.8680934906005859, "logits/rejected": -0.8719905614852905, "logps/chosen": -28.021072387695312, "logps/rejected": -35.96692657470703, "loss": 0.6666, "rewards/accuracies": 0.5625, "rewards/chosen": -0.036122702062129974, "rewards/margins": 0.10737781226634979, "rewards/rejected": -0.14350052177906036, "step": 12 }, { "epoch": 0.1723076923076923, "grad_norm": 52.51352144095569, "learning_rate": 4.117647058823529e-07, "logits/chosen": -0.8596158027648926, "logits/rejected": -0.8575820922851562, "logps/chosen": -25.149404525756836, "logps/rejected": -24.465974807739258, "loss": 0.6357, "rewards/accuracies": 0.59375, "rewards/chosen": -0.029037628322839737, "rewards/margins": 0.0849144458770752, "rewards/rejected": -0.11395206302404404, "step": 14 }, { "epoch": 0.18461538461538463, "eval_logits/chosen": -0.8164225816726685, "eval_logits/rejected": -0.8213981986045837, "eval_logps/chosen": -29.094459533691406, "eval_logps/rejected": -39.09992599487305, "eval_loss": 0.6130890846252441, "eval_rewards/accuracies": 0.6575342416763306, "eval_rewards/chosen": -0.039482928812503815, "eval_rewards/margins": 0.4161252975463867, "eval_rewards/rejected": -0.4556082487106323, "eval_runtime": 506.4372, "eval_samples_per_second": 3.424, "eval_steps_per_second": 0.144, "step": 15 }, { "epoch": 0.19692307692307692, "grad_norm": 47.087567227271336, "learning_rate": 4.705882352941176e-07, "logits/chosen": -0.8515424728393555, "logits/rejected": -0.858619213104248, "logps/chosen": -21.42935562133789, "logps/rejected": -35.25102233886719, "loss": 0.6054, "rewards/accuracies": 0.625, "rewards/chosen": -0.047799061983823776, "rewards/margins": 0.34381866455078125, "rewards/rejected": -0.39161768555641174, "step": 16 }, { "epoch": 0.22153846153846155, "grad_norm": 45.639809270904706, "learning_rate": 4.99941324504621e-07, "logits/chosen": -0.8649481534957886, "logits/rejected": -0.8680992126464844, "logps/chosen": -26.61380386352539, "logps/rejected": -37.828311920166016, "loss": 0.5893, "rewards/accuracies": 0.75, "rewards/chosen": -0.06860450655221939, "rewards/margins": 0.5221824645996094, "rewards/rejected": -0.5907869338989258, "step": 18 }, { "epoch": 0.24615384615384617, "grad_norm": 42.712271689759746, "learning_rate": 4.99472085783721e-07, "logits/chosen": -0.8538618087768555, "logits/rejected": -0.8574154376983643, "logps/chosen": -26.50242805480957, "logps/rejected": -36.465850830078125, "loss": 0.5349, "rewards/accuracies": 0.796875, "rewards/chosen": -0.20561164617538452, "rewards/margins": 0.5988239645957947, "rewards/rejected": -0.8044356107711792, "step": 20 }, { "epoch": 0.24615384615384617, "eval_logits/chosen": -0.8193703889846802, "eval_logits/rejected": -0.8243635892868042, "eval_logps/chosen": -29.048322677612305, "eval_logps/rejected": -40.16524124145508, "eval_loss": 0.5301748514175415, "eval_rewards/accuracies": 0.7123287916183472, "eval_rewards/chosen": -0.016416184604167938, "eval_rewards/margins": 0.9718519449234009, "eval_rewards/rejected": -0.9882679581642151, "eval_runtime": 501.8283, "eval_samples_per_second": 3.455, "eval_steps_per_second": 0.145, "step": 20 }, { "epoch": 0.27076923076923076, "grad_norm": 34.55903445927571, "learning_rate": 4.985344892885899e-07, "logits/chosen": -0.8641754984855652, "logits/rejected": -0.8657775521278381, "logps/chosen": -29.697185516357422, "logps/rejected": -37.21875, "loss": 0.5547, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11109799891710281, "rewards/margins": 0.7374492287635803, "rewards/rejected": -0.8485472202301025, "step": 22 }, { "epoch": 0.2953846153846154, "grad_norm": 34.94086237660157, "learning_rate": 4.971302952586796e-07, "logits/chosen": -0.8593113422393799, "logits/rejected": -0.8664268255233765, "logps/chosen": -28.04953384399414, "logps/rejected": -47.995513916015625, "loss": 0.4738, "rewards/accuracies": 0.84375, "rewards/chosen": -0.05936732143163681, "rewards/margins": 1.61855947971344, "rewards/rejected": -1.6779268980026245, "step": 24 }, { "epoch": 0.3076923076923077, "eval_logits/chosen": -0.8194184899330139, "eval_logits/rejected": -0.8244072794914246, "eval_logps/chosen": -28.87408447265625, "eval_logps/rejected": -41.07496643066406, "eval_loss": 0.47070130705833435, "eval_rewards/accuracies": 0.715753436088562, "eval_rewards/chosen": 0.07070425897836685, "eval_rewards/margins": 1.513830304145813, "eval_rewards/rejected": -1.4431262016296387, "eval_runtime": 505.6197, "eval_samples_per_second": 3.429, "eval_steps_per_second": 0.144, "step": 25 }, { "epoch": 0.32, "grad_norm": 36.047711342996145, "learning_rate": 4.952621399215597e-07, "logits/chosen": -0.8540530204772949, "logits/rejected": -0.8563276529312134, "logps/chosen": -25.321256637573242, "logps/rejected": -26.884302139282227, "loss": 0.4982, "rewards/accuracies": 0.75, "rewards/chosen": 0.013541080057621002, "rewards/margins": 0.5806317329406738, "rewards/rejected": -0.5670906901359558, "step": 26 }, { "epoch": 0.3446153846153846, "grad_norm": 34.93615246503423, "learning_rate": 4.929335305436764e-07, "logits/chosen": -0.8274993300437927, "logits/rejected": -0.8279544115066528, "logps/chosen": -20.29788589477539, "logps/rejected": -38.79467010498047, "loss": 0.4679, "rewards/accuracies": 0.859375, "rewards/chosen": -0.06851354986429214, "rewards/margins": 1.8159857988357544, "rewards/rejected": -1.8844993114471436, "step": 28 }, { "epoch": 0.36923076923076925, "grad_norm": 34.651174705720734, "learning_rate": 4.901488388458247e-07, "logits/chosen": -0.8323720693588257, "logits/rejected": -0.8362505435943604, "logps/chosen": -23.299272537231445, "logps/rejected": -31.706920623779297, "loss": 0.4411, "rewards/accuracies": 0.703125, "rewards/chosen": -0.14743047952651978, "rewards/margins": 1.210200548171997, "rewards/rejected": -1.3576310873031616, "step": 30 }, { "epoch": 0.36923076923076925, "eval_logits/chosen": -0.8136431574821472, "eval_logits/rejected": -0.8185814023017883, "eval_logps/chosen": -28.679155349731445, "eval_logps/rejected": -42.067020416259766, "eval_loss": 0.40313246846199036, "eval_rewards/accuracies": 0.7465753555297852, "eval_rewards/chosen": 0.16816774010658264, "eval_rewards/margins": 2.1073246002197266, "eval_rewards/rejected": -1.9391568899154663, "eval_runtime": 511.2322, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.143, "step": 30 }, { "epoch": 0.39384615384615385, "grad_norm": 28.222143999842437, "learning_rate": 4.869132927957006e-07, "logits/chosen": -0.8474501371383667, "logits/rejected": -0.8529994487762451, "logps/chosen": -27.735734939575195, "logps/rejected": -40.067569732666016, "loss": 0.3871, "rewards/accuracies": 0.828125, "rewards/chosen": 0.20432481169700623, "rewards/margins": 2.164912462234497, "rewards/rejected": -1.960587739944458, "step": 32 }, { "epoch": 0.41846153846153844, "grad_norm": 23.28549229020256, "learning_rate": 4.832329667929376e-07, "logits/chosen": -0.8629348278045654, "logits/rejected": -0.8702473640441895, "logps/chosen": -20.325801849365234, "logps/rejected": -44.433528900146484, "loss": 0.3497, "rewards/accuracies": 0.921875, "rewards/chosen": 0.10242755711078644, "rewards/margins": 2.8072259426116943, "rewards/rejected": -2.7047982215881348, "step": 34 }, { "epoch": 0.4307692307692308, "eval_logits/chosen": -0.8100441098213196, "eval_logits/rejected": -0.8149632811546326, "eval_logps/chosen": -28.504199981689453, "eval_logps/rejected": -42.80283737182617, "eval_loss": 0.354584664106369, "eval_rewards/accuracies": 0.7739726305007935, "eval_rewards/chosen": 0.25564688444137573, "eval_rewards/margins": 2.5627098083496094, "eval_rewards/rejected": -2.307062864303589, "eval_runtime": 507.4648, "eval_samples_per_second": 3.417, "eval_steps_per_second": 0.144, "step": 35 }, { "epoch": 0.4430769230769231, "grad_norm": 28.694201633510016, "learning_rate": 4.791147702650565e-07, "logits/chosen": -0.8724276423454285, "logits/rejected": -0.8785867691040039, "logps/chosen": -22.125022888183594, "logps/rejected": -39.55330276489258, "loss": 0.3713, "rewards/accuracies": 0.828125, "rewards/chosen": 0.26476141810417175, "rewards/margins": 2.4366393089294434, "rewards/rejected": -2.171877861022949, "step": 36 }, { "epoch": 0.4676923076923077, "grad_norm": 25.53720719902276, "learning_rate": 4.745664346957361e-07, "logits/chosen": -0.8554552793502808, "logits/rejected": -0.8553615808486938, "logps/chosen": -33.38023376464844, "logps/rejected": -28.513282775878906, "loss": 0.3246, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2387702763080597, "rewards/margins": 1.5690217018127441, "rewards/rejected": -1.3302514553070068, "step": 38 }, { "epoch": 0.49230769230769234, "grad_norm": 21.696391543968662, "learning_rate": 4.695964991097616e-07, "logits/chosen": -0.8336246013641357, "logits/rejected": -0.8416473269462585, "logps/chosen": -22.365100860595703, "logps/rejected": -47.882694244384766, "loss": 0.346, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2181028425693512, "rewards/margins": 3.067445993423462, "rewards/rejected": -2.8493428230285645, "step": 40 }, { "epoch": 0.49230769230769234, "eval_logits/chosen": -0.8083286285400391, "eval_logits/rejected": -0.8132520914077759, "eval_logps/chosen": -28.318235397338867, "eval_logps/rejected": -43.32075881958008, "eval_loss": 0.3231227993965149, "eval_rewards/accuracies": 0.801369845867157, "eval_rewards/chosen": 0.3486267626285553, "eval_rewards/margins": 2.9146482944488525, "eval_rewards/rejected": -2.56602144241333, "eval_runtime": 507.9764, "eval_samples_per_second": 3.414, "eval_steps_per_second": 0.144, "step": 40 }, { "epoch": 0.5169230769230769, "grad_norm": 21.517342705892485, "learning_rate": 4.642142940418973e-07, "logits/chosen": -0.8719685077667236, "logits/rejected": -0.8759061694145203, "logps/chosen": -23.35956573486328, "logps/rejected": -28.740066528320312, "loss": 0.3379, "rewards/accuracies": 0.765625, "rewards/chosen": 0.2940492033958435, "rewards/margins": 1.586828589439392, "rewards/rejected": -1.2927793264389038, "step": 42 }, { "epoch": 0.5415384615384615, "grad_norm": 22.644901266797667, "learning_rate": 4.5842992401978256e-07, "logits/chosen": -0.8451048135757446, "logits/rejected": -0.847484827041626, "logps/chosen": -33.87168884277344, "logps/rejected": -46.649681091308594, "loss": 0.3148, "rewards/accuracies": 0.859375, "rewards/chosen": 0.3618454933166504, "rewards/margins": 3.3943986892700195, "rewards/rejected": -3.0325536727905273, "step": 44 }, { "epoch": 0.5538461538461539, "eval_logits/chosen": -0.8085425496101379, "eval_logits/rejected": -0.8135057687759399, "eval_logps/chosen": -28.0711727142334, "eval_logps/rejected": -43.83769226074219, "eval_loss": 0.2965641915798187, "eval_rewards/accuracies": 0.8082191944122314, "eval_rewards/chosen": 0.472160279750824, "eval_rewards/margins": 3.296651840209961, "eval_rewards/rejected": -2.824491500854492, "eval_runtime": 508.0249, "eval_samples_per_second": 3.413, "eval_steps_per_second": 0.144, "step": 45 }, { "epoch": 0.5661538461538461, "grad_norm": 20.742372238934433, "learning_rate": 4.5225424859373684e-07, "logits/chosen": -0.8579592108726501, "logits/rejected": -0.8616234064102173, "logps/chosen": -21.818204879760742, "logps/rejected": -30.142559051513672, "loss": 0.2855, "rewards/accuracies": 0.78125, "rewards/chosen": 0.27730488777160645, "rewards/margins": 1.6132526397705078, "rewards/rejected": -1.335947871208191, "step": 46 }, { "epoch": 0.5907692307692308, "grad_norm": 23.021619175104203, "learning_rate": 4.456988619490889e-07, "logits/chosen": -0.8618453741073608, "logits/rejected": -0.8655754923820496, "logps/chosen": -22.56821060180664, "logps/rejected": -36.07129669189453, "loss": 0.2874, "rewards/accuracies": 0.859375, "rewards/chosen": 0.2860993444919586, "rewards/margins": 2.503146171569824, "rewards/rejected": -2.2170469760894775, "step": 48 }, { "epoch": 0.6153846153846154, "grad_norm": 19.42806196152356, "learning_rate": 4.3877607113930516e-07, "logits/chosen": -0.8602553009986877, "logits/rejected": -0.8559304475784302, "logps/chosen": -29.165634155273438, "logps/rejected": -24.462434768676758, "loss": 0.2644, "rewards/accuracies": 0.875, "rewards/chosen": 0.5700158476829529, "rewards/margins": 2.1351168155670166, "rewards/rejected": -1.5651010274887085, "step": 50 }, { "epoch": 0.6153846153846154, "eval_logits/chosen": -0.812079131603241, "eval_logits/rejected": -0.8171290159225464, "eval_logps/chosen": -27.80472183227539, "eval_logps/rejected": -44.38835906982422, "eval_loss": 0.27530089020729065, "eval_rewards/accuracies": 0.8047945499420166, "eval_rewards/chosen": 0.6053856015205383, "eval_rewards/margins": 3.7052102088928223, "eval_rewards/rejected": -3.0998241901397705, "eval_runtime": 507.1773, "eval_samples_per_second": 3.419, "eval_steps_per_second": 0.144, "step": 50 }, { "epoch": 0.64, "grad_norm": 19.235430940038626, "learning_rate": 4.314988729807827e-07, "logits/chosen": -0.877213716506958, "logits/rejected": -0.8788630366325378, "logps/chosen": -35.67780685424805, "logps/rejected": -39.79027557373047, "loss": 0.261, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6618055701255798, "rewards/margins": 2.99965763092041, "rewards/rejected": -2.3378520011901855, "step": 52 }, { "epoch": 0.6646153846153846, "grad_norm": 18.497275752898116, "learning_rate": 4.238809296526846e-07, "logits/chosen": -0.872668445110321, "logits/rejected": -0.8716113567352295, "logps/chosen": -30.384735107421875, "logps/rejected": -35.33108139038086, "loss": 0.2597, "rewards/accuracies": 0.875, "rewards/chosen": 0.40745919942855835, "rewards/margins": 2.814540386199951, "rewards/rejected": -2.407081365585327, "step": 54 }, { "epoch": 0.676923076923077, "eval_logits/chosen": -0.817668616771698, "eval_logits/rejected": -0.8228325247764587, "eval_logps/chosen": -27.571548461914062, "eval_logps/rejected": -44.867122650146484, "eval_loss": 0.2588183581829071, "eval_rewards/accuracies": 0.8047945499420166, "eval_rewards/chosen": 0.7219717502593994, "eval_rewards/margins": 4.061178207397461, "eval_rewards/rejected": -3.3392062187194824, "eval_runtime": 512.6554, "eval_samples_per_second": 3.382, "eval_steps_per_second": 0.142, "step": 55 }, { "epoch": 0.6892307692307692, "grad_norm": 19.164588052012302, "learning_rate": 4.159365430476261e-07, "logits/chosen": -0.8461377024650574, "logits/rejected": -0.8481893539428711, "logps/chosen": -23.498355865478516, "logps/rejected": -27.572532653808594, "loss": 0.254, "rewards/accuracies": 0.828125, "rewards/chosen": 0.4778762459754944, "rewards/margins": 2.123842716217041, "rewards/rejected": -1.645966649055481, "step": 56 }, { "epoch": 0.7138461538461538, "grad_norm": 18.695688461758973, "learning_rate": 4.076806279213655e-07, "logits/chosen": -0.8788058757781982, "logits/rejected": -0.8795627951622009, "logps/chosen": -27.387178421020508, "logps/rejected": -24.94676971435547, "loss": 0.2539, "rewards/accuracies": 0.84375, "rewards/chosen": 0.39298391342163086, "rewards/margins": 2.3424487113952637, "rewards/rejected": -1.9494649171829224, "step": 58 }, { "epoch": 0.7384615384615385, "grad_norm": 13.869174391468711, "learning_rate": 3.991286838919086e-07, "logits/chosen": -0.8662111163139343, "logits/rejected": -0.8639529943466187, "logps/chosen": -29.126564025878906, "logps/rejected": -26.93923568725586, "loss": 0.2163, "rewards/accuracies": 0.828125, "rewards/chosen": 0.5371630787849426, "rewards/margins": 2.0503089427948, "rewards/rejected": -1.5131456851959229, "step": 60 }, { "epoch": 0.7384615384615385, "eval_logits/chosen": -0.8232029676437378, "eval_logits/rejected": -0.8284507989883423, "eval_logps/chosen": -27.39695167541504, "eval_logps/rejected": -45.318092346191406, "eval_loss": 0.24679508805274963, "eval_rewards/accuracies": 0.801369845867157, "eval_rewards/chosen": 0.8092703819274902, "eval_rewards/margins": 4.373960494995117, "eval_rewards/rejected": -3.5646896362304688, "eval_runtime": 504.2778, "eval_samples_per_second": 3.439, "eval_steps_per_second": 0.145, "step": 60 }, { "epoch": 0.7630769230769231, "grad_norm": 19.770687722416966, "learning_rate": 3.902967663405956e-07, "logits/chosen": -0.8548551201820374, "logits/rejected": -0.8548004031181335, "logps/chosen": -23.15505599975586, "logps/rejected": -25.987648010253906, "loss": 0.2459, "rewards/accuracies": 0.828125, "rewards/chosen": 0.9548823237419128, "rewards/margins": 2.6681408882141113, "rewards/rejected": -1.7132583856582642, "step": 62 }, { "epoch": 0.7876923076923077, "grad_norm": 17.375432493553873, "learning_rate": 3.8120145626980015e-07, "logits/chosen": -0.837457537651062, "logits/rejected": -0.8325684070587158, "logps/chosen": -28.196165084838867, "logps/rejected": -29.63174819946289, "loss": 0.226, "rewards/accuracies": 0.75, "rewards/chosen": 0.5275837779045105, "rewards/margins": 2.420928716659546, "rewards/rejected": -1.8933448791503906, "step": 64 }, { "epoch": 0.8, "eval_logits/chosen": -0.8285362124443054, "eval_logits/rejected": -0.8338667750358582, "eval_logps/chosen": -27.281187057495117, "eval_logps/rejected": -45.72236251831055, "eval_loss": 0.2386258840560913, "eval_rewards/accuracies": 0.8082191944122314, "eval_rewards/chosen": 0.8671532869338989, "eval_rewards/margins": 4.633976936340332, "eval_rewards/rejected": -3.7668235301971436, "eval_runtime": 504.9865, "eval_samples_per_second": 3.434, "eval_steps_per_second": 0.145, "step": 65 }, { "epoch": 0.8123076923076923, "grad_norm": 16.23920440465993, "learning_rate": 3.718598291738298e-07, "logits/chosen": -0.8503263592720032, "logits/rejected": -0.8503541350364685, "logps/chosen": -26.526954650878906, "logps/rejected": -33.03715133666992, "loss": 0.2399, "rewards/accuracies": 0.828125, "rewards/chosen": 0.8604952096939087, "rewards/margins": 3.0948374271392822, "rewards/rejected": -2.234342336654663, "step": 66 }, { "epoch": 0.8369230769230769, "grad_norm": 16.54810979887679, "learning_rate": 3.622894229814698e-07, "logits/chosen": -0.8387259840965271, "logits/rejected": -0.8485996127128601, "logps/chosen": -19.66301155090332, "logps/rejected": -59.002899169921875, "loss": 0.2031, "rewards/accuracies": 0.796875, "rewards/chosen": 0.6494669318199158, "rewards/margins": 6.0868682861328125, "rewards/rejected": -5.43740177154541, "step": 68 }, { "epoch": 0.8615384615384616, "grad_norm": 14.045977696001755, "learning_rate": 3.52508205130354e-07, "logits/chosen": -0.8623999953269958, "logits/rejected": -0.8682447075843811, "logps/chosen": -26.10405158996582, "logps/rejected": -45.96481704711914, "loss": 0.1936, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3354326486587524, "rewards/margins": 5.366407871246338, "rewards/rejected": -4.030975341796875, "step": 70 }, { "epoch": 0.8615384615384616, "eval_logits/chosen": -0.8313784003257751, "eval_logits/rejected": -0.83674556016922, "eval_logps/chosen": -27.231040954589844, "eval_logps/rejected": -46.05835723876953, "eval_loss": 0.23013119399547577, "eval_rewards/accuracies": 0.8116438388824463, "eval_rewards/chosen": 0.8922267556190491, "eval_rewards/margins": 4.827047824859619, "eval_rewards/rejected": -3.9348206520080566, "eval_runtime": 505.5211, "eval_samples_per_second": 3.43, "eval_steps_per_second": 0.144, "step": 70 }, { "epoch": 0.8861538461538462, "grad_norm": 20.76852884591744, "learning_rate": 3.4253453883497864e-07, "logits/chosen": -0.8721504211425781, "logits/rejected": -0.8774159550666809, "logps/chosen": -27.097929000854492, "logps/rejected": -44.38412094116211, "loss": 0.2111, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6299474239349365, "rewards/margins": 4.718615531921387, "rewards/rejected": -4.088667869567871, "step": 72 }, { "epoch": 0.9107692307692308, "grad_norm": 21.511447954249494, "learning_rate": 3.323871486116851e-07, "logits/chosen": -0.8902064561843872, "logits/rejected": -0.8880136013031006, "logps/chosen": -26.3861141204834, "logps/rejected": -28.770051956176758, "loss": 0.2491, "rewards/accuracies": 0.890625, "rewards/chosen": 0.5782642364501953, "rewards/margins": 2.829862594604492, "rewards/rejected": -2.251598358154297, "step": 74 }, { "epoch": 0.9230769230769231, "eval_logits/chosen": -0.8325244188308716, "eval_logits/rejected": -0.8379253149032593, "eval_logps/chosen": -27.20527458190918, "eval_logps/rejected": -46.30801010131836, "eval_loss": 0.22474366426467896, "eval_rewards/accuracies": 0.8150684833526611, "eval_rewards/chosen": 0.9051090478897095, "eval_rewards/margins": 4.96475887298584, "eval_rewards/rejected": -4.05964994430542, "eval_runtime": 519.4473, "eval_samples_per_second": 3.338, "eval_steps_per_second": 0.141, "step": 75 }, { "epoch": 0.9353846153846154, "grad_norm": 15.058617576147917, "learning_rate": 3.220850851253377e-07, "logits/chosen": -0.8858978748321533, "logits/rejected": -0.8874486684799194, "logps/chosen": -26.12487030029297, "logps/rejected": -37.51970672607422, "loss": 0.2158, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6246981024742126, "rewards/margins": 3.997086763381958, "rewards/rejected": -3.372389316558838, "step": 76 }, { "epoch": 0.96, "grad_norm": 18.385729369704148, "learning_rate": 3.1164768942369053e-07, "logits/chosen": -0.8896593451499939, "logits/rejected": -0.8969188332557678, "logps/chosen": -16.990447998046875, "logps/rejected": -45.557037353515625, "loss": 0.2207, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7743335366249084, "rewards/margins": 4.943416118621826, "rewards/rejected": -4.1690826416015625, "step": 78 }, { "epoch": 0.9846153846153847, "grad_norm": 17.268106016496986, "learning_rate": 3.010945566265912e-07, "logits/chosen": -0.8736187219619751, "logits/rejected": -0.876487672328949, "logps/chosen": -28.458038330078125, "logps/rejected": -34.158485412597656, "loss": 0.2315, "rewards/accuracies": 0.875, "rewards/chosen": 0.3357160687446594, "rewards/margins": 3.1702334880828857, "rewards/rejected": -2.834517478942871, "step": 80 }, { "epoch": 0.9846153846153847, "eval_logits/chosen": -0.8342627882957458, "eval_logits/rejected": -0.8397004008293152, "eval_logps/chosen": -27.177371978759766, "eval_logps/rejected": -46.54536437988281, "eval_loss": 0.21979747712612152, "eval_rewards/accuracies": 0.8287671208381653, "eval_rewards/chosen": 0.9190611243247986, "eval_rewards/margins": 5.097388744354248, "eval_rewards/rejected": -4.178327560424805, "eval_runtime": 503.3189, "eval_samples_per_second": 3.445, "eval_steps_per_second": 0.145, "step": 80 }, { "epoch": 1.0092307692307692, "grad_norm": 28.182829085656643, "learning_rate": 2.9044549913819124e-07, "logits/chosen": -0.8684386014938354, "logits/rejected": -0.8742809891700745, "logps/chosen": -25.187530517578125, "logps/rejected": -54.615386962890625, "loss": 0.2079, "rewards/accuracies": 0.890625, "rewards/chosen": 0.7747483849525452, "rewards/margins": 6.844926834106445, "rewards/rejected": -6.070178985595703, "step": 82 }, { "epoch": 1.0338461538461539, "grad_norm": 13.964979005113792, "learning_rate": 2.797205094512266e-07, "logits/chosen": -0.8816483020782471, "logits/rejected": -0.8857114315032959, "logps/chosen": -28.65145492553711, "logps/rejected": -51.351768493652344, "loss": 0.1902, "rewards/accuracies": 0.859375, "rewards/chosen": 1.2536072731018066, "rewards/margins": 6.389583587646484, "rewards/rejected": -5.1359758377075195, "step": 84 }, { "epoch": 1.0461538461538462, "eval_logits/chosen": -0.8372805714607239, "eval_logits/rejected": -0.8427817225456238, "eval_logps/chosen": -27.166318893432617, "eval_logps/rejected": -46.72504806518555, "eval_loss": 0.21650995314121246, "eval_rewards/accuracies": 0.818493127822876, "eval_rewards/chosen": 0.924587070941925, "eval_rewards/margins": 5.192756652832031, "eval_rewards/rejected": -4.268169403076172, "eval_runtime": 500.3676, "eval_samples_per_second": 3.465, "eval_steps_per_second": 0.146, "step": 85 }, { "epoch": 1.0584615384615386, "grad_norm": 7.945903874330553, "learning_rate": 2.6893972261320264e-07, "logits/chosen": -0.8936392068862915, "logits/rejected": -0.8996983170509338, "logps/chosen": -22.51564598083496, "logps/rejected": -45.285972595214844, "loss": 0.1474, "rewards/accuracies": 0.921875, "rewards/chosen": 1.0068135261535645, "rewards/margins": 4.962003231048584, "rewards/rejected": -3.9551897048950195, "step": 86 }, { "epoch": 1.083076923076923, "grad_norm": 12.2186980926015, "learning_rate": 2.5812337842494516e-07, "logits/chosen": -0.8699577450752258, "logits/rejected": -0.8725451827049255, "logps/chosen": -29.06435203552246, "logps/rejected": -37.989776611328125, "loss": 0.1718, "rewards/accuracies": 0.9375, "rewards/chosen": 0.28801125288009644, "rewards/margins": 3.8652138710021973, "rewards/rejected": -3.577202796936035, "step": 88 }, { "epoch": 1.1076923076923078, "grad_norm": 10.256960594018043, "learning_rate": 2.4729178344249006e-07, "logits/chosen": -0.9174846410751343, "logits/rejected": -0.9205011129379272, "logps/chosen": -29.678579330444336, "logps/rejected": -41.8637580871582, "loss": 0.168, "rewards/accuracies": 0.875, "rewards/chosen": 0.9668111801147461, "rewards/margins": 5.104186058044434, "rewards/rejected": -4.1373748779296875, "step": 90 }, { "epoch": 1.1076923076923078, "eval_logits/chosen": -0.840661346912384, "eval_logits/rejected": -0.8462072014808655, "eval_logps/chosen": -27.118879318237305, "eval_logps/rejected": -46.88822555541992, "eval_loss": 0.21390603482723236, "eval_rewards/accuracies": 0.818493127822876, "eval_rewards/chosen": 0.9483062624931335, "eval_rewards/margins": 5.298061847686768, "eval_rewards/rejected": -4.349754810333252, "eval_runtime": 504.3162, "eval_samples_per_second": 3.438, "eval_steps_per_second": 0.145, "step": 90 } ], "logging_steps": 2, "max_steps": 162, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }