simonycl's picture
Upload folder using huggingface_hub
4555faf verified
raw
history blame
51.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984168865435357,
"eval_steps": 400,
"global_step": 473,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021108179419525065,
"grad_norm": 3.841525938161017,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -1.5679885149002075,
"logits/rejected": -1.4838868379592896,
"logps/chosen": -273.748046875,
"logps/rejected": -278.32440185546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010554089709762533,
"grad_norm": 4.075044604292173,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -1.8661268949508667,
"logits/rejected": -1.663633108139038,
"logps/chosen": -259.7994384765625,
"logps/rejected": -272.9507751464844,
"loss": 0.6931,
"rewards/accuracies": 0.4453125,
"rewards/chosen": 0.0006091540562920272,
"rewards/margins": 0.0006048179930076003,
"rewards/rejected": 4.33622335549444e-06,
"step": 5
},
{
"epoch": 0.021108179419525065,
"grad_norm": 3.8938427277220327,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -1.9186642169952393,
"logits/rejected": -1.7813522815704346,
"logps/chosen": -260.3355407714844,
"logps/rejected": -277.6410217285156,
"loss": 0.6933,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0007230077171698213,
"rewards/margins": -0.0004294753889553249,
"rewards/rejected": 0.0011524828150868416,
"step": 10
},
{
"epoch": 0.0316622691292876,
"grad_norm": 4.232192731720217,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -1.9166736602783203,
"logits/rejected": -1.6127517223358154,
"logps/chosen": -262.7110900878906,
"logps/rejected": -288.9376525878906,
"loss": 0.6931,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0007503399974666536,
"rewards/margins": 6.939703598618507e-05,
"rewards/rejected": 0.0006809430196881294,
"step": 15
},
{
"epoch": 0.04221635883905013,
"grad_norm": 4.119849835606016,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -1.8074525594711304,
"logits/rejected": -1.6753528118133545,
"logps/chosen": -288.84808349609375,
"logps/rejected": -297.88995361328125,
"loss": 0.6932,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.0002116250980179757,
"rewards/margins": -0.000452941982075572,
"rewards/rejected": 0.00024131681129802018,
"step": 20
},
{
"epoch": 0.052770448548812667,
"grad_norm": 4.422447549074996,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -1.8519093990325928,
"logits/rejected": -1.6747506856918335,
"logps/chosen": -276.16290283203125,
"logps/rejected": -283.3067932128906,
"loss": 0.6924,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0022073048166930676,
"rewards/margins": 0.001611467800103128,
"rewards/rejected": -0.0038187727332115173,
"step": 25
},
{
"epoch": 0.0633245382585752,
"grad_norm": 4.140769853407654,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.8203039169311523,
"logits/rejected": -1.6214573383331299,
"logps/chosen": -254.4104461669922,
"logps/rejected": -275.9024353027344,
"loss": 0.6916,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.0038712085224688053,
"rewards/margins": 0.004021945409476757,
"rewards/rejected": -0.00789315439760685,
"step": 30
},
{
"epoch": 0.07387862796833773,
"grad_norm": 4.0748094829519985,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -1.7195453643798828,
"logits/rejected": -1.5980784893035889,
"logps/chosen": -277.2474060058594,
"logps/rejected": -279.6336364746094,
"loss": 0.6903,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.011106612160801888,
"rewards/margins": 0.005168012343347073,
"rewards/rejected": -0.016274623572826385,
"step": 35
},
{
"epoch": 0.08443271767810026,
"grad_norm": 4.037161343642648,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -1.8530025482177734,
"logits/rejected": -1.6534423828125,
"logps/chosen": -250.5609893798828,
"logps/rejected": -266.48681640625,
"loss": 0.6878,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.013290290720760822,
"rewards/margins": 0.01362483762204647,
"rewards/rejected": -0.026915129274129868,
"step": 40
},
{
"epoch": 0.09498680738786279,
"grad_norm": 4.20201566482073,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -1.8621238470077515,
"logits/rejected": -1.7357890605926514,
"logps/chosen": -259.96875,
"logps/rejected": -273.11651611328125,
"loss": 0.6849,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.02979857288300991,
"rewards/margins": 0.019030530005693436,
"rewards/rejected": -0.0488291010260582,
"step": 45
},
{
"epoch": 0.10554089709762533,
"grad_norm": 4.392167523026418,
"learning_rate": 4.999726797933858e-07,
"logits/chosen": -1.9742714166641235,
"logits/rejected": -1.761182188987732,
"logps/chosen": -272.1903381347656,
"logps/rejected": -285.57098388671875,
"loss": 0.6753,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.048685222864151,
"rewards/margins": 0.03682791069149971,
"rewards/rejected": -0.08551312983036041,
"step": 50
},
{
"epoch": 0.11609498680738786,
"grad_norm": 6.0936366972280105,
"learning_rate": 4.99665396039775e-07,
"logits/chosen": -1.9219143390655518,
"logits/rejected": -1.8215105533599854,
"logps/chosen": -269.31439208984375,
"logps/rejected": -276.80401611328125,
"loss": 0.659,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.10569655895233154,
"rewards/margins": 0.0726684108376503,
"rewards/rejected": -0.17836496233940125,
"step": 55
},
{
"epoch": 0.1266490765171504,
"grad_norm": 7.231191310156758,
"learning_rate": 4.99017099386437e-07,
"logits/chosen": -2.0729923248291016,
"logits/rejected": -1.9367930889129639,
"logps/chosen": -298.20849609375,
"logps/rejected": -349.7650146484375,
"loss": 0.6298,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.31456637382507324,
"rewards/margins": 0.3051101565361023,
"rewards/rejected": -0.6196764707565308,
"step": 60
},
{
"epoch": 0.13720316622691292,
"grad_norm": 67.13648614495237,
"learning_rate": 4.980286753286194e-07,
"logits/chosen": -2.2857210636138916,
"logits/rejected": -2.1148781776428223,
"logps/chosen": -369.61749267578125,
"logps/rejected": -430.94732666015625,
"loss": 0.6277,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1958519220352173,
"rewards/margins": 0.49135223031044006,
"rewards/rejected": -1.6872040033340454,
"step": 65
},
{
"epoch": 0.14775725593667546,
"grad_norm": 9.715273109578154,
"learning_rate": 4.967014739346915e-07,
"logits/chosen": -2.3191657066345215,
"logits/rejected": -2.0927023887634277,
"logps/chosen": -352.59075927734375,
"logps/rejected": -438.1763610839844,
"loss": 0.5858,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8432434797286987,
"rewards/margins": 0.6638648509979248,
"rewards/rejected": -1.5071083307266235,
"step": 70
},
{
"epoch": 0.158311345646438,
"grad_norm": 9.799570258257988,
"learning_rate": 4.950373080021136e-07,
"logits/chosen": -2.159883499145508,
"logits/rejected": -2.089489459991455,
"logps/chosen": -327.1300964355469,
"logps/rejected": -372.9543762207031,
"loss": 0.5733,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.625116229057312,
"rewards/margins": 0.409213125705719,
"rewards/rejected": -1.0343292951583862,
"step": 75
},
{
"epoch": 0.16886543535620052,
"grad_norm": 21.779152085184286,
"learning_rate": 4.930384505813737e-07,
"logits/chosen": -2.304996967315674,
"logits/rejected": -2.1810271739959717,
"logps/chosen": -355.3009033203125,
"logps/rejected": -471.39892578125,
"loss": 0.5459,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0548468828201294,
"rewards/margins": 0.9830275774002075,
"rewards/rejected": -2.037874221801758,
"step": 80
},
{
"epoch": 0.17941952506596306,
"grad_norm": 14.56820002316678,
"learning_rate": 4.907076318712738e-07,
"logits/chosen": -2.2340409755706787,
"logits/rejected": -2.080930233001709,
"logps/chosen": -413.451416015625,
"logps/rejected": -522.9191284179688,
"loss": 0.5408,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.4273126125335693,
"rewards/margins": 0.9603279829025269,
"rewards/rejected": -2.3876404762268066,
"step": 85
},
{
"epoch": 0.18997361477572558,
"grad_norm": 15.919341883386638,
"learning_rate": 4.88048035489807e-07,
"logits/chosen": -2.174340009689331,
"logits/rejected": -2.168853998184204,
"logps/chosen": -394.6278076171875,
"logps/rejected": -461.028564453125,
"loss": 0.5463,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2512483596801758,
"rewards/margins": 0.6085057854652405,
"rewards/rejected": -1.859754204750061,
"step": 90
},
{
"epoch": 0.20052770448548812,
"grad_norm": 23.30417545081651,
"learning_rate": 4.85063294125718e-07,
"logits/chosen": -2.1903815269470215,
"logits/rejected": -2.19649076461792,
"logps/chosen": -459.72283935546875,
"logps/rejected": -530.1971435546875,
"loss": 0.5459,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.8961833715438843,
"rewards/margins": 0.6760674715042114,
"rewards/rejected": -2.5722508430480957,
"step": 95
},
{
"epoch": 0.21108179419525067,
"grad_norm": 11.60980371327302,
"learning_rate": 4.817574845766874e-07,
"logits/chosen": -2.358705997467041,
"logits/rejected": -2.307624340057373,
"logps/chosen": -447.1853942871094,
"logps/rejected": -532.86279296875,
"loss": 0.5137,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9070106744766235,
"rewards/margins": 0.7790099382400513,
"rewards/rejected": -2.686020612716675,
"step": 100
},
{
"epoch": 0.22163588390501318,
"grad_norm": 14.306450146724028,
"learning_rate": 4.781351221809166e-07,
"logits/chosen": -2.2865371704101562,
"logits/rejected": -2.176837921142578,
"logps/chosen": -432.4977111816406,
"logps/rejected": -542.9056396484375,
"loss": 0.5261,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.7739086151123047,
"rewards/margins": 0.9299384951591492,
"rewards/rejected": -2.7038469314575195,
"step": 105
},
{
"epoch": 0.23218997361477572,
"grad_norm": 10.269899188048251,
"learning_rate": 4.742011546497182e-07,
"logits/chosen": -2.2152955532073975,
"logits/rejected": -2.1580278873443604,
"logps/chosen": -439.315185546875,
"logps/rejected": -549.2676391601562,
"loss": 0.494,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7390915155410767,
"rewards/margins": 0.9396551847457886,
"rewards/rejected": -2.6787467002868652,
"step": 110
},
{
"epoch": 0.24274406332453827,
"grad_norm": 16.644175161757378,
"learning_rate": 4.6996095530953875e-07,
"logits/chosen": -2.3286213874816895,
"logits/rejected": -2.2058520317077637,
"logps/chosen": -506.5923767089844,
"logps/rejected": -658.1654052734375,
"loss": 0.4994,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.3760502338409424,
"rewards/margins": 1.3993351459503174,
"rewards/rejected": -3.7753853797912598,
"step": 115
},
{
"epoch": 0.2532981530343008,
"grad_norm": 18.458409874645245,
"learning_rate": 4.654203157626399e-07,
"logits/chosen": -2.363788366317749,
"logits/rejected": -2.2831900119781494,
"logps/chosen": -476.95831298828125,
"logps/rejected": -650.87841796875,
"loss": 0.4745,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.333024501800537,
"rewards/margins": 1.5511460304260254,
"rewards/rejected": -3.8841705322265625,
"step": 120
},
{
"epoch": 0.2638522427440633,
"grad_norm": 24.30561683820342,
"learning_rate": 4.605854379764673e-07,
"logits/chosen": -2.2180769443511963,
"logits/rejected": -2.1058664321899414,
"logps/chosen": -458.69317626953125,
"logps/rejected": -573.3502807617188,
"loss": 0.4683,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0290579795837402,
"rewards/margins": 1.0193411111831665,
"rewards/rejected": -3.048398971557617,
"step": 125
},
{
"epoch": 0.27440633245382584,
"grad_norm": 25.019298570271868,
"learning_rate": 4.5546292581250857e-07,
"logits/chosen": -2.2698774337768555,
"logits/rejected": -2.150057554244995,
"logps/chosen": -563.2131958007812,
"logps/rejected": -722.5281372070312,
"loss": 0.4752,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.93915057182312,
"rewards/margins": 1.441446304321289,
"rewards/rejected": -4.380597114562988,
"step": 130
},
{
"epoch": 0.2849604221635884,
"grad_norm": 10.994821669390042,
"learning_rate": 4.5005977600621275e-07,
"logits/chosen": -2.243281841278076,
"logits/rejected": -2.2170357704162598,
"logps/chosen": -536.69970703125,
"logps/rejected": -645.5635986328125,
"loss": 0.4739,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.6128830909729004,
"rewards/margins": 1.0102598667144775,
"rewards/rejected": -3.623142957687378,
"step": 135
},
{
"epoch": 0.2955145118733509,
"grad_norm": 16.90701177792478,
"learning_rate": 4.443833686102919e-07,
"logits/chosen": -2.1392781734466553,
"logits/rejected": -2.0879039764404297,
"logps/chosen": -433.86590576171875,
"logps/rejected": -533.6943359375,
"loss": 0.4645,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.7928444147109985,
"rewards/margins": 0.9198592901229858,
"rewards/rejected": -2.712703227996826,
"step": 140
},
{
"epoch": 0.30606860158311344,
"grad_norm": 23.854657702935985,
"learning_rate": 4.384414569144561e-07,
"logits/chosen": -2.3052217960357666,
"logits/rejected": -2.207017421722412,
"logps/chosen": -529.6088256835938,
"logps/rejected": -723.9100341796875,
"loss": 0.4979,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.7643752098083496,
"rewards/margins": 1.82810378074646,
"rewards/rejected": -4.592479228973389,
"step": 145
},
{
"epoch": 0.316622691292876,
"grad_norm": 19.353784387057143,
"learning_rate": 4.3224215685535287e-07,
"logits/chosen": -2.0858356952667236,
"logits/rejected": -1.950209617614746,
"logps/chosen": -505.2822265625,
"logps/rejected": -661.0929565429688,
"loss": 0.4656,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.391045093536377,
"rewards/margins": 1.4024264812469482,
"rewards/rejected": -3.793471097946167,
"step": 150
},
{
"epoch": 0.32717678100263853,
"grad_norm": 17.72909970129764,
"learning_rate": 4.2579393593117364e-07,
"logits/chosen": -2.0300238132476807,
"logits/rejected": -1.9049923419952393,
"logps/chosen": -496.39324951171875,
"logps/rejected": -680.350341796875,
"loss": 0.4412,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.427093982696533,
"rewards/margins": 1.5917612314224243,
"rewards/rejected": -4.018855094909668,
"step": 155
},
{
"epoch": 0.33773087071240104,
"grad_norm": 17.1778742252489,
"learning_rate": 4.191056016360699e-07,
"logits/chosen": -2.215439558029175,
"logits/rejected": -2.1087276935577393,
"logps/chosen": -615.9310302734375,
"logps/rejected": -818.6203002929688,
"loss": 0.4622,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.431640148162842,
"rewards/margins": 1.9439836740493774,
"rewards/rejected": -5.37562370300293,
"step": 160
},
{
"epoch": 0.3482849604221636,
"grad_norm": 17.97809867221494,
"learning_rate": 4.121862894301754e-07,
"logits/chosen": -2.0415732860565186,
"logits/rejected": -1.94220769405365,
"logps/chosen": -498.63116455078125,
"logps/rejected": -657.0416259765625,
"loss": 0.479,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2521657943725586,
"rewards/margins": 1.3823236227035522,
"rewards/rejected": -3.6344895362854004,
"step": 165
},
{
"epoch": 0.35883905013192613,
"grad_norm": 30.072934787327185,
"learning_rate": 4.050454502616667e-07,
"logits/chosen": -2.118239164352417,
"logits/rejected": -2.090146541595459,
"logps/chosen": -526.2330322265625,
"logps/rejected": -668.2966918945312,
"loss": 0.4648,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.6902260780334473,
"rewards/margins": 1.3285554647445679,
"rewards/rejected": -4.0187811851501465,
"step": 170
},
{
"epoch": 0.36939313984168864,
"grad_norm": 29.035498895998003,
"learning_rate": 3.976928376579047e-07,
"logits/chosen": -2.3821628093719482,
"logits/rejected": -2.2632079124450684,
"logps/chosen": -557.0284423828125,
"logps/rejected": -771.6123657226562,
"loss": 0.4449,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.932424545288086,
"rewards/margins": 1.8806273937225342,
"rewards/rejected": -4.813051223754883,
"step": 175
},
{
"epoch": 0.37994722955145116,
"grad_norm": 15.777007984898162,
"learning_rate": 3.9013849440328945e-07,
"logits/chosen": -2.286719560623169,
"logits/rejected": -2.162851095199585,
"logps/chosen": -564.4080200195312,
"logps/rejected": -720.1937255859375,
"loss": 0.467,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.908557415008545,
"rewards/margins": 1.4215553998947144,
"rewards/rejected": -4.330113410949707,
"step": 180
},
{
"epoch": 0.39050131926121373,
"grad_norm": 12.25325652821894,
"learning_rate": 3.8239273882202473e-07,
"logits/chosen": -2.195413589477539,
"logits/rejected": -2.2209365367889404,
"logps/chosen": -495.92938232421875,
"logps/rejected": -645.3634643554688,
"loss": 0.469,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.377912759780884,
"rewards/margins": 1.3328666687011719,
"rewards/rejected": -3.7107791900634766,
"step": 185
},
{
"epoch": 0.40105540897097625,
"grad_norm": 13.405956669044865,
"learning_rate": 3.7446615068452804e-07,
"logits/chosen": -2.128485918045044,
"logits/rejected": -2.0320448875427246,
"logps/chosen": -500.07598876953125,
"logps/rejected": -665.8009643554688,
"loss": 0.4456,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.381704092025757,
"rewards/margins": 1.4976516962051392,
"rewards/rejected": -3.8793559074401855,
"step": 190
},
{
"epoch": 0.41160949868073876,
"grad_norm": 16.531263865887837,
"learning_rate": 3.6636955675673743e-07,
"logits/chosen": -2.1537322998046875,
"logits/rejected": -2.151557207107544,
"logps/chosen": -563.8980102539062,
"logps/rejected": -719.9155883789062,
"loss": 0.4301,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.8693251609802246,
"rewards/margins": 1.4603594541549683,
"rewards/rejected": -4.329684734344482,
"step": 195
},
{
"epoch": 0.42216358839050133,
"grad_norm": 25.62641100404869,
"learning_rate": 3.5811401601205093e-07,
"logits/chosen": -2.1722164154052734,
"logits/rejected": -2.2210490703582764,
"logps/chosen": -547.766845703125,
"logps/rejected": -697.3842163085938,
"loss": 0.4585,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.807692766189575,
"rewards/margins": 1.545601725578308,
"rewards/rejected": -4.353294372558594,
"step": 200
},
{
"epoch": 0.43271767810026385,
"grad_norm": 15.253711310557463,
"learning_rate": 3.497108045260995e-07,
"logits/chosen": -2.0688979625701904,
"logits/rejected": -2.104271173477173,
"logps/chosen": -529.1517333984375,
"logps/rejected": -676.9817504882812,
"loss": 0.4423,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.4731733798980713,
"rewards/margins": 1.4461132287979126,
"rewards/rejected": -3.9192867279052734,
"step": 205
},
{
"epoch": 0.44327176781002636,
"grad_norm": 24.083715768462596,
"learning_rate": 3.411714000749838e-07,
"logits/chosen": -2.2706260681152344,
"logits/rejected": -2.135749340057373,
"logps/chosen": -541.0875854492188,
"logps/rejected": -750.7264404296875,
"loss": 0.4354,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.909759283065796,
"rewards/margins": 1.8729289770126343,
"rewards/rejected": -4.782688140869141,
"step": 210
},
{
"epoch": 0.45382585751978893,
"grad_norm": 35.56607178592358,
"learning_rate": 3.3250746645801287e-07,
"logits/chosen": -2.263277769088745,
"logits/rejected": -2.205223560333252,
"logps/chosen": -608.2554931640625,
"logps/rejected": -830.9841918945312,
"loss": 0.4321,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.611112117767334,
"rewards/margins": 2.0616540908813477,
"rewards/rejected": -5.672766208648682,
"step": 215
},
{
"epoch": 0.46437994722955145,
"grad_norm": 15.718670222248921,
"learning_rate": 3.237308375663571e-07,
"logits/chosen": -2.230881452560425,
"logits/rejected": -2.121683359146118,
"logps/chosen": -576.1820678710938,
"logps/rejected": -769.9691772460938,
"loss": 0.3944,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.302262783050537,
"rewards/margins": 1.794217824935913,
"rewards/rejected": -5.096480369567871,
"step": 220
},
{
"epoch": 0.47493403693931396,
"grad_norm": 19.204923979579966,
"learning_rate": 3.148535012193767e-07,
"logits/chosen": -2.1568782329559326,
"logits/rejected": -2.092639684677124,
"logps/chosen": -615.4882202148438,
"logps/rejected": -833.5153198242188,
"loss": 0.3871,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.338430881500244,
"rewards/margins": 2.001122236251831,
"rewards/rejected": -5.339552879333496,
"step": 225
},
{
"epoch": 0.48548812664907653,
"grad_norm": 23.052920344271605,
"learning_rate": 3.0588758279070183e-07,
"logits/chosen": -2.2185590267181396,
"logits/rejected": -2.13350772857666,
"logps/chosen": -622.9224853515625,
"logps/rejected": -836.8287353515625,
"loss": 0.4125,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.6510891914367676,
"rewards/margins": 1.8919486999511719,
"rewards/rejected": -5.5430378913879395,
"step": 230
},
{
"epoch": 0.49604221635883905,
"grad_norm": 16.46282996942275,
"learning_rate": 2.968453286464312e-07,
"logits/chosen": -2.097414493560791,
"logits/rejected": -2.146594524383545,
"logps/chosen": -590.5551147460938,
"logps/rejected": -758.9312744140625,
"loss": 0.4164,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.0970864295959473,
"rewards/margins": 1.6357967853546143,
"rewards/rejected": -4.732882976531982,
"step": 235
},
{
"epoch": 0.5065963060686016,
"grad_norm": 26.112494003766066,
"learning_rate": 2.8773908941806877e-07,
"logits/chosen": -2.0698182582855225,
"logits/rejected": -2.076683521270752,
"logps/chosen": -617.1507568359375,
"logps/rejected": -853.2135620117188,
"loss": 0.3982,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.5680668354034424,
"rewards/margins": 2.141855001449585,
"rewards/rejected": -5.709921836853027,
"step": 240
},
{
"epoch": 0.5171503957783641,
"grad_norm": 20.932946542012903,
"learning_rate": 2.785813031330473e-07,
"logits/chosen": -2.1454832553863525,
"logits/rejected": -2.16323184967041,
"logps/chosen": -661.7200317382812,
"logps/rejected": -866.1280517578125,
"loss": 0.4092,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.027346134185791,
"rewards/margins": 1.9131158590316772,
"rewards/rejected": -5.940462112426758,
"step": 245
},
{
"epoch": 0.5277044854881267,
"grad_norm": 15.896790069729533,
"learning_rate": 2.693844782258779e-07,
"logits/chosen": -2.030596971511841,
"logits/rejected": -1.9922313690185547,
"logps/chosen": -571.3850708007812,
"logps/rejected": -776.018310546875,
"loss": 0.3852,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -3.086763620376587,
"rewards/margins": 1.9301198720932007,
"rewards/rejected": -5.016883850097656,
"step": 250
},
{
"epoch": 0.5382585751978892,
"grad_norm": 40.59897974633979,
"learning_rate": 2.601611764531342e-07,
"logits/chosen": -2.153049945831299,
"logits/rejected": -2.1268014907836914,
"logps/chosen": -659.8489990234375,
"logps/rejected": -876.6301879882812,
"loss": 0.4062,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.000391483306885,
"rewards/margins": 2.1393191814422607,
"rewards/rejected": -6.139710426330566,
"step": 255
},
{
"epoch": 0.5488126649076517,
"grad_norm": 16.71817267029077,
"learning_rate": 2.5092399573560323e-07,
"logits/chosen": -2.236642599105835,
"logits/rejected": -2.24824857711792,
"logps/chosen": -675.7197265625,
"logps/rejected": -906.6882934570312,
"loss": 0.4331,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.17581844329834,
"rewards/margins": 2.249803304672241,
"rewards/rejected": -6.42562198638916,
"step": 260
},
{
"epoch": 0.5593667546174143,
"grad_norm": 21.83948507996357,
"learning_rate": 2.4168555295104124e-07,
"logits/chosen": -2.185378313064575,
"logits/rejected": -2.1056790351867676,
"logps/chosen": -593.40283203125,
"logps/rejected": -796.1773071289062,
"loss": 0.4083,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.3149795532226562,
"rewards/margins": 1.8634592294692993,
"rewards/rejected": -5.178439140319824,
"step": 265
},
{
"epoch": 0.5699208443271768,
"grad_norm": 20.351730101984266,
"learning_rate": 2.3245846670103626e-07,
"logits/chosen": -2.268347978591919,
"logits/rejected": -2.2143301963806152,
"logps/chosen": -588.11474609375,
"logps/rejected": -783.8377075195312,
"loss": 0.3935,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.3238461017608643,
"rewards/margins": 1.8138678073883057,
"rewards/rejected": -5.13771390914917,
"step": 270
},
{
"epoch": 0.5804749340369393,
"grad_norm": 18.56747674948443,
"learning_rate": 2.232553400755159e-07,
"logits/chosen": -2.4159321784973145,
"logits/rejected": -2.3257503509521484,
"logps/chosen": -631.1921997070312,
"logps/rejected": -876.8099365234375,
"loss": 0.3663,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.7254486083984375,
"rewards/margins": 2.293728828430176,
"rewards/rejected": -6.0191779136657715,
"step": 275
},
{
"epoch": 0.5910290237467019,
"grad_norm": 21.05078294350066,
"learning_rate": 2.1408874343844294e-07,
"logits/chosen": -2.3609871864318848,
"logits/rejected": -2.229645013809204,
"logps/chosen": -681.2824096679688,
"logps/rejected": -997.8416748046875,
"loss": 0.3917,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.151437282562256,
"rewards/margins": 2.831943988800049,
"rewards/rejected": -6.9833807945251465,
"step": 280
},
{
"epoch": 0.6015831134564644,
"grad_norm": 17.819286464723362,
"learning_rate": 2.049711972582101e-07,
"logits/chosen": -2.2669837474823,
"logits/rejected": -2.1804003715515137,
"logps/chosen": -674.4667358398438,
"logps/rejected": -925.66650390625,
"loss": 0.3574,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -4.095311641693115,
"rewards/margins": 2.3263769149780273,
"rewards/rejected": -6.421689033508301,
"step": 285
},
{
"epoch": 0.6121372031662269,
"grad_norm": 16.393917654235082,
"learning_rate": 1.9591515500618588e-07,
"logits/chosen": -2.3980906009674072,
"logits/rejected": -2.307847261428833,
"logps/chosen": -668.3873901367188,
"logps/rejected": -880.75146484375,
"loss": 0.4484,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -4.07429313659668,
"rewards/margins": 1.9223320484161377,
"rewards/rejected": -5.9966254234313965,
"step": 290
},
{
"epoch": 0.6226912928759895,
"grad_norm": 17.607056207364927,
"learning_rate": 1.8693298614677112e-07,
"logits/chosen": -2.1555488109588623,
"logits/rejected": -2.051828145980835,
"logps/chosen": -596.3387451171875,
"logps/rejected": -825.14892578125,
"loss": 0.3679,
"rewards/accuracies": 0.875,
"rewards/chosen": -3.2922072410583496,
"rewards/margins": 2.1459250450134277,
"rewards/rejected": -5.438132286071777,
"step": 295
},
{
"epoch": 0.633245382585752,
"grad_norm": 18.598122517039727,
"learning_rate": 1.7803695924219814e-07,
"logits/chosen": -2.2622170448303223,
"logits/rejected": -2.1897120475769043,
"logps/chosen": -639.8846435546875,
"logps/rejected": -850.0399169921875,
"loss": 0.4031,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.6260199546813965,
"rewards/margins": 2.0194387435913086,
"rewards/rejected": -5.645459175109863,
"step": 300
},
{
"epoch": 0.6437994722955145,
"grad_norm": 16.59129232266985,
"learning_rate": 1.6923922519515067e-07,
"logits/chosen": -2.2015440464019775,
"logits/rejected": -2.129885196685791,
"logps/chosen": -558.0819091796875,
"logps/rejected": -752.4927368164062,
"loss": 0.4095,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.0600318908691406,
"rewards/margins": 1.7260305881500244,
"rewards/rejected": -4.786062240600586,
"step": 305
},
{
"epoch": 0.6543535620052771,
"grad_norm": 18.44006124052621,
"learning_rate": 1.605518006520924e-07,
"logits/chosen": -2.301358461380005,
"logits/rejected": -2.184253215789795,
"logps/chosen": -583.1818237304688,
"logps/rejected": -801.277099609375,
"loss": 0.3928,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.176647663116455,
"rewards/margins": 2.0725767612457275,
"rewards/rejected": -5.249224662780762,
"step": 310
},
{
"epoch": 0.6649076517150396,
"grad_norm": 43.00212859415373,
"learning_rate": 1.519865515899731e-07,
"logits/chosen": -2.302088975906372,
"logits/rejected": -2.1100873947143555,
"logps/chosen": -601.4708251953125,
"logps/rejected": -821.9664916992188,
"loss": 0.3886,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.5313808917999268,
"rewards/margins": 1.985640287399292,
"rewards/rejected": -5.517021179199219,
"step": 315
},
{
"epoch": 0.6754617414248021,
"grad_norm": 19.308206012998415,
"learning_rate": 1.4355517710873182e-07,
"logits/chosen": -2.306898593902588,
"logits/rejected": -2.2703452110290527,
"logps/chosen": -637.2567138671875,
"logps/rejected": -900.8046875,
"loss": 0.3968,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.712189197540283,
"rewards/margins": 2.495907783508301,
"rewards/rejected": -6.208096981048584,
"step": 320
},
{
"epoch": 0.6860158311345647,
"grad_norm": 24.048438044667563,
"learning_rate": 1.3526919345173318e-07,
"logits/chosen": -2.2532455921173096,
"logits/rejected": -2.1278910636901855,
"logps/chosen": -607.2129516601562,
"logps/rejected": -847.1838989257812,
"loss": 0.4058,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.399864912033081,
"rewards/margins": 2.236896514892578,
"rewards/rejected": -5.636761665344238,
"step": 325
},
{
"epoch": 0.6965699208443272,
"grad_norm": 27.608112815101293,
"learning_rate": 1.2713991827596443e-07,
"logits/chosen": -2.233346939086914,
"logits/rejected": -2.2035372257232666,
"logps/chosen": -589.2955322265625,
"logps/rejected": -793.0179443359375,
"loss": 0.3905,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.347853899002075,
"rewards/margins": 1.9631192684173584,
"rewards/rejected": -5.31097412109375,
"step": 330
},
{
"epoch": 0.7071240105540897,
"grad_norm": 24.14881546063451,
"learning_rate": 1.191784551934773e-07,
"logits/chosen": -2.3385255336761475,
"logits/rejected": -2.322145462036133,
"logps/chosen": -588.7033081054688,
"logps/rejected": -806.0431518554688,
"loss": 0.4061,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.393342971801758,
"rewards/margins": 2.0734634399414062,
"rewards/rejected": -5.466806888580322,
"step": 335
},
{
"epoch": 0.7176781002638523,
"grad_norm": 19.815903375155614,
"learning_rate": 1.1139567860518953e-07,
"logits/chosen": -2.0588958263397217,
"logits/rejected": -1.979034423828125,
"logps/chosen": -593.4244995117188,
"logps/rejected": -787.120361328125,
"loss": 0.4265,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.2387542724609375,
"rewards/margins": 1.854077696800232,
"rewards/rejected": -5.092832088470459,
"step": 340
},
{
"epoch": 0.7282321899736148,
"grad_norm": 20.071301052736302,
"learning_rate": 1.0380221884776128e-07,
"logits/chosen": -2.067850112915039,
"logits/rejected": -2.048149824142456,
"logps/chosen": -560.5596923828125,
"logps/rejected": -704.077880859375,
"loss": 0.4373,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.912015438079834,
"rewards/margins": 1.399601936340332,
"rewards/rejected": -4.311617374420166,
"step": 345
},
{
"epoch": 0.7387862796833773,
"grad_norm": 18.162804393534355,
"learning_rate": 9.640844767383405e-08,
"logits/chosen": -2.1664066314697266,
"logits/rejected": -2.08605694770813,
"logps/chosen": -543.7811279296875,
"logps/rejected": -715.3802490234375,
"loss": 0.4225,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.847567319869995,
"rewards/margins": 1.6358835697174072,
"rewards/rejected": -4.4834513664245605,
"step": 350
},
{
"epoch": 0.7493403693931399,
"grad_norm": 22.0345662189636,
"learning_rate": 8.922446408546378e-08,
"logits/chosen": -2.125089168548584,
"logits/rejected": -2.0595450401306152,
"logps/chosen": -593.4921875,
"logps/rejected": -794.3736572265625,
"loss": 0.4491,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.1958818435668945,
"rewards/margins": 1.8830915689468384,
"rewards/rejected": -5.078973293304443,
"step": 355
},
{
"epoch": 0.7598944591029023,
"grad_norm": 37.283205456222554,
"learning_rate": 8.22600805400994e-08,
"logits/chosen": -2.119860887527466,
"logits/rejected": -2.025869846343994,
"logps/chosen": -572.8893432617188,
"logps/rejected": -800.1495361328125,
"loss": 0.3879,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.0747852325439453,
"rewards/margins": 2.0609545707702637,
"rewards/rejected": -5.135739326477051,
"step": 360
},
{
"epoch": 0.7704485488126649,
"grad_norm": 23.893898212231402,
"learning_rate": 7.552480954794558e-08,
"logits/chosen": -2.0981643199920654,
"logits/rejected": -2.010963201522827,
"logps/chosen": -598.5560302734375,
"logps/rejected": -791.58349609375,
"loss": 0.4217,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.4471168518066406,
"rewards/margins": 1.8089519739151,
"rewards/rejected": -5.256069183349609,
"step": 365
},
{
"epoch": 0.7810026385224275,
"grad_norm": 27.916098925400245,
"learning_rate": 6.902785067901854e-08,
"logits/chosen": -2.1697256565093994,
"logits/rejected": -2.015242099761963,
"logps/chosen": -603.3410034179688,
"logps/rejected": -844.3304443359375,
"loss": 0.3863,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.3369498252868652,
"rewards/margins": 2.220611810684204,
"rewards/rejected": -5.557561874389648,
"step": 370
},
{
"epoch": 0.7915567282321899,
"grad_norm": 27.790853080729732,
"learning_rate": 6.277807799763973e-08,
"logits/chosen": -2.1927974224090576,
"logits/rejected": -2.077242136001587,
"logps/chosen": -604.877685546875,
"logps/rejected": -836.9320068359375,
"loss": 0.4036,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.460221767425537,
"rewards/margins": 2.2005088329315186,
"rewards/rejected": -5.660730361938477,
"step": 375
},
{
"epoch": 0.8021108179419525,
"grad_norm": 26.505958464528764,
"learning_rate": 5.678402794153145e-08,
"logits/chosen": -2.265552282333374,
"logits/rejected": -2.2187042236328125,
"logps/chosen": -644.1717529296875,
"logps/rejected": -856.8342895507812,
"loss": 0.4045,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.8418126106262207,
"rewards/margins": 2.0028209686279297,
"rewards/rejected": -5.844632625579834,
"step": 380
},
{
"epoch": 0.8126649076517151,
"grad_norm": 28.453921770012606,
"learning_rate": 5.105388766206969e-08,
"logits/chosen": -2.355292797088623,
"logits/rejected": -2.2420361042022705,
"logps/chosen": -691.5671997070312,
"logps/rejected": -934.6068115234375,
"loss": 0.443,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -4.222853660583496,
"rewards/margins": 2.315927028656006,
"rewards/rejected": -6.538781642913818,
"step": 385
},
{
"epoch": 0.8232189973614775,
"grad_norm": 21.35738866439425,
"learning_rate": 4.5595483841620484e-08,
"logits/chosen": -2.1776041984558105,
"logits/rejected": -2.1361899375915527,
"logps/chosen": -658.7529907226562,
"logps/rejected": -870.3590087890625,
"loss": 0.378,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.8739631175994873,
"rewards/margins": 2.0037648677825928,
"rewards/rejected": -5.877728462219238,
"step": 390
},
{
"epoch": 0.8337730870712401,
"grad_norm": 25.73671420821126,
"learning_rate": 4.0416272003232526e-08,
"logits/chosen": -2.1355865001678467,
"logits/rejected": -2.0880231857299805,
"logps/chosen": -632.5217895507812,
"logps/rejected": -861.2312622070312,
"loss": 0.44,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.6603481769561768,
"rewards/margins": 2.233140230178833,
"rewards/rejected": -5.893488883972168,
"step": 395
},
{
"epoch": 0.8443271767810027,
"grad_norm": 24.57511432896418,
"learning_rate": 3.552332632729041e-08,
"logits/chosen": -2.040531873703003,
"logits/rejected": -2.077538251876831,
"logps/chosen": -628.4180908203125,
"logps/rejected": -797.4384765625,
"loss": 0.4222,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.6359188556671143,
"rewards/margins": 1.6638940572738647,
"rewards/rejected": -5.299813270568848,
"step": 400
},
{
"epoch": 0.8443271767810027,
"eval_logits/chosen": -2.7593374252319336,
"eval_logits/rejected": -2.6865265369415283,
"eval_logps/chosen": -595.5198974609375,
"eval_logps/rejected": -786.4964599609375,
"eval_loss": 0.39839133620262146,
"eval_rewards/accuracies": 0.8286290168762207,
"eval_rewards/chosen": -3.3262782096862793,
"eval_rewards/margins": 1.799713134765625,
"eval_rewards/rejected": -5.125991344451904,
"eval_runtime": 315.3526,
"eval_samples_per_second": 6.266,
"eval_steps_per_second": 0.393,
"step": 400
},
{
"epoch": 0.8548812664907651,
"grad_norm": 20.4806371393051,
"learning_rate": 3.092332998903416e-08,
"logits/chosen": -2.1178812980651855,
"logits/rejected": -2.0984854698181152,
"logps/chosen": -637.4102783203125,
"logps/rejected": -846.5029296875,
"loss": 0.3953,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.625606060028076,
"rewards/margins": 2.061870574951172,
"rewards/rejected": -5.687476634979248,
"step": 405
},
{
"epoch": 0.8654353562005277,
"grad_norm": 19.547579178485496,
"learning_rate": 2.6622566030146455e-08,
"logits/chosen": -2.1973793506622314,
"logits/rejected": -2.171604633331299,
"logps/chosen": -557.0053100585938,
"logps/rejected": -746.3121337890625,
"loss": 0.4256,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.149854898452759,
"rewards/margins": 1.7529436349868774,
"rewards/rejected": -4.902798652648926,
"step": 410
},
{
"epoch": 0.8759894459102903,
"grad_norm": 19.129911424402337,
"learning_rate": 2.26269087768734e-08,
"logits/chosen": -2.1681036949157715,
"logits/rejected": -2.006333589553833,
"logps/chosen": -610.58837890625,
"logps/rejected": -869.3065185546875,
"loss": 0.3987,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.558450698852539,
"rewards/margins": 2.386261463165283,
"rewards/rejected": -5.9447126388549805,
"step": 415
},
{
"epoch": 0.8865435356200527,
"grad_norm": 24.716365813368494,
"learning_rate": 1.894181581640106e-08,
"logits/chosen": -2.2324867248535156,
"logits/rejected": -2.2453224658966064,
"logps/chosen": -601.86083984375,
"logps/rejected": -790.0075073242188,
"loss": 0.3941,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.448594331741333,
"rewards/margins": 1.7875158786773682,
"rewards/rejected": -5.236110210418701,
"step": 420
},
{
"epoch": 0.8970976253298153,
"grad_norm": 32.13959851586395,
"learning_rate": 1.5572320542448143e-08,
"logits/chosen": -2.2512707710266113,
"logits/rejected": -2.20418119430542,
"logps/chosen": -625.6372680664062,
"logps/rejected": -828.36083984375,
"loss": 0.4037,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.624408006668091,
"rewards/margins": 1.9519973993301392,
"rewards/rejected": -5.576405048370361,
"step": 425
},
{
"epoch": 0.9076517150395779,
"grad_norm": 21.087098841456804,
"learning_rate": 1.2523025280255729e-08,
"logits/chosen": -2.314072847366333,
"logits/rejected": -2.28322434425354,
"logps/chosen": -619.37060546875,
"logps/rejected": -859.1106567382812,
"loss": 0.3474,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.540250062942505,
"rewards/margins": 2.301335334777832,
"rewards/rejected": -5.841585159301758,
"step": 430
},
{
"epoch": 0.9182058047493403,
"grad_norm": 18.102509884061345,
"learning_rate": 9.798095000364214e-09,
"logits/chosen": -2.378577470779419,
"logits/rejected": -2.214040994644165,
"logps/chosen": -613.8382568359375,
"logps/rejected": -870.4904174804688,
"loss": 0.3723,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.5427989959716797,
"rewards/margins": 2.341104030609131,
"rewards/rejected": -5.8839030265808105,
"step": 435
},
{
"epoch": 0.9287598944591029,
"grad_norm": 24.09594523964464,
"learning_rate": 7.401251629764876e-09,
"logits/chosen": -2.230398416519165,
"logits/rejected": -2.044609308242798,
"logps/chosen": -635.7887573242188,
"logps/rejected": -865.6220703125,
"loss": 0.4132,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.677701473236084,
"rewards/margins": 2.0847156047821045,
"rewards/rejected": -5.762416839599609,
"step": 440
},
{
"epoch": 0.9393139841688655,
"grad_norm": 18.69976567383702,
"learning_rate": 5.335768968195098e-09,
"logits/chosen": -2.1324424743652344,
"logits/rejected": -2.0235095024108887,
"logps/chosen": -618.6690673828125,
"logps/rejected": -826.8605346679688,
"loss": 0.4125,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.5293784141540527,
"rewards/margins": 1.9530022144317627,
"rewards/rejected": -5.4823808670043945,
"step": 445
},
{
"epoch": 0.9498680738786279,
"grad_norm": 15.92889127250539,
"learning_rate": 3.604468216521883e-09,
"logits/chosen": -2.2540245056152344,
"logits/rejected": -2.232203960418701,
"logps/chosen": -600.1151123046875,
"logps/rejected": -796.59423828125,
"loss": 0.3844,
"rewards/accuracies": 0.84375,
"rewards/chosen": -3.402463912963867,
"rewards/margins": 1.903550148010254,
"rewards/rejected": -5.306014060974121,
"step": 450
},
{
"epoch": 0.9604221635883905,
"grad_norm": 18.753569800561838,
"learning_rate": 2.2097141233206884e-09,
"logits/chosen": -2.1656556129455566,
"logits/rejected": -2.1333932876586914,
"logps/chosen": -624.7294921875,
"logps/rejected": -828.1585693359375,
"loss": 0.3908,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.7328314781188965,
"rewards/margins": 1.9572765827178955,
"rewards/rejected": -5.690107345581055,
"step": 455
},
{
"epoch": 0.9709762532981531,
"grad_norm": 19.85121890931105,
"learning_rate": 1.1534117549133472e-09,
"logits/chosen": -2.364999294281006,
"logits/rejected": -2.1894242763519287,
"logps/chosen": -624.747802734375,
"logps/rejected": -858.8040161132812,
"loss": 0.3658,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.7442946434020996,
"rewards/margins": 2.1538589000701904,
"rewards/rejected": -5.898154258728027,
"step": 460
},
{
"epoch": 0.9815303430079155,
"grad_norm": 39.10841866963654,
"learning_rate": 4.3700389327672173e-10,
"logits/chosen": -2.2868332862854004,
"logits/rejected": -2.1618874073028564,
"logps/chosen": -634.08447265625,
"logps/rejected": -845.2247924804688,
"loss": 0.3908,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.793625593185425,
"rewards/margins": 1.9014127254486084,
"rewards/rejected": -5.695038318634033,
"step": 465
},
{
"epoch": 0.9920844327176781,
"grad_norm": 18.498519136680624,
"learning_rate": 6.146906537587982e-11,
"logits/chosen": -2.2575690746307373,
"logits/rejected": -2.1273903846740723,
"logps/chosen": -600.2813720703125,
"logps/rejected": -810.6456298828125,
"loss": 0.396,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.4551138877868652,
"rewards/margins": 1.9376386404037476,
"rewards/rejected": -5.392752647399902,
"step": 470
},
{
"epoch": 0.9984168865435357,
"step": 473,
"total_flos": 0.0,
"train_loss": 0.466365703316622,
"train_runtime": 19524.7969,
"train_samples_per_second": 3.105,
"train_steps_per_second": 0.024
}
],
"logging_steps": 5,
"max_steps": 473,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}