zephyr-7b-ipo-0k-15k-i1 / trainer_state.json
BraylonDash's picture
Model save
eec7ad9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 4e-08,
"logits/chosen": -2.683027744293213,
"logits/rejected": -2.0717973709106445,
"logps/chosen": -497.5299987792969,
"logps/rejected": -340.85333251953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 4.0000000000000003e-07,
"logits/chosen": -2.517321825027466,
"logits/rejected": -2.1676418781280518,
"logps/chosen": -288.0818176269531,
"logps/rejected": -199.1251678466797,
"loss": 0.6932,
"rewards/accuracies": 0.3611111044883728,
"rewards/chosen": 0.00022377756249625236,
"rewards/margins": 0.00016948273696471006,
"rewards/rejected": 5.429480006569065e-05,
"step": 10
},
{
"epoch": 0.02,
"learning_rate": 8.000000000000001e-07,
"logits/chosen": -2.39406156539917,
"logits/rejected": -2.1605257987976074,
"logps/chosen": -271.68157958984375,
"logps/rejected": -219.1865234375,
"loss": 0.6934,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": 0.0002538819098845124,
"rewards/margins": -0.0007037109462544322,
"rewards/rejected": 0.0009575928561389446,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 1.2000000000000002e-06,
"logits/chosen": -2.306056261062622,
"logits/rejected": -2.278916358947754,
"logps/chosen": -270.09515380859375,
"logps/rejected": -301.93194580078125,
"loss": 0.6926,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0033609136007726192,
"rewards/margins": 0.0015898284036666155,
"rewards/rejected": 0.0017710853135213256,
"step": 30
},
{
"epoch": 0.03,
"learning_rate": 1.6000000000000001e-06,
"logits/chosen": -2.5502350330352783,
"logits/rejected": -2.383606433868408,
"logps/chosen": -211.55270385742188,
"logps/rejected": -190.15623474121094,
"loss": 0.6919,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.005155195482075214,
"rewards/margins": 0.0021972700487822294,
"rewards/rejected": 0.002957924734801054,
"step": 40
},
{
"epoch": 0.04,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": -2.3993449211120605,
"logits/rejected": -2.355790615081787,
"logps/chosen": -196.9150390625,
"logps/rejected": -221.62014770507812,
"loss": 0.69,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.008768909610807896,
"rewards/margins": 0.005697342567145824,
"rewards/rejected": 0.0030715656466782093,
"step": 50
},
{
"epoch": 0.05,
"learning_rate": 2.4000000000000003e-06,
"logits/chosen": -2.525311231613159,
"logits/rejected": -2.3309919834136963,
"logps/chosen": -243.81521606445312,
"logps/rejected": -289.21868896484375,
"loss": 0.6895,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.012168792076408863,
"rewards/margins": 0.00664276909083128,
"rewards/rejected": 0.005526022985577583,
"step": 60
},
{
"epoch": 0.06,
"learning_rate": 2.8000000000000003e-06,
"logits/chosen": -2.2812366485595703,
"logits/rejected": -2.306039810180664,
"logps/chosen": -225.4685516357422,
"logps/rejected": -229.1845703125,
"loss": 0.683,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.016351569443941116,
"rewards/margins": 0.022075170651078224,
"rewards/rejected": -0.0057236007414758205,
"step": 70
},
{
"epoch": 0.06,
"learning_rate": 3.2000000000000003e-06,
"logits/chosen": -2.44425892829895,
"logits/rejected": -2.432558536529541,
"logps/chosen": -261.52703857421875,
"logps/rejected": -270.8040466308594,
"loss": 0.6798,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03062610700726509,
"rewards/margins": 0.02709539607167244,
"rewards/rejected": 0.003530709771439433,
"step": 80
},
{
"epoch": 0.07,
"learning_rate": 3.6000000000000003e-06,
"logits/chosen": -2.3680663108825684,
"logits/rejected": -2.022505283355713,
"logps/chosen": -264.2672424316406,
"logps/rejected": -186.8569793701172,
"loss": 0.6624,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.023214900866150856,
"rewards/margins": 0.05782170966267586,
"rewards/rejected": -0.03460680693387985,
"step": 90
},
{
"epoch": 0.08,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": -2.32338547706604,
"logits/rejected": -2.332152843475342,
"logps/chosen": -283.2625732421875,
"logps/rejected": -274.24365234375,
"loss": 0.6627,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.10683544725179672,
"rewards/margins": 0.053194332867860794,
"rewards/rejected": -0.160029798746109,
"step": 100
},
{
"epoch": 0.09,
"learning_rate": 4.4e-06,
"logits/chosen": -2.220726490020752,
"logits/rejected": -2.0992746353149414,
"logps/chosen": -226.12930297851562,
"logps/rejected": -215.71804809570312,
"loss": 0.6366,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.13904300332069397,
"rewards/margins": 0.1515841782093048,
"rewards/rejected": -0.2906271815299988,
"step": 110
},
{
"epoch": 0.1,
"learning_rate": 4.800000000000001e-06,
"logits/chosen": -2.012620449066162,
"logits/rejected": -2.0512988567352295,
"logps/chosen": -296.2652893066406,
"logps/rejected": -368.3299560546875,
"loss": 0.5823,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5935014486312866,
"rewards/margins": 0.31774038076400757,
"rewards/rejected": -0.9112418293952942,
"step": 120
},
{
"epoch": 0.1,
"learning_rate": 4.999756310023261e-06,
"logits/chosen": -2.3318862915039062,
"logits/rejected": -2.2144980430603027,
"logps/chosen": -300.13494873046875,
"logps/rejected": -285.32867431640625,
"loss": 0.5917,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.6550928950309753,
"rewards/margins": 0.21246926486492157,
"rewards/rejected": -0.8675621151924133,
"step": 130
},
{
"epoch": 0.11,
"learning_rate": 4.997807075247147e-06,
"logits/chosen": -2.1210927963256836,
"logits/rejected": -1.8453128337860107,
"logps/chosen": -247.13992309570312,
"logps/rejected": -273.41650390625,
"loss": 0.5644,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.5666564702987671,
"rewards/margins": 0.40540236234664917,
"rewards/rejected": -0.972058892250061,
"step": 140
},
{
"epoch": 0.12,
"learning_rate": 4.993910125649561e-06,
"logits/chosen": -2.1635901927948,
"logits/rejected": -2.0486695766448975,
"logps/chosen": -275.05364990234375,
"logps/rejected": -299.96148681640625,
"loss": 0.5361,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.45165014266967773,
"rewards/margins": 0.5924302935600281,
"rewards/rejected": -1.0440804958343506,
"step": 150
},
{
"epoch": 0.13,
"learning_rate": 4.988068499954578e-06,
"logits/chosen": -2.0027568340301514,
"logits/rejected": -2.079007148742676,
"logps/chosen": -488.09222412109375,
"logps/rejected": -544.9346923828125,
"loss": 0.5923,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.358773946762085,
"rewards/margins": 0.36690616607666016,
"rewards/rejected": -2.725680112838745,
"step": 160
},
{
"epoch": 0.14,
"learning_rate": 4.980286753286196e-06,
"logits/chosen": -2.197312831878662,
"logits/rejected": -1.8223193883895874,
"logps/chosen": -473.13067626953125,
"logps/rejected": -477.058837890625,
"loss": 0.5539,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0573782920837402,
"rewards/margins": 0.36220186948776245,
"rewards/rejected": -2.4195804595947266,
"step": 170
},
{
"epoch": 0.14,
"learning_rate": 4.970570953616383e-06,
"logits/chosen": -2.0688188076019287,
"logits/rejected": -2.0571534633636475,
"logps/chosen": -367.75189208984375,
"logps/rejected": -408.49432373046875,
"loss": 0.5837,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -1.4084174633026123,
"rewards/margins": 0.27590009570121765,
"rewards/rejected": -1.6843173503875732,
"step": 180
},
{
"epoch": 0.15,
"learning_rate": 4.958928677033465e-06,
"logits/chosen": -1.9835456609725952,
"logits/rejected": -1.9227497577667236,
"logps/chosen": -397.43719482421875,
"logps/rejected": -443.9778747558594,
"loss": 0.4748,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.6137508153915405,
"rewards/margins": 0.6342134475708008,
"rewards/rejected": -2.247964382171631,
"step": 190
},
{
"epoch": 0.16,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": -1.8892637491226196,
"logits/rejected": -1.7664573192596436,
"logps/chosen": -440.25885009765625,
"logps/rejected": -551.2183837890625,
"loss": 0.4752,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8693583011627197,
"rewards/margins": 0.8833104968070984,
"rewards/rejected": -2.752668857574463,
"step": 200
},
{
"epoch": 0.17,
"learning_rate": 4.9299025014463665e-06,
"logits/chosen": -2.1745524406433105,
"logits/rejected": -1.8148044347763062,
"logps/chosen": -423.63128662109375,
"logps/rejected": -463.9573669433594,
"loss": 0.4876,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4858825206756592,
"rewards/margins": 1.151026964187622,
"rewards/rejected": -2.6369097232818604,
"step": 210
},
{
"epoch": 0.18,
"learning_rate": 4.912541236180779e-06,
"logits/chosen": -2.0313258171081543,
"logits/rejected": -1.6316728591918945,
"logps/chosen": -486.4668884277344,
"logps/rejected": -533.1800537109375,
"loss": 0.547,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.4689016342163086,
"rewards/margins": 0.798893928527832,
"rewards/rejected": -3.2677950859069824,
"step": 220
},
{
"epoch": 0.18,
"learning_rate": 4.893298743830168e-06,
"logits/chosen": -2.042020320892334,
"logits/rejected": -2.0924315452575684,
"logps/chosen": -414.12152099609375,
"logps/rejected": -496.2425231933594,
"loss": 0.5683,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.165775775909424,
"rewards/margins": 0.4269164502620697,
"rewards/rejected": -2.5926921367645264,
"step": 230
},
{
"epoch": 0.19,
"learning_rate": 4.8721900291112415e-06,
"logits/chosen": -1.9427902698516846,
"logits/rejected": -1.7720750570297241,
"logps/chosen": -426.7705078125,
"logps/rejected": -476.8197326660156,
"loss": 0.5076,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.199129104614258,
"rewards/margins": 0.538731575012207,
"rewards/rejected": -2.7378602027893066,
"step": 240
},
{
"epoch": 0.2,
"learning_rate": 4.849231551964771e-06,
"logits/chosen": -1.916007399559021,
"logits/rejected": -1.787649154663086,
"logps/chosen": -392.2034912109375,
"logps/rejected": -512.083984375,
"loss": 0.4932,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.2146475315093994,
"rewards/margins": 1.0699217319488525,
"rewards/rejected": -3.2845687866210938,
"step": 250
},
{
"epoch": 0.21,
"learning_rate": 4.824441214720629e-06,
"logits/chosen": -1.9078289270401,
"logits/rejected": -1.676805853843689,
"logps/chosen": -607.4827270507812,
"logps/rejected": -694.8690185546875,
"loss": 0.4684,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.3061842918395996,
"rewards/margins": 0.9638306498527527,
"rewards/rejected": -4.270014762878418,
"step": 260
},
{
"epoch": 0.22,
"learning_rate": 4.7978383481380865e-06,
"logits/chosen": -1.841152548789978,
"logits/rejected": -1.639878511428833,
"logps/chosen": -522.9131469726562,
"logps/rejected": -613.7696533203125,
"loss": 0.4322,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.718308448791504,
"rewards/margins": 1.2125293016433716,
"rewards/rejected": -3.930838108062744,
"step": 270
},
{
"epoch": 0.22,
"learning_rate": 4.769443696332272e-06,
"logits/chosen": -1.622367262840271,
"logits/rejected": -1.6904557943344116,
"logps/chosen": -468.45947265625,
"logps/rejected": -664.0346069335938,
"loss": 0.4407,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -3.01562762260437,
"rewards/margins": 1.5072427988052368,
"rewards/rejected": -4.5228705406188965,
"step": 280
},
{
"epoch": 0.23,
"learning_rate": 4.7392794005985324e-06,
"logits/chosen": -2.0655009746551514,
"logits/rejected": -1.782928466796875,
"logps/chosen": -439.0694274902344,
"logps/rejected": -528.5975341796875,
"loss": 0.4046,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6179521083831787,
"rewards/margins": 1.3643434047698975,
"rewards/rejected": -2.982295513153076,
"step": 290
},
{
"epoch": 0.24,
"learning_rate": 4.707368982147318e-06,
"logits/chosen": -1.8944809436798096,
"logits/rejected": -1.8623746633529663,
"logps/chosen": -409.47662353515625,
"logps/rejected": -446.8701171875,
"loss": 0.4616,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1979713439941406,
"rewards/margins": 1.0424644947052002,
"rewards/rejected": -2.2404356002807617,
"step": 300
},
{
"epoch": 0.25,
"learning_rate": 4.673737323763048e-06,
"logits/chosen": -1.6961355209350586,
"logits/rejected": -1.738201379776001,
"logps/chosen": -395.11370849609375,
"logps/rejected": -542.3433837890625,
"loss": 0.5152,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0377097129821777,
"rewards/margins": 0.9327165484428406,
"rewards/rejected": -2.970426082611084,
"step": 310
},
{
"epoch": 0.26,
"learning_rate": 4.638410650401267e-06,
"logits/chosen": -1.6403663158416748,
"logits/rejected": -1.3353043794631958,
"logps/chosen": -535.1236572265625,
"logps/rejected": -570.8885498046875,
"loss": 0.4839,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.7877275943756104,
"rewards/margins": 1.0707926750183105,
"rewards/rejected": -3.8585205078125,
"step": 320
},
{
"epoch": 0.26,
"learning_rate": 4.601416508739211e-06,
"logits/chosen": -1.6940498352050781,
"logits/rejected": -1.7366819381713867,
"logps/chosen": -538.8651733398438,
"logps/rejected": -588.4771118164062,
"loss": 0.46,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.6663858890533447,
"rewards/margins": 0.7428363561630249,
"rewards/rejected": -3.40922212600708,
"step": 330
},
{
"epoch": 0.27,
"learning_rate": 4.562783745695738e-06,
"logits/chosen": -1.7110121250152588,
"logits/rejected": -1.4488928318023682,
"logps/chosen": -372.3664855957031,
"logps/rejected": -503.14288330078125,
"loss": 0.4308,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.781057596206665,
"rewards/margins": 1.5787122249603271,
"rewards/rejected": -3.359769821166992,
"step": 340
},
{
"epoch": 0.28,
"learning_rate": 4.522542485937369e-06,
"logits/chosen": -1.8226385116577148,
"logits/rejected": -1.5300580263137817,
"logps/chosen": -463.80889892578125,
"logps/rejected": -537.3763427734375,
"loss": 0.4554,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0642762184143066,
"rewards/margins": 0.9215444326400757,
"rewards/rejected": -2.985820770263672,
"step": 350
},
{
"epoch": 0.29,
"learning_rate": 4.4807241083879774e-06,
"logits/chosen": -1.781518578529358,
"logits/rejected": -1.5981372594833374,
"logps/chosen": -506.3741760253906,
"logps/rejected": -607.2337646484375,
"loss": 0.4323,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.474335193634033,
"rewards/margins": 1.3016973733901978,
"rewards/rejected": -3.7760322093963623,
"step": 360
},
{
"epoch": 0.3,
"learning_rate": 4.437361221760449e-06,
"logits/chosen": -1.6915562152862549,
"logits/rejected": -1.6212940216064453,
"logps/chosen": -330.9052429199219,
"logps/rejected": -441.5316467285156,
"loss": 0.4468,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4203054904937744,
"rewards/margins": 1.071720838546753,
"rewards/rejected": -2.4920265674591064,
"step": 370
},
{
"epoch": 0.3,
"learning_rate": 4.3924876391293915e-06,
"logits/chosen": -1.5807178020477295,
"logits/rejected": -1.541355013847351,
"logps/chosen": -340.5176086425781,
"logps/rejected": -440.81634521484375,
"loss": 0.4907,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.505903959274292,
"rewards/margins": 0.8253719210624695,
"rewards/rejected": -2.3312761783599854,
"step": 380
},
{
"epoch": 0.31,
"learning_rate": 4.346138351564711e-06,
"logits/chosen": -1.572176218032837,
"logits/rejected": -1.440059781074524,
"logps/chosen": -388.8272399902344,
"logps/rejected": -453.6480407714844,
"loss": 0.5046,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9561408758163452,
"rewards/margins": 1.0182749032974243,
"rewards/rejected": -2.9744157791137695,
"step": 390
},
{
"epoch": 0.32,
"learning_rate": 4.2983495008466285e-06,
"logits/chosen": -1.6752668619155884,
"logits/rejected": -1.6999976634979248,
"logps/chosen": -423.57000732421875,
"logps/rejected": -530.806884765625,
"loss": 0.5645,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.094298839569092,
"rewards/margins": 0.8297155499458313,
"rewards/rejected": -2.9240143299102783,
"step": 400
},
{
"epoch": 0.33,
"learning_rate": 4.249158351283414e-06,
"logits/chosen": -1.8126693964004517,
"logits/rejected": -1.6003907918930054,
"logps/chosen": -494.4754943847656,
"logps/rejected": -602.4152221679688,
"loss": 0.5194,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.341548442840576,
"rewards/margins": 0.9126373529434204,
"rewards/rejected": -3.254185914993286,
"step": 410
},
{
"epoch": 0.34,
"learning_rate": 4.198603260653792e-06,
"logits/chosen": -1.9527698755264282,
"logits/rejected": -1.7327282428741455,
"logps/chosen": -508.19873046875,
"logps/rejected": -566.5889892578125,
"loss": 0.3991,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3916351795196533,
"rewards/margins": 0.7339528799057007,
"rewards/rejected": -3.1255879402160645,
"step": 420
},
{
"epoch": 0.34,
"learning_rate": 4.146723650296701e-06,
"logits/chosen": -1.7782537937164307,
"logits/rejected": -1.8380506038665771,
"logps/chosen": -556.6307373046875,
"logps/rejected": -737.5545654296875,
"loss": 0.4901,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.557034492492676,
"rewards/margins": 1.55734121799469,
"rewards/rejected": -4.114375591278076,
"step": 430
},
{
"epoch": 0.35,
"learning_rate": 4.093559974371725e-06,
"logits/chosen": -1.7782948017120361,
"logits/rejected": -1.4533838033676147,
"logps/chosen": -427.7296447753906,
"logps/rejected": -507.8409729003906,
"loss": 0.3925,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.4289357662200928,
"rewards/margins": 1.1018160581588745,
"rewards/rejected": -3.530752182006836,
"step": 440
},
{
"epoch": 0.36,
"learning_rate": 4.039153688314146e-06,
"logits/chosen": -1.875288724899292,
"logits/rejected": -1.6939185857772827,
"logps/chosen": -444.589599609375,
"logps/rejected": -552.37353515625,
"loss": 0.5043,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8052520751953125,
"rewards/margins": 1.3201160430908203,
"rewards/rejected": -3.125368118286133,
"step": 450
},
{
"epoch": 0.37,
"learning_rate": 3.983547216509254e-06,
"logits/chosen": -1.8725506067276,
"logits/rejected": -1.7871854305267334,
"logps/chosen": -455.09185791015625,
"logps/rejected": -519.0117797851562,
"loss": 0.4801,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.264061450958252,
"rewards/margins": 0.8552305102348328,
"rewards/rejected": -3.1192917823791504,
"step": 460
},
{
"epoch": 0.38,
"learning_rate": 3.92678391921108e-06,
"logits/chosen": -1.8820081949234009,
"logits/rejected": -1.7966502904891968,
"logps/chosen": -340.04119873046875,
"logps/rejected": -392.32769775390625,
"loss": 0.5122,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.7454599142074585,
"rewards/margins": 0.6181780099868774,
"rewards/rejected": -2.363638401031494,
"step": 470
},
{
"epoch": 0.38,
"learning_rate": 3.868908058731376e-06,
"logits/chosen": -1.760180115699768,
"logits/rejected": -1.6544349193572998,
"logps/chosen": -386.4324645996094,
"logps/rejected": -537.9710083007812,
"loss": 0.4543,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.187924861907959,
"rewards/margins": 1.2385300397872925,
"rewards/rejected": -3.426455020904541,
"step": 480
},
{
"epoch": 0.39,
"learning_rate": 3.8099647649251984e-06,
"logits/chosen": -1.8242204189300537,
"logits/rejected": -1.5543259382247925,
"logps/chosen": -496.9654235839844,
"logps/rejected": -564.4078979492188,
"loss": 0.4372,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.488828182220459,
"rewards/margins": 1.1287075281143188,
"rewards/rejected": -3.617535352706909,
"step": 490
},
{
"epoch": 0.4,
"learning_rate": 3.7500000000000005e-06,
"logits/chosen": -1.846986174583435,
"logits/rejected": -1.7779200077056885,
"logps/chosen": -481.5650329589844,
"logps/rejected": -603.2772216796875,
"loss": 0.4871,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.848694086074829,
"rewards/margins": 1.1198689937591553,
"rewards/rejected": -3.9685630798339844,
"step": 500
},
{
"epoch": 0.41,
"learning_rate": 3.689060522675689e-06,
"logits/chosen": -1.8344266414642334,
"logits/rejected": -1.6635892391204834,
"logps/chosen": -566.5777587890625,
"logps/rejected": -637.1046142578125,
"loss": 0.4406,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.230578899383545,
"rewards/margins": 0.9766052961349487,
"rewards/rejected": -4.207183837890625,
"step": 510
},
{
"epoch": 0.42,
"learning_rate": 3.627193851723577e-06,
"logits/chosen": -1.5923402309417725,
"logits/rejected": -1.473101258277893,
"logps/chosen": -652.4085083007812,
"logps/rejected": -711.7216796875,
"loss": 0.5059,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -4.209104061126709,
"rewards/margins": 0.7725512981414795,
"rewards/rejected": -4.981655120849609,
"step": 520
},
{
"epoch": 0.42,
"learning_rate": 3.564448228912682e-06,
"logits/chosen": -1.795478105545044,
"logits/rejected": -1.7586778402328491,
"logps/chosen": -623.7203369140625,
"logps/rejected": -718.7762451171875,
"loss": 0.4676,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -3.7662811279296875,
"rewards/margins": 0.7186762094497681,
"rewards/rejected": -4.484957218170166,
"step": 530
},
{
"epoch": 0.43,
"learning_rate": 3.5008725813922383e-06,
"logits/chosen": -1.7972501516342163,
"logits/rejected": -1.5103098154067993,
"logps/chosen": -662.8839721679688,
"logps/rejected": -714.4471435546875,
"loss": 0.4381,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -3.9809927940368652,
"rewards/margins": 1.153225064277649,
"rewards/rejected": -5.134218215942383,
"step": 540
},
{
"epoch": 0.44,
"learning_rate": 3.436516483539781e-06,
"logits/chosen": -1.6798747777938843,
"logits/rejected": -1.525529384613037,
"logps/chosen": -635.1442260742188,
"logps/rejected": -741.7098999023438,
"loss": 0.4519,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.326822757720947,
"rewards/margins": 1.160167932510376,
"rewards/rejected": -5.486990928649902,
"step": 550
},
{
"epoch": 0.45,
"learning_rate": 3.3714301183045382e-06,
"logits/chosen": -1.8586448431015015,
"logits/rejected": -1.954077959060669,
"logps/chosen": -697.0865478515625,
"logps/rejected": -801.1902465820312,
"loss": 0.4172,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.986204147338867,
"rewards/margins": 1.209644079208374,
"rewards/rejected": -5.195847511291504,
"step": 560
},
{
"epoch": 0.46,
"learning_rate": 3.3056642380762783e-06,
"logits/chosen": -1.7111326456069946,
"logits/rejected": -1.7340152263641357,
"logps/chosen": -598.3841552734375,
"logps/rejected": -705.2341918945312,
"loss": 0.4755,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.439131259918213,
"rewards/margins": 1.249565601348877,
"rewards/rejected": -4.68869686126709,
"step": 570
},
{
"epoch": 0.46,
"learning_rate": 3.2392701251101172e-06,
"logits/chosen": -1.790801763534546,
"logits/rejected": -1.677916169166565,
"logps/chosen": -565.8692016601562,
"logps/rejected": -629.5975341796875,
"loss": 0.5383,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -3.4626059532165527,
"rewards/margins": 1.0360872745513916,
"rewards/rejected": -4.498693466186523,
"step": 580
},
{
"epoch": 0.47,
"learning_rate": 3.1722995515381644e-06,
"logits/chosen": -1.763465166091919,
"logits/rejected": -1.6387125253677368,
"logps/chosen": -498.0137634277344,
"logps/rejected": -560.8174438476562,
"loss": 0.4456,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.469872236251831,
"rewards/margins": 0.9421303868293762,
"rewards/rejected": -3.4120030403137207,
"step": 590
},
{
"epoch": 0.48,
"learning_rate": 3.1048047389991693e-06,
"logits/chosen": -1.873694658279419,
"logits/rejected": -1.7188327312469482,
"logps/chosen": -492.42742919921875,
"logps/rejected": -610.1829833984375,
"loss": 0.4947,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.7357325553894043,
"rewards/margins": 1.0360163450241089,
"rewards/rejected": -3.7717490196228027,
"step": 600
},
{
"epoch": 0.49,
"learning_rate": 3.0368383179176584e-06,
"logits/chosen": -1.798766851425171,
"logits/rejected": -1.7281709909439087,
"logps/chosen": -465.31207275390625,
"logps/rejected": -569.0285034179688,
"loss": 0.4429,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2491648197174072,
"rewards/margins": 1.305397391319275,
"rewards/rejected": -3.55456280708313,
"step": 610
},
{
"epoch": 0.5,
"learning_rate": 2.9684532864643123e-06,
"logits/chosen": -1.898830771446228,
"logits/rejected": -1.7822704315185547,
"logps/chosen": -488.25653076171875,
"logps/rejected": -588.7685546875,
"loss": 0.4264,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.347430944442749,
"rewards/margins": 1.2308118343353271,
"rewards/rejected": -3.578242540359497,
"step": 620
},
{
"epoch": 0.5,
"learning_rate": 2.8997029692295875e-06,
"logits/chosen": -2.0265390872955322,
"logits/rejected": -1.8765138387680054,
"logps/chosen": -493.7826232910156,
"logps/rejected": -595.54541015625,
"loss": 0.4438,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.2215516567230225,
"rewards/margins": 1.2061747312545776,
"rewards/rejected": -3.4277260303497314,
"step": 630
},
{
"epoch": 0.51,
"learning_rate": 2.8306409756428067e-06,
"logits/chosen": -1.8571748733520508,
"logits/rejected": -1.4381787776947021,
"logps/chosen": -562.1082153320312,
"logps/rejected": -625.988525390625,
"loss": 0.4561,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.61163330078125,
"rewards/margins": 1.2634782791137695,
"rewards/rejected": -3.8751113414764404,
"step": 640
},
{
"epoch": 0.52,
"learning_rate": 2.761321158169134e-06,
"logits/chosen": -1.9520680904388428,
"logits/rejected": -1.6933047771453857,
"logps/chosen": -682.3081665039062,
"logps/rejected": -772.539794921875,
"loss": 0.4134,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.052522659301758,
"rewards/margins": 1.4804089069366455,
"rewards/rejected": -5.532931327819824,
"step": 650
},
{
"epoch": 0.53,
"learning_rate": 2.6917975703170466e-06,
"logits/chosen": -1.5707025527954102,
"logits/rejected": -1.276710867881775,
"logps/chosen": -679.2574462890625,
"logps/rejected": -891.1485595703125,
"loss": 0.4658,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.277982711791992,
"rewards/margins": 1.9390618801116943,
"rewards/rejected": -6.217044830322266,
"step": 660
},
{
"epoch": 0.54,
"learning_rate": 2.6221244244890336e-06,
"logits/chosen": -1.6595405340194702,
"logits/rejected": -1.4634872674942017,
"logps/chosen": -665.2347412109375,
"logps/rejected": -793.9863891601562,
"loss": 0.381,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.211177349090576,
"rewards/margins": 1.538806676864624,
"rewards/rejected": -5.749984264373779,
"step": 670
},
{
"epoch": 0.54,
"learning_rate": 2.5523560497083927e-06,
"logits/chosen": -1.8683850765228271,
"logits/rejected": -1.7012536525726318,
"logps/chosen": -486.9891052246094,
"logps/rejected": -554.7735595703125,
"loss": 0.5042,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.8961403369903564,
"rewards/margins": 0.7647749185562134,
"rewards/rejected": -3.660914897918701,
"step": 680
},
{
"epoch": 0.55,
"learning_rate": 2.482546849255096e-06,
"logits/chosen": -1.7561218738555908,
"logits/rejected": -1.4548377990722656,
"logps/chosen": -519.9765625,
"logps/rejected": -611.5833740234375,
"loss": 0.4507,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.0487327575683594,
"rewards/margins": 1.145900845527649,
"rewards/rejected": -4.194633960723877,
"step": 690
},
{
"epoch": 0.56,
"learning_rate": 2.4127512582437486e-06,
"logits/chosen": -1.7805372476577759,
"logits/rejected": -1.7106291055679321,
"logps/chosen": -587.1712646484375,
"logps/rejected": -704.341064453125,
"loss": 0.4176,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.0211739540100098,
"rewards/margins": 1.2845187187194824,
"rewards/rejected": -4.30569314956665,
"step": 700
},
{
"epoch": 0.57,
"learning_rate": 2.3430237011767166e-06,
"logits/chosen": -1.6074645519256592,
"logits/rejected": -1.453162431716919,
"logps/chosen": -528.9210205078125,
"logps/rejected": -630.1168823242188,
"loss": 0.4237,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.0611021518707275,
"rewards/margins": 1.255999207496643,
"rewards/rejected": -4.31710147857666,
"step": 710
},
{
"epoch": 0.58,
"learning_rate": 2.2734185495055503e-06,
"logits/chosen": -1.4858500957489014,
"logits/rejected": -1.2407737970352173,
"logps/chosen": -591.7060546875,
"logps/rejected": -691.8463745117188,
"loss": 0.3379,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.651978015899658,
"rewards/margins": 1.4824391603469849,
"rewards/rejected": -5.134417533874512,
"step": 720
},
{
"epoch": 0.58,
"learning_rate": 2.2039900792337477e-06,
"logits/chosen": -1.4694669246673584,
"logits/rejected": -1.2653883695602417,
"logps/chosen": -646.7306518554688,
"logps/rejected": -756.9968872070312,
"loss": 0.4527,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.894749164581299,
"rewards/margins": 1.4129467010498047,
"rewards/rejected": -5.3076958656311035,
"step": 730
},
{
"epoch": 0.59,
"learning_rate": 2.134792428593971e-06,
"logits/chosen": -1.6758044958114624,
"logits/rejected": -1.3347949981689453,
"logps/chosen": -642.6209716796875,
"logps/rejected": -756.3058471679688,
"loss": 0.4029,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.9325084686279297,
"rewards/margins": 1.526052474975586,
"rewards/rejected": -5.458560943603516,
"step": 740
},
{
"epoch": 0.6,
"learning_rate": 2.0658795558326745e-06,
"logits/chosen": -1.530992031097412,
"logits/rejected": -1.616612195968628,
"logps/chosen": -558.5872802734375,
"logps/rejected": -712.8720703125,
"loss": 0.5007,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.5670769214630127,
"rewards/margins": 1.441688895225525,
"rewards/rejected": -5.008765697479248,
"step": 750
},
{
"epoch": 0.61,
"learning_rate": 1.997305197135089e-06,
"logits/chosen": -1.5513006448745728,
"logits/rejected": -1.3652799129486084,
"logps/chosen": -569.5272216796875,
"logps/rejected": -667.7091064453125,
"loss": 0.4669,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.5715885162353516,
"rewards/margins": 1.0474005937576294,
"rewards/rejected": -4.618988990783691,
"step": 760
},
{
"epoch": 0.62,
"learning_rate": 1.9291228247233607e-06,
"logits/chosen": -1.9496749639511108,
"logits/rejected": -1.60714852809906,
"logps/chosen": -613.51806640625,
"logps/rejected": -627.2810668945312,
"loss": 0.4715,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.2026474475860596,
"rewards/margins": 0.9100178480148315,
"rewards/rejected": -4.112665176391602,
"step": 770
},
{
"epoch": 0.62,
"learning_rate": 1.8613856051605242e-06,
"logits/chosen": -1.639764428138733,
"logits/rejected": -1.4028499126434326,
"logps/chosen": -560.9642333984375,
"logps/rejected": -718.7645263671875,
"loss": 0.4383,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.006631374359131,
"rewards/margins": 1.6886869668960571,
"rewards/rejected": -4.69531774520874,
"step": 780
},
{
"epoch": 0.63,
"learning_rate": 1.7941463578928088e-06,
"logits/chosen": -1.7558552026748657,
"logits/rejected": -1.731774091720581,
"logps/chosen": -547.339599609375,
"logps/rejected": -605.9044189453125,
"loss": 0.5097,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.284121036529541,
"rewards/margins": 0.618303120136261,
"rewards/rejected": -3.902423858642578,
"step": 790
},
{
"epoch": 0.64,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": -1.662502646446228,
"logits/rejected": -1.8295265436172485,
"logps/chosen": -503.92352294921875,
"logps/rejected": -656.8177490234375,
"loss": 0.3823,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.5763401985168457,
"rewards/margins": 1.3676784038543701,
"rewards/rejected": -3.944018840789795,
"step": 800
},
{
"epoch": 0.65,
"learning_rate": 1.661371075624363e-06,
"logits/chosen": -1.6462104320526123,
"logits/rejected": -1.3300979137420654,
"logps/chosen": -448.4722595214844,
"logps/rejected": -538.0807495117188,
"loss": 0.436,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.5369763374328613,
"rewards/margins": 1.249807357788086,
"rewards/rejected": -3.7867836952209473,
"step": 810
},
{
"epoch": 0.66,
"learning_rate": 1.5959385747947697e-06,
"logits/chosen": -1.6544301509857178,
"logits/rejected": -1.6204363107681274,
"logps/chosen": -540.1203002929688,
"logps/rejected": -714.814697265625,
"loss": 0.3821,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.9797520637512207,
"rewards/margins": 1.4802463054656982,
"rewards/rejected": -4.45999813079834,
"step": 820
},
{
"epoch": 0.66,
"learning_rate": 1.5312110338697427e-06,
"logits/chosen": -1.8631584644317627,
"logits/rejected": -1.7527086734771729,
"logps/chosen": -472.2159118652344,
"logps/rejected": -541.8966064453125,
"loss": 0.5163,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.802929639816284,
"rewards/margins": 0.781417727470398,
"rewards/rejected": -3.58434796333313,
"step": 830
},
{
"epoch": 0.67,
"learning_rate": 1.467238925438646e-06,
"logits/chosen": -1.8967678546905518,
"logits/rejected": -1.7050600051879883,
"logps/chosen": -537.9852905273438,
"logps/rejected": -641.7759399414062,
"loss": 0.4672,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.674623489379883,
"rewards/margins": 1.1100194454193115,
"rewards/rejected": -3.7846426963806152,
"step": 840
},
{
"epoch": 0.68,
"learning_rate": 1.4040721330273063e-06,
"logits/chosen": -1.5284979343414307,
"logits/rejected": -1.2263238430023193,
"logps/chosen": -609.40283203125,
"logps/rejected": -748.0750732421875,
"loss": 0.3961,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.1462016105651855,
"rewards/margins": 1.6625381708145142,
"rewards/rejected": -4.808740139007568,
"step": 850
},
{
"epoch": 0.69,
"learning_rate": 1.3417599122003464e-06,
"logits/chosen": -1.913924217224121,
"logits/rejected": -1.4708651304244995,
"logps/chosen": -531.4802856445312,
"logps/rejected": -647.6087646484375,
"loss": 0.3934,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.661881923675537,
"rewards/margins": 1.7113628387451172,
"rewards/rejected": -4.373244285583496,
"step": 860
},
{
"epoch": 0.7,
"learning_rate": 1.280350852153168e-06,
"logits/chosen": -1.6116275787353516,
"logits/rejected": -1.7562923431396484,
"logps/chosen": -450.93243408203125,
"logps/rejected": -635.6582641601562,
"loss": 0.4188,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6061110496520996,
"rewards/margins": 1.2891266345977783,
"rewards/rejected": -3.895237684249878,
"step": 870
},
{
"epoch": 0.7,
"learning_rate": 1.2198928378235717e-06,
"logits/chosen": -1.674541711807251,
"logits/rejected": -1.5418341159820557,
"logps/chosen": -596.4996337890625,
"logps/rejected": -777.8062133789062,
"loss": 0.4317,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.463611602783203,
"rewards/margins": 1.5395991802215576,
"rewards/rejected": -5.00321102142334,
"step": 880
},
{
"epoch": 0.71,
"learning_rate": 1.160433012552508e-06,
"logits/chosen": -1.8214279413223267,
"logits/rejected": -1.766122579574585,
"logps/chosen": -468.58197021484375,
"logps/rejected": -573.4989624023438,
"loss": 0.5022,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.285228967666626,
"rewards/margins": 1.5109031200408936,
"rewards/rejected": -3.7961318492889404,
"step": 890
},
{
"epoch": 0.72,
"learning_rate": 1.1020177413231334e-06,
"logits/chosen": -1.842508316040039,
"logits/rejected": -1.6384559869766235,
"logps/chosen": -473.90576171875,
"logps/rejected": -624.7535400390625,
"loss": 0.4147,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.3901591300964355,
"rewards/margins": 1.7103700637817383,
"rewards/rejected": -4.100529193878174,
"step": 900
},
{
"epoch": 0.73,
"learning_rate": 1.0446925746067768e-06,
"logits/chosen": -1.6257625818252563,
"logits/rejected": -1.366236925125122,
"logps/chosen": -425.0322265625,
"logps/rejected": -510.67645263671875,
"loss": 0.407,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.007061004638672,
"rewards/margins": 1.399216890335083,
"rewards/rejected": -3.406277894973755,
"step": 910
},
{
"epoch": 0.74,
"learning_rate": 9.88502212844063e-07,
"logits/chosen": -1.7914766073226929,
"logits/rejected": -1.665331482887268,
"logps/chosen": -537.3778686523438,
"logps/rejected": -753.0563354492188,
"loss": 0.4276,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.615752696990967,
"rewards/margins": 2.0941243171691895,
"rewards/rejected": -4.7098774909973145,
"step": 920
},
{
"epoch": 0.74,
"learning_rate": 9.334904715888496e-07,
"logits/chosen": -1.7023980617523193,
"logits/rejected": -1.4218547344207764,
"logps/chosen": -579.594970703125,
"logps/rejected": -793.6122436523438,
"loss": 0.4859,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.9950103759765625,
"rewards/margins": 2.0560593605041504,
"rewards/rejected": -5.051069736480713,
"step": 930
},
{
"epoch": 0.75,
"learning_rate": 8.797002473421729e-07,
"logits/chosen": -1.6579145193099976,
"logits/rejected": -1.6787497997283936,
"logps/chosen": -541.71826171875,
"logps/rejected": -658.0726318359375,
"loss": 0.4487,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.8812687397003174,
"rewards/margins": 1.0891892910003662,
"rewards/rejected": -3.9704577922821045,
"step": 940
},
{
"epoch": 0.76,
"learning_rate": 8.271734841028553e-07,
"logits/chosen": -1.638536810874939,
"logits/rejected": -1.4352543354034424,
"logps/chosen": -552.4362182617188,
"logps/rejected": -678.9232788085938,
"loss": 0.4098,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.9728922843933105,
"rewards/margins": 1.4913972616195679,
"rewards/rejected": -4.464289665222168,
"step": 950
},
{
"epoch": 0.77,
"learning_rate": 7.759511406608255e-07,
"logits/chosen": -1.7685962915420532,
"logits/rejected": -1.6653659343719482,
"logps/chosen": -514.057373046875,
"logps/rejected": -638.7364501953125,
"loss": 0.4683,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.7808051109313965,
"rewards/margins": 1.2293360233306885,
"rewards/rejected": -4.010141372680664,
"step": 960
},
{
"epoch": 0.78,
"learning_rate": 7.260731586586983e-07,
"logits/chosen": -1.5464670658111572,
"logits/rejected": -1.6253650188446045,
"logps/chosen": -365.79888916015625,
"logps/rejected": -496.05535888671875,
"loss": 0.4741,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.3713974952697754,
"rewards/margins": 1.2587835788726807,
"rewards/rejected": -3.630180835723877,
"step": 970
},
{
"epoch": 0.78,
"learning_rate": 6.775784314464717e-07,
"logits/chosen": -1.8969453573226929,
"logits/rejected": -1.5643236637115479,
"logps/chosen": -483.40655517578125,
"logps/rejected": -605.5526123046875,
"loss": 0.4348,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.432926893234253,
"rewards/margins": 1.5910948514938354,
"rewards/rejected": -4.024021625518799,
"step": 980
},
{
"epoch": 0.79,
"learning_rate": 6.305047737536707e-07,
"logits/chosen": -1.5974626541137695,
"logits/rejected": -1.6372623443603516,
"logps/chosen": -524.9561767578125,
"logps/rejected": -677.9723510742188,
"loss": 0.4174,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.2284367084503174,
"rewards/margins": 1.2628570795059204,
"rewards/rejected": -4.491293907165527,
"step": 990
},
{
"epoch": 0.8,
"learning_rate": 5.848888922025553e-07,
"logits/chosen": -1.6726499795913696,
"logits/rejected": -1.8870794773101807,
"logps/chosen": -580.0320434570312,
"logps/rejected": -648.3935546875,
"loss": 0.4433,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -3.139599561691284,
"rewards/margins": 0.8954163789749146,
"rewards/rejected": -4.035016059875488,
"step": 1000
},
{
"epoch": 0.81,
"learning_rate": 5.407663566854008e-07,
"logits/chosen": -1.9880282878875732,
"logits/rejected": -1.7486753463745117,
"logps/chosen": -493.7062072753906,
"logps/rejected": -590.9326171875,
"loss": 0.5279,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.3209338188171387,
"rewards/margins": 1.3980019092559814,
"rewards/rejected": -3.7189362049102783,
"step": 1010
},
{
"epoch": 0.82,
"learning_rate": 4.981715726281666e-07,
"logits/chosen": -1.8252193927764893,
"logits/rejected": -1.7858692407608032,
"logps/chosen": -575.9556884765625,
"logps/rejected": -775.96630859375,
"loss": 0.3673,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.9226129055023193,
"rewards/margins": 1.7984931468963623,
"rewards/rejected": -4.72110652923584,
"step": 1020
},
{
"epoch": 0.82,
"learning_rate": 4.5713775416217884e-07,
"logits/chosen": -1.4751231670379639,
"logits/rejected": -1.5509759187698364,
"logps/chosen": -474.5804748535156,
"logps/rejected": -639.2667846679688,
"loss": 0.4805,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.7624380588531494,
"rewards/margins": 1.4808038473129272,
"rewards/rejected": -4.243242263793945,
"step": 1030
},
{
"epoch": 0.83,
"learning_rate": 4.1769689822475147e-07,
"logits/chosen": -1.4878826141357422,
"logits/rejected": -1.6433823108673096,
"logps/chosen": -429.17535400390625,
"logps/rejected": -613.1614379882812,
"loss": 0.5111,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.686619520187378,
"rewards/margins": 1.2492364645004272,
"rewards/rejected": -3.935856342315674,
"step": 1040
},
{
"epoch": 0.84,
"learning_rate": 3.798797596089351e-07,
"logits/chosen": -1.6832265853881836,
"logits/rejected": -1.4579049348831177,
"logps/chosen": -508.81622314453125,
"logps/rejected": -643.6360473632812,
"loss": 0.3685,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.7480552196502686,
"rewards/margins": 1.529114007949829,
"rewards/rejected": -4.277169227600098,
"step": 1050
},
{
"epoch": 0.85,
"learning_rate": 3.4371582698185636e-07,
"logits/chosen": -1.819790244102478,
"logits/rejected": -1.6671082973480225,
"logps/chosen": -602.0831298828125,
"logps/rejected": -778.685546875,
"loss": 0.3429,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.222461223602295,
"rewards/margins": 1.466620683670044,
"rewards/rejected": -4.68908166885376,
"step": 1060
},
{
"epoch": 0.86,
"learning_rate": 3.092332998903416e-07,
"logits/chosen": -1.7987339496612549,
"logits/rejected": -1.8623387813568115,
"logps/chosen": -553.125244140625,
"logps/rejected": -703.0887451171875,
"loss": 0.4544,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.028597116470337,
"rewards/margins": 1.3114429712295532,
"rewards/rejected": -4.34004020690918,
"step": 1070
},
{
"epoch": 0.86,
"learning_rate": 2.764590667717562e-07,
"logits/chosen": -1.5797593593597412,
"logits/rejected": -1.3366864919662476,
"logps/chosen": -491.5816345214844,
"logps/rejected": -525.8438110351562,
"loss": 0.4606,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.148127794265747,
"rewards/margins": 0.6825836300849915,
"rewards/rejected": -3.830711841583252,
"step": 1080
},
{
"epoch": 0.87,
"learning_rate": 2.454186839872158e-07,
"logits/chosen": -1.5686615705490112,
"logits/rejected": -1.4363592863082886,
"logps/chosen": -489.3838806152344,
"logps/rejected": -668.1046752929688,
"loss": 0.4054,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.8657939434051514,
"rewards/margins": 1.8218481540679932,
"rewards/rejected": -4.6876420974731445,
"step": 1090
},
{
"epoch": 0.88,
"learning_rate": 2.1613635589349756e-07,
"logits/chosen": -1.7610828876495361,
"logits/rejected": -1.5154088735580444,
"logps/chosen": -484.9877014160156,
"logps/rejected": -566.8745727539062,
"loss": 0.5207,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.580667018890381,
"rewards/margins": 0.9732138514518738,
"rewards/rejected": -3.5538806915283203,
"step": 1100
},
{
"epoch": 0.89,
"learning_rate": 1.8863491596921745e-07,
"logits/chosen": -1.7479203939437866,
"logits/rejected": -1.4576488733291626,
"logps/chosen": -534.5175170898438,
"logps/rejected": -588.4047241210938,
"loss": 0.4262,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.9962010383605957,
"rewards/margins": 1.064734697341919,
"rewards/rejected": -4.060935020446777,
"step": 1110
},
{
"epoch": 0.9,
"learning_rate": 1.629358090099639e-07,
"logits/chosen": -1.6448123455047607,
"logits/rejected": -1.516871690750122,
"logps/chosen": -528.2589721679688,
"logps/rejected": -610.9138793945312,
"loss": 0.4469,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.179063558578491,
"rewards/margins": 1.1216888427734375,
"rewards/rejected": -4.30075216293335,
"step": 1120
},
{
"epoch": 0.9,
"learning_rate": 1.3905907440629752e-07,
"logits/chosen": -1.681919813156128,
"logits/rejected": -1.4408434629440308,
"logps/chosen": -494.0121154785156,
"logps/rejected": -599.4526977539062,
"loss": 0.4937,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.8302178382873535,
"rewards/margins": 1.3215891122817993,
"rewards/rejected": -4.1518073081970215,
"step": 1130
},
{
"epoch": 0.91,
"learning_rate": 1.1702333051763271e-07,
"logits/chosen": -1.5896549224853516,
"logits/rejected": -1.3458514213562012,
"logps/chosen": -558.2330322265625,
"logps/rejected": -696.0857543945312,
"loss": 0.4117,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.992506742477417,
"rewards/margins": 1.719747543334961,
"rewards/rejected": -4.712254047393799,
"step": 1140
},
{
"epoch": 0.92,
"learning_rate": 9.684576015420277e-08,
"logits/chosen": -1.533817172050476,
"logits/rejected": -1.3470897674560547,
"logps/chosen": -540.8046875,
"logps/rejected": -716.5506591796875,
"loss": 0.3407,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.967010021209717,
"rewards/margins": 1.7264270782470703,
"rewards/rejected": -4.693437099456787,
"step": 1150
},
{
"epoch": 0.93,
"learning_rate": 7.854209717842231e-08,
"logits/chosen": -1.5058703422546387,
"logits/rejected": -1.4305099248886108,
"logps/chosen": -563.594970703125,
"logps/rejected": -608.3973388671875,
"loss": 0.4775,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.313091278076172,
"rewards/margins": 0.6520703434944153,
"rewards/rejected": -3.9651618003845215,
"step": 1160
},
{
"epoch": 0.94,
"learning_rate": 6.212661423609184e-08,
"logits/chosen": -1.4233185052871704,
"logits/rejected": -1.3441836833953857,
"logps/chosen": -597.6832275390625,
"logps/rejected": -758.8271484375,
"loss": 0.5072,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.8257243633270264,
"rewards/margins": 1.319272756576538,
"rewards/rejected": -5.144996643066406,
"step": 1170
},
{
"epoch": 0.94,
"learning_rate": 4.761211162702117e-08,
"logits/chosen": -1.603582739830017,
"logits/rejected": -1.5437158346176147,
"logps/chosen": -524.7338256835938,
"logps/rejected": -654.0867919921875,
"loss": 0.443,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.977849006652832,
"rewards/margins": 1.4028117656707764,
"rewards/rejected": -4.3806610107421875,
"step": 1180
},
{
"epoch": 0.95,
"learning_rate": 3.5009907323737826e-08,
"logits/chosen": -1.5934550762176514,
"logits/rejected": -1.2932679653167725,
"logps/chosen": -560.14501953125,
"logps/rejected": -724.4017333984375,
"loss": 0.4103,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -3.257315158843994,
"rewards/margins": 1.7088110446929932,
"rewards/rejected": -4.966126441955566,
"step": 1190
},
{
"epoch": 0.96,
"learning_rate": 2.4329828146074096e-08,
"logits/chosen": -1.9117376804351807,
"logits/rejected": -1.5988755226135254,
"logps/chosen": -599.6260986328125,
"logps/rejected": -691.9334716796875,
"loss": 0.4567,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.3284664154052734,
"rewards/margins": 1.461777925491333,
"rewards/rejected": -4.790244102478027,
"step": 1200
},
{
"epoch": 0.97,
"learning_rate": 1.5580202098509078e-08,
"logits/chosen": -1.714948058128357,
"logits/rejected": -1.6837193965911865,
"logps/chosen": -673.5400390625,
"logps/rejected": -731.3297119140625,
"loss": 0.519,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.3261985778808594,
"rewards/margins": 1.0625519752502441,
"rewards/rejected": -4.388751029968262,
"step": 1210
},
{
"epoch": 0.98,
"learning_rate": 8.767851876239075e-09,
"logits/chosen": -1.4761461019515991,
"logits/rejected": -1.4198424816131592,
"logps/chosen": -547.0154418945312,
"logps/rejected": -722.1144409179688,
"loss": 0.4213,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.2144112586975098,
"rewards/margins": 1.5727777481079102,
"rewards/rejected": -4.78718900680542,
"step": 1220
},
{
"epoch": 0.98,
"learning_rate": 3.8980895450474455e-09,
"logits/chosen": -1.6909526586532593,
"logits/rejected": -1.5214656591415405,
"logps/chosen": -526.8455810546875,
"logps/rejected": -642.462646484375,
"loss": 0.4702,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -3.0600032806396484,
"rewards/margins": 1.274106502532959,
"rewards/rejected": -4.334109783172607,
"step": 1230
},
{
"epoch": 0.99,
"learning_rate": 9.747123991141193e-10,
"logits/chosen": -1.7589871883392334,
"logits/rejected": -1.6617376804351807,
"logps/chosen": -495.7108459472656,
"logps/rejected": -648.486083984375,
"loss": 0.3952,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.6948044300079346,
"rewards/margins": 1.5261682271957397,
"rewards/rejected": -4.220972537994385,
"step": 1240
},
{
"epoch": 1.0,
"learning_rate": 0.0,
"logits/chosen": -1.6433188915252686,
"logits/rejected": -1.4641611576080322,
"logps/chosen": -550.0238037109375,
"logps/rejected": -689.4912109375,
"loss": 0.3973,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.04687237739563,
"rewards/margins": 1.4918220043182373,
"rewards/rejected": -4.538693904876709,
"step": 1250
},
{
"epoch": 1.0,
"step": 1250,
"total_flos": 0.0,
"train_loss": 0.47997802200317385,
"train_runtime": 13155.732,
"train_samples_per_second": 1.14,
"train_steps_per_second": 0.095
}
],
"logging_steps": 10,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}