{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 53.38299050327549, "learning_rate": 5.208333333333333e-08, "logits/chosen": -0.7851310968399048, "logits/rejected": -0.8436899185180664, "logps/chosen": -1.1747503280639648, "logps/rejected": -1.3589198589324951, "loss": 1.3222, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1747503280639648, "rewards/margins": 0.18416966497898102, "rewards/rejected": -1.3589198589324951, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 11.096600890374436, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.8205683827400208, "logits/rejected": -0.828952968120575, "logps/chosen": -1.1577447652816772, "logps/rejected": -1.2625210285186768, "loss": 1.2895, "rewards/accuracies": 0.5, "rewards/chosen": -1.1577447652816772, "rewards/margins": 0.10477621853351593, "rewards/rejected": -1.2625210285186768, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 30.997194358680527, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.7905334830284119, "logits/rejected": -0.8021936416625977, "logps/chosen": -1.10293710231781, "logps/rejected": -1.358322262763977, "loss": 1.2335, "rewards/accuracies": 0.65625, "rewards/chosen": -1.10293710231781, "rewards/margins": 0.25538530945777893, "rewards/rejected": -1.358322262763977, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 34.90059989061958, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.715395450592041, "logits/rejected": -0.7976140975952148, "logps/chosen": -1.148432970046997, "logps/rejected": -1.2527401447296143, "loss": 1.291, "rewards/accuracies": 0.5625, "rewards/chosen": -1.148432970046997, "rewards/margins": 0.1043071374297142, "rewards/rejected": -1.2527401447296143, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 21.54023932842184, "learning_rate": 2.604166666666667e-07, "logits/chosen": -0.8060086965560913, "logits/rejected": -0.8113024830818176, "logps/chosen": -1.1473302841186523, "logps/rejected": -1.2153716087341309, "loss": 1.271, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1473302841186523, "rewards/margins": 0.06804127246141434, "rewards/rejected": -1.2153716087341309, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 13.45224813667644, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.781649112701416, "logits/rejected": -0.8074128031730652, "logps/chosen": -1.0859925746917725, "logps/rejected": -1.2173770666122437, "loss": 1.3053, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.0859925746917725, "rewards/margins": 0.13138458132743835, "rewards/rejected": -1.2173770666122437, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 12.521849477381108, "learning_rate": 3.645833333333333e-07, "logits/chosen": -0.8088111877441406, "logits/rejected": -0.7897270917892456, "logps/chosen": -1.063861608505249, "logps/rejected": -1.1901941299438477, "loss": 1.2735, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.063861608505249, "rewards/margins": 0.12633253633975983, "rewards/rejected": -1.1901941299438477, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 10.95575535834073, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.7821220755577087, "logits/rejected": -0.8967756032943726, "logps/chosen": -0.9788470268249512, "logps/rejected": -1.2930492162704468, "loss": 1.1906, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9788470268249512, "rewards/margins": 0.31420233845710754, "rewards/rejected": -1.2930492162704468, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 12.779358937615863, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.754804253578186, "logits/rejected": -0.8189429044723511, "logps/chosen": -0.9838635325431824, "logps/rejected": -1.1558878421783447, "loss": 1.2276, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.9838635325431824, "rewards/margins": 0.1720241755247116, "rewards/rejected": -1.1558878421783447, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 42.09305399446999, "learning_rate": 4.999731868769026e-07, "logits/chosen": -0.8073973655700684, "logits/rejected": -0.8861321210861206, "logps/chosen": -1.0215693712234497, "logps/rejected": -1.269590139389038, "loss": 1.2544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0215693712234497, "rewards/margins": 0.2480209767818451, "rewards/rejected": -1.269590139389038, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 15.725192775747708, "learning_rate": 4.996716052911017e-07, "logits/chosen": -0.8139156103134155, "logits/rejected": -0.8545462489128113, "logps/chosen": -0.9811293482780457, "logps/rejected": -1.1653324365615845, "loss": 1.2404, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.9811293482780457, "rewards/margins": 0.18420298397541046, "rewards/rejected": -1.1653324365615845, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 6.989622500925356, "learning_rate": 4.990353313429303e-07, "logits/chosen": -0.838766872882843, "logits/rejected": -0.8909306526184082, "logps/chosen": -1.035679578781128, "logps/rejected": -1.1445151567459106, "loss": 1.191, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.035679578781128, "rewards/margins": 0.10883557796478271, "rewards/rejected": -1.1445151567459106, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 12.681220657714858, "learning_rate": 4.980652179769217e-07, "logits/chosen": -0.7864788174629211, "logits/rejected": -0.8672641515731812, "logps/chosen": -0.9717243909835815, "logps/rejected": -1.3309270143508911, "loss": 1.2547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9717243909835815, "rewards/margins": 0.35920244455337524, "rewards/rejected": -1.3309270143508911, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 34.48694322098047, "learning_rate": 4.967625656594781e-07, "logits/chosen": -0.7827141880989075, "logits/rejected": -0.8242231607437134, "logps/chosen": -1.0319292545318604, "logps/rejected": -1.2068545818328857, "loss": 1.2029, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.0319292545318604, "rewards/margins": 0.17492541670799255, "rewards/rejected": -1.2068545818328857, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 18.131846977177915, "learning_rate": 4.951291206355559e-07, "logits/chosen": -0.7711262702941895, "logits/rejected": -0.8099849820137024, "logps/chosen": -1.0138498544692993, "logps/rejected": -1.2658698558807373, "loss": 1.2066, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0138498544692993, "rewards/margins": 0.2520199716091156, "rewards/rejected": -1.2658698558807373, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 16.30695375589055, "learning_rate": 4.93167072587771e-07, "logits/chosen": -0.7672514915466309, "logits/rejected": -0.8570995330810547, "logps/chosen": -1.038516640663147, "logps/rejected": -1.283189058303833, "loss": 1.24, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.038516640663147, "rewards/margins": 0.24467238783836365, "rewards/rejected": -1.283189058303833, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 12.43456863304776, "learning_rate": 4.908790517010636e-07, "logits/chosen": -0.8523880243301392, "logits/rejected": -0.8795903325080872, "logps/chosen": -1.0062463283538818, "logps/rejected": -1.2304340600967407, "loss": 1.2293, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0062463283538818, "rewards/margins": 0.22418764233589172, "rewards/rejected": -1.2304340600967407, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 12.92135108486493, "learning_rate": 4.882681251368548e-07, "logits/chosen": -0.7969440221786499, "logits/rejected": -0.8410407304763794, "logps/chosen": -1.1051506996154785, "logps/rejected": -1.3057395219802856, "loss": 1.1903, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1051506996154785, "rewards/margins": 0.2005888968706131, "rewards/rejected": -1.3057395219802856, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 44.28729515072602, "learning_rate": 4.853377929214243e-07, "logits/chosen": -0.797573447227478, "logits/rejected": -0.8198971748352051, "logps/chosen": -1.0979535579681396, "logps/rejected": -1.2997150421142578, "loss": 1.2433, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.0979535579681396, "rewards/margins": 0.2017616331577301, "rewards/rejected": -1.2997150421142578, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 21.97688831309354, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.8293838500976562, "logits/rejected": -0.9006645083427429, "logps/chosen": -1.1275050640106201, "logps/rejected": -1.4170247316360474, "loss": 1.224, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1275050640106201, "rewards/margins": 0.2895195782184601, "rewards/rejected": -1.4170247316360474, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 17.71438347196099, "learning_rate": 4.785350472409791e-07, "logits/chosen": -0.8167816400527954, "logits/rejected": -0.8581489324569702, "logps/chosen": -1.0432454347610474, "logps/rejected": -1.3831846714019775, "loss": 1.1871, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0432454347610474, "rewards/margins": 0.33993929624557495, "rewards/rejected": -1.3831846714019775, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 23.125824998094913, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -0.8648807406425476, "logits/rejected": -0.9196739196777344, "logps/chosen": -1.0456058979034424, "logps/rejected": -1.3506977558135986, "loss": 1.1778, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0456058979034424, "rewards/margins": 0.3050919473171234, "rewards/rejected": -1.3506977558135986, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 14.365274758297517, "learning_rate": 4.70507279583015e-07, "logits/chosen": -0.7953716516494751, "logits/rejected": -0.88847416639328, "logps/chosen": -1.0470479726791382, "logps/rejected": -1.4342092275619507, "loss": 1.1484, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0470479726791382, "rewards/margins": 0.3871612846851349, "rewards/rejected": -1.4342092275619507, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 9.392197211358374, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -0.808591365814209, "logits/rejected": -0.8512045741081238, "logps/chosen": -1.1438627243041992, "logps/rejected": -1.3594746589660645, "loss": 1.2183, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1438627243041992, "rewards/margins": 0.21561190485954285, "rewards/rejected": -1.3594746589660645, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 13.61886061579911, "learning_rate": 4.612975213859487e-07, "logits/chosen": -0.8596604466438293, "logits/rejected": -0.8270719647407532, "logps/chosen": -1.0683120489120483, "logps/rejected": -1.265951156616211, "loss": 1.1762, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0683120489120483, "rewards/margins": 0.1976391226053238, "rewards/rejected": -1.265951156616211, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 10.064614855356515, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -0.7958996891975403, "logits/rejected": -0.8618384599685669, "logps/chosen": -1.0645456314086914, "logps/rejected": -1.2115322351455688, "loss": 1.2057, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0645456314086914, "rewards/margins": 0.1469867080450058, "rewards/rejected": -1.2115322351455688, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 12.524149653434907, "learning_rate": 4.5095513994085974e-07, "logits/chosen": -0.7780163884162903, "logits/rejected": -0.8346039056777954, "logps/chosen": -1.1273787021636963, "logps/rejected": -1.5501461029052734, "loss": 1.1594, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1273787021636963, "rewards/margins": 0.42276740074157715, "rewards/rejected": -1.5501461029052734, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 20.64490593527733, "learning_rate": 4.453763107901675e-07, "logits/chosen": -0.8106335401535034, "logits/rejected": -0.8613120317459106, "logps/chosen": -1.122890591621399, "logps/rejected": -1.440879225730896, "loss": 1.1492, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.122890591621399, "rewards/margins": 0.3179887533187866, "rewards/rejected": -1.440879225730896, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 21.025601642857428, "learning_rate": 4.395355737667985e-07, "logits/chosen": -0.8138578534126282, "logits/rejected": -0.9089186787605286, "logps/chosen": -1.1396957635879517, "logps/rejected": -1.4218051433563232, "loss": 1.1702, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1396957635879517, "rewards/margins": 0.2821093797683716, "rewards/rejected": -1.4218051433563232, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 16.06397903977772, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -0.8069850206375122, "logits/rejected": -0.820785641670227, "logps/chosen": -1.108163833618164, "logps/rejected": -1.3765920400619507, "loss": 1.1828, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.108163833618164, "rewards/margins": 0.2684280276298523, "rewards/rejected": -1.3765920400619507, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 11.984072545843137, "learning_rate": 4.271000354423425e-07, "logits/chosen": -0.8032317161560059, "logits/rejected": -0.82854825258255, "logps/chosen": -1.1042635440826416, "logps/rejected": -1.3533399105072021, "loss": 1.1265, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1042635440826416, "rewards/margins": 0.24907639622688293, "rewards/rejected": -1.3533399105072021, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 25.182537944964707, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -0.7944300770759583, "logits/rejected": -0.8345978856086731, "logps/chosen": -1.1058661937713623, "logps/rejected": -1.3634922504425049, "loss": 1.1469, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1058661937713623, "rewards/margins": 0.25762611627578735, "rewards/rejected": -1.3634922504425049, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 13.462685553639172, "learning_rate": 4.137151834863213e-07, "logits/chosen": -0.8361462354660034, "logits/rejected": -0.8756030797958374, "logps/chosen": -1.0616618394851685, "logps/rejected": -1.4116162061691284, "loss": 1.1281, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0616618394851685, "rewards/margins": 0.3499544858932495, "rewards/rejected": -1.4116162061691284, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 18.213136990144893, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -0.8153193593025208, "logits/rejected": -0.8727362751960754, "logps/chosen": -1.1646827459335327, "logps/rejected": -1.4618403911590576, "loss": 1.1646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1646827459335327, "rewards/margins": 0.29715752601623535, "rewards/rejected": -1.4618403911590576, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 24.63000430982005, "learning_rate": 3.994527650465352e-07, "logits/chosen": -0.8038943409919739, "logits/rejected": -0.8953922390937805, "logps/chosen": -1.129650354385376, "logps/rejected": -1.4100673198699951, "loss": 1.1683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.129650354385376, "rewards/margins": 0.28041717410087585, "rewards/rejected": -1.4100673198699951, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 16.639928125809334, "learning_rate": 3.920161866827889e-07, "logits/chosen": -0.7904566526412964, "logits/rejected": -0.8678900599479675, "logps/chosen": -1.1200312376022339, "logps/rejected": -1.4327762126922607, "loss": 1.1534, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1200312376022339, "rewards/margins": 0.31274500489234924, "rewards/rejected": -1.4327762126922607, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 11.589602931595458, "learning_rate": 3.8438923131177237e-07, "logits/chosen": -0.7250074148178101, "logits/rejected": -0.8084772229194641, "logps/chosen": -1.190788745880127, "logps/rejected": -1.6289300918579102, "loss": 1.1075, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.190788745880127, "rewards/margins": 0.4381411671638489, "rewards/rejected": -1.6289300918579102, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 15.86395394669409, "learning_rate": 3.765821230985757e-07, "logits/chosen": -0.8317627906799316, "logits/rejected": -0.8573846817016602, "logps/chosen": -1.165771722793579, "logps/rejected": -1.4369280338287354, "loss": 1.2539, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.165771722793579, "rewards/margins": 0.2711564600467682, "rewards/rejected": -1.4369280338287354, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 23.09052292567885, "learning_rate": 3.6860532770864005e-07, "logits/chosen": -0.7901838421821594, "logits/rejected": -0.8154572248458862, "logps/chosen": -1.1995000839233398, "logps/rejected": -1.4376014471054077, "loss": 1.1802, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1995000839233398, "rewards/margins": 0.2381015568971634, "rewards/rejected": -1.4376014471054077, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 91.91124319542283, "learning_rate": 3.604695382782159e-07, "logits/chosen": -0.8659864664077759, "logits/rejected": -0.8673130869865417, "logps/chosen": -1.2066630125045776, "logps/rejected": -1.5082231760025024, "loss": 1.1494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2066630125045776, "rewards/margins": 0.30156025290489197, "rewards/rejected": -1.5082231760025024, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 56.369612183510746, "learning_rate": 3.5218566107988867e-07, "logits/chosen": -0.7527281045913696, "logits/rejected": -0.7920881509780884, "logps/chosen": -1.0741350650787354, "logps/rejected": -1.4695587158203125, "loss": 1.1027, "rewards/accuracies": 0.625, "rewards/chosen": -1.0741350650787354, "rewards/margins": 0.3954237103462219, "rewards/rejected": -1.4695587158203125, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 31.962536697630313, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -0.7644343376159668, "logits/rejected": -0.8094353675842285, "logps/chosen": -1.1909334659576416, "logps/rejected": -1.398443579673767, "loss": 1.1695, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1909334659576416, "rewards/margins": 0.20751002430915833, "rewards/rejected": -1.398443579673767, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 20.423220329599825, "learning_rate": 3.3521824616429284e-07, "logits/chosen": -0.732006311416626, "logits/rejected": -0.8126561045646667, "logps/chosen": -1.1012035608291626, "logps/rejected": -1.499692678451538, "loss": 1.1111, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1012035608291626, "rewards/margins": 0.3984890282154083, "rewards/rejected": -1.499692678451538, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 38.12133833205935, "learning_rate": 3.265574537815398e-07, "logits/chosen": -0.7511489391326904, "logits/rejected": -0.811165452003479, "logps/chosen": -1.1531962156295776, "logps/rejected": -1.477449893951416, "loss": 1.1752, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1531962156295776, "rewards/margins": 0.3242538571357727, "rewards/rejected": -1.477449893951416, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 19.789554625674715, "learning_rate": 3.1779403380910425e-07, "logits/chosen": -0.746801495552063, "logits/rejected": -0.8311646580696106, "logps/chosen": -1.1100513935089111, "logps/rejected": -1.5284305810928345, "loss": 1.1388, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1100513935089111, "rewards/margins": 0.41837891936302185, "rewards/rejected": -1.5284305810928345, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 13.943436296638694, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -0.743213951587677, "logits/rejected": -0.8207007646560669, "logps/chosen": -1.2091771364212036, "logps/rejected": -1.7089935541152954, "loss": 1.0624, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2091771364212036, "rewards/margins": 0.499816358089447, "rewards/rejected": -1.7089935541152954, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 43.224342157968636, "learning_rate": 3.000064234440111e-07, "logits/chosen": -0.7580028772354126, "logits/rejected": -0.8552296757698059, "logps/chosen": -1.166473150253296, "logps/rejected": -1.590008020401001, "loss": 1.0657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.166473150253296, "rewards/margins": 0.4235347807407379, "rewards/rejected": -1.590008020401001, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 62.8562754120486, "learning_rate": 2.910060778827554e-07, "logits/chosen": -0.7522355318069458, "logits/rejected": -0.8166631460189819, "logps/chosen": -1.2486202716827393, "logps/rejected": -1.6446088552474976, "loss": 1.1827, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2486202716827393, "rewards/margins": 0.3959885239601135, "rewards/rejected": -1.6446088552474976, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 37.80662132118279, "learning_rate": 2.8195076242990116e-07, "logits/chosen": -0.7939907908439636, "logits/rejected": -0.8528479337692261, "logps/chosen": -1.2995712757110596, "logps/rejected": -1.6768890619277954, "loss": 1.1087, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2995712757110596, "rewards/margins": 0.3773179054260254, "rewards/rejected": -1.6768890619277954, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 24.343931328933355, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -0.7352076768875122, "logits/rejected": -0.8288635015487671, "logps/chosen": -1.2183833122253418, "logps/rejected": -1.7130768299102783, "loss": 1.0936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2183833122253418, "rewards/margins": 0.4946935176849365, "rewards/rejected": -1.7130768299102783, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 33.502855557989974, "learning_rate": 2.6372383496608186e-07, "logits/chosen": -0.76850426197052, "logits/rejected": -0.8392526507377625, "logps/chosen": -1.2973507642745972, "logps/rejected": -1.7654794454574585, "loss": 1.0362, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2973507642745972, "rewards/margins": 0.46812868118286133, "rewards/rejected": -1.7654794454574585, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 17.089775934262153, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -0.7098642587661743, "logits/rejected": -0.7836583256721497, "logps/chosen": -1.120083212852478, "logps/rejected": -1.7486484050750732, "loss": 1.1109, "rewards/accuracies": 0.6875, "rewards/chosen": -1.120083212852478, "rewards/margins": 0.6285651922225952, "rewards/rejected": -1.7486484050750732, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 37.74541083735088, "learning_rate": 2.454233432955807e-07, "logits/chosen": -0.7521490454673767, "logits/rejected": -0.8322948217391968, "logps/chosen": -1.2596670389175415, "logps/rejected": -1.5793644189834595, "loss": 1.059, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2596670389175415, "rewards/margins": 0.3196973502635956, "rewards/rejected": -1.5793644189834595, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 27.920649766985065, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -0.7807239890098572, "logits/rejected": -0.790714681148529, "logps/chosen": -1.3671778440475464, "logps/rejected": -1.6410033702850342, "loss": 1.1287, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3671778440475464, "rewards/margins": 0.2738254964351654, "rewards/rejected": -1.6410033702850342, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 27.174780061747185, "learning_rate": 2.2714738398943308e-07, "logits/chosen": -0.8025692701339722, "logits/rejected": -0.8751484751701355, "logps/chosen": -1.2785165309906006, "logps/rejected": -1.8370211124420166, "loss": 1.0963, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2785165309906006, "rewards/margins": 0.5585044026374817, "rewards/rejected": -1.8370211124420166, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 18.983042518953734, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -0.829318642616272, "logits/rejected": -0.9187866449356079, "logps/chosen": -1.2629585266113281, "logps/rejected": -1.6275123357772827, "loss": 1.0945, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2629585266113281, "rewards/margins": 0.36455395817756653, "rewards/rejected": -1.6275123357772827, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 25.322894998034943, "learning_rate": 2.089939221172446e-07, "logits/chosen": -0.7645302414894104, "logits/rejected": -0.808000922203064, "logps/chosen": -1.2651149034500122, "logps/rejected": -1.7678537368774414, "loss": 1.0671, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2651149034500122, "rewards/margins": 0.5027385950088501, "rewards/rejected": -1.7678537368774414, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 11.546524112678627, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -0.777170717716217, "logits/rejected": -0.8957067728042603, "logps/chosen": -1.3173184394836426, "logps/rejected": -1.8677217960357666, "loss": 1.0699, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3173184394836426, "rewards/margins": 0.5504032969474792, "rewards/rejected": -1.8677217960357666, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 28.718920525486002, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -0.7612749934196472, "logits/rejected": -0.8622433543205261, "logps/chosen": -1.2848594188690186, "logps/rejected": -1.7595828771591187, "loss": 1.1155, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2848594188690186, "rewards/margins": 0.4747234284877777, "rewards/rejected": -1.7595828771591187, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 15.520739875606553, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -0.7951768040657043, "logits/rejected": -0.8577022552490234, "logps/chosen": -1.3048664331436157, "logps/rejected": -1.6950209140777588, "loss": 1.1174, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3048664331436157, "rewards/margins": 0.39015451073646545, "rewards/rejected": -1.6950209140777588, "step": 300 }, { "epoch": 0.6385762889295996, "grad_norm": 21.736600636566607, "learning_rate": 1.7344254621846017e-07, "logits/chosen": -0.7790982127189636, "logits/rejected": -0.8877668380737305, "logps/chosen": -1.321123480796814, "logps/rejected": -1.8356192111968994, "loss": 1.0498, "rewards/accuracies": 0.65625, "rewards/chosen": -1.321123480796814, "rewards/margins": 0.5144956111907959, "rewards/rejected": -1.8356192111968994, "step": 305 }, { "epoch": 0.6490447526825438, "grad_norm": 28.560090483112017, "learning_rate": 1.647817538357072e-07, "logits/chosen": -0.8046407699584961, "logits/rejected": -0.8369441032409668, "logps/chosen": -1.2219583988189697, "logps/rejected": -1.6623207330703735, "loss": 1.0661, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2219583988189697, "rewards/margins": 0.4403623938560486, "rewards/rejected": -1.6623207330703735, "step": 310 }, { "epoch": 0.6595132164354881, "grad_norm": 54.078343769904905, "learning_rate": 1.562351990976095e-07, "logits/chosen": -0.7760301828384399, "logits/rejected": -0.8801841735839844, "logps/chosen": -1.2228752374649048, "logps/rejected": -1.813062071800232, "loss": 1.029, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2228752374649048, "rewards/margins": 0.5901869535446167, "rewards/rejected": -1.813062071800232, "step": 315 }, { "epoch": 0.6699816801884323, "grad_norm": 26.10346506806248, "learning_rate": 1.478143389201113e-07, "logits/chosen": -0.8181501626968384, "logits/rejected": -0.8063300848007202, "logps/chosen": -1.3335962295532227, "logps/rejected": -1.7551372051239014, "loss": 1.0617, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3335962295532227, "rewards/margins": 0.4215410351753235, "rewards/rejected": -1.7551372051239014, "step": 320 }, { "epoch": 0.6804501439413766, "grad_norm": 17.356908971828567, "learning_rate": 1.3953046172178413e-07, "logits/chosen": -0.7939370274543762, "logits/rejected": -0.8830009698867798, "logps/chosen": -1.3137036561965942, "logps/rejected": -1.841398000717163, "loss": 1.0447, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3137036561965942, "rewards/margins": 0.5276943445205688, "rewards/rejected": -1.841398000717163, "step": 325 }, { "epoch": 0.6909186076943209, "grad_norm": 32.5244093832663, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -0.7787013053894043, "logits/rejected": -0.8554028272628784, "logps/chosen": -1.29584538936615, "logps/rejected": -1.6860315799713135, "loss": 1.0817, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.29584538936615, "rewards/margins": 0.39018622040748596, "rewards/rejected": -1.6860315799713135, "step": 330 }, { "epoch": 0.7013870714472651, "grad_norm": 20.20383595400908, "learning_rate": 1.2341787690142435e-07, "logits/chosen": -0.7897971868515015, "logits/rejected": -0.8126639127731323, "logps/chosen": -1.2952033281326294, "logps/rejected": -1.7156461477279663, "loss": 1.0795, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2952033281326294, "rewards/margins": 0.4204428791999817, "rewards/rejected": -1.7156461477279663, "step": 335 }, { "epoch": 0.7118555352002094, "grad_norm": 18.887791891660626, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -0.7770498991012573, "logits/rejected": -0.8715664744377136, "logps/chosen": -1.4334077835083008, "logps/rejected": -1.8490326404571533, "loss": 1.0426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4334077835083008, "rewards/margins": 0.4156250059604645, "rewards/rejected": -1.8490326404571533, "step": 340 }, { "epoch": 0.7223239989531536, "grad_norm": 39.26078396305699, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -0.8092272877693176, "logits/rejected": -0.8746698498725891, "logps/chosen": -1.3369479179382324, "logps/rejected": -1.8316799402236938, "loss": 1.1001, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3369479179382324, "rewards/margins": 0.49473199248313904, "rewards/rejected": -1.8316799402236938, "step": 345 }, { "epoch": 0.7327924627060979, "grad_norm": 41.824009161110546, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -0.8247002363204956, "logits/rejected": -0.824820339679718, "logps/chosen": -1.310402512550354, "logps/rejected": -1.7290971279144287, "loss": 1.0978, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.310402512550354, "rewards/margins": 0.41869455575942993, "rewards/rejected": -1.7290971279144287, "step": 350 }, { "epoch": 0.7432609264590422, "grad_norm": 94.69105800236349, "learning_rate": 9.331100255592436e-08, "logits/chosen": -0.8165324330329895, "logits/rejected": -0.8640809059143066, "logps/chosen": -1.2218616008758545, "logps/rejected": -1.786354422569275, "loss": 1.0473, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2218616008758545, "rewards/margins": 0.5644930005073547, "rewards/rejected": -1.786354422569275, "step": 355 }, { "epoch": 0.7537293902119864, "grad_norm": 27.03565324741037, "learning_rate": 8.628481651367875e-08, "logits/chosen": -0.755358099937439, "logits/rejected": -0.8189060091972351, "logps/chosen": -1.1941946744918823, "logps/rejected": -1.5993152856826782, "loss": 1.0357, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1941946744918823, "rewards/margins": 0.4051206707954407, "rewards/rejected": -1.5993152856826782, "step": 360 }, { "epoch": 0.7641978539649307, "grad_norm": 36.97519801042864, "learning_rate": 7.947809564230445e-08, "logits/chosen": -0.7934302091598511, "logits/rejected": -0.8130480051040649, "logps/chosen": -1.3487493991851807, "logps/rejected": -1.827784538269043, "loss": 1.1029, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3487493991851807, "rewards/margins": 0.4790351390838623, "rewards/rejected": -1.827784538269043, "step": 365 }, { "epoch": 0.7746663177178749, "grad_norm": 37.40820945162956, "learning_rate": 7.289996455765748e-08, "logits/chosen": -0.7871400117874146, "logits/rejected": -0.8201042413711548, "logps/chosen": -1.34503972530365, "logps/rejected": -1.8722436428070068, "loss": 1.0433, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.34503972530365, "rewards/margins": 0.5272040963172913, "rewards/rejected": -1.8722436428070068, "step": 370 }, { "epoch": 0.7851347814708192, "grad_norm": 23.955007079274537, "learning_rate": 6.655924144404906e-08, "logits/chosen": -0.8498660922050476, "logits/rejected": -0.8262343406677246, "logps/chosen": -1.3859401941299438, "logps/rejected": -1.7746193408966064, "loss": 1.1065, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3859401941299438, "rewards/margins": 0.3886791169643402, "rewards/rejected": -1.7746193408966064, "step": 375 }, { "epoch": 0.7956032452237635, "grad_norm": 113.86703031861444, "learning_rate": 6.046442623320145e-08, "logits/chosen": -0.8647100329399109, "logits/rejected": -0.801548957824707, "logps/chosen": -1.425047516822815, "logps/rejected": -1.8129713535308838, "loss": 1.0854, "rewards/accuracies": 0.59375, "rewards/chosen": -1.425047516822815, "rewards/margins": 0.38792353868484497, "rewards/rejected": -1.8129713535308838, "step": 380 }, { "epoch": 0.8060717089767077, "grad_norm": 20.44017337397379, "learning_rate": 5.4623689209832484e-08, "logits/chosen": -0.783450186252594, "logits/rejected": -0.8156192898750305, "logps/chosen": -1.3061225414276123, "logps/rejected": -1.8070968389511108, "loss": 1.0197, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3061225414276123, "rewards/margins": 0.5009742975234985, "rewards/rejected": -1.8070968389511108, "step": 385 }, { "epoch": 0.816540172729652, "grad_norm": 54.05046477266579, "learning_rate": 4.904486005914027e-08, "logits/chosen": -0.7770819664001465, "logits/rejected": -0.8748834729194641, "logps/chosen": -1.2875335216522217, "logps/rejected": -1.8701460361480713, "loss": 1.0765, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2875335216522217, "rewards/margins": 0.5826126337051392, "rewards/rejected": -1.8701460361480713, "step": 390 }, { "epoch": 0.8270086364825961, "grad_norm": 25.608777943539643, "learning_rate": 4.373541737087263e-08, "logits/chosen": -0.7718879580497742, "logits/rejected": -0.8530917167663574, "logps/chosen": -1.2932384014129639, "logps/rejected": -1.7759485244750977, "loss": 1.085, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2932384014129639, "rewards/margins": 0.482710063457489, "rewards/rejected": -1.7759485244750977, "step": 395 }, { "epoch": 0.8374771002355405, "grad_norm": 56.990782840264686, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -0.7689244151115417, "logits/rejected": -0.8010846972465515, "logps/chosen": -1.2468632459640503, "logps/rejected": -1.7685699462890625, "loss": 1.0763, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2468632459640503, "rewards/margins": 0.5217065811157227, "rewards/rejected": -1.7685699462890625, "step": 400 }, { "epoch": 0.8479455639884846, "grad_norm": 36.13453164817573, "learning_rate": 3.3952790595787986e-08, "logits/chosen": -0.8216865658760071, "logits/rejected": -0.835070013999939, "logps/chosen": -1.4558444023132324, "logps/rejected": -1.8808685541152954, "loss": 1.091, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4558444023132324, "rewards/margins": 0.4250241219997406, "rewards/rejected": -1.8808685541152954, "step": 405 }, { "epoch": 0.8584140277414289, "grad_norm": 63.90507308437301, "learning_rate": 2.9492720416985e-08, "logits/chosen": -0.7913083434104919, "logits/rejected": -0.8327590227127075, "logps/chosen": -1.2795411348342896, "logps/rejected": -1.6625678539276123, "loss": 1.0711, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2795411348342896, "rewards/margins": 0.38302695751190186, "rewards/rejected": -1.6625678539276123, "step": 410 }, { "epoch": 0.8688824914943732, "grad_norm": 17.998086343917752, "learning_rate": 2.5328246937043525e-08, "logits/chosen": -0.8415113687515259, "logits/rejected": -0.9268835186958313, "logps/chosen": -1.3358659744262695, "logps/rejected": -1.7969989776611328, "loss": 1.0177, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3358659744262695, "rewards/margins": 0.46113309264183044, "rewards/rejected": -1.7969989776611328, "step": 415 }, { "epoch": 0.8793509552473174, "grad_norm": 34.40765091158961, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -0.7967473268508911, "logits/rejected": -0.8135166168212891, "logps/chosen": -1.3378677368164062, "logps/rejected": -1.803594946861267, "loss": 1.122, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3378677368164062, "rewards/margins": 0.4657273292541504, "rewards/rejected": -1.803594946861267, "step": 420 }, { "epoch": 0.8898194190002617, "grad_norm": 50.144219173307704, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -0.7934291958808899, "logits/rejected": -0.818689227104187, "logps/chosen": -1.3789987564086914, "logps/rejected": -1.802093267440796, "loss": 1.0515, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3789987564086914, "rewards/margins": 0.4230947494506836, "rewards/rejected": -1.802093267440796, "step": 425 }, { "epoch": 0.9002878827532059, "grad_norm": 15.886039080166187, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -0.7966570854187012, "logits/rejected": -0.8479889035224915, "logps/chosen": -1.3674869537353516, "logps/rejected": -1.8225427865982056, "loss": 1.0165, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3674869537353516, "rewards/margins": 0.4550558030605316, "rewards/rejected": -1.8225427865982056, "step": 430 }, { "epoch": 0.9107563465061502, "grad_norm": 96.66969938409414, "learning_rate": 1.1731874863145142e-08, "logits/chosen": -0.8691393136978149, "logits/rejected": -0.8967689275741577, "logps/chosen": -1.3705408573150635, "logps/rejected": -1.929089903831482, "loss": 1.0075, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3705408573150635, "rewards/margins": 0.5585491061210632, "rewards/rejected": -1.929089903831482, "step": 435 }, { "epoch": 0.9212248102590945, "grad_norm": 21.356866015636847, "learning_rate": 9.12094829893642e-09, "logits/chosen": -0.8050259351730347, "logits/rejected": -0.8349030613899231, "logps/chosen": -1.344839334487915, "logps/rejected": -1.75347900390625, "loss": 1.1154, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.344839334487915, "rewards/margins": 0.4086398482322693, "rewards/rejected": -1.75347900390625, "step": 440 }, { "epoch": 0.9316932740120387, "grad_norm": 31.58344276550534, "learning_rate": 6.832927412229017e-09, "logits/chosen": -0.8041754961013794, "logits/rejected": -0.8169196844100952, "logps/chosen": -1.3212865591049194, "logps/rejected": -1.7670789957046509, "loss": 1.0912, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3212865591049194, "rewards/margins": 0.44579243659973145, "rewards/rejected": -1.7670789957046509, "step": 445 }, { "epoch": 0.942161737764983, "grad_norm": 25.802172859771282, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -0.7889554500579834, "logits/rejected": -0.7736221551895142, "logps/chosen": -1.3318743705749512, "logps/rejected": -1.801044225692749, "loss": 1.0634, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3318743705749512, "rewards/margins": 0.469169944524765, "rewards/rejected": -1.801044225692749, "step": 450 }, { "epoch": 0.9526302015179272, "grad_norm": 49.19181923188829, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -0.7785223722457886, "logits/rejected": -0.8142352104187012, "logps/chosen": -1.4146032333374023, "logps/rejected": -1.7336387634277344, "loss": 1.1009, "rewards/accuracies": 0.625, "rewards/chosen": -1.4146032333374023, "rewards/margins": 0.3190356194972992, "rewards/rejected": -1.7336387634277344, "step": 455 }, { "epoch": 0.9630986652708715, "grad_norm": 90.17621343145314, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -0.8402397036552429, "logits/rejected": -0.865011990070343, "logps/chosen": -1.3008335828781128, "logps/rejected": -1.8571717739105225, "loss": 1.0135, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3008335828781128, "rewards/margins": 0.5563381314277649, "rewards/rejected": -1.8571717739105225, "step": 460 }, { "epoch": 0.9735671290238157, "grad_norm": 22.634462924437393, "learning_rate": 9.64668657069706e-10, "logits/chosen": -0.8629606366157532, "logits/rejected": -0.9080095291137695, "logps/chosen": -1.372924566268921, "logps/rejected": -1.8657976388931274, "loss": 1.0663, "rewards/accuracies": 0.6875, "rewards/chosen": -1.372924566268921, "rewards/margins": 0.49287280440330505, "rewards/rejected": -1.8657976388931274, "step": 465 }, { "epoch": 0.98403559277676, "grad_norm": 24.738713797304456, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -0.7899819612503052, "logits/rejected": -0.8224009275436401, "logps/chosen": -1.3259704113006592, "logps/rejected": -1.7480943202972412, "loss": 1.0582, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3259704113006592, "rewards/margins": 0.42212408781051636, "rewards/rejected": -1.7480943202972412, "step": 470 }, { "epoch": 0.9945040565297043, "grad_norm": 59.00244437293605, "learning_rate": 2.6813123097352287e-11, "logits/chosen": -0.8297709226608276, "logits/rejected": -0.863924503326416, "logps/chosen": -1.3778445720672607, "logps/rejected": -1.7702316045761108, "loss": 1.1059, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3778445720672607, "rewards/margins": 0.3923872709274292, "rewards/rejected": -1.7702316045761108, "step": 475 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 1.13235006017505, "train_runtime": 14238.5482, "train_samples_per_second": 4.294, "train_steps_per_second": 0.034 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }