{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003205128205128205, "grad_norm": 469662.9144643782, "learning_rate": 1.5625e-08, "logits/chosen": -0.3432708978652954, "logits/rejected": -0.332830011844635, "logps/chosen": -140.40289306640625, "logps/rejected": -115.87382507324219, "loss": 120282.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03205128205128205, "grad_norm": 568441.5566994098, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.5443148016929626, "logits/rejected": -0.5515072345733643, "logps/chosen": -89.99518585205078, "logps/rejected": -90.88400268554688, "loss": 125155.3333, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.000289025716483593, "rewards/margins": -9.353376663057134e-05, "rewards/rejected": -0.0001954919280251488, "step": 10 }, { "epoch": 0.0641025641025641, "grad_norm": 464338.6645889823, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.4427386224269867, "logits/rejected": -0.4934562146663666, "logps/chosen": -90.24401092529297, "logps/rejected": -95.63074493408203, "loss": 124284.2, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003463043598458171, "rewards/margins": 0.0009749190066941082, "rewards/rejected": -0.004437962546944618, "step": 20 }, { "epoch": 0.09615384615384616, "grad_norm": 480260.34201998485, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.5152963399887085, "logits/rejected": -0.5460027456283569, "logps/chosen": -94.16231536865234, "logps/rejected": -100.62825775146484, "loss": 124351.825, "rewards/accuracies": 0.625, "rewards/chosen": -0.006953537464141846, "rewards/margins": 0.0006809952319599688, "rewards/rejected": -0.007634532637894154, "step": 30 }, { "epoch": 0.1282051282051282, "grad_norm": 433885.6131804333, "learning_rate": 4.857142857142857e-07, "logits/chosen": -0.5599047541618347, "logits/rejected": -0.5487984418869019, "logps/chosen": -93.6915512084961, "logps/rejected": -95.92937469482422, "loss": 124131.1375, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.008278829045593739, "rewards/margins": 0.0008631674572825432, "rewards/rejected": -0.009141995571553707, "step": 40 }, { "epoch": 0.16025641025641027, "grad_norm": 491164.7661120398, "learning_rate": 4.6785714285714283e-07, "logits/chosen": -0.5146197080612183, "logits/rejected": -0.49345073103904724, "logps/chosen": -109.50101470947266, "logps/rejected": -104.8797378540039, "loss": 125644.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006485734134912491, "rewards/margins": 0.0027839418035000563, "rewards/rejected": -0.00926967617124319, "step": 50 }, { "epoch": 0.19230769230769232, "grad_norm": 558888.3723594319, "learning_rate": 4.5e-07, "logits/chosen": -0.6499918699264526, "logits/rejected": -0.6553579568862915, "logps/chosen": -114.01820373535156, "logps/rejected": -111.94651794433594, "loss": 124503.425, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.011266408488154411, "rewards/margins": 0.002909548580646515, "rewards/rejected": -0.014175957068800926, "step": 60 }, { "epoch": 0.22435897435897437, "grad_norm": 536020.620286275, "learning_rate": 4.3214285714285713e-07, "logits/chosen": -0.5960813760757446, "logits/rejected": -0.5772069692611694, "logps/chosen": -87.95893859863281, "logps/rejected": -90.57078552246094, "loss": 124475.325, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.007508446462452412, "rewards/margins": 0.0036646847147494555, "rewards/rejected": -0.011173130944371223, "step": 70 }, { "epoch": 0.2564102564102564, "grad_norm": 579545.0571782525, "learning_rate": 4.142857142857143e-07, "logits/chosen": -0.5821112394332886, "logits/rejected": -0.528997540473938, "logps/chosen": -87.23516082763672, "logps/rejected": -85.477783203125, "loss": 125293.725, "rewards/accuracies": 0.625, "rewards/chosen": -0.012744182720780373, "rewards/margins": 0.0017470993334427476, "rewards/rejected": -0.014491280540823936, "step": 80 }, { "epoch": 0.28846153846153844, "grad_norm": 702613.3359581099, "learning_rate": 3.9642857142857137e-07, "logits/chosen": -0.5383504629135132, "logits/rejected": -0.4826398491859436, "logps/chosen": -94.10514831542969, "logps/rejected": -94.60591888427734, "loss": 124419.85, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006864988245069981, "rewards/margins": 0.002892577787861228, "rewards/rejected": -0.009757566265761852, "step": 90 }, { "epoch": 0.32051282051282054, "grad_norm": 574962.0888861647, "learning_rate": 3.785714285714285e-07, "logits/chosen": -0.6083141565322876, "logits/rejected": -0.6077857613563538, "logps/chosen": -90.84620666503906, "logps/rejected": -94.3597640991211, "loss": 123532.0125, "rewards/accuracies": 0.625, "rewards/chosen": -0.010823920369148254, "rewards/margins": 0.001849750755354762, "rewards/rejected": -0.01267367135733366, "step": 100 }, { "epoch": 0.3525641025641026, "grad_norm": 582134.6500433815, "learning_rate": 3.607142857142857e-07, "logits/chosen": -0.5725646615028381, "logits/rejected": -0.5323026776313782, "logps/chosen": -79.6702651977539, "logps/rejected": -76.59967041015625, "loss": 124798.1, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.004521737806499004, "rewards/margins": 0.0010833492269739509, "rewards/rejected": -0.005605087615549564, "step": 110 }, { "epoch": 0.38461538461538464, "grad_norm": 661819.0222539164, "learning_rate": 3.4285714285714286e-07, "logits/chosen": -0.5654035210609436, "logits/rejected": -0.5707298517227173, "logps/chosen": -73.98536682128906, "logps/rejected": -84.55022430419922, "loss": 125812.0125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007425880525261164, "rewards/margins": 0.006358510348945856, "rewards/rejected": -0.01378439087420702, "step": 120 }, { "epoch": 0.4166666666666667, "grad_norm": 620371.8592043375, "learning_rate": 3.25e-07, "logits/chosen": -0.7043228149414062, "logits/rejected": -0.7306665182113647, "logps/chosen": -100.49541473388672, "logps/rejected": -107.61614990234375, "loss": 125617.475, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006788595579564571, "rewards/margins": 0.0021920702420175076, "rewards/rejected": -0.008980666287243366, "step": 130 }, { "epoch": 0.44871794871794873, "grad_norm": 633474.7695131563, "learning_rate": 3.0714285714285716e-07, "logits/chosen": -0.6854395270347595, "logits/rejected": -0.627780556678772, "logps/chosen": -91.88723754882812, "logps/rejected": -87.9045639038086, "loss": 124316.25, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010259262286126614, "rewards/margins": 0.0011669063242152333, "rewards/rejected": -0.011426168493926525, "step": 140 }, { "epoch": 0.4807692307692308, "grad_norm": 696715.0430078872, "learning_rate": 2.892857142857143e-07, "logits/chosen": -0.6097627282142639, "logits/rejected": -0.645863950252533, "logps/chosen": -102.78446197509766, "logps/rejected": -106.5654525756836, "loss": 123236.9375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.012269060127437115, "rewards/margins": 0.0026497889775782824, "rewards/rejected": -0.014918850734829903, "step": 150 }, { "epoch": 0.5128205128205128, "grad_norm": 787986.1807345189, "learning_rate": 2.714285714285714e-07, "logits/chosen": -0.5826394557952881, "logits/rejected": -0.590654730796814, "logps/chosen": -90.98385620117188, "logps/rejected": -97.3979263305664, "loss": 121865.35, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.011020239442586899, "rewards/margins": 0.005359311122447252, "rewards/rejected": -0.016379551962018013, "step": 160 }, { "epoch": 0.5448717948717948, "grad_norm": 736957.014479021, "learning_rate": 2.5357142857142855e-07, "logits/chosen": -0.5794961452484131, "logits/rejected": -0.6191390156745911, "logps/chosen": -98.76277160644531, "logps/rejected": -103.96248626708984, "loss": 123882.0625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009380698204040527, "rewards/margins": 0.0029742049518972635, "rewards/rejected": -0.012354902923107147, "step": 170 }, { "epoch": 0.5769230769230769, "grad_norm": 733809.4054912812, "learning_rate": 2.357142857142857e-07, "logits/chosen": -0.6393710970878601, "logits/rejected": -0.6236029863357544, "logps/chosen": -94.88532257080078, "logps/rejected": -92.90126037597656, "loss": 123955.3625, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.011159100569784641, "rewards/margins": 0.0009313292102888227, "rewards/rejected": -0.012090427801012993, "step": 180 }, { "epoch": 0.6089743589743589, "grad_norm": 699287.0532059986, "learning_rate": 2.1785714285714284e-07, "logits/chosen": -0.534403920173645, "logits/rejected": -0.5387021899223328, "logps/chosen": -92.89164733886719, "logps/rejected": -97.23823547363281, "loss": 124855.6375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009661735966801643, "rewards/margins": 0.004148019477725029, "rewards/rejected": -0.013809755444526672, "step": 190 }, { "epoch": 0.6410256410256411, "grad_norm": 694927.9752367702, "learning_rate": 2e-07, "logits/chosen": -0.696746289730072, "logits/rejected": -0.7076197266578674, "logps/chosen": -107.4284896850586, "logps/rejected": -108.21855163574219, "loss": 123333.7125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009755025617778301, "rewards/margins": 0.004948892164975405, "rewards/rejected": -0.014703919179737568, "step": 200 }, { "epoch": 0.6730769230769231, "grad_norm": 664529.2459223642, "learning_rate": 1.8214285714285714e-07, "logits/chosen": -0.5494934320449829, "logits/rejected": -0.5753802061080933, "logps/chosen": -87.67240905761719, "logps/rejected": -95.261474609375, "loss": 123261.425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010124283842742443, "rewards/margins": 0.006216048263013363, "rewards/rejected": -0.016340332105755806, "step": 210 }, { "epoch": 0.7051282051282052, "grad_norm": 752626.4895790943, "learning_rate": 1.6428571428571429e-07, "logits/chosen": -0.4907689094543457, "logits/rejected": -0.5003972053527832, "logps/chosen": -70.47917175292969, "logps/rejected": -73.15069580078125, "loss": 122213.175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007994843646883965, "rewards/margins": 0.005159543361514807, "rewards/rejected": -0.01315438561141491, "step": 220 }, { "epoch": 0.7371794871794872, "grad_norm": 690877.8863774311, "learning_rate": 1.4642857142857143e-07, "logits/chosen": -0.6318084597587585, "logits/rejected": -0.6108121275901794, "logps/chosen": -103.8791732788086, "logps/rejected": -110.17147064208984, "loss": 122652.725, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008601801469922066, "rewards/margins": 0.00492085749283433, "rewards/rejected": -0.013522659428417683, "step": 230 }, { "epoch": 0.7692307692307693, "grad_norm": 706116.311213081, "learning_rate": 1.2857142857142855e-07, "logits/chosen": -0.6082527041435242, "logits/rejected": -0.6249019503593445, "logps/chosen": -85.2287826538086, "logps/rejected": -85.55986785888672, "loss": 123191.8125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009229953400790691, "rewards/margins": 0.0028918907046318054, "rewards/rejected": -0.012121843174099922, "step": 240 }, { "epoch": 0.8012820512820513, "grad_norm": 762557.0917436344, "learning_rate": 1.107142857142857e-07, "logits/chosen": -0.5747382640838623, "logits/rejected": -0.6171086430549622, "logps/chosen": -94.86370086669922, "logps/rejected": -107.7577896118164, "loss": 124156.2875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01214513834565878, "rewards/margins": 0.0048862299881875515, "rewards/rejected": -0.017031369730830193, "step": 250 }, { "epoch": 0.8333333333333334, "grad_norm": 720981.6104523474, "learning_rate": 9.285714285714286e-08, "logits/chosen": -0.6732273101806641, "logits/rejected": -0.6552490592002869, "logps/chosen": -93.73551940917969, "logps/rejected": -95.43331146240234, "loss": 125028.5375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00893234834074974, "rewards/margins": 0.0013536241604015231, "rewards/rejected": -0.010285971686244011, "step": 260 }, { "epoch": 0.8653846153846154, "grad_norm": 632266.1767602823, "learning_rate": 7.5e-08, "logits/chosen": -0.6526715159416199, "logits/rejected": -0.6659075617790222, "logps/chosen": -104.94581604003906, "logps/rejected": -123.0511245727539, "loss": 122176.2, "rewards/accuracies": 0.625, "rewards/chosen": -0.014885579235851765, "rewards/margins": 0.005144301801919937, "rewards/rejected": -0.020029881969094276, "step": 270 }, { "epoch": 0.8974358974358975, "grad_norm": 886078.1218696759, "learning_rate": 5.714285714285714e-08, "logits/chosen": -0.7189252972602844, "logits/rejected": -0.7166494131088257, "logps/chosen": -110.89029693603516, "logps/rejected": -116.58308410644531, "loss": 123335.0875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011831143870949745, "rewards/margins": 0.0017397601623088121, "rewards/rejected": -0.013570902869105339, "step": 280 }, { "epoch": 0.9294871794871795, "grad_norm": 651330.6783592023, "learning_rate": 3.9285714285714285e-08, "logits/chosen": -0.6643999814987183, "logits/rejected": -0.6874372959136963, "logps/chosen": -98.12127685546875, "logps/rejected": -97.89227294921875, "loss": 124226.45, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.011173027567565441, "rewards/margins": 0.002893571276217699, "rewards/rejected": -0.014066601172089577, "step": 290 }, { "epoch": 0.9615384615384616, "grad_norm": 738901.7736935538, "learning_rate": 2.142857142857143e-08, "logits/chosen": -0.7286126017570496, "logits/rejected": -0.6981081962585449, "logps/chosen": -86.98307037353516, "logps/rejected": -90.56110382080078, "loss": 123651.1375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.010809152387082577, "rewards/margins": 0.00516370078548789, "rewards/rejected": -0.01597285456955433, "step": 300 }, { "epoch": 0.9935897435897436, "grad_norm": 792583.3715918568, "learning_rate": 3.571428571428571e-09, "logits/chosen": -0.6979326009750366, "logits/rejected": -0.6595016717910767, "logps/chosen": -99.2912368774414, "logps/rejected": -106.7309341430664, "loss": 125171.2, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.012163314037024975, "rewards/margins": 0.002274113241583109, "rewards/rejected": -0.014437426812946796, "step": 310 }, { "epoch": 1.0, "step": 312, "total_flos": 0.0, "train_loss": 124066.95723157052, "train_runtime": 2762.769, "train_samples_per_second": 7.224, "train_steps_per_second": 0.113 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }