{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 95, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002631578947368421, "eval_loss": 11.938826560974121, "eval_runtime": 2.4068, "eval_samples_per_second": 66.479, "eval_steps_per_second": 33.239, "step": 1 }, { "epoch": 0.013157894736842105, "grad_norm": 0.0984639823436737, "learning_rate": 1.6666666666666667e-05, "loss": 11.939, "step": 5 }, { "epoch": 0.02631578947368421, "grad_norm": 0.10308364778757095, "learning_rate": 3.3333333333333335e-05, "loss": 11.9404, "step": 10 }, { "epoch": 0.039473684210526314, "grad_norm": 0.1174185648560524, "learning_rate": 5e-05, "loss": 11.941, "step": 15 }, { "epoch": 0.05263157894736842, "grad_norm": 0.11486347019672394, "learning_rate": 6.666666666666667e-05, "loss": 11.9374, "step": 20 }, { "epoch": 0.06578947368421052, "grad_norm": 0.13925467431545258, "learning_rate": 8.333333333333334e-05, "loss": 11.9389, "step": 25 }, { "epoch": 0.07894736842105263, "grad_norm": 0.12139558792114258, "learning_rate": 0.0001, "loss": 11.9443, "step": 30 }, { "epoch": 0.09210526315789473, "grad_norm": 0.16076084971427917, "learning_rate": 9.994965332706573e-05, "loss": 11.9338, "step": 35 }, { "epoch": 0.10526315789473684, "grad_norm": 0.1557275503873825, "learning_rate": 9.979871469976196e-05, "loss": 11.9309, "step": 40 }, { "epoch": 0.11842105263157894, "grad_norm": 0.18004350364208221, "learning_rate": 9.954748808839674e-05, "loss": 11.9341, "step": 45 }, { "epoch": 0.13157894736842105, "grad_norm": 0.20064346492290497, "learning_rate": 9.919647942993148e-05, "loss": 11.9391, "step": 50 }, { "epoch": 0.14473684210526316, "grad_norm": 0.09248711168766022, "learning_rate": 9.874639560909117e-05, "loss": 11.933, "step": 55 }, { "epoch": 0.15789473684210525, "grad_norm": 0.15245001018047333, "learning_rate": 9.819814303479267e-05, "loss": 11.9331, "step": 60 }, { "epoch": 0.17105263157894737, "grad_norm": 0.13184253871440887, "learning_rate": 9.755282581475769e-05, "loss": 11.9329, "step": 65 }, { "epoch": 0.18421052631578946, "grad_norm": 0.14973685145378113, "learning_rate": 9.681174353198687e-05, "loss": 11.9357, "step": 70 }, { "epoch": 0.19736842105263158, "grad_norm": 0.13842841982841492, "learning_rate": 9.597638862757255e-05, "loss": 11.931, "step": 75 }, { "epoch": 0.21052631578947367, "grad_norm": 0.139064222574234, "learning_rate": 9.504844339512095e-05, "loss": 11.9304, "step": 80 }, { "epoch": 0.2236842105263158, "grad_norm": 0.1904718279838562, "learning_rate": 9.40297765928369e-05, "loss": 11.9298, "step": 85 }, { "epoch": 0.23684210526315788, "grad_norm": 0.23532414436340332, "learning_rate": 9.292243968009331e-05, "loss": 11.9353, "step": 90 }, { "epoch": 0.25, "grad_norm": 0.2097780853509903, "learning_rate": 9.172866268606513e-05, "loss": 11.932, "step": 95 }, { "epoch": 0.25, "eval_loss": 11.929388046264648, "eval_runtime": 2.4003, "eval_samples_per_second": 66.659, "eval_steps_per_second": 33.329, "step": 95 }, { "epoch": 0.2631578947368421, "grad_norm": 0.1999453604221344, "learning_rate": 9.045084971874738e-05, "loss": 11.9291, "step": 100 }, { "epoch": 0.27631578947368424, "grad_norm": 0.1953020542860031, "learning_rate": 8.90915741234015e-05, "loss": 11.9304, "step": 105 }, { "epoch": 0.2894736842105263, "grad_norm": 0.15549349784851074, "learning_rate": 8.765357330018056e-05, "loss": 11.9268, "step": 110 }, { "epoch": 0.3026315789473684, "grad_norm": 0.257530152797699, "learning_rate": 8.613974319136958e-05, "loss": 11.9239, "step": 115 }, { "epoch": 0.3157894736842105, "grad_norm": 0.2518523931503296, "learning_rate": 8.455313244934324e-05, "loss": 11.9238, "step": 120 }, { "epoch": 0.32894736842105265, "grad_norm": 0.20200787484645844, "learning_rate": 8.289693629698564e-05, "loss": 11.9177, "step": 125 }, { "epoch": 0.34210526315789475, "grad_norm": 0.2418881356716156, "learning_rate": 8.117449009293668e-05, "loss": 11.9187, "step": 130 }, { "epoch": 0.35526315789473684, "grad_norm": 0.23062272369861603, "learning_rate": 7.938926261462366e-05, "loss": 11.9139, "step": 135 }, { "epoch": 0.3684210526315789, "grad_norm": 0.2909921407699585, "learning_rate": 7.754484907260513e-05, "loss": 11.9125, "step": 140 }, { "epoch": 0.3815789473684211, "grad_norm": 0.24988366663455963, "learning_rate": 7.564496387029532e-05, "loss": 11.9137, "step": 145 }, { "epoch": 0.39473684210526316, "grad_norm": 0.26093658804893494, "learning_rate": 7.369343312364993e-05, "loss": 11.9161, "step": 150 }, { "epoch": 0.40789473684210525, "grad_norm": 0.18536804616451263, "learning_rate": 7.169418695587791e-05, "loss": 11.9134, "step": 155 }, { "epoch": 0.42105263157894735, "grad_norm": 0.1819031834602356, "learning_rate": 6.965125158269619e-05, "loss": 11.9139, "step": 160 }, { "epoch": 0.4342105263157895, "grad_norm": 0.21145343780517578, "learning_rate": 6.756874120406714e-05, "loss": 11.902, "step": 165 }, { "epoch": 0.4473684210526316, "grad_norm": 0.22574874758720398, "learning_rate": 6.545084971874738e-05, "loss": 11.9172, "step": 170 }, { "epoch": 0.4605263157894737, "grad_norm": 0.16482414305210114, "learning_rate": 6.330184227833376e-05, "loss": 11.9052, "step": 175 }, { "epoch": 0.47368421052631576, "grad_norm": 0.21980416774749756, "learning_rate": 6.112604669781572e-05, "loss": 11.91, "step": 180 }, { "epoch": 0.4868421052631579, "grad_norm": 0.2284337878227234, "learning_rate": 5.8927844739931834e-05, "loss": 11.9026, "step": 185 }, { "epoch": 0.5, "grad_norm": 0.2556101083755493, "learning_rate": 5.6711663290882776e-05, "loss": 11.9112, "step": 190 }, { "epoch": 0.5, "eval_loss": 11.905473709106445, "eval_runtime": 2.4046, "eval_samples_per_second": 66.539, "eval_steps_per_second": 33.269, "step": 190 }, { "epoch": 0.5131578947368421, "grad_norm": 0.25931644439697266, "learning_rate": 5.448196544517168e-05, "loss": 11.9077, "step": 195 }, { "epoch": 0.5263157894736842, "grad_norm": 0.31854814291000366, "learning_rate": 5.2243241517525754e-05, "loss": 11.9015, "step": 200 }, { "epoch": 0.5394736842105263, "grad_norm": 0.2101360410451889, "learning_rate": 5e-05, "loss": 11.9099, "step": 205 }, { "epoch": 0.5526315789473685, "grad_norm": 0.3032777011394501, "learning_rate": 4.775675848247427e-05, "loss": 11.9006, "step": 210 }, { "epoch": 0.5657894736842105, "grad_norm": 0.16137917339801788, "learning_rate": 4.551803455482833e-05, "loss": 11.9015, "step": 215 }, { "epoch": 0.5789473684210527, "grad_norm": 0.1659950166940689, "learning_rate": 4.328833670911724e-05, "loss": 11.8985, "step": 220 }, { "epoch": 0.5921052631578947, "grad_norm": 0.176284521818161, "learning_rate": 4.107215526006817e-05, "loss": 11.9067, "step": 225 }, { "epoch": 0.6052631578947368, "grad_norm": 0.14944250881671906, "learning_rate": 3.887395330218429e-05, "loss": 11.8992, "step": 230 }, { "epoch": 0.618421052631579, "grad_norm": 0.21413926780223846, "learning_rate": 3.6698157721666246e-05, "loss": 11.8986, "step": 235 }, { "epoch": 0.631578947368421, "grad_norm": 0.19918343424797058, "learning_rate": 3.4549150281252636e-05, "loss": 11.9055, "step": 240 }, { "epoch": 0.6447368421052632, "grad_norm": 0.22723235189914703, "learning_rate": 3.243125879593286e-05, "loss": 11.8926, "step": 245 }, { "epoch": 0.6578947368421053, "grad_norm": 0.18223680555820465, "learning_rate": 3.0348748417303823e-05, "loss": 11.8974, "step": 250 }, { "epoch": 0.6710526315789473, "grad_norm": 0.17337638139724731, "learning_rate": 2.8305813044122097e-05, "loss": 11.9015, "step": 255 }, { "epoch": 0.6842105263157895, "grad_norm": 0.22359390556812286, "learning_rate": 2.630656687635007e-05, "loss": 11.8972, "step": 260 }, { "epoch": 0.6973684210526315, "grad_norm": 0.17346619069576263, "learning_rate": 2.43550361297047e-05, "loss": 11.9009, "step": 265 }, { "epoch": 0.7105263157894737, "grad_norm": 0.17790664732456207, "learning_rate": 2.245515092739488e-05, "loss": 11.8952, "step": 270 }, { "epoch": 0.7236842105263158, "grad_norm": 0.25889426469802856, "learning_rate": 2.061073738537635e-05, "loss": 11.8964, "step": 275 }, { "epoch": 0.7368421052631579, "grad_norm": 0.18190772831439972, "learning_rate": 1.8825509907063327e-05, "loss": 11.9048, "step": 280 }, { "epoch": 0.75, "grad_norm": 0.3169431686401367, "learning_rate": 1.7103063703014372e-05, "loss": 11.9026, "step": 285 }, { "epoch": 0.75, "eval_loss": 11.899419784545898, "eval_runtime": 2.3966, "eval_samples_per_second": 66.761, "eval_steps_per_second": 33.38, "step": 285 }, { "epoch": 0.7631578947368421, "grad_norm": 0.2090289145708084, "learning_rate": 1.544686755065677e-05, "loss": 11.8964, "step": 290 }, { "epoch": 0.7763157894736842, "grad_norm": 0.2250237613916397, "learning_rate": 1.3860256808630428e-05, "loss": 11.8984, "step": 295 }, { "epoch": 0.7894736842105263, "grad_norm": 0.21673612296581268, "learning_rate": 1.2346426699819458e-05, "loss": 11.9003, "step": 300 }, { "epoch": 0.8026315789473685, "grad_norm": 0.15479284524917603, "learning_rate": 1.090842587659851e-05, "loss": 11.8985, "step": 305 }, { "epoch": 0.8157894736842105, "grad_norm": 0.1348995566368103, "learning_rate": 9.549150281252633e-06, "loss": 11.9016, "step": 310 }, { "epoch": 0.8289473684210527, "grad_norm": 0.2521159052848816, "learning_rate": 8.271337313934869e-06, "loss": 11.8967, "step": 315 }, { "epoch": 0.8421052631578947, "grad_norm": 0.21710200607776642, "learning_rate": 7.077560319906695e-06, "loss": 11.8977, "step": 320 }, { "epoch": 0.8552631578947368, "grad_norm": 0.19024071097373962, "learning_rate": 5.9702234071631e-06, "loss": 11.8927, "step": 325 }, { "epoch": 0.868421052631579, "grad_norm": 0.18137779831886292, "learning_rate": 4.951556604879048e-06, "loss": 11.9029, "step": 330 }, { "epoch": 0.881578947368421, "grad_norm": 0.19617067277431488, "learning_rate": 4.023611372427471e-06, "loss": 11.9025, "step": 335 }, { "epoch": 0.8947368421052632, "grad_norm": 0.2027858942747116, "learning_rate": 3.18825646801314e-06, "loss": 11.8985, "step": 340 }, { "epoch": 0.9078947368421053, "grad_norm": 0.22982299327850342, "learning_rate": 2.4471741852423237e-06, "loss": 11.8997, "step": 345 }, { "epoch": 0.9210526315789473, "grad_norm": 0.2535816431045532, "learning_rate": 1.8018569652073381e-06, "loss": 11.9037, "step": 350 }, { "epoch": 0.9342105263157895, "grad_norm": 0.16845425963401794, "learning_rate": 1.2536043909088191e-06, "loss": 11.9062, "step": 355 }, { "epoch": 0.9473684210526315, "grad_norm": 0.1870003640651703, "learning_rate": 8.035205700685167e-07, "loss": 11.9026, "step": 360 }, { "epoch": 0.9605263157894737, "grad_norm": 0.1637255698442459, "learning_rate": 4.52511911603265e-07, "loss": 11.8917, "step": 365 }, { "epoch": 0.9736842105263158, "grad_norm": 0.195426806807518, "learning_rate": 2.012853002380466e-07, "loss": 11.8966, "step": 370 }, { "epoch": 0.9868421052631579, "grad_norm": 0.26023876667022705, "learning_rate": 5.0346672934270534e-08, "loss": 11.9003, "step": 375 }, { "epoch": 1.0, "grad_norm": 0.27251943945884705, "learning_rate": 0.0, "loss": 11.8936, "step": 380 }, { "epoch": 1.0, "eval_loss": 11.898847579956055, "eval_runtime": 2.3965, "eval_samples_per_second": 66.765, "eval_steps_per_second": 33.382, "step": 380 } ], "logging_steps": 5, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 95118509998080.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }