{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9876543209876543, "eval_steps": 500, "global_step": 363, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0823045267489712, "grad_norm": 2.3707054376650794, "learning_rate": 5e-06, "loss": 0.8018, "step": 10 }, { "epoch": 0.1646090534979424, "grad_norm": 0.9189518749553744, "learning_rate": 5e-06, "loss": 0.6615, "step": 20 }, { "epoch": 0.24691358024691357, "grad_norm": 0.7181231203254296, "learning_rate": 5e-06, "loss": 0.6226, "step": 30 }, { "epoch": 0.3292181069958848, "grad_norm": 0.8098373994551952, "learning_rate": 5e-06, "loss": 0.5947, "step": 40 }, { "epoch": 0.411522633744856, "grad_norm": 0.6752478716308398, "learning_rate": 5e-06, "loss": 0.5731, "step": 50 }, { "epoch": 0.49382716049382713, "grad_norm": 0.6563536956259921, "learning_rate": 5e-06, "loss": 0.5748, "step": 60 }, { "epoch": 0.5761316872427984, "grad_norm": 0.7032292694552905, "learning_rate": 5e-06, "loss": 0.5696, "step": 70 }, { "epoch": 0.6584362139917695, "grad_norm": 0.7003547749235107, "learning_rate": 5e-06, "loss": 0.5579, "step": 80 }, { "epoch": 0.7407407407407407, "grad_norm": 0.6613096075940125, "learning_rate": 5e-06, "loss": 0.563, "step": 90 }, { "epoch": 0.823045267489712, "grad_norm": 0.5810433314696346, "learning_rate": 5e-06, "loss": 0.5506, "step": 100 }, { "epoch": 0.9053497942386831, "grad_norm": 0.6666365832464547, "learning_rate": 5e-06, "loss": 0.5526, "step": 110 }, { "epoch": 0.9876543209876543, "grad_norm": 0.7058806830798787, "learning_rate": 5e-06, "loss": 0.5415, "step": 120 }, { "epoch": 0.9958847736625515, "eval_loss": 0.5658594369888306, "eval_runtime": 31.1396, "eval_samples_per_second": 26.205, "eval_steps_per_second": 0.417, "step": 121 }, { "epoch": 1.0699588477366255, "grad_norm": 0.6935479068001943, "learning_rate": 5e-06, "loss": 0.5334, "step": 130 }, { "epoch": 1.1522633744855968, "grad_norm": 0.760550165297717, "learning_rate": 5e-06, "loss": 0.5067, "step": 140 }, { "epoch": 1.2345679012345678, "grad_norm": 0.643311514826356, "learning_rate": 5e-06, "loss": 0.4975, "step": 150 }, { "epoch": 1.316872427983539, "grad_norm": 0.7180631408695722, "learning_rate": 5e-06, "loss": 0.4907, "step": 160 }, { "epoch": 1.3991769547325104, "grad_norm": 0.6268855948646844, "learning_rate": 5e-06, "loss": 0.4846, "step": 170 }, { "epoch": 1.4814814814814814, "grad_norm": 0.6205279314065159, "learning_rate": 5e-06, "loss": 0.4857, "step": 180 }, { "epoch": 1.5637860082304527, "grad_norm": 0.6432594466375224, "learning_rate": 5e-06, "loss": 0.4922, "step": 190 }, { "epoch": 1.646090534979424, "grad_norm": 0.5837924660934569, "learning_rate": 5e-06, "loss": 0.485, "step": 200 }, { "epoch": 1.7283950617283952, "grad_norm": 0.5800173415658518, "learning_rate": 5e-06, "loss": 0.4899, "step": 210 }, { "epoch": 1.8106995884773662, "grad_norm": 0.5452696697421119, "learning_rate": 5e-06, "loss": 0.4794, "step": 220 }, { "epoch": 1.8930041152263375, "grad_norm": 0.6589276092411267, "learning_rate": 5e-06, "loss": 0.4925, "step": 230 }, { "epoch": 1.9753086419753085, "grad_norm": 0.7099238180212905, "learning_rate": 5e-06, "loss": 0.486, "step": 240 }, { "epoch": 2.0, "eval_loss": 0.552381157875061, "eval_runtime": 31.0974, "eval_samples_per_second": 26.24, "eval_steps_per_second": 0.418, "step": 243 }, { "epoch": 2.05761316872428, "grad_norm": 0.6173968011853433, "learning_rate": 5e-06, "loss": 0.4643, "step": 250 }, { "epoch": 2.139917695473251, "grad_norm": 0.6515589754644828, "learning_rate": 5e-06, "loss": 0.4264, "step": 260 }, { "epoch": 2.2222222222222223, "grad_norm": 0.6329003562540839, "learning_rate": 5e-06, "loss": 0.427, "step": 270 }, { "epoch": 2.3045267489711936, "grad_norm": 0.61321586370113, "learning_rate": 5e-06, "loss": 0.4284, "step": 280 }, { "epoch": 2.386831275720165, "grad_norm": 0.5986219957694312, "learning_rate": 5e-06, "loss": 0.4298, "step": 290 }, { "epoch": 2.4691358024691357, "grad_norm": 0.5797625936229965, "learning_rate": 5e-06, "loss": 0.4332, "step": 300 }, { "epoch": 2.551440329218107, "grad_norm": 0.6606165599990808, "learning_rate": 5e-06, "loss": 0.432, "step": 310 }, { "epoch": 2.633744855967078, "grad_norm": 0.6055776178246806, "learning_rate": 5e-06, "loss": 0.4297, "step": 320 }, { "epoch": 2.7160493827160495, "grad_norm": 0.5792227650214353, "learning_rate": 5e-06, "loss": 0.4301, "step": 330 }, { "epoch": 2.7983539094650207, "grad_norm": 0.5678659977310005, "learning_rate": 5e-06, "loss": 0.4256, "step": 340 }, { "epoch": 2.8806584362139915, "grad_norm": 0.5959535146183862, "learning_rate": 5e-06, "loss": 0.4252, "step": 350 }, { "epoch": 2.962962962962963, "grad_norm": 0.624214333713647, "learning_rate": 5e-06, "loss": 0.4216, "step": 360 }, { "epoch": 2.9876543209876543, "eval_loss": 0.55907142162323, "eval_runtime": 30.0336, "eval_samples_per_second": 27.17, "eval_steps_per_second": 0.433, "step": 363 }, { "epoch": 2.9876543209876543, "step": 363, "total_flos": 190042236518400.0, "train_loss": 0.506568502460301, "train_runtime": 5156.7482, "train_samples_per_second": 9.019, "train_steps_per_second": 0.07 } ], "logging_steps": 10, "max_steps": 363, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 190042236518400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }