{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.004518114816592777, "eval_steps": 5, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00022590574082963883, "grad_norm": 6.500443458557129, "learning_rate": 1e-05, "loss": 18.6481, "step": 1 }, { "epoch": 0.00022590574082963883, "eval_loss": 18.568599700927734, "eval_runtime": 290.449, "eval_samples_per_second": 25.671, "eval_steps_per_second": 12.835, "step": 1 }, { "epoch": 0.00045181148165927766, "grad_norm": 6.984323024749756, "learning_rate": 2e-05, "loss": 18.093, "step": 2 }, { "epoch": 0.0006777172224889165, "grad_norm": 7.316300392150879, "learning_rate": 3e-05, "loss": 18.673, "step": 3 }, { "epoch": 0.0009036229633185553, "grad_norm": 7.175333499908447, "learning_rate": 4e-05, "loss": 18.6659, "step": 4 }, { "epoch": 0.0011295287041481943, "grad_norm": 8.042078971862793, "learning_rate": 5e-05, "loss": 18.8358, "step": 5 }, { "epoch": 0.0011295287041481943, "eval_loss": 18.0319766998291, "eval_runtime": 266.1876, "eval_samples_per_second": 28.01, "eval_steps_per_second": 14.005, "step": 5 }, { "epoch": 0.001355434444977833, "grad_norm": 9.501138687133789, "learning_rate": 6e-05, "loss": 17.5967, "step": 6 }, { "epoch": 0.001581340185807472, "grad_norm": 11.359487533569336, "learning_rate": 7e-05, "loss": 18.0327, "step": 7 }, { "epoch": 0.0018072459266371107, "grad_norm": 12.280532836914062, "learning_rate": 8e-05, "loss": 17.2021, "step": 8 }, { "epoch": 0.0020331516674667494, "grad_norm": 16.318531036376953, "learning_rate": 9e-05, "loss": 16.39, "step": 9 }, { "epoch": 0.0022590574082963885, "grad_norm": 20.797056198120117, "learning_rate": 0.0001, "loss": 14.8167, "step": 10 }, { "epoch": 0.0022590574082963885, "eval_loss": 13.161385536193848, "eval_runtime": 183.1191, "eval_samples_per_second": 40.717, "eval_steps_per_second": 20.358, "step": 10 }, { "epoch": 0.0024849631491260273, "grad_norm": 25.225400924682617, "learning_rate": 9.755282581475769e-05, "loss": 13.105, "step": 11 }, { "epoch": 0.002710868889955666, "grad_norm": 26.978565216064453, "learning_rate": 9.045084971874738e-05, "loss": 10.561, "step": 12 }, { "epoch": 0.0029367746307853047, "grad_norm": 22.66143035888672, "learning_rate": 7.938926261462366e-05, "loss": 8.4321, "step": 13 }, { "epoch": 0.003162680371614944, "grad_norm": 20.655855178833008, "learning_rate": 6.545084971874738e-05, "loss": 6.691, "step": 14 }, { "epoch": 0.0033885861124445826, "grad_norm": 21.936189651489258, "learning_rate": 5e-05, "loss": 5.965, "step": 15 }, { "epoch": 0.0033885861124445826, "eval_loss": 4.28361701965332, "eval_runtime": 196.0362, "eval_samples_per_second": 38.034, "eval_steps_per_second": 19.017, "step": 15 }, { "epoch": 0.0036144918532742213, "grad_norm": 20.060565948486328, "learning_rate": 3.4549150281252636e-05, "loss": 4.59, "step": 16 }, { "epoch": 0.00384039759410386, "grad_norm": 17.778438568115234, "learning_rate": 2.061073738537635e-05, "loss": 3.3184, "step": 17 }, { "epoch": 0.004066303334933499, "grad_norm": 16.09156036376953, "learning_rate": 9.549150281252633e-06, "loss": 3.2847, "step": 18 }, { "epoch": 0.004292209075763138, "grad_norm": 15.134675025939941, "learning_rate": 2.4471741852423237e-06, "loss": 2.8811, "step": 19 }, { "epoch": 0.004518114816592777, "grad_norm": 14.81490707397461, "learning_rate": 0.0, "loss": 2.5459, "step": 20 }, { "epoch": 0.004518114816592777, "eval_loss": 2.7253942489624023, "eval_runtime": 193.5802, "eval_samples_per_second": 38.516, "eval_steps_per_second": 19.258, "step": 20 } ], "logging_steps": 1, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.479651398516736e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }