dzanbek's picture
Training in progress, step 20, checkpoint
d67c9f4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0028187872167999718,
"eval_steps": 2,
"global_step": 20,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001409393608399986,
"grad_norm": 0.40887540578842163,
"learning_rate": 1e-05,
"loss": 10.3962,
"step": 1
},
{
"epoch": 0.0001409393608399986,
"eval_loss": NaN,
"eval_runtime": 11.4051,
"eval_samples_per_second": 261.988,
"eval_steps_per_second": 130.994,
"step": 1
},
{
"epoch": 0.0002818787216799972,
"grad_norm": 0.41520577669143677,
"learning_rate": 2e-05,
"loss": 10.3918,
"step": 2
},
{
"epoch": 0.0002818787216799972,
"eval_loss": NaN,
"eval_runtime": 11.9,
"eval_samples_per_second": 251.092,
"eval_steps_per_second": 125.546,
"step": 2
},
{
"epoch": 0.00042281808251999576,
"grad_norm": 0.43705397844314575,
"learning_rate": 3e-05,
"loss": 10.3964,
"step": 3
},
{
"epoch": 0.0005637574433599944,
"grad_norm": 0.46050891280174255,
"learning_rate": 4e-05,
"loss": 10.3956,
"step": 4
},
{
"epoch": 0.0005637574433599944,
"eval_loss": NaN,
"eval_runtime": 12.179,
"eval_samples_per_second": 245.341,
"eval_steps_per_second": 122.67,
"step": 4
},
{
"epoch": 0.0007046968041999929,
"grad_norm": 0.4756773114204407,
"learning_rate": 5e-05,
"loss": 10.4132,
"step": 5
},
{
"epoch": 0.0008456361650399915,
"grad_norm": 0.3863561153411865,
"learning_rate": 6e-05,
"loss": 10.3797,
"step": 6
},
{
"epoch": 0.0008456361650399915,
"eval_loss": NaN,
"eval_runtime": 12.0499,
"eval_samples_per_second": 247.968,
"eval_steps_per_second": 123.984,
"step": 6
},
{
"epoch": 0.0009865755258799901,
"grad_norm": 0.47305724024772644,
"learning_rate": 7e-05,
"loss": 10.398,
"step": 7
},
{
"epoch": 0.0011275148867199887,
"grad_norm": 0.4628412425518036,
"learning_rate": 8e-05,
"loss": 10.3953,
"step": 8
},
{
"epoch": 0.0011275148867199887,
"eval_loss": NaN,
"eval_runtime": 12.0344,
"eval_samples_per_second": 248.289,
"eval_steps_per_second": 124.145,
"step": 8
},
{
"epoch": 0.0012684542475599873,
"grad_norm": 0.4279620945453644,
"learning_rate": 9e-05,
"loss": 10.3821,
"step": 9
},
{
"epoch": 0.0014093936083999859,
"grad_norm": 0.40721234679222107,
"learning_rate": 0.0001,
"loss": 10.4019,
"step": 10
},
{
"epoch": 0.0014093936083999859,
"eval_loss": NaN,
"eval_runtime": 12.085,
"eval_samples_per_second": 247.249,
"eval_steps_per_second": 123.624,
"step": 10
},
{
"epoch": 0.0015503329692399845,
"grad_norm": 0.4028102457523346,
"learning_rate": 9.755282581475769e-05,
"loss": 10.3758,
"step": 11
},
{
"epoch": 0.001691272330079983,
"grad_norm": 0.45374321937561035,
"learning_rate": 9.045084971874738e-05,
"loss": 10.4035,
"step": 12
},
{
"epoch": 0.001691272330079983,
"eval_loss": NaN,
"eval_runtime": 11.9986,
"eval_samples_per_second": 249.029,
"eval_steps_per_second": 124.514,
"step": 12
},
{
"epoch": 0.0018322116909199816,
"grad_norm": 0.4572100043296814,
"learning_rate": 7.938926261462366e-05,
"loss": 10.3763,
"step": 13
},
{
"epoch": 0.0019731510517599802,
"grad_norm": 0.47349345684051514,
"learning_rate": 6.545084971874738e-05,
"loss": 10.3903,
"step": 14
},
{
"epoch": 0.0019731510517599802,
"eval_loss": NaN,
"eval_runtime": 12.3087,
"eval_samples_per_second": 242.756,
"eval_steps_per_second": 121.378,
"step": 14
},
{
"epoch": 0.002114090412599979,
"grad_norm": 0.4572914242744446,
"learning_rate": 5e-05,
"loss": 10.3862,
"step": 15
},
{
"epoch": 0.0022550297734399774,
"grad_norm": 0.4461607336997986,
"learning_rate": 3.4549150281252636e-05,
"loss": 10.3791,
"step": 16
},
{
"epoch": 0.0022550297734399774,
"eval_loss": NaN,
"eval_runtime": 12.2292,
"eval_samples_per_second": 244.333,
"eval_steps_per_second": 122.166,
"step": 16
},
{
"epoch": 0.002395969134279976,
"grad_norm": 0.5067576766014099,
"learning_rate": 2.061073738537635e-05,
"loss": 10.3954,
"step": 17
},
{
"epoch": 0.0025369084951199746,
"grad_norm": 0.40567123889923096,
"learning_rate": 9.549150281252633e-06,
"loss": 10.3808,
"step": 18
},
{
"epoch": 0.0025369084951199746,
"eval_loss": NaN,
"eval_runtime": 12.1062,
"eval_samples_per_second": 246.815,
"eval_steps_per_second": 123.408,
"step": 18
},
{
"epoch": 0.002677847855959973,
"grad_norm": 0.47241461277008057,
"learning_rate": 2.4471741852423237e-06,
"loss": 10.3838,
"step": 19
},
{
"epoch": 0.0028187872167999718,
"grad_norm": 0.5147857666015625,
"learning_rate": 0.0,
"loss": 10.3815,
"step": 20
},
{
"epoch": 0.0028187872167999718,
"eval_loss": NaN,
"eval_runtime": 11.6945,
"eval_samples_per_second": 255.504,
"eval_steps_per_second": 127.752,
"step": 20
}
],
"logging_steps": 1,
"max_steps": 20,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 523024465920.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}