brixeus's picture
Training in progress, step 100, checkpoint
d894d20 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.005759373380176237,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.759373380176237e-05,
"eval_loss": 2.6716082096099854,
"eval_runtime": 1331.768,
"eval_samples_per_second": 21.958,
"eval_steps_per_second": 2.745,
"step": 1
},
{
"epoch": 0.0001727812014052871,
"grad_norm": 1.0326002836227417,
"learning_rate": 3e-05,
"loss": 10.8192,
"step": 3
},
{
"epoch": 0.0003455624028105742,
"grad_norm": 1.0763981342315674,
"learning_rate": 6e-05,
"loss": 10.7332,
"step": 6
},
{
"epoch": 0.0005183436042158614,
"grad_norm": 1.2686634063720703,
"learning_rate": 9e-05,
"loss": 10.432,
"step": 9
},
{
"epoch": 0.0005183436042158614,
"eval_loss": 2.629826784133911,
"eval_runtime": 1338.5029,
"eval_samples_per_second": 21.848,
"eval_steps_per_second": 2.731,
"step": 9
},
{
"epoch": 0.0006911248056211485,
"grad_norm": 1.1208940744400024,
"learning_rate": 9.987820251299122e-05,
"loss": 10.5925,
"step": 12
},
{
"epoch": 0.0008639060070264355,
"grad_norm": 1.1096599102020264,
"learning_rate": 9.924038765061042e-05,
"loss": 9.8268,
"step": 15
},
{
"epoch": 0.0010366872084317227,
"grad_norm": 1.2350603342056274,
"learning_rate": 9.806308479691595e-05,
"loss": 10.5695,
"step": 18
},
{
"epoch": 0.0010366872084317227,
"eval_loss": 2.5420780181884766,
"eval_runtime": 1337.1556,
"eval_samples_per_second": 21.87,
"eval_steps_per_second": 2.734,
"step": 18
},
{
"epoch": 0.0012094684098370098,
"grad_norm": 1.1920151710510254,
"learning_rate": 9.635919272833938e-05,
"loss": 10.2468,
"step": 21
},
{
"epoch": 0.001382249611242297,
"grad_norm": 0.8807956576347351,
"learning_rate": 9.414737964294636e-05,
"loss": 9.8125,
"step": 24
},
{
"epoch": 0.001555030812647584,
"grad_norm": 0.8759644627571106,
"learning_rate": 9.145187862775209e-05,
"loss": 9.77,
"step": 27
},
{
"epoch": 0.001555030812647584,
"eval_loss": 2.4928576946258545,
"eval_runtime": 1338.7702,
"eval_samples_per_second": 21.843,
"eval_steps_per_second": 2.731,
"step": 27
},
{
"epoch": 0.001727812014052871,
"grad_norm": 0.9653000235557556,
"learning_rate": 8.83022221559489e-05,
"loss": 10.2316,
"step": 30
},
{
"epoch": 0.0019005932154581582,
"grad_norm": 0.8884105682373047,
"learning_rate": 8.473291852294987e-05,
"loss": 10.0839,
"step": 33
},
{
"epoch": 0.0020733744168634455,
"grad_norm": 1.2079200744628906,
"learning_rate": 8.07830737662829e-05,
"loss": 10.2626,
"step": 36
},
{
"epoch": 0.0020733744168634455,
"eval_loss": 2.4689135551452637,
"eval_runtime": 1339.7306,
"eval_samples_per_second": 21.828,
"eval_steps_per_second": 2.729,
"step": 36
},
{
"epoch": 0.0022461556182687323,
"grad_norm": 0.9944831132888794,
"learning_rate": 7.649596321166024e-05,
"loss": 9.843,
"step": 39
},
{
"epoch": 0.0024189368196740196,
"grad_norm": 0.935939371585846,
"learning_rate": 7.191855733945387e-05,
"loss": 9.7281,
"step": 42
},
{
"epoch": 0.0025917180210793065,
"grad_norm": 1.00493323802948,
"learning_rate": 6.710100716628344e-05,
"loss": 10.2203,
"step": 45
},
{
"epoch": 0.0025917180210793065,
"eval_loss": 2.4538426399230957,
"eval_runtime": 1338.817,
"eval_samples_per_second": 21.842,
"eval_steps_per_second": 2.731,
"step": 45
},
{
"epoch": 0.002764499222484594,
"grad_norm": 0.9063613414764404,
"learning_rate": 6.209609477998338e-05,
"loss": 9.8641,
"step": 48
},
{
"epoch": 0.0029372804238898807,
"grad_norm": 1.0362025499343872,
"learning_rate": 5.695865504800327e-05,
"loss": 9.7865,
"step": 51
},
{
"epoch": 0.003110061625295168,
"grad_norm": 0.9377679824829102,
"learning_rate": 5.174497483512506e-05,
"loss": 10.0832,
"step": 54
},
{
"epoch": 0.003110061625295168,
"eval_loss": 2.4470882415771484,
"eval_runtime": 1338.2458,
"eval_samples_per_second": 21.852,
"eval_steps_per_second": 2.732,
"step": 54
},
{
"epoch": 0.003282842826700455,
"grad_norm": 1.549514651298523,
"learning_rate": 4.6512176312793736e-05,
"loss": 9.8436,
"step": 57
},
{
"epoch": 0.003455624028105742,
"grad_norm": 0.9654721021652222,
"learning_rate": 4.131759111665349e-05,
"loss": 10.4088,
"step": 60
},
{
"epoch": 0.003628405229511029,
"grad_norm": 0.9682468771934509,
"learning_rate": 3.6218132209150045e-05,
"loss": 9.6252,
"step": 63
},
{
"epoch": 0.003628405229511029,
"eval_loss": 2.440478563308716,
"eval_runtime": 1339.1783,
"eval_samples_per_second": 21.837,
"eval_steps_per_second": 2.73,
"step": 63
},
{
"epoch": 0.0038011864309163163,
"grad_norm": 1.087673306465149,
"learning_rate": 3.12696703292044e-05,
"loss": 9.8558,
"step": 66
},
{
"epoch": 0.003973967632321603,
"grad_norm": 0.9650527834892273,
"learning_rate": 2.6526421860705473e-05,
"loss": 9.4286,
"step": 69
},
{
"epoch": 0.004146748833726891,
"grad_norm": 1.0395511388778687,
"learning_rate": 2.2040354826462668e-05,
"loss": 9.8884,
"step": 72
},
{
"epoch": 0.004146748833726891,
"eval_loss": 2.4370479583740234,
"eval_runtime": 1337.1445,
"eval_samples_per_second": 21.87,
"eval_steps_per_second": 2.734,
"step": 72
},
{
"epoch": 0.004319530035132178,
"grad_norm": 0.8846902251243591,
"learning_rate": 1.7860619515673033e-05,
"loss": 9.9641,
"step": 75
},
{
"epoch": 0.004492311236537465,
"grad_norm": 1.0415371656417847,
"learning_rate": 1.4033009983067452e-05,
"loss": 9.3965,
"step": 78
},
{
"epoch": 0.0046650924379427515,
"grad_norm": 1.0671367645263672,
"learning_rate": 1.0599462319663905e-05,
"loss": 9.7664,
"step": 81
},
{
"epoch": 0.0046650924379427515,
"eval_loss": 2.4352002143859863,
"eval_runtime": 1335.8618,
"eval_samples_per_second": 21.891,
"eval_steps_per_second": 2.737,
"step": 81
},
{
"epoch": 0.004837873639348039,
"grad_norm": 1.150208830833435,
"learning_rate": 7.597595192178702e-06,
"loss": 9.6133,
"step": 84
},
{
"epoch": 0.005010654840753326,
"grad_norm": 0.9796428084373474,
"learning_rate": 5.060297685041659e-06,
"loss": 10.1937,
"step": 87
},
{
"epoch": 0.005183436042158613,
"grad_norm": 0.966742992401123,
"learning_rate": 3.0153689607045845e-06,
"loss": 9.7158,
"step": 90
},
{
"epoch": 0.005183436042158613,
"eval_loss": 2.434671401977539,
"eval_runtime": 1339.1776,
"eval_samples_per_second": 21.837,
"eval_steps_per_second": 2.73,
"step": 90
},
{
"epoch": 0.0053562172435639,
"grad_norm": 0.9962571859359741,
"learning_rate": 1.4852136862001764e-06,
"loss": 9.2092,
"step": 93
},
{
"epoch": 0.005528998444969188,
"grad_norm": 1.0119867324829102,
"learning_rate": 4.865965629214819e-07,
"loss": 9.5268,
"step": 96
},
{
"epoch": 0.0057017796463744745,
"grad_norm": 1.0774887800216675,
"learning_rate": 3.04586490452119e-08,
"loss": 9.4073,
"step": 99
},
{
"epoch": 0.0057017796463744745,
"eval_loss": 2.4344372749328613,
"eval_runtime": 1338.1241,
"eval_samples_per_second": 21.854,
"eval_steps_per_second": 2.732,
"step": 99
}
],
"logging_steps": 3,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.534154269222502e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}