lesso07's picture
Training in progress, step 100, checkpoint
7216993 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00621272365805169,
"eval_steps": 9,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 6.212723658051689e-05,
"grad_norm": NaN,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 1
},
{
"epoch": 6.212723658051689e-05,
"eval_loss": NaN,
"eval_runtime": 379.2078,
"eval_samples_per_second": 35.746,
"eval_steps_per_second": 4.47,
"step": 1
},
{
"epoch": 0.00012425447316103378,
"grad_norm": NaN,
"learning_rate": 2e-05,
"loss": 0.0,
"step": 2
},
{
"epoch": 0.0001863817097415507,
"grad_norm": NaN,
"learning_rate": 3e-05,
"loss": 0.0,
"step": 3
},
{
"epoch": 0.00024850894632206757,
"grad_norm": NaN,
"learning_rate": 4e-05,
"loss": 0.0,
"step": 4
},
{
"epoch": 0.0003106361829025845,
"grad_norm": NaN,
"learning_rate": 5e-05,
"loss": 0.0,
"step": 5
},
{
"epoch": 0.0003727634194831014,
"grad_norm": NaN,
"learning_rate": 6e-05,
"loss": 0.0,
"step": 6
},
{
"epoch": 0.0004348906560636183,
"grad_norm": NaN,
"learning_rate": 7e-05,
"loss": 0.0,
"step": 7
},
{
"epoch": 0.0004970178926441351,
"grad_norm": NaN,
"learning_rate": 8e-05,
"loss": 0.0,
"step": 8
},
{
"epoch": 0.0005591451292246521,
"grad_norm": NaN,
"learning_rate": 9e-05,
"loss": 0.0,
"step": 9
},
{
"epoch": 0.0005591451292246521,
"eval_loss": NaN,
"eval_runtime": 378.6602,
"eval_samples_per_second": 35.797,
"eval_steps_per_second": 4.476,
"step": 9
},
{
"epoch": 0.000621272365805169,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 10
},
{
"epoch": 0.0006833996023856859,
"grad_norm": NaN,
"learning_rate": 9.99695413509548e-05,
"loss": 0.0,
"step": 11
},
{
"epoch": 0.0007455268389662028,
"grad_norm": NaN,
"learning_rate": 9.987820251299122e-05,
"loss": 0.0,
"step": 12
},
{
"epoch": 0.0008076540755467197,
"grad_norm": NaN,
"learning_rate": 9.972609476841367e-05,
"loss": 0.0,
"step": 13
},
{
"epoch": 0.0008697813121272366,
"grad_norm": NaN,
"learning_rate": 9.951340343707852e-05,
"loss": 0.0,
"step": 14
},
{
"epoch": 0.0009319085487077535,
"grad_norm": NaN,
"learning_rate": 9.924038765061042e-05,
"loss": 0.0,
"step": 15
},
{
"epoch": 0.0009940357852882703,
"grad_norm": NaN,
"learning_rate": 9.890738003669029e-05,
"loss": 0.0,
"step": 16
},
{
"epoch": 0.0010561630218687873,
"grad_norm": NaN,
"learning_rate": 9.851478631379982e-05,
"loss": 0.0,
"step": 17
},
{
"epoch": 0.0011182902584493041,
"grad_norm": NaN,
"learning_rate": 9.806308479691595e-05,
"loss": 0.0,
"step": 18
},
{
"epoch": 0.0011182902584493041,
"eval_loss": NaN,
"eval_runtime": 379.0902,
"eval_samples_per_second": 35.757,
"eval_steps_per_second": 4.471,
"step": 18
},
{
"epoch": 0.001180417495029821,
"grad_norm": NaN,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0,
"step": 19
},
{
"epoch": 0.001242544731610338,
"grad_norm": NaN,
"learning_rate": 9.698463103929542e-05,
"loss": 0.0,
"step": 20
},
{
"epoch": 0.0013046719681908548,
"grad_norm": NaN,
"learning_rate": 9.635919272833938e-05,
"loss": 0.0,
"step": 21
},
{
"epoch": 0.0013667992047713719,
"grad_norm": NaN,
"learning_rate": 9.567727288213005e-05,
"loss": 0.0,
"step": 22
},
{
"epoch": 0.0014289264413518887,
"grad_norm": NaN,
"learning_rate": 9.493970231495835e-05,
"loss": 0.0,
"step": 23
},
{
"epoch": 0.0014910536779324055,
"grad_norm": NaN,
"learning_rate": 9.414737964294636e-05,
"loss": 0.0,
"step": 24
},
{
"epoch": 0.0015531809145129226,
"grad_norm": NaN,
"learning_rate": 9.330127018922194e-05,
"loss": 0.0,
"step": 25
},
{
"epoch": 0.0016153081510934394,
"grad_norm": NaN,
"learning_rate": 9.24024048078213e-05,
"loss": 0.0,
"step": 26
},
{
"epoch": 0.0016774353876739562,
"grad_norm": NaN,
"learning_rate": 9.145187862775209e-05,
"loss": 0.0,
"step": 27
},
{
"epoch": 0.0016774353876739562,
"eval_loss": NaN,
"eval_runtime": 378.755,
"eval_samples_per_second": 35.788,
"eval_steps_per_second": 4.475,
"step": 27
},
{
"epoch": 0.0017395626242544733,
"grad_norm": NaN,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0,
"step": 28
},
{
"epoch": 0.00180168986083499,
"grad_norm": NaN,
"learning_rate": 8.940053768033609e-05,
"loss": 0.0,
"step": 29
},
{
"epoch": 0.001863817097415507,
"grad_norm": NaN,
"learning_rate": 8.83022221559489e-05,
"loss": 0.0,
"step": 30
},
{
"epoch": 0.001925944333996024,
"grad_norm": NaN,
"learning_rate": 8.715724127386972e-05,
"loss": 0.0,
"step": 31
},
{
"epoch": 0.0019880715705765406,
"grad_norm": NaN,
"learning_rate": 8.596699001693255e-05,
"loss": 0.0,
"step": 32
},
{
"epoch": 0.002050198807157058,
"grad_norm": NaN,
"learning_rate": 8.473291852294987e-05,
"loss": 0.0,
"step": 33
},
{
"epoch": 0.0021123260437375746,
"grad_norm": NaN,
"learning_rate": 8.345653031794292e-05,
"loss": 0.0,
"step": 34
},
{
"epoch": 0.0021744532803180915,
"grad_norm": NaN,
"learning_rate": 8.213938048432697e-05,
"loss": 0.0,
"step": 35
},
{
"epoch": 0.0022365805168986083,
"grad_norm": NaN,
"learning_rate": 8.07830737662829e-05,
"loss": 0.0,
"step": 36
},
{
"epoch": 0.0022365805168986083,
"eval_loss": NaN,
"eval_runtime": 378.6339,
"eval_samples_per_second": 35.8,
"eval_steps_per_second": 4.477,
"step": 36
},
{
"epoch": 0.002298707753479125,
"grad_norm": NaN,
"learning_rate": 7.938926261462366e-05,
"loss": 0.0,
"step": 37
},
{
"epoch": 0.002360834990059642,
"grad_norm": NaN,
"learning_rate": 7.795964517353735e-05,
"loss": 0.0,
"step": 38
},
{
"epoch": 0.002422962226640159,
"grad_norm": NaN,
"learning_rate": 7.649596321166024e-05,
"loss": 0.0,
"step": 39
},
{
"epoch": 0.002485089463220676,
"grad_norm": NaN,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0,
"step": 40
},
{
"epoch": 0.002547216699801193,
"grad_norm": NaN,
"learning_rate": 7.347357813929454e-05,
"loss": 0.0,
"step": 41
},
{
"epoch": 0.0026093439363817097,
"grad_norm": NaN,
"learning_rate": 7.191855733945387e-05,
"loss": 0.0,
"step": 42
},
{
"epoch": 0.0026714711729622265,
"grad_norm": NaN,
"learning_rate": 7.033683215379002e-05,
"loss": 0.0,
"step": 43
},
{
"epoch": 0.0027335984095427437,
"grad_norm": NaN,
"learning_rate": 6.873032967079561e-05,
"loss": 0.0,
"step": 44
},
{
"epoch": 0.0027957256461232606,
"grad_norm": NaN,
"learning_rate": 6.710100716628344e-05,
"loss": 0.0,
"step": 45
},
{
"epoch": 0.0027957256461232606,
"eval_loss": NaN,
"eval_runtime": 379.5125,
"eval_samples_per_second": 35.717,
"eval_steps_per_second": 4.466,
"step": 45
},
{
"epoch": 0.0028578528827037774,
"grad_norm": NaN,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0,
"step": 46
},
{
"epoch": 0.0029199801192842942,
"grad_norm": NaN,
"learning_rate": 6.378186779084995e-05,
"loss": 0.0,
"step": 47
},
{
"epoch": 0.002982107355864811,
"grad_norm": NaN,
"learning_rate": 6.209609477998338e-05,
"loss": 0.0,
"step": 48
},
{
"epoch": 0.003044234592445328,
"grad_norm": NaN,
"learning_rate": 6.0395584540887963e-05,
"loss": 0.0,
"step": 49
},
{
"epoch": 0.003106361829025845,
"grad_norm": NaN,
"learning_rate": 5.868240888334653e-05,
"loss": 0.0,
"step": 50
},
{
"epoch": 0.003168489065606362,
"grad_norm": NaN,
"learning_rate": 5.695865504800327e-05,
"loss": 0.0,
"step": 51
},
{
"epoch": 0.0032306163021868788,
"grad_norm": NaN,
"learning_rate": 5.522642316338268e-05,
"loss": 0.0,
"step": 52
},
{
"epoch": 0.0032927435387673956,
"grad_norm": NaN,
"learning_rate": 5.348782368720626e-05,
"loss": 0.0,
"step": 53
},
{
"epoch": 0.0033548707753479124,
"grad_norm": NaN,
"learning_rate": 5.174497483512506e-05,
"loss": 0.0,
"step": 54
},
{
"epoch": 0.0033548707753479124,
"eval_loss": NaN,
"eval_runtime": 379.2175,
"eval_samples_per_second": 35.745,
"eval_steps_per_second": 4.47,
"step": 54
},
{
"epoch": 0.0034169980119284292,
"grad_norm": NaN,
"learning_rate": 5e-05,
"loss": 0.0,
"step": 55
},
{
"epoch": 0.0034791252485089465,
"grad_norm": NaN,
"learning_rate": 4.825502516487497e-05,
"loss": 0.0,
"step": 56
},
{
"epoch": 0.0035412524850894633,
"grad_norm": NaN,
"learning_rate": 4.6512176312793736e-05,
"loss": 0.0,
"step": 57
},
{
"epoch": 0.00360337972166998,
"grad_norm": NaN,
"learning_rate": 4.477357683661734e-05,
"loss": 0.0,
"step": 58
},
{
"epoch": 0.003665506958250497,
"grad_norm": NaN,
"learning_rate": 4.3041344951996746e-05,
"loss": 0.0,
"step": 59
},
{
"epoch": 0.003727634194831014,
"grad_norm": NaN,
"learning_rate": 4.131759111665349e-05,
"loss": 0.0,
"step": 60
},
{
"epoch": 0.0037897614314115306,
"grad_norm": NaN,
"learning_rate": 3.960441545911204e-05,
"loss": 0.0,
"step": 61
},
{
"epoch": 0.003851888667992048,
"grad_norm": NaN,
"learning_rate": 3.790390522001662e-05,
"loss": 0.0,
"step": 62
},
{
"epoch": 0.003914015904572564,
"grad_norm": NaN,
"learning_rate": 3.6218132209150045e-05,
"loss": 0.0,
"step": 63
},
{
"epoch": 0.003914015904572564,
"eval_loss": NaN,
"eval_runtime": 378.879,
"eval_samples_per_second": 35.777,
"eval_steps_per_second": 4.474,
"step": 63
},
{
"epoch": 0.003976143141153081,
"grad_norm": NaN,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.0,
"step": 64
},
{
"epoch": 0.004038270377733599,
"grad_norm": NaN,
"learning_rate": 3.289899283371657e-05,
"loss": 0.0,
"step": 65
},
{
"epoch": 0.004100397614314116,
"grad_norm": NaN,
"learning_rate": 3.12696703292044e-05,
"loss": 0.0,
"step": 66
},
{
"epoch": 0.0041625248508946324,
"grad_norm": NaN,
"learning_rate": 2.9663167846209998e-05,
"loss": 0.0,
"step": 67
},
{
"epoch": 0.004224652087475149,
"grad_norm": NaN,
"learning_rate": 2.8081442660546125e-05,
"loss": 0.0,
"step": 68
},
{
"epoch": 0.004286779324055666,
"grad_norm": NaN,
"learning_rate": 2.6526421860705473e-05,
"loss": 0.0,
"step": 69
},
{
"epoch": 0.004348906560636183,
"grad_norm": NaN,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0,
"step": 70
},
{
"epoch": 0.0044110337972167,
"grad_norm": NaN,
"learning_rate": 2.350403678833976e-05,
"loss": 0.0,
"step": 71
},
{
"epoch": 0.0044731610337972166,
"grad_norm": NaN,
"learning_rate": 2.2040354826462668e-05,
"loss": 0.0,
"step": 72
},
{
"epoch": 0.0044731610337972166,
"eval_loss": NaN,
"eval_runtime": 378.6574,
"eval_samples_per_second": 35.798,
"eval_steps_per_second": 4.476,
"step": 72
},
{
"epoch": 0.004535288270377733,
"grad_norm": NaN,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0,
"step": 73
},
{
"epoch": 0.00459741550695825,
"grad_norm": NaN,
"learning_rate": 1.9216926233717085e-05,
"loss": 0.0,
"step": 74
},
{
"epoch": 0.004659542743538767,
"grad_norm": NaN,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.0,
"step": 75
},
{
"epoch": 0.004721669980119284,
"grad_norm": NaN,
"learning_rate": 1.6543469682057106e-05,
"loss": 0.0,
"step": 76
},
{
"epoch": 0.0047837972166998016,
"grad_norm": NaN,
"learning_rate": 1.526708147705013e-05,
"loss": 0.0,
"step": 77
},
{
"epoch": 0.004845924453280318,
"grad_norm": NaN,
"learning_rate": 1.4033009983067452e-05,
"loss": 0.0,
"step": 78
},
{
"epoch": 0.004908051689860835,
"grad_norm": NaN,
"learning_rate": 1.2842758726130283e-05,
"loss": 0.0,
"step": 79
},
{
"epoch": 0.004970178926441352,
"grad_norm": NaN,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.0,
"step": 80
},
{
"epoch": 0.005032306163021869,
"grad_norm": NaN,
"learning_rate": 1.0599462319663905e-05,
"loss": 0.0,
"step": 81
},
{
"epoch": 0.005032306163021869,
"eval_loss": NaN,
"eval_runtime": 378.4891,
"eval_samples_per_second": 35.813,
"eval_steps_per_second": 4.478,
"step": 81
},
{
"epoch": 0.005094433399602386,
"grad_norm": NaN,
"learning_rate": 9.549150281252633e-06,
"loss": 0.0,
"step": 82
},
{
"epoch": 0.0051565606361829025,
"grad_norm": NaN,
"learning_rate": 8.548121372247918e-06,
"loss": 0.0,
"step": 83
},
{
"epoch": 0.005218687872763419,
"grad_norm": NaN,
"learning_rate": 7.597595192178702e-06,
"loss": 0.0,
"step": 84
},
{
"epoch": 0.005280815109343936,
"grad_norm": NaN,
"learning_rate": 6.698729810778065e-06,
"loss": 0.0,
"step": 85
},
{
"epoch": 0.005342942345924453,
"grad_norm": NaN,
"learning_rate": 5.852620357053651e-06,
"loss": 0.0,
"step": 86
},
{
"epoch": 0.00540506958250497,
"grad_norm": NaN,
"learning_rate": 5.060297685041659e-06,
"loss": 0.0,
"step": 87
},
{
"epoch": 0.0054671968190854875,
"grad_norm": NaN,
"learning_rate": 4.322727117869951e-06,
"loss": 0.0,
"step": 88
},
{
"epoch": 0.005529324055666004,
"grad_norm": NaN,
"learning_rate": 3.6408072716606346e-06,
"loss": 0.0,
"step": 89
},
{
"epoch": 0.005591451292246521,
"grad_norm": NaN,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.0,
"step": 90
},
{
"epoch": 0.005591451292246521,
"eval_loss": NaN,
"eval_runtime": 378.3851,
"eval_samples_per_second": 35.823,
"eval_steps_per_second": 4.48,
"step": 90
},
{
"epoch": 0.005653578528827038,
"grad_norm": NaN,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.0,
"step": 91
},
{
"epoch": 0.005715705765407555,
"grad_norm": NaN,
"learning_rate": 1.9369152030840556e-06,
"loss": 0.0,
"step": 92
},
{
"epoch": 0.005777833001988072,
"grad_norm": NaN,
"learning_rate": 1.4852136862001764e-06,
"loss": 0.0,
"step": 93
},
{
"epoch": 0.0058399602385685884,
"grad_norm": NaN,
"learning_rate": 1.0926199633097157e-06,
"loss": 0.0,
"step": 94
},
{
"epoch": 0.005902087475149105,
"grad_norm": NaN,
"learning_rate": 7.596123493895991e-07,
"loss": 0.0,
"step": 95
},
{
"epoch": 0.005964214711729622,
"grad_norm": NaN,
"learning_rate": 4.865965629214819e-07,
"loss": 0.0,
"step": 96
},
{
"epoch": 0.006026341948310139,
"grad_norm": NaN,
"learning_rate": 2.7390523158633554e-07,
"loss": 0.0,
"step": 97
},
{
"epoch": 0.006088469184890656,
"grad_norm": NaN,
"learning_rate": 1.2179748700879012e-07,
"loss": 0.0,
"step": 98
},
{
"epoch": 0.0061505964214711726,
"grad_norm": NaN,
"learning_rate": 3.04586490452119e-08,
"loss": 0.0,
"step": 99
},
{
"epoch": 0.0061505964214711726,
"eval_loss": NaN,
"eval_runtime": 378.4304,
"eval_samples_per_second": 35.819,
"eval_steps_per_second": 4.479,
"step": 99
},
{
"epoch": 0.00621272365805169,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 0.0,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2184077692108800.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}