|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9933184855233854, |
|
"eval_steps": 100, |
|
"global_step": 294, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10181355392936685, |
|
"grad_norm": 7.969674587249756, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0773, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2036271078587337, |
|
"grad_norm": 5.803192138671875, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8134, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.30544066178810053, |
|
"grad_norm": 4.133590221405029, |
|
"learning_rate": 3e-05, |
|
"loss": 2.5663, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4072542157174674, |
|
"grad_norm": 0.8438643217086792, |
|
"learning_rate": 4e-05, |
|
"loss": 2.0167, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5090677696468342, |
|
"grad_norm": 0.6160888671875, |
|
"learning_rate": 5e-05, |
|
"loss": 1.9501, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6108813235762011, |
|
"grad_norm": 0.48306068778038025, |
|
"learning_rate": 6e-05, |
|
"loss": 1.9072, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7126948775055679, |
|
"grad_norm": 0.4817599654197693, |
|
"learning_rate": 7e-05, |
|
"loss": 1.8151, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8145084314349348, |
|
"grad_norm": 0.5117820501327515, |
|
"learning_rate": 8e-05, |
|
"loss": 1.8332, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9163219853643016, |
|
"grad_norm": 0.5870208740234375, |
|
"learning_rate": 9e-05, |
|
"loss": 1.8128, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.0181355392936684, |
|
"grad_norm": 0.5354984402656555, |
|
"learning_rate": 0.0001, |
|
"loss": 1.9072, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0181355392936684, |
|
"eval_loss": 1.6765505075454712, |
|
"eval_runtime": 215.7529, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 1.622, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1199490932230354, |
|
"grad_norm": 0.6077488660812378, |
|
"learning_rate": 9.484536082474227e-05, |
|
"loss": 1.7815, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2217626471524021, |
|
"grad_norm": 0.6680741906166077, |
|
"learning_rate": 8.969072164948454e-05, |
|
"loss": 1.6772, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.323576201081769, |
|
"grad_norm": 0.6348243355751038, |
|
"learning_rate": 8.453608247422681e-05, |
|
"loss": 1.7182, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4253897550111359, |
|
"grad_norm": 0.6397783160209656, |
|
"learning_rate": 7.938144329896907e-05, |
|
"loss": 1.7319, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.5272033089405026, |
|
"grad_norm": 0.6410454511642456, |
|
"learning_rate": 7.422680412371135e-05, |
|
"loss": 1.7373, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.6290168628698696, |
|
"grad_norm": 0.6685994863510132, |
|
"learning_rate": 6.907216494845361e-05, |
|
"loss": 1.7746, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.7308304167992365, |
|
"grad_norm": 0.680564284324646, |
|
"learning_rate": 6.391752577319587e-05, |
|
"loss": 1.7363, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8326439707286033, |
|
"grad_norm": 0.7003260254859924, |
|
"learning_rate": 5.876288659793815e-05, |
|
"loss": 1.6904, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.93445752465797, |
|
"grad_norm": 0.7016832232475281, |
|
"learning_rate": 5.360824742268041e-05, |
|
"loss": 1.6566, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.036271078587337, |
|
"grad_norm": 0.7320058941841125, |
|
"learning_rate": 4.845360824742268e-05, |
|
"loss": 1.8043, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.036271078587337, |
|
"eval_loss": 1.614027500152588, |
|
"eval_runtime": 215.7027, |
|
"eval_samples_per_second": 1.623, |
|
"eval_steps_per_second": 1.623, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.138084632516704, |
|
"grad_norm": 0.7479106783866882, |
|
"learning_rate": 4.329896907216495e-05, |
|
"loss": 1.6897, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.2398981864460707, |
|
"grad_norm": 0.7098692655563354, |
|
"learning_rate": 3.8144329896907216e-05, |
|
"loss": 1.6457, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.3417117403754375, |
|
"grad_norm": 0.7282701134681702, |
|
"learning_rate": 3.2989690721649485e-05, |
|
"loss": 1.7046, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.4435252943048043, |
|
"grad_norm": 0.706913948059082, |
|
"learning_rate": 2.7835051546391755e-05, |
|
"loss": 1.68, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.545338848234171, |
|
"grad_norm": 0.789571225643158, |
|
"learning_rate": 2.268041237113402e-05, |
|
"loss": 1.6881, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.647152402163538, |
|
"grad_norm": 0.7536965012550354, |
|
"learning_rate": 1.7525773195876288e-05, |
|
"loss": 1.6628, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.748965956092905, |
|
"grad_norm": 0.7617937326431274, |
|
"learning_rate": 1.2371134020618558e-05, |
|
"loss": 1.6595, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.8507795100222717, |
|
"grad_norm": 0.7684347629547119, |
|
"learning_rate": 7.216494845360824e-06, |
|
"loss": 1.6476, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.9525930639516385, |
|
"grad_norm": 0.7330012321472168, |
|
"learning_rate": 2.061855670103093e-06, |
|
"loss": 1.6293, |
|
"step": 290 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 294, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.169518958921646e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|