|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991993594875901, |
|
"eval_steps": 4, |
|
"global_step": 78, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.051240992794235385, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.1101, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10248198558847077, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.085, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.15372297838270615, |
|
"grad_norm": 0.19277238845825195, |
|
"learning_rate": 0.002, |
|
"loss": 1.1055, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.20496397117694154, |
|
"grad_norm": 0.2435273677110672, |
|
"learning_rate": 0.0011547005383792518, |
|
"loss": 1.0199, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2562049639711769, |
|
"grad_norm": 1.447454810142517, |
|
"learning_rate": 0.0008944271909999159, |
|
"loss": 1.1178, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3074459567654123, |
|
"grad_norm": 0.34243044257164, |
|
"learning_rate": 0.0006666666666666666, |
|
"loss": 0.8637, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3586869495596477, |
|
"grad_norm": 0.18111398816108704, |
|
"learning_rate": 0.0005547001962252292, |
|
"loss": 0.8076, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4099279423538831, |
|
"grad_norm": 0.16578800976276398, |
|
"learning_rate": 0.0004850712500726659, |
|
"loss": 0.7863, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4611689351481185, |
|
"grad_norm": 0.18639568984508514, |
|
"learning_rate": 0.0004364357804719848, |
|
"loss": 0.7413, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5124099279423538, |
|
"grad_norm": 0.29591241478919983, |
|
"learning_rate": 0.0004, |
|
"loss": 0.7666, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5636509207365893, |
|
"grad_norm": 0.20059671998023987, |
|
"learning_rate": 0.0003713906763541037, |
|
"loss": 0.8008, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6148919135308246, |
|
"grad_norm": 0.16686993837356567, |
|
"learning_rate": 0.0003481553119113957, |
|
"loss": 0.7543, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6661329063250601, |
|
"grad_norm": 0.16397202014923096, |
|
"learning_rate": 0.0003287979746107146, |
|
"loss": 0.7338, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7173738991192954, |
|
"grad_norm": 0.1775909960269928, |
|
"learning_rate": 0.0003123475237772121, |
|
"loss": 0.7323, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7686148919135308, |
|
"grad_norm": 0.17287716269493103, |
|
"learning_rate": 0.00029814239699997195, |
|
"loss": 0.7414, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8198558847077662, |
|
"grad_norm": 0.19447582960128784, |
|
"learning_rate": 0.0002857142857142857, |
|
"loss": 0.7489, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.8710968775020016, |
|
"grad_norm": 0.15788140892982483, |
|
"learning_rate": 0.0002747211278973781, |
|
"loss": 0.7467, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.922337870296237, |
|
"grad_norm": 0.18027225136756897, |
|
"learning_rate": 0.00026490647141300875, |
|
"loss": 0.734, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.9735788630904724, |
|
"grad_norm": 0.1561153680086136, |
|
"learning_rate": 0.000256073759865792, |
|
"loss": 0.7291, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.9991993594875901, |
|
"step": 78, |
|
"total_flos": 7.922041751265608e+17, |
|
"train_loss": 0.8468386912957216, |
|
"train_runtime": 778.0462, |
|
"train_samples_per_second": 12.84, |
|
"train_steps_per_second": 0.1 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 78, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 4, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.922041751265608e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|