|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 42.10526315789474, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.9472, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.744, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 6.315789473684211, |
|
"grad_norm": 0.1695556640625, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.6417, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 8.421052631578947, |
|
"grad_norm": 0.128173828125, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.5931, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 10.526315789473685, |
|
"grad_norm": 0.1119384765625, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.5781, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 12.631578947368421, |
|
"grad_norm": 0.12359619140625, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.5593, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 14.736842105263158, |
|
"grad_norm": 0.1312255859375, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.5535, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 16.842105263157894, |
|
"grad_norm": 0.1319580078125, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.5372, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 18.94736842105263, |
|
"grad_norm": 0.1417236328125, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 0.5316, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 21.05263157894737, |
|
"grad_norm": 0.1668701171875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5192, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 23.157894736842106, |
|
"grad_norm": 0.19091796875, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 0.5119, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 25.263157894736842, |
|
"grad_norm": 0.1781005859375, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.5092, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 27.36842105263158, |
|
"grad_norm": 0.174560546875, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 0.4983, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 29.473684210526315, |
|
"grad_norm": 0.186767578125, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.4911, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 31.57894736842105, |
|
"grad_norm": 0.185302734375, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.4875, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 33.68421052631579, |
|
"grad_norm": 0.19384765625, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.4846, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 35.78947368421053, |
|
"grad_norm": 0.1925048828125, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 0.4835, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 37.89473684210526, |
|
"grad_norm": 0.1820068359375, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.4818, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.1949462890625, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 0.481, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 42.10526315789474, |
|
"grad_norm": 0.1903076171875, |
|
"learning_rate": 0.0, |
|
"loss": 0.4806, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 42.10526315789474, |
|
"step": 200, |
|
"total_flos": 4.049973067697357e+16, |
|
"train_loss": 0.5557145524024963, |
|
"train_runtime": 584.5958, |
|
"train_samples_per_second": 10.948, |
|
"train_steps_per_second": 0.342 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 4.049973067697357e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|