|
{ |
|
"best_metric": 2.6847217082977295, |
|
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_random-removal-num-adj-earlystop-bpe_seed-1024_1e-3/checkpoint-44800", |
|
"epoch": 19.999721059972106, |
|
"eval_steps": 500, |
|
"global_step": 44800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.44630404463040446, |
|
"grad_norm": 0.4016287326812744, |
|
"learning_rate": 3.125e-05, |
|
"loss": 5.5919, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8926080892608089, |
|
"grad_norm": 0.597842812538147, |
|
"learning_rate": 6.25e-05, |
|
"loss": 4.0912, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.999721059972106, |
|
"eval_accuracy": 0.3612549924386366, |
|
"eval_loss": 3.811737298965454, |
|
"eval_runtime": 73.9561, |
|
"eval_samples_per_second": 820.487, |
|
"eval_steps_per_second": 12.832, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.3391910739191073, |
|
"grad_norm": 0.5500933527946472, |
|
"learning_rate": 9.375e-05, |
|
"loss": 3.7019, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7854951185495118, |
|
"grad_norm": 0.548509418964386, |
|
"learning_rate": 0.000125, |
|
"loss": 3.4432, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.999721059972106, |
|
"eval_accuracy": 0.41017882301239544, |
|
"eval_loss": 3.298159122467041, |
|
"eval_runtime": 74.1751, |
|
"eval_samples_per_second": 818.064, |
|
"eval_steps_per_second": 12.794, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.2320781032078103, |
|
"grad_norm": 0.4507504999637604, |
|
"learning_rate": 0.00015625, |
|
"loss": 3.2537, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.6783821478382146, |
|
"grad_norm": 0.4841326177120209, |
|
"learning_rate": 0.0001875, |
|
"loss": 3.1317, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.999721059972106, |
|
"eval_accuracy": 0.43030058035079555, |
|
"eval_loss": 3.0934953689575195, |
|
"eval_runtime": 74.0323, |
|
"eval_samples_per_second": 819.643, |
|
"eval_steps_per_second": 12.819, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 3.1249651324965133, |
|
"grad_norm": 0.41781139373779297, |
|
"learning_rate": 0.00021875, |
|
"loss": 3.0446, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.5712691771269176, |
|
"grad_norm": 0.3777638077735901, |
|
"learning_rate": 0.00025, |
|
"loss": 2.9687, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.999721059972106, |
|
"eval_accuracy": 0.4410938772344798, |
|
"eval_loss": 2.9876036643981934, |
|
"eval_runtime": 73.6099, |
|
"eval_samples_per_second": 824.345, |
|
"eval_steps_per_second": 12.892, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 4.017852161785216, |
|
"grad_norm": 0.3835380971431732, |
|
"learning_rate": 0.00028125000000000003, |
|
"loss": 2.9265, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.464156206415621, |
|
"grad_norm": 0.3553270697593689, |
|
"learning_rate": 0.0003125, |
|
"loss": 2.8657, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.910460251046025, |
|
"grad_norm": 0.33925196528434753, |
|
"learning_rate": 0.00034375, |
|
"loss": 2.8476, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.9997210599721065, |
|
"eval_accuracy": 0.4473544922253674, |
|
"eval_loss": 2.9233009815216064, |
|
"eval_runtime": 72.7662, |
|
"eval_samples_per_second": 833.903, |
|
"eval_steps_per_second": 13.042, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 5.357043235704324, |
|
"grad_norm": 0.32550615072250366, |
|
"learning_rate": 0.000375, |
|
"loss": 2.8001, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.803347280334728, |
|
"grad_norm": 0.28298550844192505, |
|
"learning_rate": 0.00040625000000000004, |
|
"loss": 2.7908, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.9997210599721065, |
|
"eval_accuracy": 0.4512933162717955, |
|
"eval_loss": 2.8863840103149414, |
|
"eval_runtime": 72.8889, |
|
"eval_samples_per_second": 832.5, |
|
"eval_steps_per_second": 13.02, |
|
"step": 13440 |
|
}, |
|
{ |
|
"epoch": 6.249930264993027, |
|
"grad_norm": 0.3096221387386322, |
|
"learning_rate": 0.0004375, |
|
"loss": 2.7589, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.696234309623431, |
|
"grad_norm": 0.27621880173683167, |
|
"learning_rate": 0.00046871875, |
|
"loss": 2.7475, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 6.9997210599721065, |
|
"eval_accuracy": 0.4543552160481859, |
|
"eval_loss": 2.858733892440796, |
|
"eval_runtime": 72.719, |
|
"eval_samples_per_second": 834.445, |
|
"eval_steps_per_second": 13.05, |
|
"step": 15680 |
|
}, |
|
{ |
|
"epoch": 7.14281729428173, |
|
"grad_norm": 0.27353551983833313, |
|
"learning_rate": 0.00049996875, |
|
"loss": 2.7279, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.589121338912134, |
|
"grad_norm": 0.2563088536262512, |
|
"learning_rate": 0.00053121875, |
|
"loss": 2.7106, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.9997210599721065, |
|
"eval_accuracy": 0.45666395233109724, |
|
"eval_loss": 2.8396852016448975, |
|
"eval_runtime": 72.6733, |
|
"eval_samples_per_second": 834.969, |
|
"eval_steps_per_second": 13.058, |
|
"step": 17920 |
|
}, |
|
{ |
|
"epoch": 8.035704323570432, |
|
"grad_norm": 0.25674745440483093, |
|
"learning_rate": 0.0005624375, |
|
"loss": 2.7122, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 8.482008368200837, |
|
"grad_norm": 0.2463446408510208, |
|
"learning_rate": 0.0005936875, |
|
"loss": 2.6802, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.928312412831241, |
|
"grad_norm": 0.23965463042259216, |
|
"learning_rate": 0.00062490625, |
|
"loss": 2.6915, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 8.999721059972106, |
|
"eval_accuracy": 0.4583195677743741, |
|
"eval_loss": 2.8250818252563477, |
|
"eval_runtime": 72.4715, |
|
"eval_samples_per_second": 837.295, |
|
"eval_steps_per_second": 13.095, |
|
"step": 20160 |
|
}, |
|
{ |
|
"epoch": 9.37489539748954, |
|
"grad_norm": 0.2294812798500061, |
|
"learning_rate": 0.000656125, |
|
"loss": 2.6611, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.821199442119944, |
|
"grad_norm": 0.2103538066148758, |
|
"learning_rate": 0.0006873749999999999, |
|
"loss": 2.6701, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 9.999721059972106, |
|
"eval_accuracy": 0.4596804839272558, |
|
"eval_loss": 2.813282012939453, |
|
"eval_runtime": 72.8372, |
|
"eval_samples_per_second": 833.091, |
|
"eval_steps_per_second": 13.029, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 10.267782426778243, |
|
"grad_norm": 0.22618016600608826, |
|
"learning_rate": 0.000718625, |
|
"loss": 2.6493, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.714086471408647, |
|
"grad_norm": 0.21487073600292206, |
|
"learning_rate": 0.000749875, |
|
"loss": 2.6549, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 10.999721059972106, |
|
"eval_accuracy": 0.4606486615740561, |
|
"eval_loss": 2.8065173625946045, |
|
"eval_runtime": 73.172, |
|
"eval_samples_per_second": 829.279, |
|
"eval_steps_per_second": 12.969, |
|
"step": 24640 |
|
}, |
|
{ |
|
"epoch": 11.160669456066946, |
|
"grad_norm": 0.20713554322719574, |
|
"learning_rate": 0.00078109375, |
|
"loss": 2.6464, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 11.60697350069735, |
|
"grad_norm": 0.19391033053398132, |
|
"learning_rate": 0.00081234375, |
|
"loss": 2.6374, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 11.999721059972106, |
|
"eval_accuracy": 0.46137410006850466, |
|
"eval_loss": 2.7962806224823, |
|
"eval_runtime": 72.7002, |
|
"eval_samples_per_second": 834.661, |
|
"eval_steps_per_second": 13.054, |
|
"step": 26880 |
|
}, |
|
{ |
|
"epoch": 12.05355648535565, |
|
"grad_norm": 0.19129841029644012, |
|
"learning_rate": 0.0008435625, |
|
"loss": 2.6449, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 12.499860529986053, |
|
"grad_norm": 0.1857786625623703, |
|
"learning_rate": 0.0008748125, |
|
"loss": 2.6218, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 12.946164574616457, |
|
"grad_norm": 0.17947685718536377, |
|
"learning_rate": 0.0009060312499999999, |
|
"loss": 2.6417, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 12.999721059972106, |
|
"eval_accuracy": 0.4621159538304445, |
|
"eval_loss": 2.791673183441162, |
|
"eval_runtime": 72.7171, |
|
"eval_samples_per_second": 834.467, |
|
"eval_steps_per_second": 13.051, |
|
"step": 29120 |
|
}, |
|
{ |
|
"epoch": 13.392747559274756, |
|
"grad_norm": 0.19629885256290436, |
|
"learning_rate": 0.00093728125, |
|
"loss": 2.6136, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 13.83905160390516, |
|
"grad_norm": 0.18094076216220856, |
|
"learning_rate": 0.0009685000000000001, |
|
"loss": 2.6321, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 13.999721059972106, |
|
"eval_accuracy": 0.46257415952537906, |
|
"eval_loss": 2.7885537147521973, |
|
"eval_runtime": 72.7689, |
|
"eval_samples_per_second": 833.872, |
|
"eval_steps_per_second": 13.041, |
|
"step": 31360 |
|
}, |
|
{ |
|
"epoch": 14.28563458856346, |
|
"grad_norm": 0.2021656334400177, |
|
"learning_rate": 0.00099975, |
|
"loss": 2.6148, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 14.731938633193863, |
|
"grad_norm": 0.17096078395843506, |
|
"learning_rate": 0.0009225, |
|
"loss": 2.6147, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 14.999721059972106, |
|
"eval_accuracy": 0.46528216164514585, |
|
"eval_loss": 2.767446756362915, |
|
"eval_runtime": 72.4235, |
|
"eval_samples_per_second": 837.849, |
|
"eval_steps_per_second": 13.103, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 15.178521617852162, |
|
"grad_norm": 0.1746244877576828, |
|
"learning_rate": 0.000844453125, |
|
"loss": 2.5895, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 15.624825662482566, |
|
"grad_norm": 0.1760057806968689, |
|
"learning_rate": 0.0007663281250000001, |
|
"loss": 2.571, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 15.999721059972106, |
|
"eval_accuracy": 0.4683800586813499, |
|
"eval_loss": 2.7426321506500244, |
|
"eval_runtime": 73.1881, |
|
"eval_samples_per_second": 829.097, |
|
"eval_steps_per_second": 12.967, |
|
"step": 35840 |
|
}, |
|
{ |
|
"epoch": 16.071408647140863, |
|
"grad_norm": 0.18071866035461426, |
|
"learning_rate": 0.00068828125, |
|
"loss": 2.5606, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 16.51771269177127, |
|
"grad_norm": 0.19238583743572235, |
|
"learning_rate": 0.000610234375, |
|
"loss": 2.516, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 16.964016736401675, |
|
"grad_norm": 0.17937716841697693, |
|
"learning_rate": 0.000532109375, |
|
"loss": 2.5232, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 16.999721059972106, |
|
"eval_accuracy": 0.4715372187108199, |
|
"eval_loss": 2.721198558807373, |
|
"eval_runtime": 72.9354, |
|
"eval_samples_per_second": 831.969, |
|
"eval_steps_per_second": 13.012, |
|
"step": 38080 |
|
}, |
|
{ |
|
"epoch": 17.410599721059974, |
|
"grad_norm": 0.18727989494800568, |
|
"learning_rate": 0.000453984375, |
|
"loss": 2.4628, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 17.856903765690376, |
|
"grad_norm": 0.17880840599536896, |
|
"learning_rate": 0.000375859375, |
|
"loss": 2.4666, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 17.999721059972106, |
|
"eval_accuracy": 0.47424567321984823, |
|
"eval_loss": 2.7036237716674805, |
|
"eval_runtime": 72.7126, |
|
"eval_samples_per_second": 834.518, |
|
"eval_steps_per_second": 13.051, |
|
"step": 40320 |
|
}, |
|
{ |
|
"epoch": 18.303486750348675, |
|
"grad_norm": 0.19291794300079346, |
|
"learning_rate": 0.0002978125, |
|
"loss": 2.4153, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 18.74979079497908, |
|
"grad_norm": 0.18789054453372955, |
|
"learning_rate": 0.00021968750000000002, |
|
"loss": 2.4024, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 18.999721059972106, |
|
"eval_accuracy": 0.47697532539713317, |
|
"eval_loss": 2.6877355575561523, |
|
"eval_runtime": 72.7902, |
|
"eval_samples_per_second": 833.629, |
|
"eval_steps_per_second": 13.037, |
|
"step": 42560 |
|
}, |
|
{ |
|
"epoch": 19.196373779637376, |
|
"grad_norm": 0.20727203786373138, |
|
"learning_rate": 0.000141640625, |
|
"loss": 2.3681, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 19.642677824267782, |
|
"grad_norm": 0.20505329966545105, |
|
"learning_rate": 6.351562500000001e-05, |
|
"loss": 2.3301, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 19.999721059972106, |
|
"eval_accuracy": 0.4785579769152222, |
|
"eval_loss": 2.6847217082977295, |
|
"eval_runtime": 72.8209, |
|
"eval_samples_per_second": 833.277, |
|
"eval_steps_per_second": 13.032, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 19.999721059972106, |
|
"step": 44800, |
|
"total_flos": 1.498591326633984e+18, |
|
"train_loss": 2.809299095698765, |
|
"train_runtime": 30285.2709, |
|
"train_samples_per_second": 378.795, |
|
"train_steps_per_second": 1.479 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 44800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.498591326633984e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|