kanishka's picture
End of training
7e63acd verified
{
"best_metric": 2.6847217082977295,
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_random-removal-num-adj-earlystop-bpe_seed-1024_1e-3/checkpoint-44800",
"epoch": 19.999721059972106,
"eval_steps": 500,
"global_step": 44800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.44630404463040446,
"grad_norm": 0.4016287326812744,
"learning_rate": 3.125e-05,
"loss": 5.5919,
"step": 1000
},
{
"epoch": 0.8926080892608089,
"grad_norm": 0.597842812538147,
"learning_rate": 6.25e-05,
"loss": 4.0912,
"step": 2000
},
{
"epoch": 0.999721059972106,
"eval_accuracy": 0.3612549924386366,
"eval_loss": 3.811737298965454,
"eval_runtime": 73.9561,
"eval_samples_per_second": 820.487,
"eval_steps_per_second": 12.832,
"step": 2240
},
{
"epoch": 1.3391910739191073,
"grad_norm": 0.5500933527946472,
"learning_rate": 9.375e-05,
"loss": 3.7019,
"step": 3000
},
{
"epoch": 1.7854951185495118,
"grad_norm": 0.548509418964386,
"learning_rate": 0.000125,
"loss": 3.4432,
"step": 4000
},
{
"epoch": 1.999721059972106,
"eval_accuracy": 0.41017882301239544,
"eval_loss": 3.298159122467041,
"eval_runtime": 74.1751,
"eval_samples_per_second": 818.064,
"eval_steps_per_second": 12.794,
"step": 4480
},
{
"epoch": 2.2320781032078103,
"grad_norm": 0.4507504999637604,
"learning_rate": 0.00015625,
"loss": 3.2537,
"step": 5000
},
{
"epoch": 2.6783821478382146,
"grad_norm": 0.4841326177120209,
"learning_rate": 0.0001875,
"loss": 3.1317,
"step": 6000
},
{
"epoch": 2.999721059972106,
"eval_accuracy": 0.43030058035079555,
"eval_loss": 3.0934953689575195,
"eval_runtime": 74.0323,
"eval_samples_per_second": 819.643,
"eval_steps_per_second": 12.819,
"step": 6720
},
{
"epoch": 3.1249651324965133,
"grad_norm": 0.41781139373779297,
"learning_rate": 0.00021875,
"loss": 3.0446,
"step": 7000
},
{
"epoch": 3.5712691771269176,
"grad_norm": 0.3777638077735901,
"learning_rate": 0.00025,
"loss": 2.9687,
"step": 8000
},
{
"epoch": 3.999721059972106,
"eval_accuracy": 0.4410938772344798,
"eval_loss": 2.9876036643981934,
"eval_runtime": 73.6099,
"eval_samples_per_second": 824.345,
"eval_steps_per_second": 12.892,
"step": 8960
},
{
"epoch": 4.017852161785216,
"grad_norm": 0.3835380971431732,
"learning_rate": 0.00028125000000000003,
"loss": 2.9265,
"step": 9000
},
{
"epoch": 4.464156206415621,
"grad_norm": 0.3553270697593689,
"learning_rate": 0.0003125,
"loss": 2.8657,
"step": 10000
},
{
"epoch": 4.910460251046025,
"grad_norm": 0.33925196528434753,
"learning_rate": 0.00034375,
"loss": 2.8476,
"step": 11000
},
{
"epoch": 4.9997210599721065,
"eval_accuracy": 0.4473544922253674,
"eval_loss": 2.9233009815216064,
"eval_runtime": 72.7662,
"eval_samples_per_second": 833.903,
"eval_steps_per_second": 13.042,
"step": 11200
},
{
"epoch": 5.357043235704324,
"grad_norm": 0.32550615072250366,
"learning_rate": 0.000375,
"loss": 2.8001,
"step": 12000
},
{
"epoch": 5.803347280334728,
"grad_norm": 0.28298550844192505,
"learning_rate": 0.00040625000000000004,
"loss": 2.7908,
"step": 13000
},
{
"epoch": 5.9997210599721065,
"eval_accuracy": 0.4512933162717955,
"eval_loss": 2.8863840103149414,
"eval_runtime": 72.8889,
"eval_samples_per_second": 832.5,
"eval_steps_per_second": 13.02,
"step": 13440
},
{
"epoch": 6.249930264993027,
"grad_norm": 0.3096221387386322,
"learning_rate": 0.0004375,
"loss": 2.7589,
"step": 14000
},
{
"epoch": 6.696234309623431,
"grad_norm": 0.27621880173683167,
"learning_rate": 0.00046871875,
"loss": 2.7475,
"step": 15000
},
{
"epoch": 6.9997210599721065,
"eval_accuracy": 0.4543552160481859,
"eval_loss": 2.858733892440796,
"eval_runtime": 72.719,
"eval_samples_per_second": 834.445,
"eval_steps_per_second": 13.05,
"step": 15680
},
{
"epoch": 7.14281729428173,
"grad_norm": 0.27353551983833313,
"learning_rate": 0.00049996875,
"loss": 2.7279,
"step": 16000
},
{
"epoch": 7.589121338912134,
"grad_norm": 0.2563088536262512,
"learning_rate": 0.00053121875,
"loss": 2.7106,
"step": 17000
},
{
"epoch": 7.9997210599721065,
"eval_accuracy": 0.45666395233109724,
"eval_loss": 2.8396852016448975,
"eval_runtime": 72.6733,
"eval_samples_per_second": 834.969,
"eval_steps_per_second": 13.058,
"step": 17920
},
{
"epoch": 8.035704323570432,
"grad_norm": 0.25674745440483093,
"learning_rate": 0.0005624375,
"loss": 2.7122,
"step": 18000
},
{
"epoch": 8.482008368200837,
"grad_norm": 0.2463446408510208,
"learning_rate": 0.0005936875,
"loss": 2.6802,
"step": 19000
},
{
"epoch": 8.928312412831241,
"grad_norm": 0.23965463042259216,
"learning_rate": 0.00062490625,
"loss": 2.6915,
"step": 20000
},
{
"epoch": 8.999721059972106,
"eval_accuracy": 0.4583195677743741,
"eval_loss": 2.8250818252563477,
"eval_runtime": 72.4715,
"eval_samples_per_second": 837.295,
"eval_steps_per_second": 13.095,
"step": 20160
},
{
"epoch": 9.37489539748954,
"grad_norm": 0.2294812798500061,
"learning_rate": 0.000656125,
"loss": 2.6611,
"step": 21000
},
{
"epoch": 9.821199442119944,
"grad_norm": 0.2103538066148758,
"learning_rate": 0.0006873749999999999,
"loss": 2.6701,
"step": 22000
},
{
"epoch": 9.999721059972106,
"eval_accuracy": 0.4596804839272558,
"eval_loss": 2.813282012939453,
"eval_runtime": 72.8372,
"eval_samples_per_second": 833.091,
"eval_steps_per_second": 13.029,
"step": 22400
},
{
"epoch": 10.267782426778243,
"grad_norm": 0.22618016600608826,
"learning_rate": 0.000718625,
"loss": 2.6493,
"step": 23000
},
{
"epoch": 10.714086471408647,
"grad_norm": 0.21487073600292206,
"learning_rate": 0.000749875,
"loss": 2.6549,
"step": 24000
},
{
"epoch": 10.999721059972106,
"eval_accuracy": 0.4606486615740561,
"eval_loss": 2.8065173625946045,
"eval_runtime": 73.172,
"eval_samples_per_second": 829.279,
"eval_steps_per_second": 12.969,
"step": 24640
},
{
"epoch": 11.160669456066946,
"grad_norm": 0.20713554322719574,
"learning_rate": 0.00078109375,
"loss": 2.6464,
"step": 25000
},
{
"epoch": 11.60697350069735,
"grad_norm": 0.19391033053398132,
"learning_rate": 0.00081234375,
"loss": 2.6374,
"step": 26000
},
{
"epoch": 11.999721059972106,
"eval_accuracy": 0.46137410006850466,
"eval_loss": 2.7962806224823,
"eval_runtime": 72.7002,
"eval_samples_per_second": 834.661,
"eval_steps_per_second": 13.054,
"step": 26880
},
{
"epoch": 12.05355648535565,
"grad_norm": 0.19129841029644012,
"learning_rate": 0.0008435625,
"loss": 2.6449,
"step": 27000
},
{
"epoch": 12.499860529986053,
"grad_norm": 0.1857786625623703,
"learning_rate": 0.0008748125,
"loss": 2.6218,
"step": 28000
},
{
"epoch": 12.946164574616457,
"grad_norm": 0.17947685718536377,
"learning_rate": 0.0009060312499999999,
"loss": 2.6417,
"step": 29000
},
{
"epoch": 12.999721059972106,
"eval_accuracy": 0.4621159538304445,
"eval_loss": 2.791673183441162,
"eval_runtime": 72.7171,
"eval_samples_per_second": 834.467,
"eval_steps_per_second": 13.051,
"step": 29120
},
{
"epoch": 13.392747559274756,
"grad_norm": 0.19629885256290436,
"learning_rate": 0.00093728125,
"loss": 2.6136,
"step": 30000
},
{
"epoch": 13.83905160390516,
"grad_norm": 0.18094076216220856,
"learning_rate": 0.0009685000000000001,
"loss": 2.6321,
"step": 31000
},
{
"epoch": 13.999721059972106,
"eval_accuracy": 0.46257415952537906,
"eval_loss": 2.7885537147521973,
"eval_runtime": 72.7689,
"eval_samples_per_second": 833.872,
"eval_steps_per_second": 13.041,
"step": 31360
},
{
"epoch": 14.28563458856346,
"grad_norm": 0.2021656334400177,
"learning_rate": 0.00099975,
"loss": 2.6148,
"step": 32000
},
{
"epoch": 14.731938633193863,
"grad_norm": 0.17096078395843506,
"learning_rate": 0.0009225,
"loss": 2.6147,
"step": 33000
},
{
"epoch": 14.999721059972106,
"eval_accuracy": 0.46528216164514585,
"eval_loss": 2.767446756362915,
"eval_runtime": 72.4235,
"eval_samples_per_second": 837.849,
"eval_steps_per_second": 13.103,
"step": 33600
},
{
"epoch": 15.178521617852162,
"grad_norm": 0.1746244877576828,
"learning_rate": 0.000844453125,
"loss": 2.5895,
"step": 34000
},
{
"epoch": 15.624825662482566,
"grad_norm": 0.1760057806968689,
"learning_rate": 0.0007663281250000001,
"loss": 2.571,
"step": 35000
},
{
"epoch": 15.999721059972106,
"eval_accuracy": 0.4683800586813499,
"eval_loss": 2.7426321506500244,
"eval_runtime": 73.1881,
"eval_samples_per_second": 829.097,
"eval_steps_per_second": 12.967,
"step": 35840
},
{
"epoch": 16.071408647140863,
"grad_norm": 0.18071866035461426,
"learning_rate": 0.00068828125,
"loss": 2.5606,
"step": 36000
},
{
"epoch": 16.51771269177127,
"grad_norm": 0.19238583743572235,
"learning_rate": 0.000610234375,
"loss": 2.516,
"step": 37000
},
{
"epoch": 16.964016736401675,
"grad_norm": 0.17937716841697693,
"learning_rate": 0.000532109375,
"loss": 2.5232,
"step": 38000
},
{
"epoch": 16.999721059972106,
"eval_accuracy": 0.4715372187108199,
"eval_loss": 2.721198558807373,
"eval_runtime": 72.9354,
"eval_samples_per_second": 831.969,
"eval_steps_per_second": 13.012,
"step": 38080
},
{
"epoch": 17.410599721059974,
"grad_norm": 0.18727989494800568,
"learning_rate": 0.000453984375,
"loss": 2.4628,
"step": 39000
},
{
"epoch": 17.856903765690376,
"grad_norm": 0.17880840599536896,
"learning_rate": 0.000375859375,
"loss": 2.4666,
"step": 40000
},
{
"epoch": 17.999721059972106,
"eval_accuracy": 0.47424567321984823,
"eval_loss": 2.7036237716674805,
"eval_runtime": 72.7126,
"eval_samples_per_second": 834.518,
"eval_steps_per_second": 13.051,
"step": 40320
},
{
"epoch": 18.303486750348675,
"grad_norm": 0.19291794300079346,
"learning_rate": 0.0002978125,
"loss": 2.4153,
"step": 41000
},
{
"epoch": 18.74979079497908,
"grad_norm": 0.18789054453372955,
"learning_rate": 0.00021968750000000002,
"loss": 2.4024,
"step": 42000
},
{
"epoch": 18.999721059972106,
"eval_accuracy": 0.47697532539713317,
"eval_loss": 2.6877355575561523,
"eval_runtime": 72.7902,
"eval_samples_per_second": 833.629,
"eval_steps_per_second": 13.037,
"step": 42560
},
{
"epoch": 19.196373779637376,
"grad_norm": 0.20727203786373138,
"learning_rate": 0.000141640625,
"loss": 2.3681,
"step": 43000
},
{
"epoch": 19.642677824267782,
"grad_norm": 0.20505329966545105,
"learning_rate": 6.351562500000001e-05,
"loss": 2.3301,
"step": 44000
},
{
"epoch": 19.999721059972106,
"eval_accuracy": 0.4785579769152222,
"eval_loss": 2.6847217082977295,
"eval_runtime": 72.8209,
"eval_samples_per_second": 833.277,
"eval_steps_per_second": 13.032,
"step": 44800
},
{
"epoch": 19.999721059972106,
"step": 44800,
"total_flos": 1.498591326633984e+18,
"train_loss": 2.809299095698765,
"train_runtime": 30285.2709,
"train_samples_per_second": 378.795,
"train_steps_per_second": 1.479
}
],
"logging_steps": 1000,
"max_steps": 44800,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.498591326633984e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}