kanishka's picture
End of training
67157a3 verified
raw
history blame
13.6 kB
{
"best_metric": 2.6889467239379883,
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44500",
"epoch": 19.99134928659701,
"eval_steps": 500,
"global_step": 44500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4493877092461521,
"grad_norm": 0.4442262053489685,
"learning_rate": 3.125e-05,
"loss": 5.5692,
"step": 1000
},
{
"epoch": 0.8987754184923042,
"grad_norm": 0.5283451676368713,
"learning_rate": 6.25e-05,
"loss": 4.0804,
"step": 2000
},
{
"epoch": 1.0,
"eval_accuracy": 0.35990021325653254,
"eval_loss": 3.8291473388671875,
"eval_runtime": 71.8623,
"eval_samples_per_second": 844.699,
"eval_steps_per_second": 13.206,
"step": 2226
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.5473753809928894,
"learning_rate": 9.375e-05,
"loss": 3.6828,
"step": 3000
},
{
"epoch": 1.7972137962026737,
"grad_norm": 0.6101972460746765,
"learning_rate": 0.000125,
"loss": 3.431,
"step": 4000
},
{
"epoch": 2.0,
"eval_accuracy": 0.4075588167460322,
"eval_loss": 3.316692590713501,
"eval_runtime": 71.9883,
"eval_samples_per_second": 843.22,
"eval_steps_per_second": 13.183,
"step": 4452
},
{
"epoch": 2.2462644646668912,
"grad_norm": 0.5061201453208923,
"learning_rate": 0.00015625,
"loss": 3.2333,
"step": 5000
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.4999963343143463,
"learning_rate": 0.0001875,
"loss": 3.1165,
"step": 6000
},
{
"epoch": 3.0,
"eval_accuracy": 0.4289709096382779,
"eval_loss": 3.102813482284546,
"eval_runtime": 71.4078,
"eval_samples_per_second": 850.075,
"eval_steps_per_second": 13.29,
"step": 6678
},
{
"epoch": 3.144702842377261,
"grad_norm": 0.4476725161075592,
"learning_rate": 0.00021875,
"loss": 3.0211,
"step": 7000
},
{
"epoch": 3.594090551623413,
"grad_norm": 0.3835226595401764,
"learning_rate": 0.00025,
"loss": 2.9545,
"step": 8000
},
{
"epoch": 4.0,
"eval_accuracy": 0.4395799214549251,
"eval_loss": 2.998249053955078,
"eval_runtime": 71.272,
"eval_samples_per_second": 851.695,
"eval_steps_per_second": 13.315,
"step": 8904
},
{
"epoch": 4.04314122008763,
"grad_norm": 0.36295273900032043,
"learning_rate": 0.00028125000000000003,
"loss": 2.9068,
"step": 9000
},
{
"epoch": 4.4925289293337825,
"grad_norm": 0.368268221616745,
"learning_rate": 0.0003125,
"loss": 2.8533,
"step": 10000
},
{
"epoch": 4.941916638579935,
"grad_norm": 0.3496946096420288,
"learning_rate": 0.00034375,
"loss": 2.8357,
"step": 11000
},
{
"epoch": 5.0,
"eval_accuracy": 0.4463052869660269,
"eval_loss": 2.9296414852142334,
"eval_runtime": 70.9885,
"eval_samples_per_second": 855.097,
"eval_steps_per_second": 13.368,
"step": 11130
},
{
"epoch": 5.390967307044153,
"grad_norm": 0.32746192812919617,
"learning_rate": 0.000375,
"loss": 2.7838,
"step": 12000
},
{
"epoch": 5.840355016290305,
"grad_norm": 0.3100210726261139,
"learning_rate": 0.00040625000000000004,
"loss": 2.7805,
"step": 13000
},
{
"epoch": 6.0,
"eval_accuracy": 0.4504751272852721,
"eval_loss": 2.892799139022827,
"eval_runtime": 70.9528,
"eval_samples_per_second": 855.527,
"eval_steps_per_second": 13.375,
"step": 13356
},
{
"epoch": 6.289405684754522,
"grad_norm": 0.28876417875289917,
"learning_rate": 0.0004375,
"loss": 2.7419,
"step": 14000
},
{
"epoch": 6.738793394000674,
"grad_norm": 0.2749331295490265,
"learning_rate": 0.00046871875,
"loss": 2.7354,
"step": 15000
},
{
"epoch": 7.0,
"eval_accuracy": 0.45374956150296436,
"eval_loss": 2.8629629611968994,
"eval_runtime": 70.9856,
"eval_samples_per_second": 855.131,
"eval_steps_per_second": 13.369,
"step": 15582
},
{
"epoch": 7.187844062464891,
"grad_norm": 0.2688666582107544,
"learning_rate": 0.00049996875,
"loss": 2.7138,
"step": 16000
},
{
"epoch": 7.637231771711043,
"grad_norm": 0.25027596950531006,
"learning_rate": 0.00053121875,
"loss": 2.702,
"step": 17000
},
{
"epoch": 8.0,
"eval_accuracy": 0.45609111952250175,
"eval_loss": 2.8440542221069336,
"eval_runtime": 70.7062,
"eval_samples_per_second": 858.51,
"eval_steps_per_second": 13.422,
"step": 17808
},
{
"epoch": 8.08628244017526,
"grad_norm": 0.2555592656135559,
"learning_rate": 0.0005624375,
"loss": 2.6924,
"step": 18000
},
{
"epoch": 8.535670149421414,
"grad_norm": 0.24662208557128906,
"learning_rate": 0.0005936875,
"loss": 2.6741,
"step": 19000
},
{
"epoch": 8.985057858667565,
"grad_norm": 0.2206139713525772,
"learning_rate": 0.00062490625,
"loss": 2.684,
"step": 20000
},
{
"epoch": 9.0,
"eval_accuracy": 0.4570897622005542,
"eval_loss": 2.835329294204712,
"eval_runtime": 70.6801,
"eval_samples_per_second": 858.827,
"eval_steps_per_second": 13.427,
"step": 20034
},
{
"epoch": 9.434108527131784,
"grad_norm": 0.21738700568675995,
"learning_rate": 0.0006561562500000001,
"loss": 2.6454,
"step": 21000
},
{
"epoch": 9.883496236377935,
"grad_norm": 0.2152646780014038,
"learning_rate": 0.0006873749999999999,
"loss": 2.6647,
"step": 22000
},
{
"epoch": 10.0,
"eval_accuracy": 0.45841717267447984,
"eval_loss": 2.819920778274536,
"eval_runtime": 70.5746,
"eval_samples_per_second": 860.111,
"eval_steps_per_second": 13.447,
"step": 22260
},
{
"epoch": 10.332546904842152,
"grad_norm": 0.2241608053445816,
"learning_rate": 0.000718625,
"loss": 2.6349,
"step": 23000
},
{
"epoch": 10.781934614088305,
"grad_norm": 0.20968548953533173,
"learning_rate": 0.0007498437500000001,
"loss": 2.6475,
"step": 24000
},
{
"epoch": 11.0,
"eval_accuracy": 0.45958520603061825,
"eval_loss": 2.81199049949646,
"eval_runtime": 70.7302,
"eval_samples_per_second": 858.219,
"eval_steps_per_second": 13.417,
"step": 24486
},
{
"epoch": 11.230985282552522,
"grad_norm": 0.21336565911769867,
"learning_rate": 0.00078109375,
"loss": 2.6285,
"step": 25000
},
{
"epoch": 11.680372991798674,
"grad_norm": 0.19908477365970612,
"learning_rate": 0.0008123125,
"loss": 2.6325,
"step": 26000
},
{
"epoch": 12.0,
"eval_accuracy": 0.46038157479063585,
"eval_loss": 2.805652618408203,
"eval_runtime": 70.672,
"eval_samples_per_second": 858.926,
"eval_steps_per_second": 13.428,
"step": 26712
},
{
"epoch": 12.129423660262892,
"grad_norm": 0.1926439106464386,
"learning_rate": 0.0008435625,
"loss": 2.6264,
"step": 27000
},
{
"epoch": 12.578811369509044,
"grad_norm": 0.19238470494747162,
"learning_rate": 0.00087478125,
"loss": 2.6204,
"step": 28000
},
{
"epoch": 13.0,
"eval_accuracy": 0.46109079327424685,
"eval_loss": 2.7990734577178955,
"eval_runtime": 71.4566,
"eval_samples_per_second": 849.495,
"eval_steps_per_second": 13.281,
"step": 28938
},
{
"epoch": 13.02786203797326,
"grad_norm": 0.19909153878688812,
"learning_rate": 0.0009060312499999999,
"loss": 2.6278,
"step": 29000
},
{
"epoch": 13.477249747219414,
"grad_norm": 0.17719709873199463,
"learning_rate": 0.00093725,
"loss": 2.6029,
"step": 30000
},
{
"epoch": 13.926637456465565,
"grad_norm": 0.18759088218212128,
"learning_rate": 0.00096846875,
"loss": 2.6273,
"step": 31000
},
{
"epoch": 14.0,
"eval_accuracy": 0.4612619282499333,
"eval_loss": 2.797557830810547,
"eval_runtime": 71.6456,
"eval_samples_per_second": 847.253,
"eval_steps_per_second": 13.246,
"step": 31164
},
{
"epoch": 14.375688124929782,
"grad_norm": 0.18803943693637848,
"learning_rate": 0.00099971875,
"loss": 2.5963,
"step": 32000
},
{
"epoch": 14.825075834175935,
"grad_norm": 0.17913933098316193,
"learning_rate": 0.00092072,
"loss": 2.6134,
"step": 33000
},
{
"epoch": 15.0,
"eval_accuracy": 0.4638461374467747,
"eval_loss": 2.778160810470581,
"eval_runtime": 71.3541,
"eval_samples_per_second": 850.715,
"eval_steps_per_second": 13.3,
"step": 33390
},
{
"epoch": 15.274126502640152,
"grad_norm": 0.1831241399049759,
"learning_rate": 0.00084072,
"loss": 2.5729,
"step": 34000
},
{
"epoch": 15.723514211886306,
"grad_norm": 0.1739753931760788,
"learning_rate": 0.0007608000000000001,
"loss": 2.5675,
"step": 35000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4674098666516786,
"eval_loss": 2.749758720397949,
"eval_runtime": 71.8388,
"eval_samples_per_second": 844.975,
"eval_steps_per_second": 13.21,
"step": 35616
},
{
"epoch": 16.17256488035052,
"grad_norm": 0.17786382138729095,
"learning_rate": 0.0006808,
"loss": 2.5367,
"step": 36000
},
{
"epoch": 16.621952589596674,
"grad_norm": 0.17862257361412048,
"learning_rate": 0.0006008,
"loss": 2.5158,
"step": 37000
},
{
"epoch": 17.0,
"eval_accuracy": 0.47079748640255414,
"eval_loss": 2.7294299602508545,
"eval_runtime": 71.4426,
"eval_samples_per_second": 849.661,
"eval_steps_per_second": 13.283,
"step": 37842
},
{
"epoch": 17.07100325806089,
"grad_norm": 0.17703622579574585,
"learning_rate": 0.00052088,
"loss": 2.5007,
"step": 38000
},
{
"epoch": 17.520390967307044,
"grad_norm": 0.1765511929988861,
"learning_rate": 0.00044088,
"loss": 2.4562,
"step": 39000
},
{
"epoch": 17.969778676553197,
"grad_norm": 0.18510127067565918,
"learning_rate": 0.00036088,
"loss": 2.4606,
"step": 40000
},
{
"epoch": 18.0,
"eval_accuracy": 0.47380801485366314,
"eval_loss": 2.705484628677368,
"eval_runtime": 71.4487,
"eval_samples_per_second": 849.589,
"eval_steps_per_second": 13.282,
"step": 40068
},
{
"epoch": 18.418829345017414,
"grad_norm": 0.19680210947990417,
"learning_rate": 0.00028095999999999997,
"loss": 2.3926,
"step": 41000
},
{
"epoch": 18.868217054263567,
"grad_norm": 0.1969345360994339,
"learning_rate": 0.00020096,
"loss": 2.3953,
"step": 42000
},
{
"epoch": 19.0,
"eval_accuracy": 0.47649991827642724,
"eval_loss": 2.692018985748291,
"eval_runtime": 71.5249,
"eval_samples_per_second": 848.683,
"eval_steps_per_second": 13.268,
"step": 42294
},
{
"epoch": 19.317267722727784,
"grad_norm": 0.21539689600467682,
"learning_rate": 0.00012096,
"loss": 2.341,
"step": 43000
},
{
"epoch": 19.766655431973934,
"grad_norm": 0.2021603137254715,
"learning_rate": 4.096e-05,
"loss": 2.3225,
"step": 44000
},
{
"epoch": 19.99134928659701,
"eval_accuracy": 0.4780591265203653,
"eval_loss": 2.6889467239379883,
"eval_runtime": 71.2633,
"eval_samples_per_second": 851.799,
"eval_steps_per_second": 13.317,
"step": 44500
},
{
"epoch": 19.99134928659701,
"step": 44500,
"total_flos": 1.487793172119552e+18,
"train_loss": 2.799591330538975,
"train_runtime": 29586.3987,
"train_samples_per_second": 385.072,
"train_steps_per_second": 1.504
}
],
"logging_steps": 1000,
"max_steps": 44500,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.487793172119552e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}