|
{ |
|
"best_metric": 2.6889467239379883, |
|
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44500", |
|
"epoch": 19.99134928659701, |
|
"eval_steps": 500, |
|
"global_step": 44500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4493877092461521, |
|
"grad_norm": 0.4442262053489685, |
|
"learning_rate": 3.125e-05, |
|
"loss": 5.5692, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8987754184923042, |
|
"grad_norm": 0.5283451676368713, |
|
"learning_rate": 6.25e-05, |
|
"loss": 4.0804, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.35990021325653254, |
|
"eval_loss": 3.8291473388671875, |
|
"eval_runtime": 71.8623, |
|
"eval_samples_per_second": 844.699, |
|
"eval_steps_per_second": 13.206, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 1.3478260869565217, |
|
"grad_norm": 0.5473753809928894, |
|
"learning_rate": 9.375e-05, |
|
"loss": 3.6828, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7972137962026737, |
|
"grad_norm": 0.6101972460746765, |
|
"learning_rate": 0.000125, |
|
"loss": 3.431, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4075588167460322, |
|
"eval_loss": 3.316692590713501, |
|
"eval_runtime": 71.9883, |
|
"eval_samples_per_second": 843.22, |
|
"eval_steps_per_second": 13.183, |
|
"step": 4452 |
|
}, |
|
{ |
|
"epoch": 2.2462644646668912, |
|
"grad_norm": 0.5061201453208923, |
|
"learning_rate": 0.00015625, |
|
"loss": 3.2333, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.6956521739130435, |
|
"grad_norm": 0.4999963343143463, |
|
"learning_rate": 0.0001875, |
|
"loss": 3.1165, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.4289709096382779, |
|
"eval_loss": 3.102813482284546, |
|
"eval_runtime": 71.4078, |
|
"eval_samples_per_second": 850.075, |
|
"eval_steps_per_second": 13.29, |
|
"step": 6678 |
|
}, |
|
{ |
|
"epoch": 3.144702842377261, |
|
"grad_norm": 0.4476725161075592, |
|
"learning_rate": 0.00021875, |
|
"loss": 3.0211, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.594090551623413, |
|
"grad_norm": 0.3835226595401764, |
|
"learning_rate": 0.00025, |
|
"loss": 2.9545, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4395799214549251, |
|
"eval_loss": 2.998249053955078, |
|
"eval_runtime": 71.272, |
|
"eval_samples_per_second": 851.695, |
|
"eval_steps_per_second": 13.315, |
|
"step": 8904 |
|
}, |
|
{ |
|
"epoch": 4.04314122008763, |
|
"grad_norm": 0.36295273900032043, |
|
"learning_rate": 0.00028125000000000003, |
|
"loss": 2.9068, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.4925289293337825, |
|
"grad_norm": 0.368268221616745, |
|
"learning_rate": 0.0003125, |
|
"loss": 2.8533, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.941916638579935, |
|
"grad_norm": 0.3496946096420288, |
|
"learning_rate": 0.00034375, |
|
"loss": 2.8357, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.4463052869660269, |
|
"eval_loss": 2.9296414852142334, |
|
"eval_runtime": 70.9885, |
|
"eval_samples_per_second": 855.097, |
|
"eval_steps_per_second": 13.368, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 5.390967307044153, |
|
"grad_norm": 0.32746192812919617, |
|
"learning_rate": 0.000375, |
|
"loss": 2.7838, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.840355016290305, |
|
"grad_norm": 0.3100210726261139, |
|
"learning_rate": 0.00040625000000000004, |
|
"loss": 2.7805, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.4504751272852721, |
|
"eval_loss": 2.892799139022827, |
|
"eval_runtime": 70.9528, |
|
"eval_samples_per_second": 855.527, |
|
"eval_steps_per_second": 13.375, |
|
"step": 13356 |
|
}, |
|
{ |
|
"epoch": 6.289405684754522, |
|
"grad_norm": 0.28876417875289917, |
|
"learning_rate": 0.0004375, |
|
"loss": 2.7419, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.738793394000674, |
|
"grad_norm": 0.2749331295490265, |
|
"learning_rate": 0.00046871875, |
|
"loss": 2.7354, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.45374956150296436, |
|
"eval_loss": 2.8629629611968994, |
|
"eval_runtime": 70.9856, |
|
"eval_samples_per_second": 855.131, |
|
"eval_steps_per_second": 13.369, |
|
"step": 15582 |
|
}, |
|
{ |
|
"epoch": 7.187844062464891, |
|
"grad_norm": 0.2688666582107544, |
|
"learning_rate": 0.00049996875, |
|
"loss": 2.7138, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.637231771711043, |
|
"grad_norm": 0.25027596950531006, |
|
"learning_rate": 0.00053121875, |
|
"loss": 2.702, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.45609111952250175, |
|
"eval_loss": 2.8440542221069336, |
|
"eval_runtime": 70.7062, |
|
"eval_samples_per_second": 858.51, |
|
"eval_steps_per_second": 13.422, |
|
"step": 17808 |
|
}, |
|
{ |
|
"epoch": 8.08628244017526, |
|
"grad_norm": 0.2555592656135559, |
|
"learning_rate": 0.0005624375, |
|
"loss": 2.6924, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 8.535670149421414, |
|
"grad_norm": 0.24662208557128906, |
|
"learning_rate": 0.0005936875, |
|
"loss": 2.6741, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.985057858667565, |
|
"grad_norm": 0.2206139713525772, |
|
"learning_rate": 0.00062490625, |
|
"loss": 2.684, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4570897622005542, |
|
"eval_loss": 2.835329294204712, |
|
"eval_runtime": 70.6801, |
|
"eval_samples_per_second": 858.827, |
|
"eval_steps_per_second": 13.427, |
|
"step": 20034 |
|
}, |
|
{ |
|
"epoch": 9.434108527131784, |
|
"grad_norm": 0.21738700568675995, |
|
"learning_rate": 0.0006561562500000001, |
|
"loss": 2.6454, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.883496236377935, |
|
"grad_norm": 0.2152646780014038, |
|
"learning_rate": 0.0006873749999999999, |
|
"loss": 2.6647, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.45841717267447984, |
|
"eval_loss": 2.819920778274536, |
|
"eval_runtime": 70.5746, |
|
"eval_samples_per_second": 860.111, |
|
"eval_steps_per_second": 13.447, |
|
"step": 22260 |
|
}, |
|
{ |
|
"epoch": 10.332546904842152, |
|
"grad_norm": 0.2241608053445816, |
|
"learning_rate": 0.000718625, |
|
"loss": 2.6349, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.781934614088305, |
|
"grad_norm": 0.20968548953533173, |
|
"learning_rate": 0.0007498437500000001, |
|
"loss": 2.6475, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.45958520603061825, |
|
"eval_loss": 2.81199049949646, |
|
"eval_runtime": 70.7302, |
|
"eval_samples_per_second": 858.219, |
|
"eval_steps_per_second": 13.417, |
|
"step": 24486 |
|
}, |
|
{ |
|
"epoch": 11.230985282552522, |
|
"grad_norm": 0.21336565911769867, |
|
"learning_rate": 0.00078109375, |
|
"loss": 2.6285, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 11.680372991798674, |
|
"grad_norm": 0.19908477365970612, |
|
"learning_rate": 0.0008123125, |
|
"loss": 2.6325, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.46038157479063585, |
|
"eval_loss": 2.805652618408203, |
|
"eval_runtime": 70.672, |
|
"eval_samples_per_second": 858.926, |
|
"eval_steps_per_second": 13.428, |
|
"step": 26712 |
|
}, |
|
{ |
|
"epoch": 12.129423660262892, |
|
"grad_norm": 0.1926439106464386, |
|
"learning_rate": 0.0008435625, |
|
"loss": 2.6264, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 12.578811369509044, |
|
"grad_norm": 0.19238470494747162, |
|
"learning_rate": 0.00087478125, |
|
"loss": 2.6204, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.46109079327424685, |
|
"eval_loss": 2.7990734577178955, |
|
"eval_runtime": 71.4566, |
|
"eval_samples_per_second": 849.495, |
|
"eval_steps_per_second": 13.281, |
|
"step": 28938 |
|
}, |
|
{ |
|
"epoch": 13.02786203797326, |
|
"grad_norm": 0.19909153878688812, |
|
"learning_rate": 0.0009060312499999999, |
|
"loss": 2.6278, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 13.477249747219414, |
|
"grad_norm": 0.17719709873199463, |
|
"learning_rate": 0.00093725, |
|
"loss": 2.6029, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 13.926637456465565, |
|
"grad_norm": 0.18759088218212128, |
|
"learning_rate": 0.00096846875, |
|
"loss": 2.6273, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.4612619282499333, |
|
"eval_loss": 2.797557830810547, |
|
"eval_runtime": 71.6456, |
|
"eval_samples_per_second": 847.253, |
|
"eval_steps_per_second": 13.246, |
|
"step": 31164 |
|
}, |
|
{ |
|
"epoch": 14.375688124929782, |
|
"grad_norm": 0.18803943693637848, |
|
"learning_rate": 0.00099971875, |
|
"loss": 2.5963, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 14.825075834175935, |
|
"grad_norm": 0.17913933098316193, |
|
"learning_rate": 0.00092072, |
|
"loss": 2.6134, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.4638461374467747, |
|
"eval_loss": 2.778160810470581, |
|
"eval_runtime": 71.3541, |
|
"eval_samples_per_second": 850.715, |
|
"eval_steps_per_second": 13.3, |
|
"step": 33390 |
|
}, |
|
{ |
|
"epoch": 15.274126502640152, |
|
"grad_norm": 0.1831241399049759, |
|
"learning_rate": 0.00084072, |
|
"loss": 2.5729, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 15.723514211886306, |
|
"grad_norm": 0.1739753931760788, |
|
"learning_rate": 0.0007608000000000001, |
|
"loss": 2.5675, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4674098666516786, |
|
"eval_loss": 2.749758720397949, |
|
"eval_runtime": 71.8388, |
|
"eval_samples_per_second": 844.975, |
|
"eval_steps_per_second": 13.21, |
|
"step": 35616 |
|
}, |
|
{ |
|
"epoch": 16.17256488035052, |
|
"grad_norm": 0.17786382138729095, |
|
"learning_rate": 0.0006808, |
|
"loss": 2.5367, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 16.621952589596674, |
|
"grad_norm": 0.17862257361412048, |
|
"learning_rate": 0.0006008, |
|
"loss": 2.5158, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.47079748640255414, |
|
"eval_loss": 2.7294299602508545, |
|
"eval_runtime": 71.4426, |
|
"eval_samples_per_second": 849.661, |
|
"eval_steps_per_second": 13.283, |
|
"step": 37842 |
|
}, |
|
{ |
|
"epoch": 17.07100325806089, |
|
"grad_norm": 0.17703622579574585, |
|
"learning_rate": 0.00052088, |
|
"loss": 2.5007, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 17.520390967307044, |
|
"grad_norm": 0.1765511929988861, |
|
"learning_rate": 0.00044088, |
|
"loss": 2.4562, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 17.969778676553197, |
|
"grad_norm": 0.18510127067565918, |
|
"learning_rate": 0.00036088, |
|
"loss": 2.4606, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.47380801485366314, |
|
"eval_loss": 2.705484628677368, |
|
"eval_runtime": 71.4487, |
|
"eval_samples_per_second": 849.589, |
|
"eval_steps_per_second": 13.282, |
|
"step": 40068 |
|
}, |
|
{ |
|
"epoch": 18.418829345017414, |
|
"grad_norm": 0.19680210947990417, |
|
"learning_rate": 0.00028095999999999997, |
|
"loss": 2.3926, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 18.868217054263567, |
|
"grad_norm": 0.1969345360994339, |
|
"learning_rate": 0.00020096, |
|
"loss": 2.3953, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.47649991827642724, |
|
"eval_loss": 2.692018985748291, |
|
"eval_runtime": 71.5249, |
|
"eval_samples_per_second": 848.683, |
|
"eval_steps_per_second": 13.268, |
|
"step": 42294 |
|
}, |
|
{ |
|
"epoch": 19.317267722727784, |
|
"grad_norm": 0.21539689600467682, |
|
"learning_rate": 0.00012096, |
|
"loss": 2.341, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 19.766655431973934, |
|
"grad_norm": 0.2021603137254715, |
|
"learning_rate": 4.096e-05, |
|
"loss": 2.3225, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 19.99134928659701, |
|
"eval_accuracy": 0.4780591265203653, |
|
"eval_loss": 2.6889467239379883, |
|
"eval_runtime": 71.2633, |
|
"eval_samples_per_second": 851.799, |
|
"eval_steps_per_second": 13.317, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 19.99134928659701, |
|
"step": 44500, |
|
"total_flos": 1.487793172119552e+18, |
|
"train_loss": 2.799591330538975, |
|
"train_runtime": 29586.3987, |
|
"train_samples_per_second": 385.072, |
|
"train_steps_per_second": 1.504 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 44500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.487793172119552e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|