{ "best_metric": 2.6889467239379883, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44500", "epoch": 19.99134928659701, "eval_steps": 500, "global_step": 44500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4493877092461521, "grad_norm": 0.4442262053489685, "learning_rate": 3.125e-05, "loss": 5.5692, "step": 1000 }, { "epoch": 0.8987754184923042, "grad_norm": 0.5283451676368713, "learning_rate": 6.25e-05, "loss": 4.0804, "step": 2000 }, { "epoch": 1.0, "eval_accuracy": 0.35990021325653254, "eval_loss": 3.8291473388671875, "eval_runtime": 71.8623, "eval_samples_per_second": 844.699, "eval_steps_per_second": 13.206, "step": 2226 }, { "epoch": 1.3478260869565217, "grad_norm": 0.5473753809928894, "learning_rate": 9.375e-05, "loss": 3.6828, "step": 3000 }, { "epoch": 1.7972137962026737, "grad_norm": 0.6101972460746765, "learning_rate": 0.000125, "loss": 3.431, "step": 4000 }, { "epoch": 2.0, "eval_accuracy": 0.4075588167460322, "eval_loss": 3.316692590713501, "eval_runtime": 71.9883, "eval_samples_per_second": 843.22, "eval_steps_per_second": 13.183, "step": 4452 }, { "epoch": 2.2462644646668912, "grad_norm": 0.5061201453208923, "learning_rate": 0.00015625, "loss": 3.2333, "step": 5000 }, { "epoch": 2.6956521739130435, "grad_norm": 0.4999963343143463, "learning_rate": 0.0001875, "loss": 3.1165, "step": 6000 }, { "epoch": 3.0, "eval_accuracy": 0.4289709096382779, "eval_loss": 3.102813482284546, "eval_runtime": 71.4078, "eval_samples_per_second": 850.075, "eval_steps_per_second": 13.29, "step": 6678 }, { "epoch": 3.144702842377261, "grad_norm": 0.4476725161075592, "learning_rate": 0.00021875, "loss": 3.0211, "step": 7000 }, { "epoch": 3.594090551623413, "grad_norm": 0.3835226595401764, "learning_rate": 0.00025, "loss": 2.9545, "step": 8000 }, { "epoch": 4.0, "eval_accuracy": 0.4395799214549251, "eval_loss": 2.998249053955078, "eval_runtime": 71.272, "eval_samples_per_second": 851.695, "eval_steps_per_second": 13.315, "step": 8904 }, { "epoch": 4.04314122008763, "grad_norm": 0.36295273900032043, "learning_rate": 0.00028125000000000003, "loss": 2.9068, "step": 9000 }, { "epoch": 4.4925289293337825, "grad_norm": 0.368268221616745, "learning_rate": 0.0003125, "loss": 2.8533, "step": 10000 }, { "epoch": 4.941916638579935, "grad_norm": 0.3496946096420288, "learning_rate": 0.00034375, "loss": 2.8357, "step": 11000 }, { "epoch": 5.0, "eval_accuracy": 0.4463052869660269, "eval_loss": 2.9296414852142334, "eval_runtime": 70.9885, "eval_samples_per_second": 855.097, "eval_steps_per_second": 13.368, "step": 11130 }, { "epoch": 5.390967307044153, "grad_norm": 0.32746192812919617, "learning_rate": 0.000375, "loss": 2.7838, "step": 12000 }, { "epoch": 5.840355016290305, "grad_norm": 0.3100210726261139, "learning_rate": 0.00040625000000000004, "loss": 2.7805, "step": 13000 }, { "epoch": 6.0, "eval_accuracy": 0.4504751272852721, "eval_loss": 2.892799139022827, "eval_runtime": 70.9528, "eval_samples_per_second": 855.527, "eval_steps_per_second": 13.375, "step": 13356 }, { "epoch": 6.289405684754522, "grad_norm": 0.28876417875289917, "learning_rate": 0.0004375, "loss": 2.7419, "step": 14000 }, { "epoch": 6.738793394000674, "grad_norm": 0.2749331295490265, "learning_rate": 0.00046871875, "loss": 2.7354, "step": 15000 }, { "epoch": 7.0, "eval_accuracy": 0.45374956150296436, "eval_loss": 2.8629629611968994, "eval_runtime": 70.9856, "eval_samples_per_second": 855.131, "eval_steps_per_second": 13.369, "step": 15582 }, { "epoch": 7.187844062464891, "grad_norm": 0.2688666582107544, "learning_rate": 0.00049996875, "loss": 2.7138, "step": 16000 }, { "epoch": 7.637231771711043, "grad_norm": 0.25027596950531006, "learning_rate": 0.00053121875, "loss": 2.702, "step": 17000 }, { "epoch": 8.0, "eval_accuracy": 0.45609111952250175, "eval_loss": 2.8440542221069336, "eval_runtime": 70.7062, "eval_samples_per_second": 858.51, "eval_steps_per_second": 13.422, "step": 17808 }, { "epoch": 8.08628244017526, "grad_norm": 0.2555592656135559, "learning_rate": 0.0005624375, "loss": 2.6924, "step": 18000 }, { "epoch": 8.535670149421414, "grad_norm": 0.24662208557128906, "learning_rate": 0.0005936875, "loss": 2.6741, "step": 19000 }, { "epoch": 8.985057858667565, "grad_norm": 0.2206139713525772, "learning_rate": 0.00062490625, "loss": 2.684, "step": 20000 }, { "epoch": 9.0, "eval_accuracy": 0.4570897622005542, "eval_loss": 2.835329294204712, "eval_runtime": 70.6801, "eval_samples_per_second": 858.827, "eval_steps_per_second": 13.427, "step": 20034 }, { "epoch": 9.434108527131784, "grad_norm": 0.21738700568675995, "learning_rate": 0.0006561562500000001, "loss": 2.6454, "step": 21000 }, { "epoch": 9.883496236377935, "grad_norm": 0.2152646780014038, "learning_rate": 0.0006873749999999999, "loss": 2.6647, "step": 22000 }, { "epoch": 10.0, "eval_accuracy": 0.45841717267447984, "eval_loss": 2.819920778274536, "eval_runtime": 70.5746, "eval_samples_per_second": 860.111, "eval_steps_per_second": 13.447, "step": 22260 }, { "epoch": 10.332546904842152, "grad_norm": 0.2241608053445816, "learning_rate": 0.000718625, "loss": 2.6349, "step": 23000 }, { "epoch": 10.781934614088305, "grad_norm": 0.20968548953533173, "learning_rate": 0.0007498437500000001, "loss": 2.6475, "step": 24000 }, { "epoch": 11.0, "eval_accuracy": 0.45958520603061825, "eval_loss": 2.81199049949646, "eval_runtime": 70.7302, "eval_samples_per_second": 858.219, "eval_steps_per_second": 13.417, "step": 24486 }, { "epoch": 11.230985282552522, "grad_norm": 0.21336565911769867, "learning_rate": 0.00078109375, "loss": 2.6285, "step": 25000 }, { "epoch": 11.680372991798674, "grad_norm": 0.19908477365970612, "learning_rate": 0.0008123125, "loss": 2.6325, "step": 26000 }, { "epoch": 12.0, "eval_accuracy": 0.46038157479063585, "eval_loss": 2.805652618408203, "eval_runtime": 70.672, "eval_samples_per_second": 858.926, "eval_steps_per_second": 13.428, "step": 26712 }, { "epoch": 12.129423660262892, "grad_norm": 0.1926439106464386, "learning_rate": 0.0008435625, "loss": 2.6264, "step": 27000 }, { "epoch": 12.578811369509044, "grad_norm": 0.19238470494747162, "learning_rate": 0.00087478125, "loss": 2.6204, "step": 28000 }, { "epoch": 13.0, "eval_accuracy": 0.46109079327424685, "eval_loss": 2.7990734577178955, "eval_runtime": 71.4566, "eval_samples_per_second": 849.495, "eval_steps_per_second": 13.281, "step": 28938 }, { "epoch": 13.02786203797326, "grad_norm": 0.19909153878688812, "learning_rate": 0.0009060312499999999, "loss": 2.6278, "step": 29000 }, { "epoch": 13.477249747219414, "grad_norm": 0.17719709873199463, "learning_rate": 0.00093725, "loss": 2.6029, "step": 30000 }, { "epoch": 13.926637456465565, "grad_norm": 0.18759088218212128, "learning_rate": 0.00096846875, "loss": 2.6273, "step": 31000 }, { "epoch": 14.0, "eval_accuracy": 0.4612619282499333, "eval_loss": 2.797557830810547, "eval_runtime": 71.6456, "eval_samples_per_second": 847.253, "eval_steps_per_second": 13.246, "step": 31164 }, { "epoch": 14.375688124929782, "grad_norm": 0.18803943693637848, "learning_rate": 0.00099971875, "loss": 2.5963, "step": 32000 }, { "epoch": 14.825075834175935, "grad_norm": 0.17913933098316193, "learning_rate": 0.00092072, "loss": 2.6134, "step": 33000 }, { "epoch": 15.0, "eval_accuracy": 0.4638461374467747, "eval_loss": 2.778160810470581, "eval_runtime": 71.3541, "eval_samples_per_second": 850.715, "eval_steps_per_second": 13.3, "step": 33390 }, { "epoch": 15.274126502640152, "grad_norm": 0.1831241399049759, "learning_rate": 0.00084072, "loss": 2.5729, "step": 34000 }, { "epoch": 15.723514211886306, "grad_norm": 0.1739753931760788, "learning_rate": 0.0007608000000000001, "loss": 2.5675, "step": 35000 }, { "epoch": 16.0, "eval_accuracy": 0.4674098666516786, "eval_loss": 2.749758720397949, "eval_runtime": 71.8388, "eval_samples_per_second": 844.975, "eval_steps_per_second": 13.21, "step": 35616 }, { "epoch": 16.17256488035052, "grad_norm": 0.17786382138729095, "learning_rate": 0.0006808, "loss": 2.5367, "step": 36000 }, { "epoch": 16.621952589596674, "grad_norm": 0.17862257361412048, "learning_rate": 0.0006008, "loss": 2.5158, "step": 37000 }, { "epoch": 17.0, "eval_accuracy": 0.47079748640255414, "eval_loss": 2.7294299602508545, "eval_runtime": 71.4426, "eval_samples_per_second": 849.661, "eval_steps_per_second": 13.283, "step": 37842 }, { "epoch": 17.07100325806089, "grad_norm": 0.17703622579574585, "learning_rate": 0.00052088, "loss": 2.5007, "step": 38000 }, { "epoch": 17.520390967307044, "grad_norm": 0.1765511929988861, "learning_rate": 0.00044088, "loss": 2.4562, "step": 39000 }, { "epoch": 17.969778676553197, "grad_norm": 0.18510127067565918, "learning_rate": 0.00036088, "loss": 2.4606, "step": 40000 }, { "epoch": 18.0, "eval_accuracy": 0.47380801485366314, "eval_loss": 2.705484628677368, "eval_runtime": 71.4487, "eval_samples_per_second": 849.589, "eval_steps_per_second": 13.282, "step": 40068 }, { "epoch": 18.418829345017414, "grad_norm": 0.19680210947990417, "learning_rate": 0.00028095999999999997, "loss": 2.3926, "step": 41000 }, { "epoch": 18.868217054263567, "grad_norm": 0.1969345360994339, "learning_rate": 0.00020096, "loss": 2.3953, "step": 42000 }, { "epoch": 19.0, "eval_accuracy": 0.47649991827642724, "eval_loss": 2.692018985748291, "eval_runtime": 71.5249, "eval_samples_per_second": 848.683, "eval_steps_per_second": 13.268, "step": 42294 }, { "epoch": 19.317267722727784, "grad_norm": 0.21539689600467682, "learning_rate": 0.00012096, "loss": 2.341, "step": 43000 }, { "epoch": 19.766655431973934, "grad_norm": 0.2021603137254715, "learning_rate": 4.096e-05, "loss": 2.3225, "step": 44000 }, { "epoch": 19.99134928659701, "eval_accuracy": 0.4780591265203653, "eval_loss": 2.6889467239379883, "eval_runtime": 71.2633, "eval_samples_per_second": 851.799, "eval_steps_per_second": 13.317, "step": 44500 }, { "epoch": 19.99134928659701, "step": 44500, "total_flos": 1.487793172119552e+18, "train_loss": 2.799591330538975, "train_runtime": 29586.3987, "train_samples_per_second": 385.072, "train_steps_per_second": 1.504 } ], "logging_steps": 1000, "max_steps": 44500, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.487793172119552e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }