{ "best_metric": 2.690006971359253, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_no-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44480", "epoch": 19.99134539732494, "eval_steps": 500, "global_step": 44480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.44958974935371476, "grad_norm": 0.5849416255950928, "learning_rate": 3.125e-05, "loss": 5.5771, "step": 1000 }, { "epoch": 0.8991794987074295, "grad_norm": 0.6677811145782471, "learning_rate": 6.25e-05, "loss": 4.0896, "step": 2000 }, { "epoch": 1.0, "eval_accuracy": 0.35931850789096126, "eval_loss": 3.830660104751587, "eval_runtime": 73.9335, "eval_samples_per_second": 820.738, "eval_steps_per_second": 12.836, "step": 2225 }, { "epoch": 1.348432055749129, "grad_norm": 0.5427906513214111, "learning_rate": 9.375e-05, "loss": 3.6885, "step": 3000 }, { "epoch": 1.7980218051028438, "grad_norm": 0.5098850727081299, "learning_rate": 0.000125, "loss": 3.4325, "step": 4000 }, { "epoch": 2.0, "eval_accuracy": 0.40814210192976336, "eval_loss": 3.3144962787628174, "eval_runtime": 73.7109, "eval_samples_per_second": 823.216, "eval_steps_per_second": 12.875, "step": 4450 }, { "epoch": 2.2472743621445432, "grad_norm": 0.47539061307907104, "learning_rate": 0.00015625, "loss": 3.2344, "step": 5000 }, { "epoch": 2.696864111498258, "grad_norm": 0.4557252824306488, "learning_rate": 0.0001875, "loss": 3.1208, "step": 6000 }, { "epoch": 3.0, "eval_accuracy": 0.42951116108935333, "eval_loss": 3.1043691635131836, "eval_runtime": 73.4454, "eval_samples_per_second": 826.192, "eval_steps_per_second": 12.921, "step": 6675 }, { "epoch": 3.1461166685399573, "grad_norm": 0.4797649085521698, "learning_rate": 0.00021875, "loss": 3.0251, "step": 7000 }, { "epoch": 3.595706417893672, "grad_norm": 0.39974284172058105, "learning_rate": 0.00025, "loss": 2.957, "step": 8000 }, { "epoch": 4.0, "eval_accuracy": 0.43958987682086675, "eval_loss": 2.997297763824463, "eval_runtime": 72.7828, "eval_samples_per_second": 833.713, "eval_steps_per_second": 13.039, "step": 8900 }, { "epoch": 4.044958974935372, "grad_norm": 0.37127092480659485, "learning_rate": 0.00028125000000000003, "loss": 2.91, "step": 9000 }, { "epoch": 4.4945487242890865, "grad_norm": 0.35168108344078064, "learning_rate": 0.0003125, "loss": 2.8553, "step": 10000 }, { "epoch": 4.944138473642801, "grad_norm": 0.33759039640426636, "learning_rate": 0.00034375, "loss": 2.8381, "step": 11000 }, { "epoch": 5.0, "eval_accuracy": 0.4463711272247858, "eval_loss": 2.9338483810424805, "eval_runtime": 73.014, "eval_samples_per_second": 831.073, "eval_steps_per_second": 12.998, "step": 11125 }, { "epoch": 5.3933910306845005, "grad_norm": 0.3382190465927124, "learning_rate": 0.000375, "loss": 2.7866, "step": 12000 }, { "epoch": 5.842980780038215, "grad_norm": 0.30114060640335083, "learning_rate": 0.00040625000000000004, "loss": 2.7819, "step": 13000 }, { "epoch": 6.0, "eval_accuracy": 0.4507991133170473, "eval_loss": 2.8903539180755615, "eval_runtime": 72.6589, "eval_samples_per_second": 835.135, "eval_steps_per_second": 13.061, "step": 13350 }, { "epoch": 6.292233337079915, "grad_norm": 0.29889941215515137, "learning_rate": 0.0004375, "loss": 2.7435, "step": 14000 }, { "epoch": 6.741823086433629, "grad_norm": 0.2726060450077057, "learning_rate": 0.00046871875, "loss": 2.7385, "step": 15000 }, { "epoch": 7.0, "eval_accuracy": 0.4530601548463815, "eval_loss": 2.8666210174560547, "eval_runtime": 72.57, "eval_samples_per_second": 836.158, "eval_steps_per_second": 13.077, "step": 15575 }, { "epoch": 7.191075643475329, "grad_norm": 0.2712298333644867, "learning_rate": 0.00049996875, "loss": 2.7141, "step": 16000 }, { "epoch": 7.640665392829043, "grad_norm": 0.2567969560623169, "learning_rate": 0.00053121875, "loss": 2.7061, "step": 17000 }, { "epoch": 8.0, "eval_accuracy": 0.4559095609239081, "eval_loss": 2.8456263542175293, "eval_runtime": 72.544, "eval_samples_per_second": 836.458, "eval_steps_per_second": 13.082, "step": 17800 }, { "epoch": 8.089917949870744, "grad_norm": 0.24305634200572968, "learning_rate": 0.0005624687499999999, "loss": 2.6951, "step": 18000 }, { "epoch": 8.539507699224458, "grad_norm": 0.23454323410987854, "learning_rate": 0.0005936875, "loss": 2.6745, "step": 19000 }, { "epoch": 8.989097448578173, "grad_norm": 0.2262556552886963, "learning_rate": 0.0006249375000000001, "loss": 2.6855, "step": 20000 }, { "epoch": 9.0, "eval_accuracy": 0.4575118590613569, "eval_loss": 2.833211898803711, "eval_runtime": 72.5848, "eval_samples_per_second": 835.988, "eval_steps_per_second": 13.074, "step": 20025 }, { "epoch": 9.438350005619872, "grad_norm": 0.2205825001001358, "learning_rate": 0.0006561562500000001, "loss": 2.6456, "step": 21000 }, { "epoch": 9.887939754973587, "grad_norm": 0.21933791041374207, "learning_rate": 0.00068740625, "loss": 2.6669, "step": 22000 }, { "epoch": 10.0, "eval_accuracy": 0.45864309072343507, "eval_loss": 2.819795608520508, "eval_runtime": 72.9477, "eval_samples_per_second": 831.828, "eval_steps_per_second": 13.009, "step": 22250 }, { "epoch": 10.337192312015286, "grad_norm": 0.21199771761894226, "learning_rate": 0.000718625, "loss": 2.636, "step": 23000 }, { "epoch": 10.786782061369001, "grad_norm": 0.20166124403476715, "learning_rate": 0.000749875, "loss": 2.6499, "step": 24000 }, { "epoch": 11.0, "eval_accuracy": 0.4597305698812155, "eval_loss": 2.8118433952331543, "eval_runtime": 73.1507, "eval_samples_per_second": 829.521, "eval_steps_per_second": 12.973, "step": 24475 }, { "epoch": 11.2360346184107, "grad_norm": 0.20014500617980957, "learning_rate": 0.000781125, "loss": 2.6298, "step": 25000 }, { "epoch": 11.685624367764415, "grad_norm": 0.19325494766235352, "learning_rate": 0.00081234375, "loss": 2.6351, "step": 26000 }, { "epoch": 12.0, "eval_accuracy": 0.4601314513940052, "eval_loss": 2.807219982147217, "eval_runtime": 72.6854, "eval_samples_per_second": 834.831, "eval_steps_per_second": 13.056, "step": 26700 }, { "epoch": 12.134876924806115, "grad_norm": 0.19165903329849243, "learning_rate": 0.00084359375, "loss": 2.6265, "step": 27000 }, { "epoch": 12.58446667415983, "grad_norm": 0.1863769292831421, "learning_rate": 0.0008748437500000001, "loss": 2.6204, "step": 28000 }, { "epoch": 13.0, "eval_accuracy": 0.4611660010081818, "eval_loss": 2.802619218826294, "eval_runtime": 72.6835, "eval_samples_per_second": 834.852, "eval_steps_per_second": 13.057, "step": 28925 }, { "epoch": 13.033719231201529, "grad_norm": 0.19991189241409302, "learning_rate": 0.00090609375, "loss": 2.6286, "step": 29000 }, { "epoch": 13.483308980555243, "grad_norm": 0.18545052409172058, "learning_rate": 0.0009373125, "loss": 2.6068, "step": 30000 }, { "epoch": 13.932898729908958, "grad_norm": 0.17478196322917938, "learning_rate": 0.0009685625, "loss": 2.6277, "step": 31000 }, { "epoch": 14.0, "eval_accuracy": 0.4612847208758256, "eval_loss": 2.801252841949463, "eval_runtime": 72.4155, "eval_samples_per_second": 837.942, "eval_steps_per_second": 13.105, "step": 31150 }, { "epoch": 14.382151286950657, "grad_norm": 0.17678463459014893, "learning_rate": 0.00099978125, "loss": 2.5975, "step": 32000 }, { "epoch": 14.831741036304372, "grad_norm": 0.17033128440380096, "learning_rate": 0.0009204326923076923, "loss": 2.6136, "step": 33000 }, { "epoch": 15.0, "eval_accuracy": 0.46383548541367764, "eval_loss": 2.779118061065674, "eval_runtime": 72.723, "eval_samples_per_second": 834.399, "eval_steps_per_second": 13.05, "step": 33375 }, { "epoch": 15.280993593346071, "grad_norm": 0.1788545548915863, "learning_rate": 0.0008403044871794871, "loss": 2.5726, "step": 34000 }, { "epoch": 15.730583342699786, "grad_norm": 0.17410264909267426, "learning_rate": 0.0007602564102564103, "loss": 2.5687, "step": 35000 }, { "epoch": 16.0, "eval_accuracy": 0.4676111908177905, "eval_loss": 2.75136661529541, "eval_runtime": 72.8808, "eval_samples_per_second": 832.592, "eval_steps_per_second": 13.021, "step": 35600 }, { "epoch": 16.179835899741487, "grad_norm": 0.17518466711044312, "learning_rate": 0.0006801282051282051, "loss": 2.5356, "step": 36000 }, { "epoch": 16.6294256490952, "grad_norm": 0.17421123385429382, "learning_rate": 0.0006000801282051283, "loss": 2.5184, "step": 37000 }, { "epoch": 17.0, "eval_accuracy": 0.4707787558002766, "eval_loss": 2.728271245956421, "eval_runtime": 72.8051, "eval_samples_per_second": 833.458, "eval_steps_per_second": 13.035, "step": 37825 }, { "epoch": 17.0786782061369, "grad_norm": 0.18264968693256378, "learning_rate": 0.0005199519230769231, "loss": 2.4989, "step": 38000 }, { "epoch": 17.528267955490616, "grad_norm": 0.18594865500926971, "learning_rate": 0.00043990384615384616, "loss": 2.4571, "step": 39000 }, { "epoch": 17.97785770484433, "grad_norm": 0.17989173531532288, "learning_rate": 0.00035977564102564105, "loss": 2.4613, "step": 40000 }, { "epoch": 18.0, "eval_accuracy": 0.47402006023239884, "eval_loss": 2.705965518951416, "eval_runtime": 72.7008, "eval_samples_per_second": 834.654, "eval_steps_per_second": 13.054, "step": 40050 }, { "epoch": 18.42711026188603, "grad_norm": 0.19071288406848907, "learning_rate": 0.0002797275641025641, "loss": 2.3913, "step": 41000 }, { "epoch": 18.876700011239745, "grad_norm": 0.19211626052856445, "learning_rate": 0.00019959935897435898, "loss": 2.3966, "step": 42000 }, { "epoch": 19.0, "eval_accuracy": 0.4765565421949927, "eval_loss": 2.6947293281555176, "eval_runtime": 72.857, "eval_samples_per_second": 832.864, "eval_steps_per_second": 13.026, "step": 42275 }, { "epoch": 19.325952568281444, "grad_norm": 0.19902721047401428, "learning_rate": 0.00011947115384615386, "loss": 2.3404, "step": 43000 }, { "epoch": 19.775542317635157, "grad_norm": 0.20530302822589874, "learning_rate": 3.942307692307692e-05, "loss": 2.3227, "step": 44000 }, { "epoch": 19.99134539732494, "eval_accuracy": 0.4781093360218181, "eval_loss": 2.690006971359253, "eval_runtime": 72.887, "eval_samples_per_second": 832.521, "eval_steps_per_second": 13.02, "step": 44480 }, { "epoch": 19.99134539732494, "step": 44480, "total_flos": 1.487139158163456e+18, "train_loss": 2.8016022716494775, "train_runtime": 30047.0507, "train_samples_per_second": 379.002, "train_steps_per_second": 1.48 } ], "logging_steps": 1000, "max_steps": 44480, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.487139158163456e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }