{ "best_metric": 0.12822793424129486, "best_model_checkpoint": "vit-large-ai-or-not/checkpoint-1600", "epoch": 4.0, "eval_steps": 200, "global_step": 7448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10741138560687433, "grad_norm": 0.16569846868515015, "learning_rate": 9.731471535982815e-06, "loss": 0.1089, "step": 200 }, { "epoch": 0.10741138560687433, "eval_accuracy": 0.9497851772287863, "eval_loss": 0.17561669647693634, "eval_runtime": 60.3674, "eval_samples_per_second": 61.689, "eval_steps_per_second": 7.719, "step": 200 }, { "epoch": 0.21482277121374865, "grad_norm": 0.006833943538367748, "learning_rate": 9.462943071965628e-06, "loss": 0.041, "step": 400 }, { "epoch": 0.21482277121374865, "eval_accuracy": 0.9503222341568206, "eval_loss": 0.21512643992900848, "eval_runtime": 62.0849, "eval_samples_per_second": 59.982, "eval_steps_per_second": 7.506, "step": 400 }, { "epoch": 0.322234156820623, "grad_norm": 0.010011733509600163, "learning_rate": 9.194414607948443e-06, "loss": 0.0566, "step": 600 }, { "epoch": 0.322234156820623, "eval_accuracy": 0.9511278195488722, "eval_loss": 0.2124553769826889, "eval_runtime": 61.5803, "eval_samples_per_second": 60.474, "eval_steps_per_second": 7.567, "step": 600 }, { "epoch": 0.4296455424274973, "grad_norm": 22.38845443725586, "learning_rate": 8.927228786251343e-06, "loss": 0.1028, "step": 800 }, { "epoch": 0.4296455424274973, "eval_accuracy": 0.9449516648764769, "eval_loss": 0.20836062729358673, "eval_runtime": 61.4329, "eval_samples_per_second": 60.619, "eval_steps_per_second": 7.586, "step": 800 }, { "epoch": 0.5370569280343717, "grad_norm": 0.2941912114620209, "learning_rate": 8.658700322234156e-06, "loss": 0.1722, "step": 1000 }, { "epoch": 0.5370569280343717, "eval_accuracy": 0.9556928034371643, "eval_loss": 0.16581448912620544, "eval_runtime": 61.8304, "eval_samples_per_second": 60.229, "eval_steps_per_second": 7.537, "step": 1000 }, { "epoch": 0.644468313641246, "grad_norm": 2.7486720085144043, "learning_rate": 8.390171858216972e-06, "loss": 0.1486, "step": 1200 }, { "epoch": 0.644468313641246, "eval_accuracy": 0.959452201933405, "eval_loss": 0.13117314875125885, "eval_runtime": 61.611, "eval_samples_per_second": 60.444, "eval_steps_per_second": 7.564, "step": 1200 }, { "epoch": 0.7518796992481203, "grad_norm": 7.503540515899658, "learning_rate": 8.121643394199787e-06, "loss": 0.1446, "step": 1400 }, { "epoch": 0.7518796992481203, "eval_accuracy": 0.9564983888292159, "eval_loss": 0.16343578696250916, "eval_runtime": 61.6178, "eval_samples_per_second": 60.437, "eval_steps_per_second": 7.563, "step": 1400 }, { "epoch": 0.8592910848549946, "grad_norm": 15.931679725646973, "learning_rate": 7.8531149301826e-06, "loss": 0.1281, "step": 1600 }, { "epoch": 0.8592910848549946, "eval_accuracy": 0.965359828141783, "eval_loss": 0.12822793424129486, "eval_runtime": 62.167, "eval_samples_per_second": 59.903, "eval_steps_per_second": 7.496, "step": 1600 }, { "epoch": 0.966702470461869, "grad_norm": 38.17241668701172, "learning_rate": 7.5845864661654145e-06, "loss": 0.1584, "step": 1800 }, { "epoch": 0.966702470461869, "eval_accuracy": 0.966702470461869, "eval_loss": 0.12948955595493317, "eval_runtime": 61.1637, "eval_samples_per_second": 60.886, "eval_steps_per_second": 7.619, "step": 1800 }, { "epoch": 1.0741138560687433, "grad_norm": 0.005669532343745232, "learning_rate": 7.316058002148228e-06, "loss": 0.0549, "step": 2000 }, { "epoch": 1.0741138560687433, "eval_accuracy": 0.9669709989258861, "eval_loss": 0.16132378578186035, "eval_runtime": 61.0423, "eval_samples_per_second": 61.007, "eval_steps_per_second": 7.634, "step": 2000 }, { "epoch": 1.1815252416756177, "grad_norm": 0.003028369741514325, "learning_rate": 7.047529538131043e-06, "loss": 0.0373, "step": 2200 }, { "epoch": 1.1815252416756177, "eval_accuracy": 0.9723415682062299, "eval_loss": 0.1343841254711151, "eval_runtime": 62.4644, "eval_samples_per_second": 59.618, "eval_steps_per_second": 7.46, "step": 2200 }, { "epoch": 1.2889366272824918, "grad_norm": 0.009722361341118813, "learning_rate": 6.779001074113857e-06, "loss": 0.0293, "step": 2400 }, { "epoch": 1.2889366272824918, "eval_accuracy": 0.9699248120300752, "eval_loss": 0.15843084454536438, "eval_runtime": 61.4796, "eval_samples_per_second": 60.573, "eval_steps_per_second": 7.58, "step": 2400 }, { "epoch": 1.3963480128893662, "grad_norm": 0.01564161479473114, "learning_rate": 6.510472610096671e-06, "loss": 0.0251, "step": 2600 }, { "epoch": 1.3963480128893662, "eval_accuracy": 0.9656283566058003, "eval_loss": 0.17043128609657288, "eval_runtime": 61.4294, "eval_samples_per_second": 60.622, "eval_steps_per_second": 7.586, "step": 2600 }, { "epoch": 1.5037593984962405, "grad_norm": 0.004944147542119026, "learning_rate": 6.241944146079485e-06, "loss": 0.0249, "step": 2800 }, { "epoch": 1.5037593984962405, "eval_accuracy": 0.9699248120300752, "eval_loss": 0.1586124300956726, "eval_runtime": 63.1105, "eval_samples_per_second": 59.008, "eval_steps_per_second": 7.384, "step": 2800 }, { "epoch": 1.6111707841031149, "grad_norm": 0.004947973415255547, "learning_rate": 5.9734156820622995e-06, "loss": 0.0383, "step": 3000 }, { "epoch": 1.6111707841031149, "eval_accuracy": 0.9715359828141783, "eval_loss": 0.1466864049434662, "eval_runtime": 61.3469, "eval_samples_per_second": 60.704, "eval_steps_per_second": 7.596, "step": 3000 }, { "epoch": 1.7185821697099892, "grad_norm": 0.00027971353847533464, "learning_rate": 5.704887218045113e-06, "loss": 0.0213, "step": 3200 }, { "epoch": 1.7185821697099892, "eval_accuracy": 0.9734156820622986, "eval_loss": 0.15455451607704163, "eval_runtime": 61.872, "eval_samples_per_second": 60.189, "eval_steps_per_second": 7.532, "step": 3200 }, { "epoch": 1.8259935553168636, "grad_norm": 0.0011353939771652222, "learning_rate": 5.436358754027927e-06, "loss": 0.0544, "step": 3400 }, { "epoch": 1.8259935553168636, "eval_accuracy": 0.9685821697099892, "eval_loss": 0.16710036993026733, "eval_runtime": 62.8836, "eval_samples_per_second": 59.22, "eval_steps_per_second": 7.411, "step": 3400 }, { "epoch": 1.933404940923738, "grad_norm": 0.021713044494390488, "learning_rate": 5.1678302900107415e-06, "loss": 0.0401, "step": 3600 }, { "epoch": 1.933404940923738, "eval_accuracy": 0.9656283566058003, "eval_loss": 0.18700723350048065, "eval_runtime": 62.2418, "eval_samples_per_second": 59.831, "eval_steps_per_second": 7.487, "step": 3600 }, { "epoch": 2.0408163265306123, "grad_norm": 0.010888410732150078, "learning_rate": 4.900644468313642e-06, "loss": 0.0288, "step": 3800 }, { "epoch": 2.0408163265306123, "eval_accuracy": 0.9599892588614393, "eval_loss": 0.19811120629310608, "eval_runtime": 62.2807, "eval_samples_per_second": 59.794, "eval_steps_per_second": 7.482, "step": 3800 }, { "epoch": 2.1482277121374866, "grad_norm": 0.0020080420654267073, "learning_rate": 4.632116004296456e-06, "loss": 0.0078, "step": 4000 }, { "epoch": 2.1482277121374866, "eval_accuracy": 0.9747583243823845, "eval_loss": 0.14224101603031158, "eval_runtime": 61.9174, "eval_samples_per_second": 60.145, "eval_steps_per_second": 7.526, "step": 4000 }, { "epoch": 2.255639097744361, "grad_norm": 0.0014250120148062706, "learning_rate": 4.36358754027927e-06, "loss": 0.0037, "step": 4200 }, { "epoch": 2.255639097744361, "eval_accuracy": 0.9704618689581096, "eval_loss": 0.17747652530670166, "eval_runtime": 62.7966, "eval_samples_per_second": 59.303, "eval_steps_per_second": 7.421, "step": 4200 }, { "epoch": 2.3630504833512354, "grad_norm": 0.0010213665664196014, "learning_rate": 4.095059076262084e-06, "loss": 0.0035, "step": 4400 }, { "epoch": 2.3630504833512354, "eval_accuracy": 0.9704618689581096, "eval_loss": 0.18454258143901825, "eval_runtime": 62.428, "eval_samples_per_second": 59.653, "eval_steps_per_second": 7.465, "step": 4400 }, { "epoch": 2.4704618689581097, "grad_norm": 0.0008432798786088824, "learning_rate": 3.826530612244898e-06, "loss": 0.0043, "step": 4600 }, { "epoch": 2.4704618689581097, "eval_accuracy": 0.9709989258861439, "eval_loss": 0.20010964572429657, "eval_runtime": 61.7124, "eval_samples_per_second": 60.344, "eval_steps_per_second": 7.551, "step": 4600 }, { "epoch": 2.5778732545649836, "grad_norm": 0.0009041269076988101, "learning_rate": 3.5580021482277123e-06, "loss": 0.0049, "step": 4800 }, { "epoch": 2.5778732545649836, "eval_accuracy": 0.9688506981740065, "eval_loss": 0.21447789669036865, "eval_runtime": 63.1489, "eval_samples_per_second": 58.972, "eval_steps_per_second": 7.379, "step": 4800 }, { "epoch": 2.685284640171858, "grad_norm": 0.0005904084537178278, "learning_rate": 3.289473684210527e-06, "loss": 0.01, "step": 5000 }, { "epoch": 2.685284640171858, "eval_accuracy": 0.9750268528464017, "eval_loss": 0.1444501280784607, "eval_runtime": 61.7502, "eval_samples_per_second": 60.308, "eval_steps_per_second": 7.547, "step": 5000 }, { "epoch": 2.7926960257787323, "grad_norm": 0.002195934997871518, "learning_rate": 3.020945220193341e-06, "loss": 0.0039, "step": 5200 }, { "epoch": 2.7926960257787323, "eval_accuracy": 0.9747583243823845, "eval_loss": 0.15087123215198517, "eval_runtime": 61.9759, "eval_samples_per_second": 60.088, "eval_steps_per_second": 7.519, "step": 5200 }, { "epoch": 2.9001074113856067, "grad_norm": 0.013723284937441349, "learning_rate": 2.752416756176155e-06, "loss": 0.0055, "step": 5400 }, { "epoch": 2.9001074113856067, "eval_accuracy": 0.9747583243823845, "eval_loss": 0.16743087768554688, "eval_runtime": 61.6792, "eval_samples_per_second": 60.377, "eval_steps_per_second": 7.555, "step": 5400 }, { "epoch": 3.007518796992481, "grad_norm": 0.00023605262686032802, "learning_rate": 2.483888292158969e-06, "loss": 0.0094, "step": 5600 }, { "epoch": 3.007518796992481, "eval_accuracy": 0.9747583243823845, "eval_loss": 0.15689106285572052, "eval_runtime": 62.4872, "eval_samples_per_second": 59.596, "eval_steps_per_second": 7.458, "step": 5600 }, { "epoch": 3.1149301825993554, "grad_norm": 0.031025564298033714, "learning_rate": 2.215359828141783e-06, "loss": 0.0018, "step": 5800 }, { "epoch": 3.1149301825993554, "eval_accuracy": 0.9752953813104189, "eval_loss": 0.15795429050922394, "eval_runtime": 61.1051, "eval_samples_per_second": 60.944, "eval_steps_per_second": 7.626, "step": 5800 }, { "epoch": 3.2223415682062297, "grad_norm": 0.00042387741268612444, "learning_rate": 1.9468313641245973e-06, "loss": 0.0, "step": 6000 }, { "epoch": 3.2223415682062297, "eval_accuracy": 0.9761009667024705, "eval_loss": 0.16977229714393616, "eval_runtime": 61.4203, "eval_samples_per_second": 60.631, "eval_steps_per_second": 7.587, "step": 6000 }, { "epoch": 3.329752953813104, "grad_norm": 0.0005808643181808293, "learning_rate": 1.6783029001074116e-06, "loss": 0.0003, "step": 6200 }, { "epoch": 3.329752953813104, "eval_accuracy": 0.9761009667024705, "eval_loss": 0.1605655997991562, "eval_runtime": 61.8621, "eval_samples_per_second": 60.198, "eval_steps_per_second": 7.533, "step": 6200 }, { "epoch": 3.4371643394199785, "grad_norm": 5.516281453310512e-05, "learning_rate": 1.4097744360902258e-06, "loss": 0.0034, "step": 6400 }, { "epoch": 3.4371643394199785, "eval_accuracy": 0.9736842105263158, "eval_loss": 0.18701837956905365, "eval_runtime": 61.6799, "eval_samples_per_second": 60.376, "eval_steps_per_second": 7.555, "step": 6400 }, { "epoch": 3.544575725026853, "grad_norm": 0.005959503818303347, "learning_rate": 1.1412459720730397e-06, "loss": 0.0, "step": 6600 }, { "epoch": 3.544575725026853, "eval_accuracy": 0.9755639097744361, "eval_loss": 0.16974356770515442, "eval_runtime": 61.5705, "eval_samples_per_second": 60.483, "eval_steps_per_second": 7.569, "step": 6600 }, { "epoch": 3.651987110633727, "grad_norm": 0.0005512916832230985, "learning_rate": 8.727175080558539e-07, "loss": 0.0, "step": 6800 }, { "epoch": 3.651987110633727, "eval_accuracy": 0.9750268528464017, "eval_loss": 0.1672970950603485, "eval_runtime": 61.3615, "eval_samples_per_second": 60.69, "eval_steps_per_second": 7.594, "step": 6800 }, { "epoch": 3.7593984962406015, "grad_norm": 0.0007423324859701097, "learning_rate": 6.041890440386682e-07, "loss": 0.0053, "step": 7000 }, { "epoch": 3.7593984962406015, "eval_accuracy": 0.9752953813104189, "eval_loss": 0.16435429453849792, "eval_runtime": 63.0316, "eval_samples_per_second": 59.081, "eval_steps_per_second": 7.393, "step": 7000 }, { "epoch": 3.866809881847476, "grad_norm": 0.0015081085730344057, "learning_rate": 3.356605800214823e-07, "loss": 0.0, "step": 7200 }, { "epoch": 3.866809881847476, "eval_accuracy": 0.9752953813104189, "eval_loss": 0.16764488816261292, "eval_runtime": 61.9177, "eval_samples_per_second": 60.144, "eval_steps_per_second": 7.526, "step": 7200 }, { "epoch": 3.9742212674543502, "grad_norm": 0.00019354032701812685, "learning_rate": 6.713211600429646e-08, "loss": 0.0013, "step": 7400 }, { "epoch": 3.9742212674543502, "eval_accuracy": 0.9761009667024705, "eval_loss": 0.1640813946723938, "eval_runtime": 61.6171, "eval_samples_per_second": 60.438, "eval_steps_per_second": 7.563, "step": 7400 }, { "epoch": 4.0, "step": 7448, "total_flos": 1.668149581703322e+19, "train_loss": 0.03976387239464436, "train_runtime": 7938.8524, "train_samples_per_second": 7.504, "train_steps_per_second": 0.938 } ], "logging_steps": 200, "max_steps": 7448, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.668149581703322e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }