|
{ |
|
"best_metric": 0.12822793424129486, |
|
"best_model_checkpoint": "vit-large-ai-or-not/checkpoint-1600", |
|
"epoch": 4.0, |
|
"eval_steps": 200, |
|
"global_step": 7448, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10741138560687433, |
|
"grad_norm": 0.16569846868515015, |
|
"learning_rate": 9.731471535982815e-06, |
|
"loss": 0.1089, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10741138560687433, |
|
"eval_accuracy": 0.9497851772287863, |
|
"eval_loss": 0.17561669647693634, |
|
"eval_runtime": 60.3674, |
|
"eval_samples_per_second": 61.689, |
|
"eval_steps_per_second": 7.719, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21482277121374865, |
|
"grad_norm": 0.006833943538367748, |
|
"learning_rate": 9.462943071965628e-06, |
|
"loss": 0.041, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.21482277121374865, |
|
"eval_accuracy": 0.9503222341568206, |
|
"eval_loss": 0.21512643992900848, |
|
"eval_runtime": 62.0849, |
|
"eval_samples_per_second": 59.982, |
|
"eval_steps_per_second": 7.506, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.322234156820623, |
|
"grad_norm": 0.010011733509600163, |
|
"learning_rate": 9.194414607948443e-06, |
|
"loss": 0.0566, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.322234156820623, |
|
"eval_accuracy": 0.9511278195488722, |
|
"eval_loss": 0.2124553769826889, |
|
"eval_runtime": 61.5803, |
|
"eval_samples_per_second": 60.474, |
|
"eval_steps_per_second": 7.567, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4296455424274973, |
|
"grad_norm": 22.38845443725586, |
|
"learning_rate": 8.927228786251343e-06, |
|
"loss": 0.1028, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4296455424274973, |
|
"eval_accuracy": 0.9449516648764769, |
|
"eval_loss": 0.20836062729358673, |
|
"eval_runtime": 61.4329, |
|
"eval_samples_per_second": 60.619, |
|
"eval_steps_per_second": 7.586, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5370569280343717, |
|
"grad_norm": 0.2941912114620209, |
|
"learning_rate": 8.658700322234156e-06, |
|
"loss": 0.1722, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5370569280343717, |
|
"eval_accuracy": 0.9556928034371643, |
|
"eval_loss": 0.16581448912620544, |
|
"eval_runtime": 61.8304, |
|
"eval_samples_per_second": 60.229, |
|
"eval_steps_per_second": 7.537, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.644468313641246, |
|
"grad_norm": 2.7486720085144043, |
|
"learning_rate": 8.390171858216972e-06, |
|
"loss": 0.1486, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.644468313641246, |
|
"eval_accuracy": 0.959452201933405, |
|
"eval_loss": 0.13117314875125885, |
|
"eval_runtime": 61.611, |
|
"eval_samples_per_second": 60.444, |
|
"eval_steps_per_second": 7.564, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7518796992481203, |
|
"grad_norm": 7.503540515899658, |
|
"learning_rate": 8.121643394199787e-06, |
|
"loss": 0.1446, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7518796992481203, |
|
"eval_accuracy": 0.9564983888292159, |
|
"eval_loss": 0.16343578696250916, |
|
"eval_runtime": 61.6178, |
|
"eval_samples_per_second": 60.437, |
|
"eval_steps_per_second": 7.563, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8592910848549946, |
|
"grad_norm": 15.931679725646973, |
|
"learning_rate": 7.8531149301826e-06, |
|
"loss": 0.1281, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8592910848549946, |
|
"eval_accuracy": 0.965359828141783, |
|
"eval_loss": 0.12822793424129486, |
|
"eval_runtime": 62.167, |
|
"eval_samples_per_second": 59.903, |
|
"eval_steps_per_second": 7.496, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.966702470461869, |
|
"grad_norm": 38.17241668701172, |
|
"learning_rate": 7.5845864661654145e-06, |
|
"loss": 0.1584, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.966702470461869, |
|
"eval_accuracy": 0.966702470461869, |
|
"eval_loss": 0.12948955595493317, |
|
"eval_runtime": 61.1637, |
|
"eval_samples_per_second": 60.886, |
|
"eval_steps_per_second": 7.619, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.0741138560687433, |
|
"grad_norm": 0.005669532343745232, |
|
"learning_rate": 7.316058002148228e-06, |
|
"loss": 0.0549, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0741138560687433, |
|
"eval_accuracy": 0.9669709989258861, |
|
"eval_loss": 0.16132378578186035, |
|
"eval_runtime": 61.0423, |
|
"eval_samples_per_second": 61.007, |
|
"eval_steps_per_second": 7.634, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1815252416756177, |
|
"grad_norm": 0.003028369741514325, |
|
"learning_rate": 7.047529538131043e-06, |
|
"loss": 0.0373, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.1815252416756177, |
|
"eval_accuracy": 0.9723415682062299, |
|
"eval_loss": 0.1343841254711151, |
|
"eval_runtime": 62.4644, |
|
"eval_samples_per_second": 59.618, |
|
"eval_steps_per_second": 7.46, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.2889366272824918, |
|
"grad_norm": 0.009722361341118813, |
|
"learning_rate": 6.779001074113857e-06, |
|
"loss": 0.0293, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.2889366272824918, |
|
"eval_accuracy": 0.9699248120300752, |
|
"eval_loss": 0.15843084454536438, |
|
"eval_runtime": 61.4796, |
|
"eval_samples_per_second": 60.573, |
|
"eval_steps_per_second": 7.58, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3963480128893662, |
|
"grad_norm": 0.01564161479473114, |
|
"learning_rate": 6.510472610096671e-06, |
|
"loss": 0.0251, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.3963480128893662, |
|
"eval_accuracy": 0.9656283566058003, |
|
"eval_loss": 0.17043128609657288, |
|
"eval_runtime": 61.4294, |
|
"eval_samples_per_second": 60.622, |
|
"eval_steps_per_second": 7.586, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.5037593984962405, |
|
"grad_norm": 0.004944147542119026, |
|
"learning_rate": 6.241944146079485e-06, |
|
"loss": 0.0249, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.5037593984962405, |
|
"eval_accuracy": 0.9699248120300752, |
|
"eval_loss": 0.1586124300956726, |
|
"eval_runtime": 63.1105, |
|
"eval_samples_per_second": 59.008, |
|
"eval_steps_per_second": 7.384, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.6111707841031149, |
|
"grad_norm": 0.004947973415255547, |
|
"learning_rate": 5.9734156820622995e-06, |
|
"loss": 0.0383, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.6111707841031149, |
|
"eval_accuracy": 0.9715359828141783, |
|
"eval_loss": 0.1466864049434662, |
|
"eval_runtime": 61.3469, |
|
"eval_samples_per_second": 60.704, |
|
"eval_steps_per_second": 7.596, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7185821697099892, |
|
"grad_norm": 0.00027971353847533464, |
|
"learning_rate": 5.704887218045113e-06, |
|
"loss": 0.0213, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.7185821697099892, |
|
"eval_accuracy": 0.9734156820622986, |
|
"eval_loss": 0.15455451607704163, |
|
"eval_runtime": 61.872, |
|
"eval_samples_per_second": 60.189, |
|
"eval_steps_per_second": 7.532, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.8259935553168636, |
|
"grad_norm": 0.0011353939771652222, |
|
"learning_rate": 5.436358754027927e-06, |
|
"loss": 0.0544, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.8259935553168636, |
|
"eval_accuracy": 0.9685821697099892, |
|
"eval_loss": 0.16710036993026733, |
|
"eval_runtime": 62.8836, |
|
"eval_samples_per_second": 59.22, |
|
"eval_steps_per_second": 7.411, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.933404940923738, |
|
"grad_norm": 0.021713044494390488, |
|
"learning_rate": 5.1678302900107415e-06, |
|
"loss": 0.0401, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.933404940923738, |
|
"eval_accuracy": 0.9656283566058003, |
|
"eval_loss": 0.18700723350048065, |
|
"eval_runtime": 62.2418, |
|
"eval_samples_per_second": 59.831, |
|
"eval_steps_per_second": 7.487, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"grad_norm": 0.010888410732150078, |
|
"learning_rate": 4.900644468313642e-06, |
|
"loss": 0.0288, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"eval_accuracy": 0.9599892588614393, |
|
"eval_loss": 0.19811120629310608, |
|
"eval_runtime": 62.2807, |
|
"eval_samples_per_second": 59.794, |
|
"eval_steps_per_second": 7.482, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.1482277121374866, |
|
"grad_norm": 0.0020080420654267073, |
|
"learning_rate": 4.632116004296456e-06, |
|
"loss": 0.0078, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.1482277121374866, |
|
"eval_accuracy": 0.9747583243823845, |
|
"eval_loss": 0.14224101603031158, |
|
"eval_runtime": 61.9174, |
|
"eval_samples_per_second": 60.145, |
|
"eval_steps_per_second": 7.526, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.255639097744361, |
|
"grad_norm": 0.0014250120148062706, |
|
"learning_rate": 4.36358754027927e-06, |
|
"loss": 0.0037, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.255639097744361, |
|
"eval_accuracy": 0.9704618689581096, |
|
"eval_loss": 0.17747652530670166, |
|
"eval_runtime": 62.7966, |
|
"eval_samples_per_second": 59.303, |
|
"eval_steps_per_second": 7.421, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.3630504833512354, |
|
"grad_norm": 0.0010213665664196014, |
|
"learning_rate": 4.095059076262084e-06, |
|
"loss": 0.0035, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.3630504833512354, |
|
"eval_accuracy": 0.9704618689581096, |
|
"eval_loss": 0.18454258143901825, |
|
"eval_runtime": 62.428, |
|
"eval_samples_per_second": 59.653, |
|
"eval_steps_per_second": 7.465, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.4704618689581097, |
|
"grad_norm": 0.0008432798786088824, |
|
"learning_rate": 3.826530612244898e-06, |
|
"loss": 0.0043, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.4704618689581097, |
|
"eval_accuracy": 0.9709989258861439, |
|
"eval_loss": 0.20010964572429657, |
|
"eval_runtime": 61.7124, |
|
"eval_samples_per_second": 60.344, |
|
"eval_steps_per_second": 7.551, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.5778732545649836, |
|
"grad_norm": 0.0009041269076988101, |
|
"learning_rate": 3.5580021482277123e-06, |
|
"loss": 0.0049, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.5778732545649836, |
|
"eval_accuracy": 0.9688506981740065, |
|
"eval_loss": 0.21447789669036865, |
|
"eval_runtime": 63.1489, |
|
"eval_samples_per_second": 58.972, |
|
"eval_steps_per_second": 7.379, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.685284640171858, |
|
"grad_norm": 0.0005904084537178278, |
|
"learning_rate": 3.289473684210527e-06, |
|
"loss": 0.01, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.685284640171858, |
|
"eval_accuracy": 0.9750268528464017, |
|
"eval_loss": 0.1444501280784607, |
|
"eval_runtime": 61.7502, |
|
"eval_samples_per_second": 60.308, |
|
"eval_steps_per_second": 7.547, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.7926960257787323, |
|
"grad_norm": 0.002195934997871518, |
|
"learning_rate": 3.020945220193341e-06, |
|
"loss": 0.0039, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.7926960257787323, |
|
"eval_accuracy": 0.9747583243823845, |
|
"eval_loss": 0.15087123215198517, |
|
"eval_runtime": 61.9759, |
|
"eval_samples_per_second": 60.088, |
|
"eval_steps_per_second": 7.519, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.9001074113856067, |
|
"grad_norm": 0.013723284937441349, |
|
"learning_rate": 2.752416756176155e-06, |
|
"loss": 0.0055, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.9001074113856067, |
|
"eval_accuracy": 0.9747583243823845, |
|
"eval_loss": 0.16743087768554688, |
|
"eval_runtime": 61.6792, |
|
"eval_samples_per_second": 60.377, |
|
"eval_steps_per_second": 7.555, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.007518796992481, |
|
"grad_norm": 0.00023605262686032802, |
|
"learning_rate": 2.483888292158969e-06, |
|
"loss": 0.0094, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.007518796992481, |
|
"eval_accuracy": 0.9747583243823845, |
|
"eval_loss": 0.15689106285572052, |
|
"eval_runtime": 62.4872, |
|
"eval_samples_per_second": 59.596, |
|
"eval_steps_per_second": 7.458, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.1149301825993554, |
|
"grad_norm": 0.031025564298033714, |
|
"learning_rate": 2.215359828141783e-06, |
|
"loss": 0.0018, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.1149301825993554, |
|
"eval_accuracy": 0.9752953813104189, |
|
"eval_loss": 0.15795429050922394, |
|
"eval_runtime": 61.1051, |
|
"eval_samples_per_second": 60.944, |
|
"eval_steps_per_second": 7.626, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.2223415682062297, |
|
"grad_norm": 0.00042387741268612444, |
|
"learning_rate": 1.9468313641245973e-06, |
|
"loss": 0.0, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.2223415682062297, |
|
"eval_accuracy": 0.9761009667024705, |
|
"eval_loss": 0.16977229714393616, |
|
"eval_runtime": 61.4203, |
|
"eval_samples_per_second": 60.631, |
|
"eval_steps_per_second": 7.587, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.329752953813104, |
|
"grad_norm": 0.0005808643181808293, |
|
"learning_rate": 1.6783029001074116e-06, |
|
"loss": 0.0003, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.329752953813104, |
|
"eval_accuracy": 0.9761009667024705, |
|
"eval_loss": 0.1605655997991562, |
|
"eval_runtime": 61.8621, |
|
"eval_samples_per_second": 60.198, |
|
"eval_steps_per_second": 7.533, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.4371643394199785, |
|
"grad_norm": 5.516281453310512e-05, |
|
"learning_rate": 1.4097744360902258e-06, |
|
"loss": 0.0034, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.4371643394199785, |
|
"eval_accuracy": 0.9736842105263158, |
|
"eval_loss": 0.18701837956905365, |
|
"eval_runtime": 61.6799, |
|
"eval_samples_per_second": 60.376, |
|
"eval_steps_per_second": 7.555, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.544575725026853, |
|
"grad_norm": 0.005959503818303347, |
|
"learning_rate": 1.1412459720730397e-06, |
|
"loss": 0.0, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.544575725026853, |
|
"eval_accuracy": 0.9755639097744361, |
|
"eval_loss": 0.16974356770515442, |
|
"eval_runtime": 61.5705, |
|
"eval_samples_per_second": 60.483, |
|
"eval_steps_per_second": 7.569, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.651987110633727, |
|
"grad_norm": 0.0005512916832230985, |
|
"learning_rate": 8.727175080558539e-07, |
|
"loss": 0.0, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.651987110633727, |
|
"eval_accuracy": 0.9750268528464017, |
|
"eval_loss": 0.1672970950603485, |
|
"eval_runtime": 61.3615, |
|
"eval_samples_per_second": 60.69, |
|
"eval_steps_per_second": 7.594, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.7593984962406015, |
|
"grad_norm": 0.0007423324859701097, |
|
"learning_rate": 6.041890440386682e-07, |
|
"loss": 0.0053, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.7593984962406015, |
|
"eval_accuracy": 0.9752953813104189, |
|
"eval_loss": 0.16435429453849792, |
|
"eval_runtime": 63.0316, |
|
"eval_samples_per_second": 59.081, |
|
"eval_steps_per_second": 7.393, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.866809881847476, |
|
"grad_norm": 0.0015081085730344057, |
|
"learning_rate": 3.356605800214823e-07, |
|
"loss": 0.0, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.866809881847476, |
|
"eval_accuracy": 0.9752953813104189, |
|
"eval_loss": 0.16764488816261292, |
|
"eval_runtime": 61.9177, |
|
"eval_samples_per_second": 60.144, |
|
"eval_steps_per_second": 7.526, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.9742212674543502, |
|
"grad_norm": 0.00019354032701812685, |
|
"learning_rate": 6.713211600429646e-08, |
|
"loss": 0.0013, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.9742212674543502, |
|
"eval_accuracy": 0.9761009667024705, |
|
"eval_loss": 0.1640813946723938, |
|
"eval_runtime": 61.6171, |
|
"eval_samples_per_second": 60.438, |
|
"eval_steps_per_second": 7.563, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 7448, |
|
"total_flos": 1.668149581703322e+19, |
|
"train_loss": 0.03976387239464436, |
|
"train_runtime": 7938.8524, |
|
"train_samples_per_second": 7.504, |
|
"train_steps_per_second": 0.938 |
|
} |
|
], |
|
"logging_steps": 200, |
|
"max_steps": 7448, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.668149581703322e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|