{ "best_metric": 0.34654033184051514, "best_model_checkpoint": "./vit-base-images/checkpoint-1000", "epoch": 4.0, "eval_steps": 100, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.5996569395065308, "learning_rate": 0.00019800000000000002, "loss": 1.0932, "step": 10 }, { "epoch": 0.08, "grad_norm": 1.9198634624481201, "learning_rate": 0.000196, "loss": 0.9522, "step": 20 }, { "epoch": 0.12, "grad_norm": 1.2456237077713013, "learning_rate": 0.000194, "loss": 0.6875, "step": 30 }, { "epoch": 0.16, "grad_norm": 1.7687036991119385, "learning_rate": 0.000192, "loss": 0.9009, "step": 40 }, { "epoch": 0.2, "grad_norm": 2.880617141723633, "learning_rate": 0.00019, "loss": 0.7155, "step": 50 }, { "epoch": 0.24, "grad_norm": 1.5424929857254028, "learning_rate": 0.000188, "loss": 0.8144, "step": 60 }, { "epoch": 0.28, "grad_norm": 1.816297173500061, "learning_rate": 0.00018600000000000002, "loss": 0.6641, "step": 70 }, { "epoch": 0.32, "grad_norm": 2.0054569244384766, "learning_rate": 0.00018400000000000003, "loss": 0.5917, "step": 80 }, { "epoch": 0.36, "grad_norm": 2.2372283935546875, "learning_rate": 0.000182, "loss": 0.7852, "step": 90 }, { "epoch": 0.4, "grad_norm": 2.542130708694458, "learning_rate": 0.00018, "loss": 0.7334, "step": 100 }, { "epoch": 0.4, "eval_accuracy": 0.779, "eval_loss": 0.6142178773880005, "eval_runtime": 18.2795, "eval_samples_per_second": 54.706, "eval_steps_per_second": 6.838, "step": 100 }, { "epoch": 0.44, "grad_norm": 1.4493112564086914, "learning_rate": 0.00017800000000000002, "loss": 0.5289, "step": 110 }, { "epoch": 0.48, "grad_norm": 1.905771017074585, "learning_rate": 0.00017600000000000002, "loss": 0.6191, "step": 120 }, { "epoch": 0.52, "grad_norm": 2.2236440181732178, "learning_rate": 0.000174, "loss": 0.5111, "step": 130 }, { "epoch": 0.56, "grad_norm": 2.113398551940918, "learning_rate": 0.000172, "loss": 0.6606, "step": 140 }, { "epoch": 0.6, "grad_norm": 2.4624953269958496, "learning_rate": 0.00017, "loss": 0.5002, "step": 150 }, { "epoch": 0.64, "grad_norm": 2.324570417404175, "learning_rate": 0.000168, "loss": 0.9353, "step": 160 }, { "epoch": 0.68, "grad_norm": 5.384814262390137, "learning_rate": 0.000166, "loss": 0.6604, "step": 170 }, { "epoch": 0.72, "grad_norm": 0.8541224598884583, "learning_rate": 0.000164, "loss": 0.4894, "step": 180 }, { "epoch": 0.76, "grad_norm": 3.017305612564087, "learning_rate": 0.000162, "loss": 0.6219, "step": 190 }, { "epoch": 0.8, "grad_norm": 1.9483362436294556, "learning_rate": 0.00016, "loss": 0.6032, "step": 200 }, { "epoch": 0.8, "eval_accuracy": 0.808, "eval_loss": 0.5516341328620911, "eval_runtime": 14.5864, "eval_samples_per_second": 68.557, "eval_steps_per_second": 8.57, "step": 200 }, { "epoch": 0.84, "grad_norm": 2.7376227378845215, "learning_rate": 0.00015800000000000002, "loss": 0.4968, "step": 210 }, { "epoch": 0.88, "grad_norm": 1.563944697380066, "learning_rate": 0.00015600000000000002, "loss": 0.4505, "step": 220 }, { "epoch": 0.92, "grad_norm": 1.3606369495391846, "learning_rate": 0.000154, "loss": 0.5368, "step": 230 }, { "epoch": 0.96, "grad_norm": 1.3428421020507812, "learning_rate": 0.000152, "loss": 0.4932, "step": 240 }, { "epoch": 1.0, "grad_norm": 1.9562724828720093, "learning_rate": 0.00015000000000000001, "loss": 0.4884, "step": 250 }, { "epoch": 1.04, "grad_norm": 0.947496771812439, "learning_rate": 0.000148, "loss": 0.381, "step": 260 }, { "epoch": 1.08, "grad_norm": 1.6039777994155884, "learning_rate": 0.000146, "loss": 0.6633, "step": 270 }, { "epoch": 1.12, "grad_norm": 1.8116464614868164, "learning_rate": 0.000144, "loss": 0.3728, "step": 280 }, { "epoch": 1.16, "grad_norm": 1.6644967794418335, "learning_rate": 0.000142, "loss": 0.3299, "step": 290 }, { "epoch": 1.2, "grad_norm": 1.5359082221984863, "learning_rate": 0.00014, "loss": 0.4725, "step": 300 }, { "epoch": 1.2, "eval_accuracy": 0.854, "eval_loss": 0.43897509574890137, "eval_runtime": 14.256, "eval_samples_per_second": 70.146, "eval_steps_per_second": 8.768, "step": 300 }, { "epoch": 1.24, "grad_norm": 2.018160581588745, "learning_rate": 0.000138, "loss": 0.3064, "step": 310 }, { "epoch": 1.28, "grad_norm": 1.5475637912750244, "learning_rate": 0.00013600000000000003, "loss": 0.2928, "step": 320 }, { "epoch": 1.32, "grad_norm": 2.780301809310913, "learning_rate": 0.000134, "loss": 0.2959, "step": 330 }, { "epoch": 1.3599999999999999, "grad_norm": 1.0915693044662476, "learning_rate": 0.000132, "loss": 0.3152, "step": 340 }, { "epoch": 1.4, "grad_norm": 2.1470773220062256, "learning_rate": 0.00013000000000000002, "loss": 0.4123, "step": 350 }, { "epoch": 1.44, "grad_norm": 4.054312705993652, "learning_rate": 0.00012800000000000002, "loss": 0.5676, "step": 360 }, { "epoch": 1.48, "grad_norm": 1.8798813819885254, "learning_rate": 0.000126, "loss": 0.3909, "step": 370 }, { "epoch": 1.52, "grad_norm": 2.3789453506469727, "learning_rate": 0.000124, "loss": 0.419, "step": 380 }, { "epoch": 1.56, "grad_norm": 1.7660586833953857, "learning_rate": 0.000122, "loss": 0.516, "step": 390 }, { "epoch": 1.6, "grad_norm": 3.304502010345459, "learning_rate": 0.00012, "loss": 0.3638, "step": 400 }, { "epoch": 1.6, "eval_accuracy": 0.822, "eval_loss": 0.4622470438480377, "eval_runtime": 14.2766, "eval_samples_per_second": 70.045, "eval_steps_per_second": 8.756, "step": 400 }, { "epoch": 1.6400000000000001, "grad_norm": 3.906277656555176, "learning_rate": 0.000118, "loss": 0.3608, "step": 410 }, { "epoch": 1.6800000000000002, "grad_norm": 2.591684103012085, "learning_rate": 0.000116, "loss": 0.4414, "step": 420 }, { "epoch": 1.72, "grad_norm": 0.6823468804359436, "learning_rate": 0.00011399999999999999, "loss": 0.3937, "step": 430 }, { "epoch": 1.76, "grad_norm": 2.4249002933502197, "learning_rate": 0.00011200000000000001, "loss": 0.2984, "step": 440 }, { "epoch": 1.8, "grad_norm": 2.575287103652954, "learning_rate": 0.00011000000000000002, "loss": 0.4073, "step": 450 }, { "epoch": 1.8399999999999999, "grad_norm": 0.8557507395744324, "learning_rate": 0.00010800000000000001, "loss": 0.3573, "step": 460 }, { "epoch": 1.88, "grad_norm": 3.4324100017547607, "learning_rate": 0.00010600000000000002, "loss": 0.3758, "step": 470 }, { "epoch": 1.92, "grad_norm": 2.3825552463531494, "learning_rate": 0.00010400000000000001, "loss": 0.2375, "step": 480 }, { "epoch": 1.96, "grad_norm": 0.9951996207237244, "learning_rate": 0.00010200000000000001, "loss": 0.2496, "step": 490 }, { "epoch": 2.0, "grad_norm": 2.2203187942504883, "learning_rate": 0.0001, "loss": 0.3279, "step": 500 }, { "epoch": 2.0, "eval_accuracy": 0.876, "eval_loss": 0.3772076666355133, "eval_runtime": 14.2674, "eval_samples_per_second": 70.09, "eval_steps_per_second": 8.761, "step": 500 }, { "epoch": 2.04, "grad_norm": 1.8857389688491821, "learning_rate": 9.8e-05, "loss": 0.1633, "step": 510 }, { "epoch": 2.08, "grad_norm": 5.698770046234131, "learning_rate": 9.6e-05, "loss": 0.2812, "step": 520 }, { "epoch": 2.12, "grad_norm": 1.7683120965957642, "learning_rate": 9.4e-05, "loss": 0.1895, "step": 530 }, { "epoch": 2.16, "grad_norm": 0.6420239806175232, "learning_rate": 9.200000000000001e-05, "loss": 0.1732, "step": 540 }, { "epoch": 2.2, "grad_norm": 0.8955737948417664, "learning_rate": 9e-05, "loss": 0.1557, "step": 550 }, { "epoch": 2.24, "grad_norm": 2.202012300491333, "learning_rate": 8.800000000000001e-05, "loss": 0.2851, "step": 560 }, { "epoch": 2.2800000000000002, "grad_norm": 3.6105308532714844, "learning_rate": 8.6e-05, "loss": 0.1645, "step": 570 }, { "epoch": 2.32, "grad_norm": 3.514596462249756, "learning_rate": 8.4e-05, "loss": 0.1399, "step": 580 }, { "epoch": 2.36, "grad_norm": 4.36515474319458, "learning_rate": 8.2e-05, "loss": 0.2495, "step": 590 }, { "epoch": 2.4, "grad_norm": 0.10514427721500397, "learning_rate": 8e-05, "loss": 0.1337, "step": 600 }, { "epoch": 2.4, "eval_accuracy": 0.869, "eval_loss": 0.45184341073036194, "eval_runtime": 14.683, "eval_samples_per_second": 68.106, "eval_steps_per_second": 8.513, "step": 600 }, { "epoch": 2.44, "grad_norm": 1.140317440032959, "learning_rate": 7.800000000000001e-05, "loss": 0.1493, "step": 610 }, { "epoch": 2.48, "grad_norm": 0.3709057569503784, "learning_rate": 7.6e-05, "loss": 0.164, "step": 620 }, { "epoch": 2.52, "grad_norm": 3.097055196762085, "learning_rate": 7.4e-05, "loss": 0.208, "step": 630 }, { "epoch": 2.56, "grad_norm": 3.960178852081299, "learning_rate": 7.2e-05, "loss": 0.2337, "step": 640 }, { "epoch": 2.6, "grad_norm": 2.339881420135498, "learning_rate": 7e-05, "loss": 0.167, "step": 650 }, { "epoch": 2.64, "grad_norm": 3.97763729095459, "learning_rate": 6.800000000000001e-05, "loss": 0.2026, "step": 660 }, { "epoch": 2.68, "grad_norm": 0.5411188006401062, "learning_rate": 6.6e-05, "loss": 0.0895, "step": 670 }, { "epoch": 2.7199999999999998, "grad_norm": 0.25824812054634094, "learning_rate": 6.400000000000001e-05, "loss": 0.0933, "step": 680 }, { "epoch": 2.76, "grad_norm": 0.2557239830493927, "learning_rate": 6.2e-05, "loss": 0.1155, "step": 690 }, { "epoch": 2.8, "grad_norm": 5.947152137756348, "learning_rate": 6e-05, "loss": 0.236, "step": 700 }, { "epoch": 2.8, "eval_accuracy": 0.878, "eval_loss": 0.37660717964172363, "eval_runtime": 14.2296, "eval_samples_per_second": 70.276, "eval_steps_per_second": 8.785, "step": 700 }, { "epoch": 2.84, "grad_norm": 2.6783535480499268, "learning_rate": 5.8e-05, "loss": 0.1437, "step": 710 }, { "epoch": 2.88, "grad_norm": 1.7082568407058716, "learning_rate": 5.6000000000000006e-05, "loss": 0.1498, "step": 720 }, { "epoch": 2.92, "grad_norm": 0.3654639720916748, "learning_rate": 5.4000000000000005e-05, "loss": 0.1544, "step": 730 }, { "epoch": 2.96, "grad_norm": 2.7878735065460205, "learning_rate": 5.2000000000000004e-05, "loss": 0.2657, "step": 740 }, { "epoch": 3.0, "grad_norm": 3.31339430809021, "learning_rate": 5e-05, "loss": 0.103, "step": 750 }, { "epoch": 3.04, "grad_norm": 0.41359299421310425, "learning_rate": 4.8e-05, "loss": 0.0904, "step": 760 }, { "epoch": 3.08, "grad_norm": 0.11081337183713913, "learning_rate": 4.600000000000001e-05, "loss": 0.0475, "step": 770 }, { "epoch": 3.12, "grad_norm": 0.6292364001274109, "learning_rate": 4.4000000000000006e-05, "loss": 0.0613, "step": 780 }, { "epoch": 3.16, "grad_norm": 0.06634623557329178, "learning_rate": 4.2e-05, "loss": 0.0419, "step": 790 }, { "epoch": 3.2, "grad_norm": 3.720346212387085, "learning_rate": 4e-05, "loss": 0.0275, "step": 800 }, { "epoch": 3.2, "eval_accuracy": 0.891, "eval_loss": 0.3517528176307678, "eval_runtime": 14.2729, "eval_samples_per_second": 70.063, "eval_steps_per_second": 8.758, "step": 800 }, { "epoch": 3.24, "grad_norm": 0.15002816915512085, "learning_rate": 3.8e-05, "loss": 0.0425, "step": 810 }, { "epoch": 3.2800000000000002, "grad_norm": 0.08299177885055542, "learning_rate": 3.6e-05, "loss": 0.0465, "step": 820 }, { "epoch": 3.32, "grad_norm": 0.41334620118141174, "learning_rate": 3.4000000000000007e-05, "loss": 0.0434, "step": 830 }, { "epoch": 3.36, "grad_norm": 0.5403936505317688, "learning_rate": 3.2000000000000005e-05, "loss": 0.0301, "step": 840 }, { "epoch": 3.4, "grad_norm": 0.08261027932167053, "learning_rate": 3e-05, "loss": 0.072, "step": 850 }, { "epoch": 3.44, "grad_norm": 1.0293442010879517, "learning_rate": 2.8000000000000003e-05, "loss": 0.082, "step": 860 }, { "epoch": 3.48, "grad_norm": 1.7797234058380127, "learning_rate": 2.6000000000000002e-05, "loss": 0.0748, "step": 870 }, { "epoch": 3.52, "grad_norm": 3.523738145828247, "learning_rate": 2.4e-05, "loss": 0.1751, "step": 880 }, { "epoch": 3.56, "grad_norm": 0.06309465318918228, "learning_rate": 2.2000000000000003e-05, "loss": 0.0383, "step": 890 }, { "epoch": 3.6, "grad_norm": 2.1426751613616943, "learning_rate": 2e-05, "loss": 0.0427, "step": 900 }, { "epoch": 3.6, "eval_accuracy": 0.896, "eval_loss": 0.3709311783313751, "eval_runtime": 14.359, "eval_samples_per_second": 69.643, "eval_steps_per_second": 8.705, "step": 900 }, { "epoch": 3.64, "grad_norm": 1.3229968547821045, "learning_rate": 1.8e-05, "loss": 0.0352, "step": 910 }, { "epoch": 3.68, "grad_norm": 0.08263090997934341, "learning_rate": 1.6000000000000003e-05, "loss": 0.0192, "step": 920 }, { "epoch": 3.7199999999999998, "grad_norm": 0.1414523720741272, "learning_rate": 1.4000000000000001e-05, "loss": 0.0724, "step": 930 }, { "epoch": 3.76, "grad_norm": 0.05866268649697304, "learning_rate": 1.2e-05, "loss": 0.0289, "step": 940 }, { "epoch": 3.8, "grad_norm": 0.08174656331539154, "learning_rate": 1e-05, "loss": 0.0264, "step": 950 }, { "epoch": 3.84, "grad_norm": 0.07566811144351959, "learning_rate": 8.000000000000001e-06, "loss": 0.0225, "step": 960 }, { "epoch": 3.88, "grad_norm": 0.06544584035873413, "learning_rate": 6e-06, "loss": 0.0488, "step": 970 }, { "epoch": 3.92, "grad_norm": 0.2268047034740448, "learning_rate": 4.000000000000001e-06, "loss": 0.0423, "step": 980 }, { "epoch": 3.96, "grad_norm": 0.05503053963184357, "learning_rate": 2.0000000000000003e-06, "loss": 0.0506, "step": 990 }, { "epoch": 4.0, "grad_norm": 0.3238757252693176, "learning_rate": 0.0, "loss": 0.0363, "step": 1000 }, { "epoch": 4.0, "eval_accuracy": 0.905, "eval_loss": 0.34654033184051514, "eval_runtime": 14.8487, "eval_samples_per_second": 67.346, "eval_steps_per_second": 8.418, "step": 1000 }, { "epoch": 4.0, "step": 1000, "total_flos": 1.239905171570688e+18, "train_loss": 0.3179253642559052, "train_runtime": 607.1163, "train_samples_per_second": 26.354, "train_steps_per_second": 1.647 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.239905171570688e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }