vit-base-images / trainer_state.json
JYL480's picture
End of training
d5a355b verified
raw
history blame
No virus
18.5 kB
{
"best_metric": 0.34654033184051514,
"best_model_checkpoint": "./vit-base-images/checkpoint-1000",
"epoch": 4.0,
"eval_steps": 100,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 1.5996569395065308,
"learning_rate": 0.00019800000000000002,
"loss": 1.0932,
"step": 10
},
{
"epoch": 0.08,
"grad_norm": 1.9198634624481201,
"learning_rate": 0.000196,
"loss": 0.9522,
"step": 20
},
{
"epoch": 0.12,
"grad_norm": 1.2456237077713013,
"learning_rate": 0.000194,
"loss": 0.6875,
"step": 30
},
{
"epoch": 0.16,
"grad_norm": 1.7687036991119385,
"learning_rate": 0.000192,
"loss": 0.9009,
"step": 40
},
{
"epoch": 0.2,
"grad_norm": 2.880617141723633,
"learning_rate": 0.00019,
"loss": 0.7155,
"step": 50
},
{
"epoch": 0.24,
"grad_norm": 1.5424929857254028,
"learning_rate": 0.000188,
"loss": 0.8144,
"step": 60
},
{
"epoch": 0.28,
"grad_norm": 1.816297173500061,
"learning_rate": 0.00018600000000000002,
"loss": 0.6641,
"step": 70
},
{
"epoch": 0.32,
"grad_norm": 2.0054569244384766,
"learning_rate": 0.00018400000000000003,
"loss": 0.5917,
"step": 80
},
{
"epoch": 0.36,
"grad_norm": 2.2372283935546875,
"learning_rate": 0.000182,
"loss": 0.7852,
"step": 90
},
{
"epoch": 0.4,
"grad_norm": 2.542130708694458,
"learning_rate": 0.00018,
"loss": 0.7334,
"step": 100
},
{
"epoch": 0.4,
"eval_accuracy": 0.779,
"eval_loss": 0.6142178773880005,
"eval_runtime": 18.2795,
"eval_samples_per_second": 54.706,
"eval_steps_per_second": 6.838,
"step": 100
},
{
"epoch": 0.44,
"grad_norm": 1.4493112564086914,
"learning_rate": 0.00017800000000000002,
"loss": 0.5289,
"step": 110
},
{
"epoch": 0.48,
"grad_norm": 1.905771017074585,
"learning_rate": 0.00017600000000000002,
"loss": 0.6191,
"step": 120
},
{
"epoch": 0.52,
"grad_norm": 2.2236440181732178,
"learning_rate": 0.000174,
"loss": 0.5111,
"step": 130
},
{
"epoch": 0.56,
"grad_norm": 2.113398551940918,
"learning_rate": 0.000172,
"loss": 0.6606,
"step": 140
},
{
"epoch": 0.6,
"grad_norm": 2.4624953269958496,
"learning_rate": 0.00017,
"loss": 0.5002,
"step": 150
},
{
"epoch": 0.64,
"grad_norm": 2.324570417404175,
"learning_rate": 0.000168,
"loss": 0.9353,
"step": 160
},
{
"epoch": 0.68,
"grad_norm": 5.384814262390137,
"learning_rate": 0.000166,
"loss": 0.6604,
"step": 170
},
{
"epoch": 0.72,
"grad_norm": 0.8541224598884583,
"learning_rate": 0.000164,
"loss": 0.4894,
"step": 180
},
{
"epoch": 0.76,
"grad_norm": 3.017305612564087,
"learning_rate": 0.000162,
"loss": 0.6219,
"step": 190
},
{
"epoch": 0.8,
"grad_norm": 1.9483362436294556,
"learning_rate": 0.00016,
"loss": 0.6032,
"step": 200
},
{
"epoch": 0.8,
"eval_accuracy": 0.808,
"eval_loss": 0.5516341328620911,
"eval_runtime": 14.5864,
"eval_samples_per_second": 68.557,
"eval_steps_per_second": 8.57,
"step": 200
},
{
"epoch": 0.84,
"grad_norm": 2.7376227378845215,
"learning_rate": 0.00015800000000000002,
"loss": 0.4968,
"step": 210
},
{
"epoch": 0.88,
"grad_norm": 1.563944697380066,
"learning_rate": 0.00015600000000000002,
"loss": 0.4505,
"step": 220
},
{
"epoch": 0.92,
"grad_norm": 1.3606369495391846,
"learning_rate": 0.000154,
"loss": 0.5368,
"step": 230
},
{
"epoch": 0.96,
"grad_norm": 1.3428421020507812,
"learning_rate": 0.000152,
"loss": 0.4932,
"step": 240
},
{
"epoch": 1.0,
"grad_norm": 1.9562724828720093,
"learning_rate": 0.00015000000000000001,
"loss": 0.4884,
"step": 250
},
{
"epoch": 1.04,
"grad_norm": 0.947496771812439,
"learning_rate": 0.000148,
"loss": 0.381,
"step": 260
},
{
"epoch": 1.08,
"grad_norm": 1.6039777994155884,
"learning_rate": 0.000146,
"loss": 0.6633,
"step": 270
},
{
"epoch": 1.12,
"grad_norm": 1.8116464614868164,
"learning_rate": 0.000144,
"loss": 0.3728,
"step": 280
},
{
"epoch": 1.16,
"grad_norm": 1.6644967794418335,
"learning_rate": 0.000142,
"loss": 0.3299,
"step": 290
},
{
"epoch": 1.2,
"grad_norm": 1.5359082221984863,
"learning_rate": 0.00014,
"loss": 0.4725,
"step": 300
},
{
"epoch": 1.2,
"eval_accuracy": 0.854,
"eval_loss": 0.43897509574890137,
"eval_runtime": 14.256,
"eval_samples_per_second": 70.146,
"eval_steps_per_second": 8.768,
"step": 300
},
{
"epoch": 1.24,
"grad_norm": 2.018160581588745,
"learning_rate": 0.000138,
"loss": 0.3064,
"step": 310
},
{
"epoch": 1.28,
"grad_norm": 1.5475637912750244,
"learning_rate": 0.00013600000000000003,
"loss": 0.2928,
"step": 320
},
{
"epoch": 1.32,
"grad_norm": 2.780301809310913,
"learning_rate": 0.000134,
"loss": 0.2959,
"step": 330
},
{
"epoch": 1.3599999999999999,
"grad_norm": 1.0915693044662476,
"learning_rate": 0.000132,
"loss": 0.3152,
"step": 340
},
{
"epoch": 1.4,
"grad_norm": 2.1470773220062256,
"learning_rate": 0.00013000000000000002,
"loss": 0.4123,
"step": 350
},
{
"epoch": 1.44,
"grad_norm": 4.054312705993652,
"learning_rate": 0.00012800000000000002,
"loss": 0.5676,
"step": 360
},
{
"epoch": 1.48,
"grad_norm": 1.8798813819885254,
"learning_rate": 0.000126,
"loss": 0.3909,
"step": 370
},
{
"epoch": 1.52,
"grad_norm": 2.3789453506469727,
"learning_rate": 0.000124,
"loss": 0.419,
"step": 380
},
{
"epoch": 1.56,
"grad_norm": 1.7660586833953857,
"learning_rate": 0.000122,
"loss": 0.516,
"step": 390
},
{
"epoch": 1.6,
"grad_norm": 3.304502010345459,
"learning_rate": 0.00012,
"loss": 0.3638,
"step": 400
},
{
"epoch": 1.6,
"eval_accuracy": 0.822,
"eval_loss": 0.4622470438480377,
"eval_runtime": 14.2766,
"eval_samples_per_second": 70.045,
"eval_steps_per_second": 8.756,
"step": 400
},
{
"epoch": 1.6400000000000001,
"grad_norm": 3.906277656555176,
"learning_rate": 0.000118,
"loss": 0.3608,
"step": 410
},
{
"epoch": 1.6800000000000002,
"grad_norm": 2.591684103012085,
"learning_rate": 0.000116,
"loss": 0.4414,
"step": 420
},
{
"epoch": 1.72,
"grad_norm": 0.6823468804359436,
"learning_rate": 0.00011399999999999999,
"loss": 0.3937,
"step": 430
},
{
"epoch": 1.76,
"grad_norm": 2.4249002933502197,
"learning_rate": 0.00011200000000000001,
"loss": 0.2984,
"step": 440
},
{
"epoch": 1.8,
"grad_norm": 2.575287103652954,
"learning_rate": 0.00011000000000000002,
"loss": 0.4073,
"step": 450
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.8557507395744324,
"learning_rate": 0.00010800000000000001,
"loss": 0.3573,
"step": 460
},
{
"epoch": 1.88,
"grad_norm": 3.4324100017547607,
"learning_rate": 0.00010600000000000002,
"loss": 0.3758,
"step": 470
},
{
"epoch": 1.92,
"grad_norm": 2.3825552463531494,
"learning_rate": 0.00010400000000000001,
"loss": 0.2375,
"step": 480
},
{
"epoch": 1.96,
"grad_norm": 0.9951996207237244,
"learning_rate": 0.00010200000000000001,
"loss": 0.2496,
"step": 490
},
{
"epoch": 2.0,
"grad_norm": 2.2203187942504883,
"learning_rate": 0.0001,
"loss": 0.3279,
"step": 500
},
{
"epoch": 2.0,
"eval_accuracy": 0.876,
"eval_loss": 0.3772076666355133,
"eval_runtime": 14.2674,
"eval_samples_per_second": 70.09,
"eval_steps_per_second": 8.761,
"step": 500
},
{
"epoch": 2.04,
"grad_norm": 1.8857389688491821,
"learning_rate": 9.8e-05,
"loss": 0.1633,
"step": 510
},
{
"epoch": 2.08,
"grad_norm": 5.698770046234131,
"learning_rate": 9.6e-05,
"loss": 0.2812,
"step": 520
},
{
"epoch": 2.12,
"grad_norm": 1.7683120965957642,
"learning_rate": 9.4e-05,
"loss": 0.1895,
"step": 530
},
{
"epoch": 2.16,
"grad_norm": 0.6420239806175232,
"learning_rate": 9.200000000000001e-05,
"loss": 0.1732,
"step": 540
},
{
"epoch": 2.2,
"grad_norm": 0.8955737948417664,
"learning_rate": 9e-05,
"loss": 0.1557,
"step": 550
},
{
"epoch": 2.24,
"grad_norm": 2.202012300491333,
"learning_rate": 8.800000000000001e-05,
"loss": 0.2851,
"step": 560
},
{
"epoch": 2.2800000000000002,
"grad_norm": 3.6105308532714844,
"learning_rate": 8.6e-05,
"loss": 0.1645,
"step": 570
},
{
"epoch": 2.32,
"grad_norm": 3.514596462249756,
"learning_rate": 8.4e-05,
"loss": 0.1399,
"step": 580
},
{
"epoch": 2.36,
"grad_norm": 4.36515474319458,
"learning_rate": 8.2e-05,
"loss": 0.2495,
"step": 590
},
{
"epoch": 2.4,
"grad_norm": 0.10514427721500397,
"learning_rate": 8e-05,
"loss": 0.1337,
"step": 600
},
{
"epoch": 2.4,
"eval_accuracy": 0.869,
"eval_loss": 0.45184341073036194,
"eval_runtime": 14.683,
"eval_samples_per_second": 68.106,
"eval_steps_per_second": 8.513,
"step": 600
},
{
"epoch": 2.44,
"grad_norm": 1.140317440032959,
"learning_rate": 7.800000000000001e-05,
"loss": 0.1493,
"step": 610
},
{
"epoch": 2.48,
"grad_norm": 0.3709057569503784,
"learning_rate": 7.6e-05,
"loss": 0.164,
"step": 620
},
{
"epoch": 2.52,
"grad_norm": 3.097055196762085,
"learning_rate": 7.4e-05,
"loss": 0.208,
"step": 630
},
{
"epoch": 2.56,
"grad_norm": 3.960178852081299,
"learning_rate": 7.2e-05,
"loss": 0.2337,
"step": 640
},
{
"epoch": 2.6,
"grad_norm": 2.339881420135498,
"learning_rate": 7e-05,
"loss": 0.167,
"step": 650
},
{
"epoch": 2.64,
"grad_norm": 3.97763729095459,
"learning_rate": 6.800000000000001e-05,
"loss": 0.2026,
"step": 660
},
{
"epoch": 2.68,
"grad_norm": 0.5411188006401062,
"learning_rate": 6.6e-05,
"loss": 0.0895,
"step": 670
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.25824812054634094,
"learning_rate": 6.400000000000001e-05,
"loss": 0.0933,
"step": 680
},
{
"epoch": 2.76,
"grad_norm": 0.2557239830493927,
"learning_rate": 6.2e-05,
"loss": 0.1155,
"step": 690
},
{
"epoch": 2.8,
"grad_norm": 5.947152137756348,
"learning_rate": 6e-05,
"loss": 0.236,
"step": 700
},
{
"epoch": 2.8,
"eval_accuracy": 0.878,
"eval_loss": 0.37660717964172363,
"eval_runtime": 14.2296,
"eval_samples_per_second": 70.276,
"eval_steps_per_second": 8.785,
"step": 700
},
{
"epoch": 2.84,
"grad_norm": 2.6783535480499268,
"learning_rate": 5.8e-05,
"loss": 0.1437,
"step": 710
},
{
"epoch": 2.88,
"grad_norm": 1.7082568407058716,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.1498,
"step": 720
},
{
"epoch": 2.92,
"grad_norm": 0.3654639720916748,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.1544,
"step": 730
},
{
"epoch": 2.96,
"grad_norm": 2.7878735065460205,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.2657,
"step": 740
},
{
"epoch": 3.0,
"grad_norm": 3.31339430809021,
"learning_rate": 5e-05,
"loss": 0.103,
"step": 750
},
{
"epoch": 3.04,
"grad_norm": 0.41359299421310425,
"learning_rate": 4.8e-05,
"loss": 0.0904,
"step": 760
},
{
"epoch": 3.08,
"grad_norm": 0.11081337183713913,
"learning_rate": 4.600000000000001e-05,
"loss": 0.0475,
"step": 770
},
{
"epoch": 3.12,
"grad_norm": 0.6292364001274109,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.0613,
"step": 780
},
{
"epoch": 3.16,
"grad_norm": 0.06634623557329178,
"learning_rate": 4.2e-05,
"loss": 0.0419,
"step": 790
},
{
"epoch": 3.2,
"grad_norm": 3.720346212387085,
"learning_rate": 4e-05,
"loss": 0.0275,
"step": 800
},
{
"epoch": 3.2,
"eval_accuracy": 0.891,
"eval_loss": 0.3517528176307678,
"eval_runtime": 14.2729,
"eval_samples_per_second": 70.063,
"eval_steps_per_second": 8.758,
"step": 800
},
{
"epoch": 3.24,
"grad_norm": 0.15002816915512085,
"learning_rate": 3.8e-05,
"loss": 0.0425,
"step": 810
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.08299177885055542,
"learning_rate": 3.6e-05,
"loss": 0.0465,
"step": 820
},
{
"epoch": 3.32,
"grad_norm": 0.41334620118141174,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.0434,
"step": 830
},
{
"epoch": 3.36,
"grad_norm": 0.5403936505317688,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.0301,
"step": 840
},
{
"epoch": 3.4,
"grad_norm": 0.08261027932167053,
"learning_rate": 3e-05,
"loss": 0.072,
"step": 850
},
{
"epoch": 3.44,
"grad_norm": 1.0293442010879517,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.082,
"step": 860
},
{
"epoch": 3.48,
"grad_norm": 1.7797234058380127,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.0748,
"step": 870
},
{
"epoch": 3.52,
"grad_norm": 3.523738145828247,
"learning_rate": 2.4e-05,
"loss": 0.1751,
"step": 880
},
{
"epoch": 3.56,
"grad_norm": 0.06309465318918228,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.0383,
"step": 890
},
{
"epoch": 3.6,
"grad_norm": 2.1426751613616943,
"learning_rate": 2e-05,
"loss": 0.0427,
"step": 900
},
{
"epoch": 3.6,
"eval_accuracy": 0.896,
"eval_loss": 0.3709311783313751,
"eval_runtime": 14.359,
"eval_samples_per_second": 69.643,
"eval_steps_per_second": 8.705,
"step": 900
},
{
"epoch": 3.64,
"grad_norm": 1.3229968547821045,
"learning_rate": 1.8e-05,
"loss": 0.0352,
"step": 910
},
{
"epoch": 3.68,
"grad_norm": 0.08263090997934341,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0192,
"step": 920
},
{
"epoch": 3.7199999999999998,
"grad_norm": 0.1414523720741272,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.0724,
"step": 930
},
{
"epoch": 3.76,
"grad_norm": 0.05866268649697304,
"learning_rate": 1.2e-05,
"loss": 0.0289,
"step": 940
},
{
"epoch": 3.8,
"grad_norm": 0.08174656331539154,
"learning_rate": 1e-05,
"loss": 0.0264,
"step": 950
},
{
"epoch": 3.84,
"grad_norm": 0.07566811144351959,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0225,
"step": 960
},
{
"epoch": 3.88,
"grad_norm": 0.06544584035873413,
"learning_rate": 6e-06,
"loss": 0.0488,
"step": 970
},
{
"epoch": 3.92,
"grad_norm": 0.2268047034740448,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0423,
"step": 980
},
{
"epoch": 3.96,
"grad_norm": 0.05503053963184357,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0506,
"step": 990
},
{
"epoch": 4.0,
"grad_norm": 0.3238757252693176,
"learning_rate": 0.0,
"loss": 0.0363,
"step": 1000
},
{
"epoch": 4.0,
"eval_accuracy": 0.905,
"eval_loss": 0.34654033184051514,
"eval_runtime": 14.8487,
"eval_samples_per_second": 67.346,
"eval_steps_per_second": 8.418,
"step": 1000
},
{
"epoch": 4.0,
"step": 1000,
"total_flos": 1.239905171570688e+18,
"train_loss": 0.3179253642559052,
"train_runtime": 607.1163,
"train_samples_per_second": 26.354,
"train_steps_per_second": 1.647
}
],
"logging_steps": 10,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.239905171570688e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}