JYL480 commited on
Commit
4185a7d
1 Parent(s): d5a355b

End of training

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  metrics:
7
  - accuracy
@@ -15,10 +16,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # vit-base-images
17
 
18
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.3465
21
- - Accuracy: 0.905
22
 
23
  ## Model description
24
 
@@ -50,16 +51,16 @@ The following hyperparameters were used during training:
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
- | 0.7334 | 0.4 | 100 | 0.6142 | 0.779 |
54
- | 0.6032 | 0.8 | 200 | 0.5516 | 0.808 |
55
- | 0.4725 | 1.2 | 300 | 0.4390 | 0.854 |
56
- | 0.3638 | 1.6 | 400 | 0.4622 | 0.822 |
57
- | 0.3279 | 2.0 | 500 | 0.3772 | 0.876 |
58
- | 0.1337 | 2.4 | 600 | 0.4518 | 0.869 |
59
- | 0.236 | 2.8 | 700 | 0.3766 | 0.878 |
60
- | 0.0275 | 3.2 | 800 | 0.3518 | 0.891 |
61
- | 0.0427 | 3.6 | 900 | 0.3709 | 0.896 |
62
- | 0.0363 | 4.0 | 1000 | 0.3465 | 0.905 |
63
 
64
 
65
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
 
16
 
17
  # vit-base-images
18
 
19
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the marmal88/skin_cancer dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.0918
22
+ - Accuracy: 0.981
23
 
24
  ## Model description
25
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
+ | 0.8785 | 0.4 | 100 | 0.7795 | 0.711 |
55
+ | 0.7076 | 0.8 | 200 | 0.5421 | 0.818 |
56
+ | 0.4283 | 1.2 | 300 | 0.3951 | 0.876 |
57
+ | 0.4251 | 1.6 | 400 | 0.3818 | 0.864 |
58
+ | 0.335 | 2.0 | 500 | 0.2474 | 0.924 |
59
+ | 0.2286 | 2.4 | 600 | 0.1675 | 0.952 |
60
+ | 0.1523 | 2.8 | 700 | 0.1641 | 0.954 |
61
+ | 0.1346 | 3.2 | 800 | 0.1120 | 0.969 |
62
+ | 0.0638 | 3.6 | 900 | 0.1025 | 0.978 |
63
+ | 0.0574 | 4.0 | 1000 | 0.0918 | 0.981 |
64
 
65
 
66
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.905,
4
- "eval_loss": 0.34654033184051514,
5
- "eval_runtime": 21.2431,
6
- "eval_samples_per_second": 47.074,
7
- "eval_steps_per_second": 5.884,
8
  "total_flos": 1.239905171570688e+18,
9
- "train_loss": 0.3179253642559052,
10
- "train_runtime": 607.1163,
11
- "train_samples_per_second": 26.354,
12
- "train_steps_per_second": 1.647
13
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.981,
4
+ "eval_loss": 0.09177211672067642,
5
+ "eval_runtime": 15.1575,
6
+ "eval_samples_per_second": 65.974,
7
+ "eval_steps_per_second": 8.247,
8
  "total_flos": 1.239905171570688e+18,
9
+ "train_loss": 0.3459155881404877,
10
+ "train_runtime": 562.8746,
11
+ "train_samples_per_second": 28.426,
12
+ "train_steps_per_second": 1.777
13
  }
config.json CHANGED
@@ -9,21 +9,21 @@
9
  "hidden_dropout_prob": 0.0,
10
  "hidden_size": 768,
11
  "id2label": {
12
- "0": "basal_cell_carcinoma",
13
- "1": "benign_keratosis-like_lesions",
14
- "2": "melanocytic_Nevi",
15
  "3": "dermatofibroma",
16
- "4": "actinic_keratoses"
17
  },
18
  "image_size": 224,
19
  "initializer_range": 0.02,
20
  "intermediate_size": 3072,
21
  "label2id": {
22
- "actinic_keratoses": "4",
23
- "basal_cell_carcinoma": "0",
24
- "benign_keratosis-like_lesions": "1",
25
  "dermatofibroma": "3",
26
- "melanocytic_Nevi": "2"
27
  },
28
  "layer_norm_eps": 1e-12,
29
  "model_type": "vit",
 
9
  "hidden_dropout_prob": 0.0,
10
  "hidden_size": 768,
11
  "id2label": {
12
+ "0": "benign_keratosis-like_lesions",
13
+ "1": "basal_cell_carcinoma",
14
+ "2": "actinic_keratoses",
15
  "3": "dermatofibroma",
16
+ "4": "melanocytic_Nevi"
17
  },
18
  "image_size": 224,
19
  "initializer_range": 0.02,
20
  "intermediate_size": 3072,
21
  "label2id": {
22
+ "actinic_keratoses": "2",
23
+ "basal_cell_carcinoma": "1",
24
+ "benign_keratosis-like_lesions": "0",
25
  "dermatofibroma": "3",
26
+ "melanocytic_Nevi": "4"
27
  },
28
  "layer_norm_eps": 1e-12,
29
  "model_type": "vit",
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.905,
4
- "eval_loss": 0.34654033184051514,
5
- "eval_runtime": 21.2431,
6
- "eval_samples_per_second": 47.074,
7
- "eval_steps_per_second": 5.884
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.981,
4
+ "eval_loss": 0.09177211672067642,
5
+ "eval_runtime": 15.1575,
6
+ "eval_samples_per_second": 65.974,
7
+ "eval_steps_per_second": 8.247
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65bf56253dc41fb2232cd24bf6b31aabbe82f9a83128a3285ee7d833751a62a9
3
  size 343233204
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a857093be94afc442b9b7de17ca6322749916293529da0a7d0c659ac6aae89e0
3
  size 343233204
runs/Aug01_12-35-15_3c8cb0b5712e/events.out.tfevents.1722515747.3c8cb0b5712e.503.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76bef5ca525d67c1ea03da9dde2800a9daed78991fc38127a05ec85f52dd32c
3
+ size 29704
runs/Aug01_12-35-15_3c8cb0b5712e/events.out.tfevents.1722516355.3c8cb0b5712e.503.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7b156180a1f6f187a556dc3a746cc4aba1daa990ceb96032c97066cd12133e0
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 1.239905171570688e+18,
4
- "train_loss": 0.3179253642559052,
5
- "train_runtime": 607.1163,
6
- "train_samples_per_second": 26.354,
7
- "train_steps_per_second": 1.647
8
  }
 
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 1.239905171570688e+18,
4
+ "train_loss": 0.3459155881404877,
5
+ "train_runtime": 562.8746,
6
+ "train_samples_per_second": 28.426,
7
+ "train_steps_per_second": 1.777
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.34654033184051514,
3
  "best_model_checkpoint": "./vit-base-images/checkpoint-1000",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
@@ -10,802 +10,802 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
- "grad_norm": 1.5996569395065308,
14
  "learning_rate": 0.00019800000000000002,
15
- "loss": 1.0932,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.08,
20
- "grad_norm": 1.9198634624481201,
21
  "learning_rate": 0.000196,
22
- "loss": 0.9522,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.12,
27
- "grad_norm": 1.2456237077713013,
28
  "learning_rate": 0.000194,
29
- "loss": 0.6875,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.16,
34
- "grad_norm": 1.7687036991119385,
35
  "learning_rate": 0.000192,
36
- "loss": 0.9009,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.2,
41
- "grad_norm": 2.880617141723633,
42
  "learning_rate": 0.00019,
43
- "loss": 0.7155,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.24,
48
- "grad_norm": 1.5424929857254028,
49
  "learning_rate": 0.000188,
50
- "loss": 0.8144,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.28,
55
- "grad_norm": 1.816297173500061,
56
  "learning_rate": 0.00018600000000000002,
57
- "loss": 0.6641,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.32,
62
- "grad_norm": 2.0054569244384766,
63
  "learning_rate": 0.00018400000000000003,
64
- "loss": 0.5917,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.36,
69
- "grad_norm": 2.2372283935546875,
70
  "learning_rate": 0.000182,
71
- "loss": 0.7852,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.4,
76
- "grad_norm": 2.542130708694458,
77
  "learning_rate": 0.00018,
78
- "loss": 0.7334,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.4,
83
- "eval_accuracy": 0.779,
84
- "eval_loss": 0.6142178773880005,
85
- "eval_runtime": 18.2795,
86
- "eval_samples_per_second": 54.706,
87
- "eval_steps_per_second": 6.838,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.44,
92
- "grad_norm": 1.4493112564086914,
93
  "learning_rate": 0.00017800000000000002,
94
- "loss": 0.5289,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.48,
99
- "grad_norm": 1.905771017074585,
100
  "learning_rate": 0.00017600000000000002,
101
- "loss": 0.6191,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.52,
106
- "grad_norm": 2.2236440181732178,
107
  "learning_rate": 0.000174,
108
- "loss": 0.5111,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.56,
113
- "grad_norm": 2.113398551940918,
114
  "learning_rate": 0.000172,
115
- "loss": 0.6606,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.6,
120
- "grad_norm": 2.4624953269958496,
121
  "learning_rate": 0.00017,
122
- "loss": 0.5002,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 0.64,
127
- "grad_norm": 2.324570417404175,
128
  "learning_rate": 0.000168,
129
- "loss": 0.9353,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 0.68,
134
- "grad_norm": 5.384814262390137,
135
  "learning_rate": 0.000166,
136
- "loss": 0.6604,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 0.72,
141
- "grad_norm": 0.8541224598884583,
142
  "learning_rate": 0.000164,
143
- "loss": 0.4894,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 0.76,
148
- "grad_norm": 3.017305612564087,
149
  "learning_rate": 0.000162,
150
- "loss": 0.6219,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 0.8,
155
- "grad_norm": 1.9483362436294556,
156
  "learning_rate": 0.00016,
157
- "loss": 0.6032,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 0.8,
162
- "eval_accuracy": 0.808,
163
- "eval_loss": 0.5516341328620911,
164
- "eval_runtime": 14.5864,
165
- "eval_samples_per_second": 68.557,
166
- "eval_steps_per_second": 8.57,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.84,
171
- "grad_norm": 2.7376227378845215,
172
  "learning_rate": 0.00015800000000000002,
173
- "loss": 0.4968,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 0.88,
178
- "grad_norm": 1.563944697380066,
179
  "learning_rate": 0.00015600000000000002,
180
- "loss": 0.4505,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 0.92,
185
- "grad_norm": 1.3606369495391846,
186
  "learning_rate": 0.000154,
187
- "loss": 0.5368,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 0.96,
192
- "grad_norm": 1.3428421020507812,
193
  "learning_rate": 0.000152,
194
- "loss": 0.4932,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.0,
199
- "grad_norm": 1.9562724828720093,
200
  "learning_rate": 0.00015000000000000001,
201
- "loss": 0.4884,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.04,
206
- "grad_norm": 0.947496771812439,
207
  "learning_rate": 0.000148,
208
- "loss": 0.381,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.08,
213
- "grad_norm": 1.6039777994155884,
214
  "learning_rate": 0.000146,
215
- "loss": 0.6633,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.12,
220
- "grad_norm": 1.8116464614868164,
221
  "learning_rate": 0.000144,
222
- "loss": 0.3728,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.16,
227
- "grad_norm": 1.6644967794418335,
228
  "learning_rate": 0.000142,
229
- "loss": 0.3299,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.2,
234
- "grad_norm": 1.5359082221984863,
235
  "learning_rate": 0.00014,
236
- "loss": 0.4725,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.2,
241
- "eval_accuracy": 0.854,
242
- "eval_loss": 0.43897509574890137,
243
- "eval_runtime": 14.256,
244
- "eval_samples_per_second": 70.146,
245
- "eval_steps_per_second": 8.768,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.24,
250
- "grad_norm": 2.018160581588745,
251
  "learning_rate": 0.000138,
252
- "loss": 0.3064,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 1.28,
257
- "grad_norm": 1.5475637912750244,
258
  "learning_rate": 0.00013600000000000003,
259
- "loss": 0.2928,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 1.32,
264
- "grad_norm": 2.780301809310913,
265
  "learning_rate": 0.000134,
266
- "loss": 0.2959,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 1.3599999999999999,
271
- "grad_norm": 1.0915693044662476,
272
  "learning_rate": 0.000132,
273
- "loss": 0.3152,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 1.4,
278
- "grad_norm": 2.1470773220062256,
279
  "learning_rate": 0.00013000000000000002,
280
- "loss": 0.4123,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.44,
285
- "grad_norm": 4.054312705993652,
286
  "learning_rate": 0.00012800000000000002,
287
- "loss": 0.5676,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 1.48,
292
- "grad_norm": 1.8798813819885254,
293
  "learning_rate": 0.000126,
294
- "loss": 0.3909,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 1.52,
299
- "grad_norm": 2.3789453506469727,
300
  "learning_rate": 0.000124,
301
- "loss": 0.419,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 1.56,
306
- "grad_norm": 1.7660586833953857,
307
  "learning_rate": 0.000122,
308
- "loss": 0.516,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 1.6,
313
- "grad_norm": 3.304502010345459,
314
  "learning_rate": 0.00012,
315
- "loss": 0.3638,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 1.6,
320
- "eval_accuracy": 0.822,
321
- "eval_loss": 0.4622470438480377,
322
- "eval_runtime": 14.2766,
323
- "eval_samples_per_second": 70.045,
324
- "eval_steps_per_second": 8.756,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 1.6400000000000001,
329
- "grad_norm": 3.906277656555176,
330
  "learning_rate": 0.000118,
331
- "loss": 0.3608,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 1.6800000000000002,
336
- "grad_norm": 2.591684103012085,
337
  "learning_rate": 0.000116,
338
- "loss": 0.4414,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 1.72,
343
- "grad_norm": 0.6823468804359436,
344
  "learning_rate": 0.00011399999999999999,
345
- "loss": 0.3937,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 1.76,
350
- "grad_norm": 2.4249002933502197,
351
  "learning_rate": 0.00011200000000000001,
352
- "loss": 0.2984,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 1.8,
357
- "grad_norm": 2.575287103652954,
358
  "learning_rate": 0.00011000000000000002,
359
- "loss": 0.4073,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 1.8399999999999999,
364
- "grad_norm": 0.8557507395744324,
365
  "learning_rate": 0.00010800000000000001,
366
- "loss": 0.3573,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 1.88,
371
- "grad_norm": 3.4324100017547607,
372
  "learning_rate": 0.00010600000000000002,
373
- "loss": 0.3758,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 1.92,
378
- "grad_norm": 2.3825552463531494,
379
  "learning_rate": 0.00010400000000000001,
380
- "loss": 0.2375,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 1.96,
385
- "grad_norm": 0.9951996207237244,
386
  "learning_rate": 0.00010200000000000001,
387
- "loss": 0.2496,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 2.0,
392
- "grad_norm": 2.2203187942504883,
393
  "learning_rate": 0.0001,
394
- "loss": 0.3279,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 2.0,
399
- "eval_accuracy": 0.876,
400
- "eval_loss": 0.3772076666355133,
401
- "eval_runtime": 14.2674,
402
- "eval_samples_per_second": 70.09,
403
- "eval_steps_per_second": 8.761,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 2.04,
408
- "grad_norm": 1.8857389688491821,
409
  "learning_rate": 9.8e-05,
410
- "loss": 0.1633,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 2.08,
415
- "grad_norm": 5.698770046234131,
416
  "learning_rate": 9.6e-05,
417
- "loss": 0.2812,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 2.12,
422
- "grad_norm": 1.7683120965957642,
423
  "learning_rate": 9.4e-05,
424
- "loss": 0.1895,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 2.16,
429
- "grad_norm": 0.6420239806175232,
430
  "learning_rate": 9.200000000000001e-05,
431
- "loss": 0.1732,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 2.2,
436
- "grad_norm": 0.8955737948417664,
437
  "learning_rate": 9e-05,
438
- "loss": 0.1557,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 2.24,
443
- "grad_norm": 2.202012300491333,
444
  "learning_rate": 8.800000000000001e-05,
445
- "loss": 0.2851,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 2.2800000000000002,
450
- "grad_norm": 3.6105308532714844,
451
  "learning_rate": 8.6e-05,
452
- "loss": 0.1645,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 2.32,
457
- "grad_norm": 3.514596462249756,
458
  "learning_rate": 8.4e-05,
459
- "loss": 0.1399,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 2.36,
464
- "grad_norm": 4.36515474319458,
465
  "learning_rate": 8.2e-05,
466
- "loss": 0.2495,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 2.4,
471
- "grad_norm": 0.10514427721500397,
472
  "learning_rate": 8e-05,
473
- "loss": 0.1337,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 2.4,
478
- "eval_accuracy": 0.869,
479
- "eval_loss": 0.45184341073036194,
480
- "eval_runtime": 14.683,
481
- "eval_samples_per_second": 68.106,
482
- "eval_steps_per_second": 8.513,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 2.44,
487
- "grad_norm": 1.140317440032959,
488
  "learning_rate": 7.800000000000001e-05,
489
- "loss": 0.1493,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 2.48,
494
- "grad_norm": 0.3709057569503784,
495
  "learning_rate": 7.6e-05,
496
- "loss": 0.164,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 2.52,
501
- "grad_norm": 3.097055196762085,
502
  "learning_rate": 7.4e-05,
503
- "loss": 0.208,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 2.56,
508
- "grad_norm": 3.960178852081299,
509
  "learning_rate": 7.2e-05,
510
- "loss": 0.2337,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 2.6,
515
- "grad_norm": 2.339881420135498,
516
  "learning_rate": 7e-05,
517
- "loss": 0.167,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 2.64,
522
- "grad_norm": 3.97763729095459,
523
  "learning_rate": 6.800000000000001e-05,
524
- "loss": 0.2026,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 2.68,
529
- "grad_norm": 0.5411188006401062,
530
  "learning_rate": 6.6e-05,
531
- "loss": 0.0895,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 2.7199999999999998,
536
- "grad_norm": 0.25824812054634094,
537
  "learning_rate": 6.400000000000001e-05,
538
- "loss": 0.0933,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 2.76,
543
- "grad_norm": 0.2557239830493927,
544
  "learning_rate": 6.2e-05,
545
- "loss": 0.1155,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 2.8,
550
- "grad_norm": 5.947152137756348,
551
  "learning_rate": 6e-05,
552
- "loss": 0.236,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 2.8,
557
- "eval_accuracy": 0.878,
558
- "eval_loss": 0.37660717964172363,
559
- "eval_runtime": 14.2296,
560
- "eval_samples_per_second": 70.276,
561
- "eval_steps_per_second": 8.785,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 2.84,
566
- "grad_norm": 2.6783535480499268,
567
  "learning_rate": 5.8e-05,
568
- "loss": 0.1437,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 2.88,
573
- "grad_norm": 1.7082568407058716,
574
  "learning_rate": 5.6000000000000006e-05,
575
- "loss": 0.1498,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 2.92,
580
- "grad_norm": 0.3654639720916748,
581
  "learning_rate": 5.4000000000000005e-05,
582
- "loss": 0.1544,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 2.96,
587
- "grad_norm": 2.7878735065460205,
588
  "learning_rate": 5.2000000000000004e-05,
589
- "loss": 0.2657,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 3.0,
594
- "grad_norm": 3.31339430809021,
595
  "learning_rate": 5e-05,
596
- "loss": 0.103,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 3.04,
601
- "grad_norm": 0.41359299421310425,
602
  "learning_rate": 4.8e-05,
603
- "loss": 0.0904,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 3.08,
608
- "grad_norm": 0.11081337183713913,
609
  "learning_rate": 4.600000000000001e-05,
610
- "loss": 0.0475,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 3.12,
615
- "grad_norm": 0.6292364001274109,
616
  "learning_rate": 4.4000000000000006e-05,
617
- "loss": 0.0613,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 3.16,
622
- "grad_norm": 0.06634623557329178,
623
  "learning_rate": 4.2e-05,
624
- "loss": 0.0419,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 3.2,
629
- "grad_norm": 3.720346212387085,
630
  "learning_rate": 4e-05,
631
- "loss": 0.0275,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 3.2,
636
- "eval_accuracy": 0.891,
637
- "eval_loss": 0.3517528176307678,
638
- "eval_runtime": 14.2729,
639
- "eval_samples_per_second": 70.063,
640
- "eval_steps_per_second": 8.758,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 3.24,
645
- "grad_norm": 0.15002816915512085,
646
  "learning_rate": 3.8e-05,
647
- "loss": 0.0425,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 3.2800000000000002,
652
- "grad_norm": 0.08299177885055542,
653
  "learning_rate": 3.6e-05,
654
- "loss": 0.0465,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 3.32,
659
- "grad_norm": 0.41334620118141174,
660
  "learning_rate": 3.4000000000000007e-05,
661
- "loss": 0.0434,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 3.36,
666
- "grad_norm": 0.5403936505317688,
667
  "learning_rate": 3.2000000000000005e-05,
668
- "loss": 0.0301,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 3.4,
673
- "grad_norm": 0.08261027932167053,
674
  "learning_rate": 3e-05,
675
- "loss": 0.072,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 3.44,
680
- "grad_norm": 1.0293442010879517,
681
  "learning_rate": 2.8000000000000003e-05,
682
- "loss": 0.082,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 3.48,
687
- "grad_norm": 1.7797234058380127,
688
  "learning_rate": 2.6000000000000002e-05,
689
- "loss": 0.0748,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 3.52,
694
- "grad_norm": 3.523738145828247,
695
  "learning_rate": 2.4e-05,
696
- "loss": 0.1751,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 3.56,
701
- "grad_norm": 0.06309465318918228,
702
  "learning_rate": 2.2000000000000003e-05,
703
- "loss": 0.0383,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 3.6,
708
- "grad_norm": 2.1426751613616943,
709
  "learning_rate": 2e-05,
710
- "loss": 0.0427,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 3.6,
715
- "eval_accuracy": 0.896,
716
- "eval_loss": 0.3709311783313751,
717
- "eval_runtime": 14.359,
718
- "eval_samples_per_second": 69.643,
719
- "eval_steps_per_second": 8.705,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 3.64,
724
- "grad_norm": 1.3229968547821045,
725
  "learning_rate": 1.8e-05,
726
- "loss": 0.0352,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 3.68,
731
- "grad_norm": 0.08263090997934341,
732
  "learning_rate": 1.6000000000000003e-05,
733
- "loss": 0.0192,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 3.7199999999999998,
738
- "grad_norm": 0.1414523720741272,
739
  "learning_rate": 1.4000000000000001e-05,
740
- "loss": 0.0724,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 3.76,
745
- "grad_norm": 0.05866268649697304,
746
  "learning_rate": 1.2e-05,
747
- "loss": 0.0289,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 3.8,
752
- "grad_norm": 0.08174656331539154,
753
  "learning_rate": 1e-05,
754
- "loss": 0.0264,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 3.84,
759
- "grad_norm": 0.07566811144351959,
760
  "learning_rate": 8.000000000000001e-06,
761
- "loss": 0.0225,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 3.88,
766
- "grad_norm": 0.06544584035873413,
767
  "learning_rate": 6e-06,
768
- "loss": 0.0488,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 3.92,
773
- "grad_norm": 0.2268047034740448,
774
  "learning_rate": 4.000000000000001e-06,
775
- "loss": 0.0423,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 3.96,
780
- "grad_norm": 0.05503053963184357,
781
  "learning_rate": 2.0000000000000003e-06,
782
- "loss": 0.0506,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 4.0,
787
- "grad_norm": 0.3238757252693176,
788
  "learning_rate": 0.0,
789
- "loss": 0.0363,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 4.0,
794
- "eval_accuracy": 0.905,
795
- "eval_loss": 0.34654033184051514,
796
- "eval_runtime": 14.8487,
797
- "eval_samples_per_second": 67.346,
798
- "eval_steps_per_second": 8.418,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 4.0,
803
  "step": 1000,
804
  "total_flos": 1.239905171570688e+18,
805
- "train_loss": 0.3179253642559052,
806
- "train_runtime": 607.1163,
807
- "train_samples_per_second": 26.354,
808
- "train_steps_per_second": 1.647
809
  }
810
  ],
811
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.09177211672067642,
3
  "best_model_checkpoint": "./vit-base-images/checkpoint-1000",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
+ "grad_norm": 2.1739912033081055,
14
  "learning_rate": 0.00019800000000000002,
15
+ "loss": 1.2716,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.08,
20
+ "grad_norm": 2.4669973850250244,
21
  "learning_rate": 0.000196,
22
+ "loss": 1.0636,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.12,
27
+ "grad_norm": 1.3152661323547363,
28
  "learning_rate": 0.000194,
29
+ "loss": 0.8686,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.16,
34
+ "grad_norm": 1.30520761013031,
35
  "learning_rate": 0.000192,
36
+ "loss": 0.8709,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.2,
41
+ "grad_norm": 3.462522506713867,
42
  "learning_rate": 0.00019,
43
+ "loss": 0.8241,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.24,
48
+ "grad_norm": 2.773179531097412,
49
  "learning_rate": 0.000188,
50
+ "loss": 0.743,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.28,
55
+ "grad_norm": 0.8740523457527161,
56
  "learning_rate": 0.00018600000000000002,
57
+ "loss": 0.7111,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.32,
62
+ "grad_norm": 3.179422378540039,
63
  "learning_rate": 0.00018400000000000003,
64
+ "loss": 0.7327,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.36,
69
+ "grad_norm": 2.8877387046813965,
70
  "learning_rate": 0.000182,
71
+ "loss": 0.8438,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.4,
76
+ "grad_norm": 2.021406888961792,
77
  "learning_rate": 0.00018,
78
+ "loss": 0.8785,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.4,
83
+ "eval_accuracy": 0.711,
84
+ "eval_loss": 0.779495358467102,
85
+ "eval_runtime": 14.6516,
86
+ "eval_samples_per_second": 68.252,
87
+ "eval_steps_per_second": 8.531,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.44,
92
+ "grad_norm": 1.4613615274429321,
93
  "learning_rate": 0.00017800000000000002,
94
+ "loss": 0.767,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 0.48,
99
+ "grad_norm": 1.6331909894943237,
100
  "learning_rate": 0.00017600000000000002,
101
+ "loss": 0.9023,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 0.52,
106
+ "grad_norm": 1.2065010070800781,
107
  "learning_rate": 0.000174,
108
+ "loss": 0.7347,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.56,
113
+ "grad_norm": 1.609735369682312,
114
  "learning_rate": 0.000172,
115
+ "loss": 0.5344,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 0.6,
120
+ "grad_norm": 3.425642251968384,
121
  "learning_rate": 0.00017,
122
+ "loss": 0.5897,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 0.64,
127
+ "grad_norm": 0.7095292210578918,
128
  "learning_rate": 0.000168,
129
+ "loss": 0.6487,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 0.68,
134
+ "grad_norm": 1.6857857704162598,
135
  "learning_rate": 0.000166,
136
+ "loss": 0.479,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 0.72,
141
+ "grad_norm": 3.3737733364105225,
142
  "learning_rate": 0.000164,
143
+ "loss": 0.6547,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 0.76,
148
+ "grad_norm": 2.8827691078186035,
149
  "learning_rate": 0.000162,
150
+ "loss": 0.667,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 0.8,
155
+ "grad_norm": 0.998505711555481,
156
  "learning_rate": 0.00016,
157
+ "loss": 0.7076,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 0.8,
162
+ "eval_accuracy": 0.818,
163
+ "eval_loss": 0.5420999526977539,
164
+ "eval_runtime": 14.8716,
165
+ "eval_samples_per_second": 67.242,
166
+ "eval_steps_per_second": 8.405,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.84,
171
+ "grad_norm": 1.6601933240890503,
172
  "learning_rate": 0.00015800000000000002,
173
+ "loss": 0.5418,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 0.88,
178
+ "grad_norm": 2.71398663520813,
179
  "learning_rate": 0.00015600000000000002,
180
+ "loss": 0.6696,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 0.92,
185
+ "grad_norm": 3.1173503398895264,
186
  "learning_rate": 0.000154,
187
+ "loss": 0.5191,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 0.96,
192
+ "grad_norm": 1.7604912519454956,
193
  "learning_rate": 0.000152,
194
+ "loss": 0.5596,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 1.0,
199
+ "grad_norm": 1.8370306491851807,
200
  "learning_rate": 0.00015000000000000001,
201
+ "loss": 0.4941,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 1.04,
206
+ "grad_norm": 3.0072226524353027,
207
  "learning_rate": 0.000148,
208
+ "loss": 0.417,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 1.08,
213
+ "grad_norm": 2.4395639896392822,
214
  "learning_rate": 0.000146,
215
+ "loss": 0.5287,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 1.12,
220
+ "grad_norm": 1.1612118482589722,
221
  "learning_rate": 0.000144,
222
+ "loss": 0.4771,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.16,
227
+ "grad_norm": 1.5060698986053467,
228
  "learning_rate": 0.000142,
229
+ "loss": 0.4211,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 1.2,
234
+ "grad_norm": 2.4940316677093506,
235
  "learning_rate": 0.00014,
236
+ "loss": 0.4283,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 1.2,
241
+ "eval_accuracy": 0.876,
242
+ "eval_loss": 0.3951060175895691,
243
+ "eval_runtime": 14.0948,
244
+ "eval_samples_per_second": 70.948,
245
+ "eval_steps_per_second": 8.869,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 1.24,
250
+ "grad_norm": 3.643969774246216,
251
  "learning_rate": 0.000138,
252
+ "loss": 0.4377,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 1.28,
257
+ "grad_norm": 1.954455852508545,
258
  "learning_rate": 0.00013600000000000003,
259
+ "loss": 0.4311,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 1.32,
264
+ "grad_norm": 1.4906481504440308,
265
  "learning_rate": 0.000134,
266
+ "loss": 0.3393,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 1.3599999999999999,
271
+ "grad_norm": 2.0885210037231445,
272
  "learning_rate": 0.000132,
273
+ "loss": 0.4909,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 1.4,
278
+ "grad_norm": 4.9808173179626465,
279
  "learning_rate": 0.00013000000000000002,
280
+ "loss": 0.4071,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.44,
285
+ "grad_norm": 2.143996477127075,
286
  "learning_rate": 0.00012800000000000002,
287
+ "loss": 0.2979,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 1.48,
292
+ "grad_norm": 5.164979457855225,
293
  "learning_rate": 0.000126,
294
+ "loss": 0.4576,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 1.52,
299
+ "grad_norm": 6.777462482452393,
300
  "learning_rate": 0.000124,
301
+ "loss": 0.446,
302
  "step": 380
303
  },
304
  {
305
  "epoch": 1.56,
306
+ "grad_norm": 1.1243666410446167,
307
  "learning_rate": 0.000122,
308
+ "loss": 0.3795,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 1.6,
313
+ "grad_norm": 3.663288116455078,
314
  "learning_rate": 0.00012,
315
+ "loss": 0.4251,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 1.6,
320
+ "eval_accuracy": 0.864,
321
+ "eval_loss": 0.38176068663597107,
322
+ "eval_runtime": 14.993,
323
+ "eval_samples_per_second": 66.698,
324
+ "eval_steps_per_second": 8.337,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 1.6400000000000001,
329
+ "grad_norm": 2.137402296066284,
330
  "learning_rate": 0.000118,
331
+ "loss": 0.4443,
332
  "step": 410
333
  },
334
  {
335
  "epoch": 1.6800000000000002,
336
+ "grad_norm": 3.767395257949829,
337
  "learning_rate": 0.000116,
338
+ "loss": 0.5193,
339
  "step": 420
340
  },
341
  {
342
  "epoch": 1.72,
343
+ "grad_norm": 1.152035117149353,
344
  "learning_rate": 0.00011399999999999999,
345
+ "loss": 0.3036,
346
  "step": 430
347
  },
348
  {
349
  "epoch": 1.76,
350
+ "grad_norm": 5.046035289764404,
351
  "learning_rate": 0.00011200000000000001,
352
+ "loss": 0.2667,
353
  "step": 440
354
  },
355
  {
356
  "epoch": 1.8,
357
+ "grad_norm": 1.6602838039398193,
358
  "learning_rate": 0.00011000000000000002,
359
+ "loss": 0.3035,
360
  "step": 450
361
  },
362
  {
363
  "epoch": 1.8399999999999999,
364
+ "grad_norm": 2.508718729019165,
365
  "learning_rate": 0.00010800000000000001,
366
+ "loss": 0.4932,
367
  "step": 460
368
  },
369
  {
370
  "epoch": 1.88,
371
+ "grad_norm": 2.9392433166503906,
372
  "learning_rate": 0.00010600000000000002,
373
+ "loss": 0.4191,
374
  "step": 470
375
  },
376
  {
377
  "epoch": 1.92,
378
+ "grad_norm": 2.791663885116577,
379
  "learning_rate": 0.00010400000000000001,
380
+ "loss": 0.3038,
381
  "step": 480
382
  },
383
  {
384
  "epoch": 1.96,
385
+ "grad_norm": 2.3044326305389404,
386
  "learning_rate": 0.00010200000000000001,
387
+ "loss": 0.4298,
388
  "step": 490
389
  },
390
  {
391
  "epoch": 2.0,
392
+ "grad_norm": 3.1489250659942627,
393
  "learning_rate": 0.0001,
394
+ "loss": 0.335,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 2.0,
399
+ "eval_accuracy": 0.924,
400
+ "eval_loss": 0.24736037850379944,
401
+ "eval_runtime": 15.0667,
402
+ "eval_samples_per_second": 66.372,
403
+ "eval_steps_per_second": 8.296,
404
  "step": 500
405
  },
406
  {
407
  "epoch": 2.04,
408
+ "grad_norm": 3.3036346435546875,
409
  "learning_rate": 9.8e-05,
410
+ "loss": 0.1649,
411
  "step": 510
412
  },
413
  {
414
  "epoch": 2.08,
415
+ "grad_norm": 2.8792152404785156,
416
  "learning_rate": 9.6e-05,
417
+ "loss": 0.1627,
418
  "step": 520
419
  },
420
  {
421
  "epoch": 2.12,
422
+ "grad_norm": 0.28887757658958435,
423
  "learning_rate": 9.4e-05,
424
+ "loss": 0.3599,
425
  "step": 530
426
  },
427
  {
428
  "epoch": 2.16,
429
+ "grad_norm": 5.17996883392334,
430
  "learning_rate": 9.200000000000001e-05,
431
+ "loss": 0.3233,
432
  "step": 540
433
  },
434
  {
435
  "epoch": 2.2,
436
+ "grad_norm": 2.1433322429656982,
437
  "learning_rate": 9e-05,
438
+ "loss": 0.222,
439
  "step": 550
440
  },
441
  {
442
  "epoch": 2.24,
443
+ "grad_norm": 3.143852949142456,
444
  "learning_rate": 8.800000000000001e-05,
445
+ "loss": 0.2713,
446
  "step": 560
447
  },
448
  {
449
  "epoch": 2.2800000000000002,
450
+ "grad_norm": 0.4215773344039917,
451
  "learning_rate": 8.6e-05,
452
+ "loss": 0.1708,
453
  "step": 570
454
  },
455
  {
456
  "epoch": 2.32,
457
+ "grad_norm": 1.9217822551727295,
458
  "learning_rate": 8.4e-05,
459
+ "loss": 0.198,
460
  "step": 580
461
  },
462
  {
463
  "epoch": 2.36,
464
+ "grad_norm": 2.4554295539855957,
465
  "learning_rate": 8.2e-05,
466
+ "loss": 0.1617,
467
  "step": 590
468
  },
469
  {
470
  "epoch": 2.4,
471
+ "grad_norm": 1.2291343212127686,
472
  "learning_rate": 8e-05,
473
+ "loss": 0.2286,
474
  "step": 600
475
  },
476
  {
477
  "epoch": 2.4,
478
+ "eval_accuracy": 0.952,
479
+ "eval_loss": 0.16752035915851593,
480
+ "eval_runtime": 13.8882,
481
+ "eval_samples_per_second": 72.003,
482
+ "eval_steps_per_second": 9.0,
483
  "step": 600
484
  },
485
  {
486
  "epoch": 2.44,
487
+ "grad_norm": 0.3957996964454651,
488
  "learning_rate": 7.800000000000001e-05,
489
+ "loss": 0.1758,
490
  "step": 610
491
  },
492
  {
493
  "epoch": 2.48,
494
+ "grad_norm": 2.093458414077759,
495
  "learning_rate": 7.6e-05,
496
+ "loss": 0.138,
497
  "step": 620
498
  },
499
  {
500
  "epoch": 2.52,
501
+ "grad_norm": 1.130835771560669,
502
  "learning_rate": 7.4e-05,
503
+ "loss": 0.1194,
504
  "step": 630
505
  },
506
  {
507
  "epoch": 2.56,
508
+ "grad_norm": 1.3829611539840698,
509
  "learning_rate": 7.2e-05,
510
+ "loss": 0.1629,
511
  "step": 640
512
  },
513
  {
514
  "epoch": 2.6,
515
+ "grad_norm": 0.17628225684165955,
516
  "learning_rate": 7e-05,
517
+ "loss": 0.0954,
518
  "step": 650
519
  },
520
  {
521
  "epoch": 2.64,
522
+ "grad_norm": 1.7156352996826172,
523
  "learning_rate": 6.800000000000001e-05,
524
+ "loss": 0.172,
525
  "step": 660
526
  },
527
  {
528
  "epoch": 2.68,
529
+ "grad_norm": 3.950498342514038,
530
  "learning_rate": 6.6e-05,
531
+ "loss": 0.1414,
532
  "step": 670
533
  },
534
  {
535
  "epoch": 2.7199999999999998,
536
+ "grad_norm": 3.134085178375244,
537
  "learning_rate": 6.400000000000001e-05,
538
+ "loss": 0.1734,
539
  "step": 680
540
  },
541
  {
542
  "epoch": 2.76,
543
+ "grad_norm": 4.362244129180908,
544
  "learning_rate": 6.2e-05,
545
+ "loss": 0.1806,
546
  "step": 690
547
  },
548
  {
549
  "epoch": 2.8,
550
+ "grad_norm": 5.472875118255615,
551
  "learning_rate": 6e-05,
552
+ "loss": 0.1523,
553
  "step": 700
554
  },
555
  {
556
  "epoch": 2.8,
557
+ "eval_accuracy": 0.954,
558
+ "eval_loss": 0.1640758216381073,
559
+ "eval_runtime": 14.3922,
560
+ "eval_samples_per_second": 69.482,
561
+ "eval_steps_per_second": 8.685,
562
  "step": 700
563
  },
564
  {
565
  "epoch": 2.84,
566
+ "grad_norm": 1.1265697479248047,
567
  "learning_rate": 5.8e-05,
568
+ "loss": 0.1939,
569
  "step": 710
570
  },
571
  {
572
  "epoch": 2.88,
573
+ "grad_norm": 3.1513569355010986,
574
  "learning_rate": 5.6000000000000006e-05,
575
+ "loss": 0.1369,
576
  "step": 720
577
  },
578
  {
579
  "epoch": 2.92,
580
+ "grad_norm": 0.14964208006858826,
581
  "learning_rate": 5.4000000000000005e-05,
582
+ "loss": 0.1372,
583
  "step": 730
584
  },
585
  {
586
  "epoch": 2.96,
587
+ "grad_norm": 0.1409606784582138,
588
  "learning_rate": 5.2000000000000004e-05,
589
+ "loss": 0.1503,
590
  "step": 740
591
  },
592
  {
593
  "epoch": 3.0,
594
+ "grad_norm": 0.1397903859615326,
595
  "learning_rate": 5e-05,
596
+ "loss": 0.1655,
597
  "step": 750
598
  },
599
  {
600
  "epoch": 3.04,
601
+ "grad_norm": 0.6654576063156128,
602
  "learning_rate": 4.8e-05,
603
+ "loss": 0.0662,
604
  "step": 760
605
  },
606
  {
607
  "epoch": 3.08,
608
+ "grad_norm": 0.11212094128131866,
609
  "learning_rate": 4.600000000000001e-05,
610
+ "loss": 0.0362,
611
  "step": 770
612
  },
613
  {
614
  "epoch": 3.12,
615
+ "grad_norm": 0.8599291443824768,
616
  "learning_rate": 4.4000000000000006e-05,
617
+ "loss": 0.0545,
618
  "step": 780
619
  },
620
  {
621
  "epoch": 3.16,
622
+ "grad_norm": 0.0967254787683487,
623
  "learning_rate": 4.2e-05,
624
+ "loss": 0.0428,
625
  "step": 790
626
  },
627
  {
628
  "epoch": 3.2,
629
+ "grad_norm": 1.8969135284423828,
630
  "learning_rate": 4e-05,
631
+ "loss": 0.1346,
632
  "step": 800
633
  },
634
  {
635
  "epoch": 3.2,
636
+ "eval_accuracy": 0.969,
637
+ "eval_loss": 0.11203579604625702,
638
+ "eval_runtime": 14.1593,
639
+ "eval_samples_per_second": 70.625,
640
+ "eval_steps_per_second": 8.828,
641
  "step": 800
642
  },
643
  {
644
  "epoch": 3.24,
645
+ "grad_norm": 0.32526883482933044,
646
  "learning_rate": 3.8e-05,
647
+ "loss": 0.0576,
648
  "step": 810
649
  },
650
  {
651
  "epoch": 3.2800000000000002,
652
+ "grad_norm": 3.220393419265747,
653
  "learning_rate": 3.6e-05,
654
+ "loss": 0.0532,
655
  "step": 820
656
  },
657
  {
658
  "epoch": 3.32,
659
+ "grad_norm": 0.09663262218236923,
660
  "learning_rate": 3.4000000000000007e-05,
661
+ "loss": 0.0832,
662
  "step": 830
663
  },
664
  {
665
  "epoch": 3.36,
666
+ "grad_norm": 0.07815458625555038,
667
  "learning_rate": 3.2000000000000005e-05,
668
+ "loss": 0.0653,
669
  "step": 840
670
  },
671
  {
672
  "epoch": 3.4,
673
+ "grad_norm": 0.09688442200422287,
674
  "learning_rate": 3e-05,
675
+ "loss": 0.0763,
676
  "step": 850
677
  },
678
  {
679
  "epoch": 3.44,
680
+ "grad_norm": 0.07274580001831055,
681
  "learning_rate": 2.8000000000000003e-05,
682
+ "loss": 0.04,
683
  "step": 860
684
  },
685
  {
686
  "epoch": 3.48,
687
+ "grad_norm": 0.06821909546852112,
688
  "learning_rate": 2.6000000000000002e-05,
689
+ "loss": 0.0778,
690
  "step": 870
691
  },
692
  {
693
  "epoch": 3.52,
694
+ "grad_norm": 1.8587623834609985,
695
  "learning_rate": 2.4e-05,
696
+ "loss": 0.0563,
697
  "step": 880
698
  },
699
  {
700
  "epoch": 3.56,
701
+ "grad_norm": 0.09389644116163254,
702
  "learning_rate": 2.2000000000000003e-05,
703
+ "loss": 0.1213,
704
  "step": 890
705
  },
706
  {
707
  "epoch": 3.6,
708
+ "grad_norm": 6.0940117835998535,
709
  "learning_rate": 2e-05,
710
+ "loss": 0.0638,
711
  "step": 900
712
  },
713
  {
714
  "epoch": 3.6,
715
+ "eval_accuracy": 0.978,
716
+ "eval_loss": 0.10251828283071518,
717
+ "eval_runtime": 14.4118,
718
+ "eval_samples_per_second": 69.388,
719
+ "eval_steps_per_second": 8.673,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 3.64,
724
+ "grad_norm": 0.1492830514907837,
725
  "learning_rate": 1.8e-05,
726
+ "loss": 0.0821,
727
  "step": 910
728
  },
729
  {
730
  "epoch": 3.68,
731
+ "grad_norm": 0.07156217098236084,
732
  "learning_rate": 1.6000000000000003e-05,
733
+ "loss": 0.023,
734
  "step": 920
735
  },
736
  {
737
  "epoch": 3.7199999999999998,
738
+ "grad_norm": 0.07530596107244492,
739
  "learning_rate": 1.4000000000000001e-05,
740
+ "loss": 0.0821,
741
  "step": 930
742
  },
743
  {
744
  "epoch": 3.76,
745
+ "grad_norm": 0.061225246638059616,
746
  "learning_rate": 1.2e-05,
747
+ "loss": 0.0518,
748
  "step": 940
749
  },
750
  {
751
  "epoch": 3.8,
752
+ "grad_norm": 0.15823562443256378,
753
  "learning_rate": 1e-05,
754
+ "loss": 0.0286,
755
  "step": 950
756
  },
757
  {
758
  "epoch": 3.84,
759
+ "grad_norm": 0.09221441298723221,
760
  "learning_rate": 8.000000000000001e-06,
761
+ "loss": 0.0364,
762
  "step": 960
763
  },
764
  {
765
  "epoch": 3.88,
766
+ "grad_norm": 0.05617209151387215,
767
  "learning_rate": 6e-06,
768
+ "loss": 0.0498,
769
  "step": 970
770
  },
771
  {
772
  "epoch": 3.92,
773
+ "grad_norm": 0.08119833469390869,
774
  "learning_rate": 4.000000000000001e-06,
775
+ "loss": 0.049,
776
  "step": 980
777
  },
778
  {
779
  "epoch": 3.96,
780
+ "grad_norm": 0.08950098603963852,
781
  "learning_rate": 2.0000000000000003e-06,
782
+ "loss": 0.0642,
783
  "step": 990
784
  },
785
  {
786
  "epoch": 4.0,
787
+ "grad_norm": 0.07529381662607193,
788
  "learning_rate": 0.0,
789
+ "loss": 0.0574,
790
  "step": 1000
791
  },
792
  {
793
  "epoch": 4.0,
794
+ "eval_accuracy": 0.981,
795
+ "eval_loss": 0.09177211672067642,
796
+ "eval_runtime": 15.0707,
797
+ "eval_samples_per_second": 66.354,
798
+ "eval_steps_per_second": 8.294,
799
  "step": 1000
800
  },
801
  {
802
  "epoch": 4.0,
803
  "step": 1000,
804
  "total_flos": 1.239905171570688e+18,
805
+ "train_loss": 0.3459155881404877,
806
+ "train_runtime": 562.8746,
807
+ "train_samples_per_second": 28.426,
808
+ "train_steps_per_second": 1.777
809
  }
810
  ],
811
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50dacd679584c943310851e04614287d2d560396915ec76b9daca95419e88685
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40dc00985c0a75ad675d49b33b6ad4495940384eabfc0ead05db15735440fc4
3
  size 5112