sheepy928 commited on
Commit
0c4b78c
1 Parent(s): ea3273a

Training in progress, step 500, checkpoint

Browse files
checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf34a6733d5227eb85d55def447818a330bc1eae397c5506bc1e75b3797e4fda
3
  size 997351674
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9b5331a049bf007e3fe60e6b2ee0194b68959b298abbf3a9c2ee94807a4dd2d
3
  size 997351674
checkpoint-500/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:340c33a4987169203c067a0a28b9c570a1f740be29bc9ad9b441418693a54d36
3
  size 498661166
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58f17f1f744378c30c2b3f5533b9d7bedf71b3d71f7f094f101e878c5d4bf595
3
  size 498661166
checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b422432f050f31232fd8bfedc63b986c036bf46b6cf0b466f2e329905094359
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd8ed62c04c41b6e0c29c4bf4ad972a544ebceea21938ff9f3a06432ffd506cd
3
  size 14244
checkpoint-500/trainer_state.json CHANGED
@@ -1,409 +1,544 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.3762227238525207,
5
- "eval_steps": 50,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
  "learning_rate": 5e-07,
14
- "loss": 1.1001,
15
  "step": 10
16
  },
17
  {
18
- "epoch": 0.02,
19
  "learning_rate": 1e-06,
20
- "loss": 1.0661,
 
 
 
 
 
 
 
 
 
21
  "step": 20
22
  },
23
  {
24
- "epoch": 0.02,
25
  "learning_rate": 1.5e-06,
26
- "loss": 1.0782,
27
  "step": 30
28
  },
29
  {
30
- "epoch": 0.03,
31
  "learning_rate": 2e-06,
32
- "loss": 1.0578,
33
  "step": 40
34
  },
35
  {
36
- "epoch": 0.04,
37
- "learning_rate": 2.5e-06,
38
- "loss": 1.0395,
39
- "step": 50
 
 
 
40
  },
41
  {
42
- "epoch": 0.04,
43
- "eval_accuracy": 0.4754781102152329,
44
- "eval_loss": 1.0322505235671997,
45
- "eval_runtime": 49.5848,
46
- "eval_samples_per_second": 605.307,
47
- "eval_steps_per_second": 1.19,
48
  "step": 50
49
  },
50
  {
51
- "epoch": 0.05,
52
  "learning_rate": 3e-06,
53
- "loss": 1.0308,
 
 
 
 
 
 
 
 
 
54
  "step": 60
55
  },
56
  {
57
- "epoch": 0.05,
58
  "learning_rate": 3.5e-06,
59
- "loss": 1.0044,
60
  "step": 70
61
  },
62
  {
63
- "epoch": 0.06,
64
  "learning_rate": 4e-06,
65
- "loss": 1.0398,
 
 
 
 
 
 
 
 
 
66
  "step": 80
67
  },
68
  {
69
- "epoch": 0.07,
70
  "learning_rate": 4.5e-06,
71
- "loss": 1.0116,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.08,
76
  "learning_rate": 5e-06,
77
- "loss": 1.0067,
78
  "step": 100
79
  },
80
  {
81
- "epoch": 0.08,
82
- "eval_accuracy": 0.5313187179316319,
83
- "eval_loss": 0.9863238334655762,
84
- "eval_runtime": 49.2356,
85
- "eval_samples_per_second": 609.599,
86
- "eval_steps_per_second": 1.198,
87
  "step": 100
88
  },
89
  {
90
- "epoch": 0.08,
91
  "learning_rate": 5.5e-06,
92
- "loss": 1.0131,
93
  "step": 110
94
  },
95
  {
96
- "epoch": 0.09,
97
  "learning_rate": 6e-06,
98
- "loss": 1.0235,
 
 
 
 
 
 
 
 
 
99
  "step": 120
100
  },
101
  {
102
- "epoch": 0.1,
103
  "learning_rate": 6.5e-06,
104
- "loss": 1.0073,
105
  "step": 130
106
  },
107
  {
108
- "epoch": 0.11,
109
  "learning_rate": 7e-06,
110
- "loss": 0.9856,
111
  "step": 140
112
  },
113
  {
114
- "epoch": 0.11,
115
- "learning_rate": 7.5e-06,
116
- "loss": 1.0062,
117
- "step": 150
 
 
 
118
  },
119
  {
120
- "epoch": 0.11,
121
- "eval_accuracy": 0.5358499366962084,
122
- "eval_loss": 0.9735665917396545,
123
- "eval_runtime": 49.4568,
124
- "eval_samples_per_second": 606.874,
125
- "eval_steps_per_second": 1.193,
126
  "step": 150
127
  },
128
  {
129
- "epoch": 0.12,
130
  "learning_rate": 8e-06,
131
- "loss": 0.9622,
 
 
 
 
 
 
 
 
 
132
  "step": 160
133
  },
134
  {
135
- "epoch": 0.13,
136
  "learning_rate": 8.500000000000002e-06,
137
- "loss": 0.9583,
138
  "step": 170
139
  },
140
  {
141
- "epoch": 0.14,
142
  "learning_rate": 9e-06,
143
- "loss": 0.987,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 0.14,
 
 
 
 
 
 
 
 
 
148
  "learning_rate": 9.5e-06,
149
- "loss": 0.9995,
150
  "step": 190
151
  },
152
  {
153
- "epoch": 0.15,
154
  "learning_rate": 1e-05,
155
- "loss": 0.9953,
156
  "step": 200
157
  },
158
  {
159
- "epoch": 0.15,
160
- "eval_accuracy": 0.5406476977410541,
161
- "eval_loss": 0.9847397804260254,
162
- "eval_runtime": 49.5469,
163
- "eval_samples_per_second": 605.769,
164
- "eval_steps_per_second": 1.191,
165
  "step": 200
166
  },
167
  {
168
- "epoch": 0.16,
169
  "learning_rate": 1.0500000000000001e-05,
170
- "loss": 0.9894,
171
  "step": 210
172
  },
173
  {
174
- "epoch": 0.17,
175
  "learning_rate": 1.1e-05,
176
- "loss": 0.9857,
 
 
 
 
 
 
 
 
 
177
  "step": 220
178
  },
179
  {
180
- "epoch": 0.17,
181
  "learning_rate": 1.15e-05,
182
- "loss": 0.9806,
183
  "step": 230
184
  },
185
  {
186
- "epoch": 0.18,
187
  "learning_rate": 1.2e-05,
188
- "loss": 0.9802,
189
  "step": 240
190
  },
191
  {
192
- "epoch": 0.19,
193
- "learning_rate": 1.25e-05,
194
- "loss": 0.9559,
195
- "step": 250
 
 
 
196
  },
197
  {
198
- "epoch": 0.19,
199
- "eval_accuracy": 0.5136269740787632,
200
- "eval_loss": 1.0206754207611084,
201
- "eval_runtime": 49.5108,
202
- "eval_samples_per_second": 606.212,
203
- "eval_steps_per_second": 1.192,
204
  "step": 250
205
  },
206
  {
207
- "epoch": 0.2,
208
  "learning_rate": 1.3e-05,
209
- "loss": 0.9877,
 
 
 
 
 
 
 
 
 
210
  "step": 260
211
  },
212
  {
213
- "epoch": 0.2,
214
  "learning_rate": 1.35e-05,
215
- "loss": 0.9878,
216
  "step": 270
217
  },
218
  {
219
- "epoch": 0.21,
220
  "learning_rate": 1.4e-05,
221
- "loss": 0.9646,
 
 
 
 
 
 
 
 
 
222
  "step": 280
223
  },
224
  {
225
- "epoch": 0.22,
226
  "learning_rate": 1.4500000000000002e-05,
227
- "loss": 0.9598,
228
  "step": 290
229
  },
230
  {
231
- "epoch": 0.23,
232
  "learning_rate": 1.5e-05,
233
- "loss": 0.9567,
234
  "step": 300
235
  },
236
  {
237
- "epoch": 0.23,
238
- "eval_accuracy": 0.5693676284400613,
239
- "eval_loss": 0.9298574924468994,
240
- "eval_runtime": 49.5798,
241
- "eval_samples_per_second": 605.367,
242
- "eval_steps_per_second": 1.19,
243
  "step": 300
244
  },
245
  {
246
- "epoch": 0.23,
247
  "learning_rate": 1.55e-05,
248
- "loss": 0.9659,
249
  "step": 310
250
  },
251
  {
252
- "epoch": 0.24,
253
  "learning_rate": 1.6e-05,
254
- "loss": 0.9803,
 
 
 
 
 
 
 
 
 
255
  "step": 320
256
  },
257
  {
258
- "epoch": 0.25,
259
  "learning_rate": 1.65e-05,
260
- "loss": 0.959,
261
  "step": 330
262
  },
263
  {
264
- "epoch": 0.26,
265
  "learning_rate": 1.7000000000000003e-05,
266
- "loss": 0.9298,
267
  "step": 340
268
  },
269
  {
270
- "epoch": 0.26,
271
- "learning_rate": 1.7500000000000002e-05,
272
- "loss": 0.8923,
273
- "step": 350
 
 
 
274
  },
275
  {
276
- "epoch": 0.26,
277
- "eval_accuracy": 0.603285133604318,
278
- "eval_loss": 0.8731092810630798,
279
- "eval_runtime": 49.5126,
280
- "eval_samples_per_second": 606.189,
281
- "eval_steps_per_second": 1.192,
282
  "step": 350
283
  },
284
  {
285
- "epoch": 0.27,
286
  "learning_rate": 1.8e-05,
287
- "loss": 0.9255,
 
 
 
 
 
 
 
 
 
288
  "step": 360
289
  },
290
  {
291
- "epoch": 0.28,
292
  "learning_rate": 1.85e-05,
293
- "loss": 0.9567,
294
  "step": 370
295
  },
296
  {
297
- "epoch": 0.29,
298
  "learning_rate": 1.9e-05,
299
- "loss": 0.9135,
 
 
 
 
 
 
 
 
 
300
  "step": 380
301
  },
302
  {
303
- "epoch": 0.29,
304
  "learning_rate": 1.95e-05,
305
- "loss": 0.8978,
306
  "step": 390
307
  },
308
  {
309
- "epoch": 0.3,
310
  "learning_rate": 2e-05,
311
- "loss": 0.9617,
312
  "step": 400
313
  },
314
  {
315
- "epoch": 0.3,
316
- "eval_accuracy": 0.5401479309655494,
317
- "eval_loss": 0.9500759840011597,
318
- "eval_runtime": 49.5248,
319
- "eval_samples_per_second": 606.04,
320
- "eval_steps_per_second": 1.191,
321
  "step": 400
322
  },
323
  {
324
- "epoch": 0.31,
325
  "learning_rate": 2.05e-05,
326
- "loss": 0.9387,
327
  "step": 410
328
  },
329
  {
330
- "epoch": 0.32,
331
  "learning_rate": 2.1000000000000002e-05,
332
- "loss": 0.8832,
 
 
 
 
 
 
 
 
 
333
  "step": 420
334
  },
335
  {
336
- "epoch": 0.32,
337
  "learning_rate": 2.1499999999999997e-05,
338
- "loss": 0.9087,
339
  "step": 430
340
  },
341
  {
342
- "epoch": 0.33,
343
  "learning_rate": 2.2e-05,
344
- "loss": 0.9192,
345
  "step": 440
346
  },
347
  {
348
- "epoch": 0.34,
349
- "learning_rate": 2.2499999999999998e-05,
350
- "loss": 0.8456,
351
- "step": 450
 
 
 
352
  },
353
  {
354
- "epoch": 0.34,
355
- "eval_accuracy": 0.6245085626707536,
356
- "eval_loss": 0.8298193216323853,
357
- "eval_runtime": 49.5226,
358
- "eval_samples_per_second": 606.066,
359
- "eval_steps_per_second": 1.191,
360
  "step": 450
361
  },
362
  {
363
- "epoch": 0.35,
364
  "learning_rate": 2.3e-05,
365
- "loss": 0.8511,
 
 
 
 
 
 
 
 
 
366
  "step": 460
367
  },
368
  {
369
- "epoch": 0.35,
370
  "learning_rate": 2.3500000000000002e-05,
371
- "loss": 0.9211,
372
  "step": 470
373
  },
374
  {
375
- "epoch": 0.36,
376
  "learning_rate": 2.4e-05,
377
- "loss": 0.8416,
378
  "step": 480
379
  },
380
  {
381
- "epoch": 0.37,
 
 
 
 
 
 
 
 
 
382
  "learning_rate": 2.4500000000000003e-05,
383
- "loss": 0.849,
384
  "step": 490
385
  },
386
  {
387
- "epoch": 0.38,
388
  "learning_rate": 2.5e-05,
389
- "loss": 0.8098,
390
  "step": 500
391
  },
392
  {
393
- "epoch": 0.38,
394
- "eval_accuracy": 0.6523622309588859,
395
- "eval_loss": 0.771039605140686,
396
- "eval_runtime": 49.4859,
397
- "eval_samples_per_second": 606.517,
398
- "eval_steps_per_second": 1.192,
399
  "step": 500
400
  }
401
  ],
402
  "logging_steps": 10,
403
- "max_steps": 3987,
404
  "num_train_epochs": 3,
405
- "save_steps": 250,
406
- "total_flos": 1.6839258144768e+16,
407
  "trial_name": null,
408
  "trial_params": null
409
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.5048908954100828,
5
+ "eval_steps": 20,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
  "learning_rate": 5e-07,
14
+ "loss": 1.1046,
15
  "step": 10
16
  },
17
  {
18
+ "epoch": 0.06,
19
  "learning_rate": 1e-06,
20
+ "loss": 1.0664,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.06,
25
+ "eval_accuracy": 0.44479243019924036,
26
+ "eval_loss": 1.065091609954834,
27
+ "eval_runtime": 50.348,
28
+ "eval_samples_per_second": 596.131,
29
+ "eval_steps_per_second": 1.172,
30
  "step": 20
31
  },
32
  {
33
+ "epoch": 0.09,
34
  "learning_rate": 1.5e-06,
35
+ "loss": 1.0672,
36
  "step": 30
37
  },
38
  {
39
+ "epoch": 0.12,
40
  "learning_rate": 2e-06,
41
+ "loss": 1.0423,
42
  "step": 40
43
  },
44
  {
45
+ "epoch": 0.12,
46
+ "eval_accuracy": 0.5033650962883988,
47
+ "eval_loss": 1.0188277959823608,
48
+ "eval_runtime": 49.614,
49
+ "eval_samples_per_second": 604.95,
50
+ "eval_steps_per_second": 1.189,
51
+ "step": 40
52
  },
53
  {
54
+ "epoch": 0.15,
55
+ "learning_rate": 2.5e-06,
56
+ "loss": 1.027,
 
 
 
57
  "step": 50
58
  },
59
  {
60
+ "epoch": 0.18,
61
  "learning_rate": 3e-06,
62
+ "loss": 1.0137,
63
+ "step": 60
64
+ },
65
+ {
66
+ "epoch": 0.18,
67
+ "eval_accuracy": 0.5279203038581995,
68
+ "eval_loss": 0.9871189594268799,
69
+ "eval_runtime": 49.5201,
70
+ "eval_samples_per_second": 606.098,
71
+ "eval_steps_per_second": 1.191,
72
  "step": 60
73
  },
74
  {
75
+ "epoch": 0.21,
76
  "learning_rate": 3.5e-06,
77
+ "loss": 1.0005,
78
  "step": 70
79
  },
80
  {
81
+ "epoch": 0.24,
82
  "learning_rate": 4e-06,
83
+ "loss": 1.0027,
84
+ "step": 80
85
+ },
86
+ {
87
+ "epoch": 0.24,
88
+ "eval_accuracy": 0.5308189511561271,
89
+ "eval_loss": 0.9888613224029541,
90
+ "eval_runtime": 49.4469,
91
+ "eval_samples_per_second": 606.994,
92
+ "eval_steps_per_second": 1.193,
93
  "step": 80
94
  },
95
  {
96
+ "epoch": 0.27,
97
  "learning_rate": 4.5e-06,
98
+ "loss": 0.9937,
99
  "step": 90
100
  },
101
  {
102
+ "epoch": 0.3,
103
  "learning_rate": 5e-06,
104
+ "loss": 0.9914,
105
  "step": 100
106
  },
107
  {
108
+ "epoch": 0.3,
109
+ "eval_accuracy": 0.5307856333710935,
110
+ "eval_loss": 0.9762536883354187,
111
+ "eval_runtime": 49.4804,
112
+ "eval_samples_per_second": 606.583,
113
+ "eval_steps_per_second": 1.192,
114
  "step": 100
115
  },
116
  {
117
+ "epoch": 0.33,
118
  "learning_rate": 5.5e-06,
119
+ "loss": 0.9884,
120
  "step": 110
121
  },
122
  {
123
+ "epoch": 0.36,
124
  "learning_rate": 6e-06,
125
+ "loss": 0.9826,
126
+ "step": 120
127
+ },
128
+ {
129
+ "epoch": 0.36,
130
+ "eval_accuracy": 0.5387819017791697,
131
+ "eval_loss": 0.9713281989097595,
132
+ "eval_runtime": 49.4983,
133
+ "eval_samples_per_second": 606.364,
134
+ "eval_steps_per_second": 1.192,
135
  "step": 120
136
  },
137
  {
138
+ "epoch": 0.39,
139
  "learning_rate": 6.5e-06,
140
+ "loss": 0.9882,
141
  "step": 130
142
  },
143
  {
144
+ "epoch": 0.42,
145
  "learning_rate": 7e-06,
146
+ "loss": 0.9788,
147
  "step": 140
148
  },
149
  {
150
+ "epoch": 0.42,
151
+ "eval_accuracy": 0.5312520823615646,
152
+ "eval_loss": 0.9766249656677246,
153
+ "eval_runtime": 49.5213,
154
+ "eval_samples_per_second": 606.083,
155
+ "eval_steps_per_second": 1.191,
156
+ "step": 140
157
  },
158
  {
159
+ "epoch": 0.45,
160
+ "learning_rate": 7.5e-06,
161
+ "loss": 0.9854,
 
 
 
162
  "step": 150
163
  },
164
  {
165
+ "epoch": 0.48,
166
  "learning_rate": 8e-06,
167
+ "loss": 0.984,
168
+ "step": 160
169
+ },
170
+ {
171
+ "epoch": 0.48,
172
+ "eval_accuracy": 0.5398147531152129,
173
+ "eval_loss": 0.9589501619338989,
174
+ "eval_runtime": 49.4977,
175
+ "eval_samples_per_second": 606.372,
176
+ "eval_steps_per_second": 1.192,
177
  "step": 160
178
  },
179
  {
180
+ "epoch": 0.51,
181
  "learning_rate": 8.500000000000002e-06,
182
+ "loss": 0.9795,
183
  "step": 170
184
  },
185
  {
186
+ "epoch": 0.54,
187
  "learning_rate": 9e-06,
188
+ "loss": 0.9694,
189
  "step": 180
190
  },
191
  {
192
+ "epoch": 0.54,
193
+ "eval_accuracy": 0.5423469047777704,
194
+ "eval_loss": 0.953514814376831,
195
+ "eval_runtime": 49.4511,
196
+ "eval_samples_per_second": 606.943,
197
+ "eval_steps_per_second": 1.193,
198
+ "step": 180
199
+ },
200
+ {
201
+ "epoch": 0.57,
202
  "learning_rate": 9.5e-06,
203
+ "loss": 0.9757,
204
  "step": 190
205
  },
206
  {
207
+ "epoch": 0.6,
208
  "learning_rate": 1e-05,
209
+ "loss": 0.9676,
210
  "step": 200
211
  },
212
  {
213
+ "epoch": 0.6,
214
+ "eval_accuracy": 0.567201972412874,
215
+ "eval_loss": 0.9273685812950134,
216
+ "eval_runtime": 49.4122,
217
+ "eval_samples_per_second": 607.421,
218
+ "eval_steps_per_second": 1.194,
219
  "step": 200
220
  },
221
  {
222
+ "epoch": 0.63,
223
  "learning_rate": 1.0500000000000001e-05,
224
+ "loss": 0.9708,
225
  "step": 210
226
  },
227
  {
228
+ "epoch": 0.66,
229
  "learning_rate": 1.1e-05,
230
+ "loss": 0.9753,
231
+ "step": 220
232
+ },
233
+ {
234
+ "epoch": 0.66,
235
+ "eval_accuracy": 0.573598987139335,
236
+ "eval_loss": 0.912590503692627,
237
+ "eval_runtime": 49.2827,
238
+ "eval_samples_per_second": 609.017,
239
+ "eval_steps_per_second": 1.197,
240
  "step": 220
241
  },
242
  {
243
+ "epoch": 0.69,
244
  "learning_rate": 1.15e-05,
245
+ "loss": 0.9586,
246
  "step": 230
247
  },
248
  {
249
+ "epoch": 0.72,
250
  "learning_rate": 1.2e-05,
251
+ "loss": 0.9557,
252
  "step": 240
253
  },
254
  {
255
+ "epoch": 0.72,
256
+ "eval_accuracy": 0.5759645498767242,
257
+ "eval_loss": 0.9052607417106628,
258
+ "eval_runtime": 49.4457,
259
+ "eval_samples_per_second": 607.01,
260
+ "eval_steps_per_second": 1.193,
261
+ "step": 240
262
  },
263
  {
264
+ "epoch": 0.75,
265
+ "learning_rate": 1.25e-05,
266
+ "loss": 0.9518,
 
 
 
267
  "step": 250
268
  },
269
  {
270
+ "epoch": 0.78,
271
  "learning_rate": 1.3e-05,
272
+ "loss": 0.9508,
273
+ "step": 260
274
+ },
275
+ {
276
+ "epoch": 0.78,
277
+ "eval_accuracy": 0.5766975411474645,
278
+ "eval_loss": 0.9178985953330994,
279
+ "eval_runtime": 49.3132,
280
+ "eval_samples_per_second": 608.641,
281
+ "eval_steps_per_second": 1.196,
282
  "step": 260
283
  },
284
  {
285
+ "epoch": 0.81,
286
  "learning_rate": 1.35e-05,
287
+ "loss": 0.9405,
288
  "step": 270
289
  },
290
  {
291
+ "epoch": 0.84,
292
  "learning_rate": 1.4e-05,
293
+ "loss": 0.9355,
294
+ "step": 280
295
+ },
296
+ {
297
+ "epoch": 0.84,
298
+ "eval_accuracy": 0.58915839275005,
299
+ "eval_loss": 0.8937407732009888,
300
+ "eval_runtime": 49.4342,
301
+ "eval_samples_per_second": 607.15,
302
+ "eval_steps_per_second": 1.194,
303
  "step": 280
304
  },
305
  {
306
+ "epoch": 0.87,
307
  "learning_rate": 1.4500000000000002e-05,
308
+ "loss": 0.9235,
309
  "step": 290
310
  },
311
  {
312
+ "epoch": 0.9,
313
  "learning_rate": 1.5e-05,
314
+ "loss": 0.9,
315
  "step": 300
316
  },
317
  {
318
+ "epoch": 0.9,
319
+ "eval_accuracy": 0.613013926834144,
320
+ "eval_loss": 0.8468813300132751,
321
+ "eval_runtime": 49.5046,
322
+ "eval_samples_per_second": 606.287,
323
+ "eval_steps_per_second": 1.192,
324
  "step": 300
325
  },
326
  {
327
+ "epoch": 0.93,
328
  "learning_rate": 1.55e-05,
329
+ "loss": 0.8857,
330
  "step": 310
331
  },
332
  {
333
+ "epoch": 0.96,
334
  "learning_rate": 1.6e-05,
335
+ "loss": 0.993,
336
+ "step": 320
337
+ },
338
+ {
339
+ "epoch": 0.96,
340
+ "eval_accuracy": 0.6046511627906976,
341
+ "eval_loss": 0.8615403771400452,
342
+ "eval_runtime": 49.5041,
343
+ "eval_samples_per_second": 606.294,
344
+ "eval_steps_per_second": 1.192,
345
  "step": 320
346
  },
347
  {
348
+ "epoch": 0.99,
349
  "learning_rate": 1.65e-05,
350
+ "loss": 0.8418,
351
  "step": 330
352
  },
353
  {
354
+ "epoch": 1.02,
355
  "learning_rate": 1.7000000000000003e-05,
356
+ "loss": 0.8527,
357
  "step": 340
358
  },
359
  {
360
+ "epoch": 1.02,
361
+ "eval_accuracy": 0.6439328313453722,
362
+ "eval_loss": 0.7896137237548828,
363
+ "eval_runtime": 49.4448,
364
+ "eval_samples_per_second": 607.02,
365
+ "eval_steps_per_second": 1.193,
366
+ "step": 340
367
  },
368
  {
369
+ "epoch": 1.05,
370
+ "learning_rate": 1.7500000000000002e-05,
371
+ "loss": 0.8982,
 
 
 
372
  "step": 350
373
  },
374
  {
375
+ "epoch": 1.08,
376
  "learning_rate": 1.8e-05,
377
+ "loss": 0.966,
378
+ "step": 360
379
+ },
380
+ {
381
+ "epoch": 1.08,
382
+ "eval_accuracy": 0.5315852602119011,
383
+ "eval_loss": 1.0123510360717773,
384
+ "eval_runtime": 49.4838,
385
+ "eval_samples_per_second": 606.542,
386
+ "eval_steps_per_second": 1.192,
387
  "step": 360
388
  },
389
  {
390
+ "epoch": 1.11,
391
  "learning_rate": 1.85e-05,
392
+ "loss": 0.9413,
393
  "step": 370
394
  },
395
  {
396
+ "epoch": 1.14,
397
  "learning_rate": 1.9e-05,
398
+ "loss": 0.8441,
399
+ "step": 380
400
+ },
401
+ {
402
+ "epoch": 1.14,
403
+ "eval_accuracy": 0.6488638635303525,
404
+ "eval_loss": 0.791083574295044,
405
+ "eval_runtime": 49.4701,
406
+ "eval_samples_per_second": 606.71,
407
+ "eval_steps_per_second": 1.193,
408
  "step": 380
409
  },
410
  {
411
+ "epoch": 1.17,
412
  "learning_rate": 1.95e-05,
413
+ "loss": 0.8223,
414
  "step": 390
415
  },
416
  {
417
+ "epoch": 1.2,
418
  "learning_rate": 2e-05,
419
+ "loss": 0.8226,
420
  "step": 400
421
  },
422
  {
423
+ "epoch": 1.2,
424
+ "eval_accuracy": 0.6699873392416872,
425
+ "eval_loss": 0.7472424507141113,
426
+ "eval_runtime": 49.4938,
427
+ "eval_samples_per_second": 606.419,
428
+ "eval_steps_per_second": 1.192,
429
  "step": 400
430
  },
431
  {
432
+ "epoch": 1.23,
433
  "learning_rate": 2.05e-05,
434
+ "loss": 0.7924,
435
  "step": 410
436
  },
437
  {
438
+ "epoch": 1.26,
439
  "learning_rate": 2.1000000000000002e-05,
440
+ "loss": 0.7948,
441
+ "step": 420
442
+ },
443
+ {
444
+ "epoch": 1.26,
445
+ "eval_accuracy": 0.6580595721996402,
446
+ "eval_loss": 0.7663838863372803,
447
+ "eval_runtime": 49.5064,
448
+ "eval_samples_per_second": 606.265,
449
+ "eval_steps_per_second": 1.192,
450
  "step": 420
451
  },
452
  {
453
+ "epoch": 1.29,
454
  "learning_rate": 2.1499999999999997e-05,
455
+ "loss": 0.776,
456
  "step": 430
457
  },
458
  {
459
+ "epoch": 1.32,
460
  "learning_rate": 2.2e-05,
461
+ "loss": 0.7428,
462
  "step": 440
463
  },
464
  {
465
+ "epoch": 1.32,
466
+ "eval_accuracy": 0.6991737189311654,
467
+ "eval_loss": 0.6993714570999146,
468
+ "eval_runtime": 49.4888,
469
+ "eval_samples_per_second": 606.481,
470
+ "eval_steps_per_second": 1.192,
471
+ "step": 440
472
  },
473
  {
474
+ "epoch": 1.35,
475
+ "learning_rate": 2.2499999999999998e-05,
476
+ "loss": 0.7512,
 
 
 
477
  "step": 450
478
  },
479
  {
480
+ "epoch": 1.38,
481
  "learning_rate": 2.3e-05,
482
+ "loss": 0.7109,
483
+ "step": 460
484
+ },
485
+ {
486
+ "epoch": 1.38,
487
+ "eval_accuracy": 0.7283600986206437,
488
+ "eval_loss": 0.6510820984840393,
489
+ "eval_runtime": 49.5999,
490
+ "eval_samples_per_second": 605.122,
491
+ "eval_steps_per_second": 1.19,
492
  "step": 460
493
  },
494
  {
495
+ "epoch": 1.41,
496
  "learning_rate": 2.3500000000000002e-05,
497
+ "loss": 0.701,
498
  "step": 470
499
  },
500
  {
501
+ "epoch": 1.44,
502
  "learning_rate": 2.4e-05,
503
+ "loss": 0.6882,
504
  "step": 480
505
  },
506
  {
507
+ "epoch": 1.44,
508
+ "eval_accuracy": 0.7576797494502565,
509
+ "eval_loss": 0.5987845063209534,
510
+ "eval_runtime": 49.5123,
511
+ "eval_samples_per_second": 606.192,
512
+ "eval_steps_per_second": 1.192,
513
+ "step": 480
514
+ },
515
+ {
516
+ "epoch": 1.47,
517
  "learning_rate": 2.4500000000000003e-05,
518
+ "loss": 0.651,
519
  "step": 490
520
  },
521
  {
522
+ "epoch": 1.5,
523
  "learning_rate": 2.5e-05,
524
+ "loss": 0.7296,
525
  "step": 500
526
  },
527
  {
528
+ "epoch": 1.5,
529
+ "eval_accuracy": 0.7564136736189778,
530
+ "eval_loss": 0.5992804765701294,
531
+ "eval_runtime": 49.5641,
532
+ "eval_samples_per_second": 605.56,
533
+ "eval_steps_per_second": 1.19,
534
  "step": 500
535
  }
536
  ],
537
  "logging_steps": 10,
538
+ "max_steps": 996,
539
  "num_train_epochs": 3,
540
+ "save_steps": 100,
541
+ "total_flos": 6.73549277766615e+16,
542
  "trial_name": null,
543
  "trial_params": null
544
  }
checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:842788c44aca7ccbca2d57d8869e43eb15da3880b4cadb0b091bb46a3e020fcc
3
  size 4536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f1be80ebe52f6e43af0b8aa087e72fad77310d5998b6e0b8f66a6a1d53be7b7
3
  size 4536