kanishka commited on
Commit
34e82f8
·
verified ·
1 Parent(s): 9dc51d6

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +14 -14
  3. eval_results.json +8 -8
  4. train_results.json +7 -7
  5. trainer_state.json +279 -279
README.md CHANGED
@@ -2,11 +2,23 @@
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
 
 
5
  metrics:
6
  - accuracy
7
  model-index:
8
  - name: opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
9
- results: []
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
16
 
17
- This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 2.6840
20
  - Accuracy: 0.4787
 
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - kanishka/babylm2-rewritten-clean-spacy
7
  metrics:
8
  - accuracy
9
  model-index:
10
  - name: opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
11
+ results:
12
+ - task:
13
+ name: Causal Language Modeling
14
+ type: text-generation
15
+ dataset:
16
+ name: kanishka/babylm2-rewritten-clean-spacy
17
+ type: kanishka/babylm2-rewritten-clean-spacy
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.47868057440510814
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
28
 
29
+ This model was trained from scratch on the kanishka/babylm2-rewritten-clean-spacy dataset.
30
  It achieves the following results on the evaluation set:
31
  - Loss: 2.6840
32
  - Accuracy: 0.4787
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.47877642614021604,
4
- "eval_loss": 2.6820449829101562,
5
- "eval_runtime": 71.9233,
6
- "eval_samples": 60701,
7
- "eval_samples_per_second": 843.969,
8
- "eval_steps_per_second": 13.195,
9
- "perplexity": 14.614950080315884,
10
- "total_flos": 1.50902942072832e+18,
11
- "train_loss": 2.805498681169875,
12
- "train_runtime": 30524.4807,
13
- "train_samples": 577526,
14
- "train_samples_per_second": 378.402,
15
- "train_steps_per_second": 1.478
16
  }
 
1
  {
2
+ "epoch": 19.991464360935595,
3
+ "eval_accuracy": 0.47868057440510814,
4
+ "eval_loss": 2.6840312480926514,
5
+ "eval_runtime": 71.5689,
6
+ "eval_samples": 60680,
7
+ "eval_samples_per_second": 847.854,
8
+ "eval_steps_per_second": 13.26,
9
+ "perplexity": 14.644008095713023,
10
+ "total_flos": 1.507910045663232e+18,
11
+ "train_loss": 2.8050402250099604,
12
+ "train_runtime": 30336.2728,
13
+ "train_samples": 577344,
14
+ "train_samples_per_second": 380.629,
15
+ "train_steps_per_second": 1.487
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.47877642614021604,
4
- "eval_loss": 2.6820449829101562,
5
- "eval_runtime": 71.9233,
6
- "eval_samples": 60701,
7
- "eval_samples_per_second": 843.969,
8
- "eval_steps_per_second": 13.195,
9
- "perplexity": 14.614950080315884
10
  }
 
1
  {
2
+ "epoch": 19.991464360935595,
3
+ "eval_accuracy": 0.47868057440510814,
4
+ "eval_loss": 2.6840312480926514,
5
+ "eval_runtime": 71.5689,
6
+ "eval_samples": 60680,
7
+ "eval_samples_per_second": 847.854,
8
+ "eval_steps_per_second": 13.26,
9
+ "perplexity": 14.644008095713023
10
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 1.50902942072832e+18,
4
- "train_loss": 2.805498681169875,
5
- "train_runtime": 30524.4807,
6
- "train_samples": 577526,
7
- "train_samples_per_second": 378.402,
8
- "train_steps_per_second": 1.478
9
  }
 
1
  {
2
+ "epoch": 19.991464360935595,
3
+ "total_flos": 1.507910045663232e+18,
4
+ "train_loss": 2.8050402250099604,
5
+ "train_runtime": 30336.2728,
6
+ "train_samples": 577344,
7
+ "train_samples_per_second": 380.629,
8
+ "train_steps_per_second": 1.487
9
  }
trainer_state.json CHANGED
@@ -1,520 +1,520 @@
1
  {
2
- "best_metric": 2.6820449829101562,
3
- "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3/checkpoint-45120",
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 45120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4432624113475177,
13
- "grad_norm": 0.469483345746994,
14
  "learning_rate": 3.125e-05,
15
- "loss": 5.5946,
16
  "step": 1000
17
  },
18
  {
19
- "epoch": 0.8865248226950354,
20
- "grad_norm": 0.6834925413131714,
21
  "learning_rate": 6.25e-05,
22
- "loss": 4.101,
23
  "step": 2000
24
  },
25
  {
26
  "epoch": 1.0,
27
- "eval_accuracy": 0.36108013855119486,
28
- "eval_loss": 3.813789129257202,
29
- "eval_runtime": 74.5004,
30
- "eval_samples_per_second": 814.774,
31
- "eval_steps_per_second": 12.738,
32
  "step": 2256
33
  },
34
  {
35
- "epoch": 1.3297872340425532,
36
- "grad_norm": 0.5719049572944641,
37
  "learning_rate": 9.375e-05,
38
- "loss": 3.7004,
39
  "step": 3000
40
  },
41
  {
42
- "epoch": 1.773049645390071,
43
- "grad_norm": 0.5357337594032288,
44
  "learning_rate": 0.000125,
45
- "loss": 3.445,
46
  "step": 4000
47
  },
48
  {
49
  "epoch": 2.0,
50
- "eval_accuracy": 0.4097865752122829,
51
- "eval_loss": 3.29646635055542,
52
- "eval_runtime": 74.2471,
53
- "eval_samples_per_second": 817.554,
54
- "eval_steps_per_second": 12.782,
55
  "step": 4512
56
  },
57
  {
58
- "epoch": 2.2163120567375887,
59
- "grad_norm": 0.5515570640563965,
60
  "learning_rate": 0.00015625,
61
- "loss": 3.2498,
62
  "step": 5000
63
  },
64
  {
65
- "epoch": 2.6595744680851063,
66
- "grad_norm": 0.5060694813728333,
67
  "learning_rate": 0.0001875,
68
- "loss": 3.1279,
69
  "step": 6000
70
  },
71
  {
72
  "epoch": 3.0,
73
- "eval_accuracy": 0.43082928827286177,
74
- "eval_loss": 3.0860400199890137,
75
- "eval_runtime": 74.3001,
76
- "eval_samples_per_second": 816.97,
77
- "eval_steps_per_second": 12.773,
78
  "step": 6768
79
  },
80
  {
81
- "epoch": 3.102836879432624,
82
- "grad_norm": 0.4440550208091736,
83
  "learning_rate": 0.00021875,
84
- "loss": 3.0384,
85
  "step": 7000
86
  },
87
  {
88
- "epoch": 3.546099290780142,
89
- "grad_norm": 0.4073255956172943,
90
  "learning_rate": 0.00025,
91
- "loss": 2.9651,
92
  "step": 8000
93
  },
94
  {
95
- "epoch": 3.9893617021276597,
96
- "grad_norm": 0.3835934102535248,
97
  "learning_rate": 0.00028125000000000003,
98
- "loss": 2.9218,
99
  "step": 9000
100
  },
101
  {
102
  "epoch": 4.0,
103
- "eval_accuracy": 0.4411697193992669,
104
- "eval_loss": 2.9823455810546875,
105
- "eval_runtime": 73.4266,
106
- "eval_samples_per_second": 826.69,
107
- "eval_steps_per_second": 12.924,
108
  "step": 9024
109
  },
110
  {
111
- "epoch": 4.432624113475177,
112
- "grad_norm": 0.35862067341804504,
113
  "learning_rate": 0.0003125,
114
- "loss": 2.8625,
115
  "step": 10000
116
  },
117
  {
118
- "epoch": 4.875886524822695,
119
- "grad_norm": 0.33855435252189636,
120
  "learning_rate": 0.00034375,
121
- "loss": 2.8441,
122
  "step": 11000
123
  },
124
  {
125
  "epoch": 5.0,
126
- "eval_accuracy": 0.44725845198790215,
127
- "eval_loss": 2.9202077388763428,
128
- "eval_runtime": 73.099,
129
- "eval_samples_per_second": 830.394,
130
- "eval_steps_per_second": 12.982,
131
  "step": 11280
132
  },
133
  {
134
- "epoch": 5.319148936170213,
135
- "grad_norm": 0.3236384987831116,
136
  "learning_rate": 0.000375,
137
- "loss": 2.7995,
138
  "step": 12000
139
  },
140
  {
141
- "epoch": 5.76241134751773,
142
- "grad_norm": 0.3051661252975464,
143
  "learning_rate": 0.00040625000000000004,
144
- "loss": 2.7865,
145
  "step": 13000
146
  },
147
  {
148
  "epoch": 6.0,
149
- "eval_accuracy": 0.4512637482794966,
150
- "eval_loss": 2.8827643394470215,
151
- "eval_runtime": 72.7776,
152
- "eval_samples_per_second": 834.061,
153
- "eval_steps_per_second": 13.04,
154
  "step": 13536
155
  },
156
  {
157
- "epoch": 6.205673758865248,
158
- "grad_norm": 0.29630789160728455,
159
  "learning_rate": 0.0004375,
160
- "loss": 2.759,
161
  "step": 14000
162
  },
163
  {
164
- "epoch": 6.648936170212766,
165
- "grad_norm": 0.27569055557250977,
166
  "learning_rate": 0.00046871875,
167
- "loss": 2.7432,
168
  "step": 15000
169
  },
170
  {
171
  "epoch": 7.0,
172
- "eval_accuracy": 0.4541487994350967,
173
- "eval_loss": 2.8589529991149902,
174
- "eval_runtime": 72.8865,
175
- "eval_samples_per_second": 832.815,
176
- "eval_steps_per_second": 13.02,
177
  "step": 15792
178
  },
179
  {
180
- "epoch": 7.092198581560283,
181
- "grad_norm": 0.2677430808544159,
182
  "learning_rate": 0.00049996875,
183
- "loss": 2.7319,
184
  "step": 16000
185
  },
186
  {
187
- "epoch": 7.535460992907802,
188
- "grad_norm": 0.2510625123977661,
189
- "learning_rate": 0.0005311875000000001,
190
- "loss": 2.7061,
191
  "step": 17000
192
  },
193
  {
194
- "epoch": 7.9787234042553195,
195
- "grad_norm": 0.23760418593883514,
196
- "learning_rate": 0.0005624375,
197
- "loss": 2.7146,
198
  "step": 18000
199
  },
200
  {
201
  "epoch": 8.0,
202
- "eval_accuracy": 0.4565537086154539,
203
- "eval_loss": 2.8377726078033447,
204
- "eval_runtime": 72.7887,
205
- "eval_samples_per_second": 833.934,
206
- "eval_steps_per_second": 13.038,
207
  "step": 18048
208
  },
209
  {
210
- "epoch": 8.421985815602836,
211
- "grad_norm": 0.23823712766170502,
212
- "learning_rate": 0.00059365625,
213
- "loss": 2.675,
214
  "step": 19000
215
  },
216
  {
217
- "epoch": 8.865248226950355,
218
- "grad_norm": 0.23026619851589203,
219
- "learning_rate": 0.00062490625,
220
- "loss": 2.6906,
221
  "step": 20000
222
  },
223
  {
224
  "epoch": 9.0,
225
- "eval_accuracy": 0.45826760614791046,
226
- "eval_loss": 2.8225581645965576,
227
- "eval_runtime": 72.5363,
228
- "eval_samples_per_second": 836.836,
229
- "eval_steps_per_second": 13.083,
230
  "step": 20304
231
  },
232
  {
233
- "epoch": 9.308510638297872,
234
- "grad_norm": 0.22488652169704437,
235
- "learning_rate": 0.000656125,
236
- "loss": 2.6598,
237
  "step": 21000
238
  },
239
  {
240
- "epoch": 9.75177304964539,
241
- "grad_norm": 0.20617271959781647,
242
- "learning_rate": 0.0006873749999999999,
243
- "loss": 2.6681,
244
  "step": 22000
245
  },
246
  {
247
  "epoch": 10.0,
248
- "eval_accuracy": 0.459518546549771,
249
- "eval_loss": 2.813441753387451,
250
- "eval_runtime": 72.4442,
251
- "eval_samples_per_second": 837.9,
252
- "eval_steps_per_second": 13.1,
253
  "step": 22560
254
  },
255
  {
256
- "epoch": 10.195035460992909,
257
- "grad_norm": 0.20597966015338898,
258
- "learning_rate": 0.000718625,
259
- "loss": 2.6559,
260
  "step": 23000
261
  },
262
  {
263
- "epoch": 10.638297872340425,
264
- "grad_norm": 0.21323370933532715,
265
- "learning_rate": 0.0007498437500000001,
266
- "loss": 2.6498,
267
  "step": 24000
268
  },
269
  {
270
  "epoch": 11.0,
271
- "eval_accuracy": 0.4604723054276652,
272
- "eval_loss": 2.8047826290130615,
273
- "eval_runtime": 72.8612,
274
- "eval_samples_per_second": 833.105,
275
- "eval_steps_per_second": 13.025,
276
  "step": 24816
277
  },
278
  {
279
- "epoch": 11.081560283687944,
280
- "grad_norm": 0.21533997356891632,
281
- "learning_rate": 0.00078109375,
282
- "loss": 2.6487,
283
  "step": 25000
284
  },
285
  {
286
- "epoch": 11.52482269503546,
287
- "grad_norm": 0.21542951464653015,
288
- "learning_rate": 0.0008123125,
289
- "loss": 2.6323,
290
  "step": 26000
291
  },
292
  {
293
- "epoch": 11.96808510638298,
294
- "grad_norm": 0.19053979218006134,
295
- "learning_rate": 0.0008435625,
296
- "loss": 2.6497,
297
  "step": 27000
298
  },
299
  {
300
  "epoch": 12.0,
301
- "eval_accuracy": 0.46156735473880167,
302
- "eval_loss": 2.7969932556152344,
303
- "eval_runtime": 72.5419,
304
- "eval_samples_per_second": 836.771,
305
- "eval_steps_per_second": 13.082,
306
  "step": 27072
307
  },
308
  {
309
- "epoch": 12.411347517730496,
310
- "grad_norm": 0.18872858583927155,
311
- "learning_rate": 0.00087478125,
312
- "loss": 2.6167,
313
  "step": 28000
314
  },
315
  {
316
- "epoch": 12.854609929078014,
317
- "grad_norm": 0.1738893836736679,
318
- "learning_rate": 0.0009060312499999999,
319
- "loss": 2.6375,
320
  "step": 29000
321
  },
322
  {
323
  "epoch": 13.0,
324
- "eval_accuracy": 0.4621078374843455,
325
- "eval_loss": 2.7915232181549072,
326
- "eval_runtime": 72.7216,
327
- "eval_samples_per_second": 834.704,
328
- "eval_steps_per_second": 13.05,
329
  "step": 29328
330
  },
331
  {
332
- "epoch": 13.297872340425531,
333
- "grad_norm": 0.1877707690000534,
334
- "learning_rate": 0.00093725,
335
- "loss": 2.6153,
336
  "step": 30000
337
  },
338
  {
339
- "epoch": 13.74113475177305,
340
- "grad_norm": 0.186727836728096,
341
- "learning_rate": 0.0009685000000000001,
342
- "loss": 2.6278,
343
  "step": 31000
344
  },
345
  {
346
  "epoch": 14.0,
347
- "eval_accuracy": 0.462822946677559,
348
- "eval_loss": 2.786189317703247,
349
- "eval_runtime": 72.4588,
350
- "eval_samples_per_second": 837.731,
351
- "eval_steps_per_second": 13.097,
352
  "step": 31584
353
  },
354
  {
355
- "epoch": 14.184397163120567,
356
- "grad_norm": 0.20270851254463196,
357
- "learning_rate": 0.00099971875,
358
- "loss": 2.6168,
359
  "step": 32000
360
  },
361
  {
362
- "epoch": 14.627659574468085,
363
- "grad_norm": 0.17972639203071594,
364
- "learning_rate": 0.0009244664634146341,
365
- "loss": 2.6102,
366
  "step": 33000
367
  },
368
  {
369
  "epoch": 15.0,
370
- "eval_accuracy": 0.46529976086578023,
371
- "eval_loss": 2.763315200805664,
372
- "eval_runtime": 72.4263,
373
- "eval_samples_per_second": 838.107,
374
- "eval_steps_per_second": 13.103,
375
  "step": 33840
376
  },
377
  {
378
- "epoch": 15.070921985815604,
379
- "grad_norm": 0.17892582714557648,
380
- "learning_rate": 0.0008483231707317073,
381
- "loss": 2.6,
382
  "step": 34000
383
  },
384
  {
385
- "epoch": 15.51418439716312,
386
- "grad_norm": 0.16611941158771515,
387
- "learning_rate": 0.0007721036585365854,
388
- "loss": 2.5602,
389
  "step": 35000
390
  },
391
  {
392
- "epoch": 15.957446808510639,
393
- "grad_norm": 0.1676749587059021,
394
- "learning_rate": 0.0006960365853658537,
395
- "loss": 2.5668,
396
  "step": 36000
397
  },
398
  {
399
  "epoch": 16.0,
400
- "eval_accuracy": 0.46856268478957125,
401
- "eval_loss": 2.739426374435425,
402
- "eval_runtime": 72.5666,
403
- "eval_samples_per_second": 836.486,
404
- "eval_steps_per_second": 13.078,
405
  "step": 36096
406
  },
407
  {
408
- "epoch": 16.400709219858157,
409
- "grad_norm": 0.17558415234088898,
410
- "learning_rate": 0.0006198170731707318,
411
- "loss": 2.5115,
412
  "step": 37000
413
  },
414
  {
415
- "epoch": 16.843971631205672,
416
- "grad_norm": 0.1874464899301529,
417
- "learning_rate": 0.0005435975609756098,
418
- "loss": 2.5178,
419
  "step": 38000
420
  },
421
  {
422
  "epoch": 17.0,
423
- "eval_accuracy": 0.47171248592021775,
424
- "eval_loss": 2.7182633876800537,
425
- "eval_runtime": 72.699,
426
- "eval_samples_per_second": 834.964,
427
- "eval_steps_per_second": 13.054,
428
  "step": 38352
429
  },
430
  {
431
- "epoch": 17.28723404255319,
432
- "grad_norm": 0.18552443385124207,
433
- "learning_rate": 0.00046745426829268295,
434
- "loss": 2.4731,
435
  "step": 39000
436
  },
437
  {
438
- "epoch": 17.73049645390071,
439
- "grad_norm": 0.1822243332862854,
440
- "learning_rate": 0.000391234756097561,
441
- "loss": 2.462,
442
  "step": 40000
443
  },
444
  {
445
  "epoch": 18.0,
446
- "eval_accuracy": 0.4747758459901975,
447
- "eval_loss": 2.697719097137451,
448
- "eval_runtime": 72.2911,
449
- "eval_samples_per_second": 839.675,
450
- "eval_steps_per_second": 13.127,
451
  "step": 40608
452
  },
453
  {
454
- "epoch": 18.173758865248228,
455
- "grad_norm": 0.18331420421600342,
456
- "learning_rate": 0.000315015243902439,
457
- "loss": 2.4319,
458
  "step": 41000
459
  },
460
  {
461
- "epoch": 18.617021276595743,
462
- "grad_norm": 0.19214719533920288,
463
- "learning_rate": 0.00023879573170731708,
464
- "loss": 2.3974,
465
  "step": 42000
466
  },
467
  {
468
  "epoch": 19.0,
469
- "eval_accuracy": 0.4773433651479076,
470
- "eval_loss": 2.683954954147339,
471
- "eval_runtime": 72.4404,
472
- "eval_samples_per_second": 837.944,
473
- "eval_steps_per_second": 13.1,
474
  "step": 42864
475
  },
476
  {
477
- "epoch": 19.06028368794326,
478
- "grad_norm": 0.19565586745738983,
479
- "learning_rate": 0.00016265243902439025,
480
- "loss": 2.3863,
481
  "step": 43000
482
  },
483
  {
484
- "epoch": 19.50354609929078,
485
- "grad_norm": 0.19951286911964417,
486
- "learning_rate": 8.643292682926828e-05,
487
- "loss": 2.3273,
488
  "step": 44000
489
  },
490
  {
491
- "epoch": 19.9468085106383,
492
- "grad_norm": 0.19867576658725739,
493
- "learning_rate": 1.0213414634146342e-05,
494
- "loss": 2.3259,
495
  "step": 45000
496
  },
497
  {
498
- "epoch": 20.0,
499
- "eval_accuracy": 0.47877642614021604,
500
- "eval_loss": 2.6820449829101562,
501
- "eval_runtime": 72.6976,
502
- "eval_samples_per_second": 834.98,
503
- "eval_steps_per_second": 13.054,
504
- "step": 45120
505
  },
506
  {
507
- "epoch": 20.0,
508
- "step": 45120,
509
- "total_flos": 1.50902942072832e+18,
510
- "train_loss": 2.805498681169875,
511
- "train_runtime": 30524.4807,
512
- "train_samples_per_second": 378.402,
513
- "train_steps_per_second": 1.478
514
  }
515
  ],
516
  "logging_steps": 1000,
517
- "max_steps": 45120,
518
  "num_input_tokens_seen": 0,
519
  "num_train_epochs": 20,
520
  "save_steps": 500,
@@ -539,7 +539,7 @@
539
  "attributes": {}
540
  }
541
  },
542
- "total_flos": 1.50902942072832e+18,
543
  "train_batch_size": 32,
544
  "trial_name": null,
545
  "trial_params": null
 
1
  {
2
+ "best_metric": 2.6840312480926514,
3
+ "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3/checkpoint-45100",
4
+ "epoch": 19.991464360935595,
5
  "eval_steps": 500,
6
+ "global_step": 45100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.4434098215275468,
13
+ "grad_norm": 0.4883245825767517,
14
  "learning_rate": 3.125e-05,
15
+ "loss": 5.5896,
16
  "step": 1000
17
  },
18
  {
19
+ "epoch": 0.8868196430550936,
20
+ "grad_norm": 0.6184232831001282,
21
  "learning_rate": 6.25e-05,
22
+ "loss": 4.1044,
23
  "step": 2000
24
  },
25
  {
26
  "epoch": 1.0,
27
+ "eval_accuracy": 0.3604034665942844,
28
+ "eval_loss": 3.820427417755127,
29
+ "eval_runtime": 74.0095,
30
+ "eval_samples_per_second": 819.894,
31
+ "eval_steps_per_second": 12.823,
32
  "step": 2256
33
  },
34
  {
35
+ "epoch": 1.3298969072164948,
36
+ "grad_norm": 0.5272237658500671,
37
  "learning_rate": 9.375e-05,
38
+ "loss": 3.6981,
39
  "step": 3000
40
  },
41
  {
42
+ "epoch": 1.7733067287440418,
43
+ "grad_norm": 0.580745279788971,
44
  "learning_rate": 0.000125,
45
+ "loss": 3.4457,
46
  "step": 4000
47
  },
48
  {
49
  "epoch": 2.0,
50
+ "eval_accuracy": 0.4093216099887549,
51
+ "eval_loss": 3.304572105407715,
52
+ "eval_runtime": 74.3697,
53
+ "eval_samples_per_second": 815.923,
54
+ "eval_steps_per_second": 12.761,
55
  "step": 4512
56
  },
57
  {
58
+ "epoch": 2.2163839929054427,
59
+ "grad_norm": 0.5752166509628296,
60
  "learning_rate": 0.00015625,
61
+ "loss": 3.2482,
62
  "step": 5000
63
  },
64
  {
65
+ "epoch": 2.6597938144329896,
66
+ "grad_norm": 0.45855629444122314,
67
  "learning_rate": 0.0001875,
68
+ "loss": 3.13,
69
  "step": 6000
70
  },
71
  {
72
  "epoch": 3.0,
73
+ "eval_accuracy": 0.42987378339602156,
74
+ "eval_loss": 3.0944786071777344,
75
+ "eval_runtime": 73.3184,
76
+ "eval_samples_per_second": 827.624,
77
+ "eval_steps_per_second": 12.944,
78
  "step": 6768
79
  },
80
  {
81
+ "epoch": 3.102871078594391,
82
+ "grad_norm": 0.4158306419849396,
83
  "learning_rate": 0.00021875,
84
+ "loss": 3.0338,
85
  "step": 7000
86
  },
87
  {
88
+ "epoch": 3.5462809001219378,
89
+ "grad_norm": 0.3917515277862549,
90
  "learning_rate": 0.00025,
91
+ "loss": 2.9667,
92
  "step": 8000
93
  },
94
  {
95
+ "epoch": 3.9896907216494846,
96
+ "grad_norm": 0.3958011865615845,
97
  "learning_rate": 0.00028125000000000003,
98
+ "loss": 2.9219,
99
  "step": 9000
100
  },
101
  {
102
  "epoch": 4.0,
103
+ "eval_accuracy": 0.4403801362337948,
104
+ "eval_loss": 2.988952398300171,
105
+ "eval_runtime": 72.5536,
106
+ "eval_samples_per_second": 836.347,
107
+ "eval_steps_per_second": 13.08,
108
  "step": 9024
109
  },
110
  {
111
+ "epoch": 4.4327679858108855,
112
+ "grad_norm": 0.3376877009868622,
113
  "learning_rate": 0.0003125,
114
+ "loss": 2.8585,
115
  "step": 10000
116
  },
117
  {
118
+ "epoch": 4.876177807338433,
119
+ "grad_norm": 0.32727962732315063,
120
  "learning_rate": 0.00034375,
121
+ "loss": 2.8444,
122
  "step": 11000
123
  },
124
  {
125
  "epoch": 5.0,
126
+ "eval_accuracy": 0.44664624452285856,
127
+ "eval_loss": 2.928157091140747,
128
+ "eval_runtime": 72.5793,
129
+ "eval_samples_per_second": 836.051,
130
+ "eval_steps_per_second": 13.075,
131
  "step": 11280
132
  },
133
  {
134
+ "epoch": 5.319255071499834,
135
+ "grad_norm": 0.32956644892692566,
136
  "learning_rate": 0.000375,
137
+ "loss": 2.7978,
138
  "step": 12000
139
  },
140
  {
141
+ "epoch": 5.762664893027381,
142
+ "grad_norm": 0.3080673813819885,
143
  "learning_rate": 0.00040625000000000004,
144
+ "loss": 2.7883,
145
  "step": 13000
146
  },
147
  {
148
  "epoch": 6.0,
149
+ "eval_accuracy": 0.4507549730505254,
150
+ "eval_loss": 2.8910350799560547,
151
+ "eval_runtime": 72.5685,
152
+ "eval_samples_per_second": 836.175,
153
+ "eval_steps_per_second": 13.077,
154
  "step": 13536
155
  },
156
  {
157
+ "epoch": 6.205742157188782,
158
+ "grad_norm": 0.2959093153476715,
159
  "learning_rate": 0.0004375,
160
+ "loss": 2.7566,
161
  "step": 14000
162
  },
163
  {
164
+ "epoch": 6.649151978716328,
165
+ "grad_norm": 0.29388415813446045,
166
  "learning_rate": 0.00046871875,
167
+ "loss": 2.7434,
168
  "step": 15000
169
  },
170
  {
171
  "epoch": 7.0,
172
+ "eval_accuracy": 0.4544950043300115,
173
+ "eval_loss": 2.8579459190368652,
174
+ "eval_runtime": 72.4898,
175
+ "eval_samples_per_second": 837.083,
176
+ "eval_steps_per_second": 13.091,
177
  "step": 15792
178
  },
179
  {
180
+ "epoch": 7.09222924287773,
181
+ "grad_norm": 0.27015742659568787,
182
  "learning_rate": 0.00049996875,
183
+ "loss": 2.7294,
184
  "step": 16000
185
  },
186
  {
187
+ "epoch": 7.535639064405276,
188
+ "grad_norm": 0.2585032880306244,
189
+ "learning_rate": 0.00053121875,
190
+ "loss": 2.7057,
191
  "step": 17000
192
  },
193
  {
194
+ "epoch": 7.979048885932824,
195
+ "grad_norm": 0.26894038915634155,
196
+ "learning_rate": 0.0005624687499999999,
197
+ "loss": 2.7158,
198
  "step": 18000
199
  },
200
  {
201
  "epoch": 8.0,
202
+ "eval_accuracy": 0.4559880181472721,
203
+ "eval_loss": 2.842834949493408,
204
+ "eval_runtime": 72.6498,
205
+ "eval_samples_per_second": 835.24,
206
+ "eval_steps_per_second": 13.063,
207
  "step": 18048
208
  },
209
  {
210
+ "epoch": 8.422126150094225,
211
+ "grad_norm": 0.24038437008857727,
212
+ "learning_rate": 0.0005936875,
213
+ "loss": 2.6733,
214
  "step": 19000
215
  },
216
  {
217
+ "epoch": 8.865535971621771,
218
+ "grad_norm": 0.22421102225780487,
219
+ "learning_rate": 0.0006249375000000001,
220
+ "loss": 2.6905,
221
  "step": 20000
222
  },
223
  {
224
  "epoch": 9.0,
225
+ "eval_accuracy": 0.4572794602349839,
226
+ "eval_loss": 2.8298442363739014,
227
+ "eval_runtime": 72.6455,
228
+ "eval_samples_per_second": 835.29,
229
+ "eval_steps_per_second": 13.063,
230
  "step": 20304
231
  },
232
  {
233
+ "epoch": 9.308613235783172,
234
+ "grad_norm": 0.22955693304538727,
235
+ "learning_rate": 0.0006561562500000001,
236
+ "loss": 2.6582,
237
  "step": 21000
238
  },
239
  {
240
+ "epoch": 9.75202305731072,
241
+ "grad_norm": 0.20538607239723206,
242
+ "learning_rate": 0.00068740625,
243
+ "loss": 2.6697,
244
  "step": 22000
245
  },
246
  {
247
  "epoch": 10.0,
248
+ "eval_accuracy": 0.45919856010960747,
249
+ "eval_loss": 2.816859006881714,
250
+ "eval_runtime": 72.615,
251
+ "eval_samples_per_second": 835.64,
252
+ "eval_steps_per_second": 13.069,
253
  "step": 22560
254
  },
255
  {
256
+ "epoch": 10.19510032147212,
257
+ "grad_norm": 0.2177572250366211,
258
+ "learning_rate": 0.00071865625,
259
+ "loss": 2.6506,
260
  "step": 23000
261
  },
262
  {
263
+ "epoch": 10.638510142999667,
264
+ "grad_norm": 0.2029583603143692,
265
+ "learning_rate": 0.000749875,
266
+ "loss": 2.6509,
267
  "step": 24000
268
  },
269
  {
270
  "epoch": 11.0,
271
+ "eval_accuracy": 0.46011432522910284,
272
+ "eval_loss": 2.807971477508545,
273
+ "eval_runtime": 72.7484,
274
+ "eval_samples_per_second": 834.107,
275
+ "eval_steps_per_second": 13.045,
276
  "step": 24816
277
  },
278
  {
279
+ "epoch": 11.081587407161068,
280
+ "grad_norm": 0.2110850214958191,
281
+ "learning_rate": 0.000781125,
282
+ "loss": 2.6497,
283
  "step": 25000
284
  },
285
  {
286
+ "epoch": 11.524997228688616,
287
+ "grad_norm": 0.19248805940151215,
288
+ "learning_rate": 0.000812375,
289
+ "loss": 2.6322,
290
  "step": 26000
291
  },
292
  {
293
+ "epoch": 11.968407050216163,
294
+ "grad_norm": 0.18789444863796234,
295
+ "learning_rate": 0.00084359375,
296
+ "loss": 2.6494,
297
  "step": 27000
298
  },
299
  {
300
  "epoch": 12.0,
301
+ "eval_accuracy": 0.4606821383794124,
302
+ "eval_loss": 2.8019886016845703,
303
+ "eval_runtime": 72.5563,
304
+ "eval_samples_per_second": 836.316,
305
+ "eval_steps_per_second": 13.079,
306
  "step": 27072
307
  },
308
  {
309
+ "epoch": 12.411484314377564,
310
+ "grad_norm": 0.1861707717180252,
311
+ "learning_rate": 0.0008748437500000001,
312
+ "loss": 2.6148,
313
  "step": 28000
314
  },
315
  {
316
+ "epoch": 12.854894135905111,
317
+ "grad_norm": 0.18803346157073975,
318
+ "learning_rate": 0.0009060625,
319
+ "loss": 2.6384,
320
  "step": 29000
321
  },
322
  {
323
  "epoch": 13.0,
324
+ "eval_accuracy": 0.46163558106169295,
325
+ "eval_loss": 2.7958271503448486,
326
+ "eval_runtime": 72.8339,
327
+ "eval_samples_per_second": 833.128,
328
+ "eval_steps_per_second": 13.03,
329
  "step": 29328
330
  },
331
  {
332
+ "epoch": 13.297971400066512,
333
+ "grad_norm": 0.1761549860239029,
334
+ "learning_rate": 0.0009373125,
335
+ "loss": 2.6142,
336
  "step": 30000
337
  },
338
  {
339
+ "epoch": 13.741381221594057,
340
+ "grad_norm": 0.1844184547662735,
341
+ "learning_rate": 0.00096853125,
342
+ "loss": 2.6297,
343
  "step": 31000
344
  },
345
  {
346
  "epoch": 14.0,
347
+ "eval_accuracy": 0.46196660074708856,
348
+ "eval_loss": 2.7939445972442627,
349
+ "eval_runtime": 72.6804,
350
+ "eval_samples_per_second": 834.888,
351
+ "eval_steps_per_second": 13.057,
352
  "step": 31584
353
  },
354
  {
355
+ "epoch": 14.18445848575546,
356
+ "grad_norm": 0.1886565387248993,
357
+ "learning_rate": 0.00099978125,
358
+ "loss": 2.6147,
359
  "step": 32000
360
  },
361
  {
362
+ "epoch": 14.627868307283006,
363
+ "grad_norm": 0.17688792943954468,
364
+ "learning_rate": 0.0009241984732824427,
365
+ "loss": 2.612,
366
  "step": 33000
367
  },
368
  {
369
  "epoch": 15.0,
370
+ "eval_accuracy": 0.4653180942779221,
371
+ "eval_loss": 2.764906167984009,
372
+ "eval_runtime": 72.6592,
373
+ "eval_samples_per_second": 835.131,
374
+ "eval_steps_per_second": 13.061,
375
  "step": 33840
376
  },
377
  {
378
+ "epoch": 15.070945571444408,
379
+ "grad_norm": 0.19411760568618774,
380
+ "learning_rate": 0.00084793893129771,
381
+ "loss": 2.5952,
382
  "step": 34000
383
  },
384
  {
385
+ "epoch": 15.514355392971954,
386
+ "grad_norm": 0.17588546872138977,
387
+ "learning_rate": 0.0007716793893129771,
388
+ "loss": 2.5635,
389
  "step": 35000
390
  },
391
  {
392
+ "epoch": 15.957765214499501,
393
+ "grad_norm": 0.17366230487823486,
394
+ "learning_rate": 0.0006953435114503817,
395
+ "loss": 2.5667,
396
  "step": 36000
397
  },
398
  {
399
  "epoch": 16.0,
400
+ "eval_accuracy": 0.4685543578011297,
401
+ "eval_loss": 2.7425177097320557,
402
+ "eval_runtime": 72.8257,
403
+ "eval_samples_per_second": 833.222,
404
+ "eval_steps_per_second": 13.031,
405
  "step": 36096
406
  },
407
  {
408
+ "epoch": 16.400842478660902,
409
+ "grad_norm": 0.18736566603183746,
410
+ "learning_rate": 0.0006190076335877863,
411
+ "loss": 2.5093,
412
  "step": 37000
413
  },
414
  {
415
+ "epoch": 16.84425230018845,
416
+ "grad_norm": 0.18060249090194702,
417
+ "learning_rate": 0.0005426717557251909,
418
+ "loss": 2.5177,
419
  "step": 38000
420
  },
421
  {
422
  "epoch": 17.0,
423
+ "eval_accuracy": 0.4714397611384699,
424
+ "eval_loss": 2.7205777168273926,
425
+ "eval_runtime": 72.8204,
426
+ "eval_samples_per_second": 833.283,
427
+ "eval_steps_per_second": 13.032,
428
  "step": 38352
429
  },
430
  {
431
+ "epoch": 17.28732956434985,
432
+ "grad_norm": 0.1875220090150833,
433
+ "learning_rate": 0.000466412213740458,
434
+ "loss": 2.4733,
435
  "step": 39000
436
  },
437
  {
438
+ "epoch": 17.730739385877396,
439
+ "grad_norm": 0.18848279118537903,
440
+ "learning_rate": 0.00039007633587786263,
441
+ "loss": 2.4607,
442
  "step": 40000
443
  },
444
  {
445
  "epoch": 18.0,
446
+ "eval_accuracy": 0.47464791190042266,
447
+ "eval_loss": 2.699930429458618,
448
+ "eval_runtime": 72.4963,
449
+ "eval_samples_per_second": 837.008,
450
+ "eval_steps_per_second": 13.09,
451
  "step": 40608
452
  },
453
  {
454
+ "epoch": 18.1738166500388,
455
+ "grad_norm": 0.19309544563293457,
456
+ "learning_rate": 0.0003138167938931298,
457
+ "loss": 2.43,
458
  "step": 41000
459
  },
460
  {
461
+ "epoch": 18.617226471566344,
462
+ "grad_norm": 0.1929185390472412,
463
+ "learning_rate": 0.00023748091603053434,
464
+ "loss": 2.397,
465
  "step": 42000
466
  },
467
  {
468
  "epoch": 19.0,
469
+ "eval_accuracy": 0.4773406620393708,
470
+ "eval_loss": 2.6864736080169678,
471
+ "eval_runtime": 72.792,
472
+ "eval_samples_per_second": 833.608,
473
+ "eval_steps_per_second": 13.037,
474
  "step": 42864
475
  },
476
  {
477
+ "epoch": 19.060303735727746,
478
+ "grad_norm": 0.20593929290771484,
479
+ "learning_rate": 0.00016114503816793893,
480
+ "loss": 2.3837,
481
  "step": 43000
482
  },
483
  {
484
+ "epoch": 19.503713557255292,
485
+ "grad_norm": 0.2001897543668747,
486
+ "learning_rate": 8.480916030534351e-05,
487
+ "loss": 2.3276,
488
  "step": 44000
489
  },
490
  {
491
+ "epoch": 19.94712337878284,
492
+ "grad_norm": 0.19933941960334778,
493
+ "learning_rate": 8.549618320610688e-06,
494
+ "loss": 2.3241,
495
  "step": 45000
496
  },
497
  {
498
+ "epoch": 19.991464360935595,
499
+ "eval_accuracy": 0.47868057440510814,
500
+ "eval_loss": 2.6840312480926514,
501
+ "eval_runtime": 72.9711,
502
+ "eval_samples_per_second": 831.562,
503
+ "eval_steps_per_second": 13.005,
504
+ "step": 45100
505
  },
506
  {
507
+ "epoch": 19.991464360935595,
508
+ "step": 45100,
509
+ "total_flos": 1.507910045663232e+18,
510
+ "train_loss": 2.8050402250099604,
511
+ "train_runtime": 30336.2728,
512
+ "train_samples_per_second": 380.629,
513
+ "train_steps_per_second": 1.487
514
  }
515
  ],
516
  "logging_steps": 1000,
517
+ "max_steps": 45100,
518
  "num_input_tokens_seen": 0,
519
  "num_train_epochs": 20,
520
  "save_steps": 500,
 
539
  "attributes": {}
540
  }
541
  },
542
+ "total_flos": 1.507910045663232e+18,
543
  "train_batch_size": 32,
544
  "trial_name": null,
545
  "trial_params": null