Fill-Mask
Transformers
PyTorch
English
bert
Inference Endpoints
sequoiaandrade commited on
Commit
dbe0879
·
1 Parent(s): 6d3e2ce

updated model files

Browse files
config.json CHANGED
@@ -19,7 +19,7 @@
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
  "torch_dtype": "float32",
22
- "transformers_version": "4.23.1",
23
  "type_vocab_size": 2,
24
  "use_cache": true,
25
  "vocab_size": 30522
 
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
  "torch_dtype": "float32",
22
+ "transformers_version": "4.33.3",
23
  "type_vocab_size": 2,
24
  "use_cache": true,
25
  "vocab_size": 30522
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.33.3"
5
+ }
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bae74914920b452d71e40aadcab35331f74b320cf51e9c2b55082954b42eb89
3
- size 1554433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3f31febe1dd0cc26397d30f312c151db6acc0fd9b12d444e3ae813896e2da0e
3
+ size 1555717
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9154208c846c82d1c5f71a458961aa6d9645432eb51b7a80dd55c9cc21697f59
3
- size 438128811
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:526e1af0a8d0f41e583eff223781a02d05475392c007c54436302e577c644a7b
3
+ size 438126133
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b7bf20eb436b3197418c8b18cde94c14592eda2d8826cc6ffe4c4e48eba4cbb
3
- size 16543
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d11b1e775e8d24ee8aa73e2be08c7d3442a71a9b3343c20d8cea19f6c0499c28
3
+ size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8642e2ea00a020f1667e4c75ffeb51b55a74217527808f110fd5e8eb1a30a72
3
- size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3169029a398e76d6013cadd402227ef5f345365c777edfd7ad382de6b99270
3
+ size 627
tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 512
12
- },
13
- "direction": "Right",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 0,
16
- "pad_type_id": 0,
17
- "pad_token": "[PAD]"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
tokenizer_config.json CHANGED
@@ -1,14 +1,13 @@
1
  {
 
2
  "cls_token": "[CLS]",
3
  "do_lower_case": true,
4
  "mask_token": "[MASK]",
5
  "model_max_length": 512,
6
- "name_or_path": "bert-base-uncased",
7
  "pad_token": "[PAD]",
8
  "padding": "max_length",
9
  "return_special_tokens_mask": true,
10
  "sep_token": "[SEP]",
11
- "special_tokens_map_file": null,
12
  "strip_accents": null,
13
  "tokenize_chinese_chars": true,
14
  "tokenizer_class": "BertTokenizer",
 
1
  {
2
+ "clean_up_tokenization_spaces": true,
3
  "cls_token": "[CLS]",
4
  "do_lower_case": true,
5
  "mask_token": "[MASK]",
6
  "model_max_length": 512,
 
7
  "pad_token": "[PAD]",
8
  "padding": "max_length",
9
  "return_special_tokens_mask": true,
10
  "sep_token": "[SEP]",
 
11
  "strip_accents": null,
12
  "tokenize_chinese_chars": true,
13
  "tokenizer_class": "BertTokenizer",
trainer_state.json CHANGED
@@ -1,796 +1,2457 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.187168922895821,
5
- "global_step": 13005,
 
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.02,
12
- "learning_rate": 9.986209216279854e-06,
13
- "loss": 1.5736,
14
  "step": 100
15
  },
16
  {
17
- "epoch": 0.03,
18
- "learning_rate": 9.974100235452405e-06,
19
- "loss": 2.2457,
20
  "step": 200
21
  },
22
  {
23
- "epoch": 0.05,
24
- "learning_rate": 9.961823074335689e-06,
25
- "loss": 1.5859,
26
  "step": 300
27
  },
28
  {
29
- "epoch": 0.07,
30
- "learning_rate": 9.948705011772622e-06,
31
- "loss": 1.728,
32
  "step": 400
33
  },
34
  {
35
- "epoch": 0.08,
36
- "learning_rate": 9.938109653548606e-06,
37
- "loss": 1.7661,
38
  "step": 500
39
  },
40
  {
41
- "epoch": 0.1,
42
- "learning_rate": 9.927009754456779e-06,
43
- "loss": 2.3678209948440125e+23,
 
 
 
 
 
 
 
 
44
  "step": 600
45
  },
46
  {
47
- "epoch": 0.12,
48
- "learning_rate": 9.910191725529768e-06,
49
- "loss": 0.0,
50
  "step": 700
51
  },
52
  {
53
- "epoch": 0.13,
54
- "learning_rate": 9.893373696602759e-06,
55
- "loss": 0.0,
56
  "step": 800
57
  },
58
  {
59
- "epoch": 0.15,
60
- "learning_rate": 9.876555667675748e-06,
61
- "loss": 0.0,
62
  "step": 900
63
  },
64
  {
65
- "epoch": 0.17,
66
- "learning_rate": 9.85973763874874e-06,
67
- "loss": 0.0,
68
  "step": 1000
69
  },
70
  {
71
- "epoch": 0.18,
72
- "learning_rate": 9.84291960982173e-06,
73
- "loss": 0.0,
 
 
 
 
 
 
 
 
74
  "step": 1100
75
  },
76
  {
77
- "epoch": 0.2,
78
- "learning_rate": 9.826101580894721e-06,
79
- "loss": 0.0,
80
  "step": 1200
81
  },
82
  {
83
- "epoch": 0.22,
84
- "learning_rate": 9.80928355196771e-06,
85
- "loss": 0.0,
86
  "step": 1300
87
  },
88
  {
89
- "epoch": 0.24,
90
- "learning_rate": 9.792465523040701e-06,
91
- "loss": 0.0,
92
  "step": 1400
93
  },
94
  {
95
- "epoch": 0.25,
96
- "learning_rate": 9.77564749411369e-06,
97
- "loss": 0.0,
98
  "step": 1500
99
  },
100
  {
101
- "epoch": 0.27,
102
- "learning_rate": 9.758829465186681e-06,
103
- "loss": 0.0,
 
 
 
 
 
 
 
 
104
  "step": 1600
105
  },
106
  {
107
- "epoch": 0.29,
108
- "learning_rate": 9.74201143625967e-06,
109
- "loss": 0.0,
110
  "step": 1700
111
  },
112
  {
113
- "epoch": 0.3,
114
- "learning_rate": 9.725193407332661e-06,
115
- "loss": 0.0,
116
  "step": 1800
117
  },
118
  {
119
- "epoch": 0.32,
120
- "learning_rate": 9.70837537840565e-06,
121
- "loss": 0.0,
122
  "step": 1900
123
  },
124
  {
125
- "epoch": 0.34,
126
- "learning_rate": 9.691557349478641e-06,
127
- "loss": 0.0,
128
  "step": 2000
129
  },
130
  {
131
- "epoch": 0.35,
132
- "learning_rate": 9.674739320551632e-06,
133
- "loss": 0.0,
 
 
 
 
 
 
 
 
134
  "step": 2100
135
  },
136
  {
137
- "epoch": 0.37,
138
- "learning_rate": 9.657921291624623e-06,
139
- "loss": 0.0,
140
  "step": 2200
141
  },
142
  {
143
- "epoch": 0.39,
144
- "learning_rate": 9.641103262697614e-06,
145
- "loss": 0.0,
146
  "step": 2300
147
  },
148
  {
149
- "epoch": 0.4,
150
- "learning_rate": 9.624285233770603e-06,
151
- "loss": 0.0,
152
  "step": 2400
153
  },
154
  {
155
- "epoch": 0.42,
156
- "learning_rate": 9.607467204843594e-06,
157
- "loss": 0.0,
158
  "step": 2500
159
  },
160
  {
161
- "epoch": 0.44,
162
- "learning_rate": 9.590649175916583e-06,
163
- "loss": 0.0,
 
 
 
 
 
 
 
 
164
  "step": 2600
165
  },
166
  {
167
- "epoch": 0.45,
168
- "learning_rate": 9.573831146989574e-06,
169
- "loss": 0.0,
170
  "step": 2700
171
  },
172
  {
173
- "epoch": 0.47,
174
- "learning_rate": 9.557013118062563e-06,
175
- "loss": 0.0,
176
  "step": 2800
177
  },
178
  {
179
- "epoch": 0.49,
180
- "learning_rate": 9.540195089135554e-06,
181
- "loss": 0.0,
182
  "step": 2900
183
  },
184
  {
185
- "epoch": 0.5,
186
- "learning_rate": 9.523377060208543e-06,
187
- "loss": 0.0,
188
  "step": 3000
189
  },
190
  {
191
- "epoch": 0.52,
192
- "learning_rate": 9.506559031281534e-06,
193
- "loss": 0.0,
 
 
 
 
 
 
 
 
194
  "step": 3100
195
  },
196
  {
197
- "epoch": 0.54,
198
- "learning_rate": 9.489741002354525e-06,
199
- "loss": 0.0,
200
  "step": 3200
201
  },
202
  {
203
- "epoch": 0.55,
204
- "learning_rate": 9.472922973427516e-06,
205
- "loss": 0.0,
206
  "step": 3300
207
  },
208
  {
209
- "epoch": 0.57,
210
- "learning_rate": 9.456104944500505e-06,
211
- "loss": 0.0,
212
  "step": 3400
213
  },
214
  {
215
- "epoch": 0.59,
216
- "learning_rate": 9.439286915573496e-06,
217
- "loss": 0.0,
218
  "step": 3500
219
  },
220
  {
221
- "epoch": 0.61,
222
- "learning_rate": 9.422468886646485e-06,
223
- "loss": 0.0,
 
 
 
 
 
 
 
 
224
  "step": 3600
225
  },
226
  {
227
- "epoch": 0.62,
228
- "learning_rate": 9.405650857719476e-06,
229
- "loss": 0.0,
230
  "step": 3700
231
  },
232
  {
233
- "epoch": 0.64,
234
- "learning_rate": 9.388832828792467e-06,
235
- "loss": 0.0,
236
  "step": 3800
237
  },
238
  {
239
- "epoch": 0.66,
240
- "learning_rate": 9.372014799865456e-06,
241
- "loss": 0.0,
242
  "step": 3900
243
  },
244
  {
245
- "epoch": 0.67,
246
- "learning_rate": 9.355196770938447e-06,
247
- "loss": 0.0,
248
  "step": 4000
249
  },
250
  {
251
- "epoch": 0.69,
252
- "learning_rate": 9.338378742011436e-06,
253
- "loss": 0.0,
 
 
 
 
 
 
 
 
254
  "step": 4100
255
  },
256
  {
257
- "epoch": 0.71,
258
- "learning_rate": 9.321560713084427e-06,
259
- "loss": 0.0,
260
  "step": 4200
261
  },
262
  {
263
- "epoch": 0.72,
264
- "learning_rate": 9.304742684157418e-06,
265
- "loss": 0.0,
266
  "step": 4300
267
  },
268
  {
269
- "epoch": 0.74,
270
- "learning_rate": 9.287924655230409e-06,
271
- "loss": 0.0,
272
  "step": 4400
273
  },
274
  {
275
- "epoch": 0.76,
276
- "learning_rate": 9.271106626303398e-06,
277
- "loss": 0.0,
278
  "step": 4500
279
  },
280
  {
281
- "epoch": 0.77,
282
- "learning_rate": 9.254288597376389e-06,
283
- "loss": 0.0,
 
 
 
 
 
 
 
 
284
  "step": 4600
285
  },
286
  {
287
- "epoch": 0.79,
288
- "learning_rate": 9.237470568449378e-06,
289
- "loss": 0.0,
290
  "step": 4700
291
  },
292
  {
293
- "epoch": 0.81,
294
- "learning_rate": 9.220652539522369e-06,
295
- "loss": 0.0,
296
  "step": 4800
297
  },
298
  {
299
- "epoch": 0.82,
300
- "learning_rate": 9.203834510595358e-06,
301
- "loss": 0.0,
302
  "step": 4900
303
  },
304
  {
305
- "epoch": 0.84,
306
- "learning_rate": 9.187016481668349e-06,
307
- "loss": 0.0,
308
  "step": 5000
309
  },
310
  {
311
- "epoch": 0.86,
312
- "learning_rate": 9.170198452741338e-06,
313
- "loss": 0.0,
 
 
 
 
 
 
 
 
314
  "step": 5100
315
  },
316
  {
317
- "epoch": 0.87,
318
- "learning_rate": 9.153380423814329e-06,
319
- "loss": 0.0,
320
  "step": 5200
321
  },
322
  {
323
- "epoch": 0.89,
324
- "learning_rate": 9.13656239488732e-06,
325
- "loss": 0.0,
326
  "step": 5300
327
  },
328
  {
329
- "epoch": 0.91,
330
- "learning_rate": 9.119744365960311e-06,
331
- "loss": 0.0,
332
  "step": 5400
333
  },
334
  {
335
- "epoch": 0.92,
336
- "learning_rate": 9.102926337033302e-06,
337
- "loss": 0.0,
338
  "step": 5500
339
  },
340
  {
341
- "epoch": 0.94,
342
- "learning_rate": 9.086108308106291e-06,
343
- "loss": 0.0,
 
 
 
 
 
 
 
 
344
  "step": 5600
345
  },
346
  {
347
- "epoch": 0.96,
348
- "learning_rate": 9.069290279179282e-06,
349
- "loss": 0.0,
350
  "step": 5700
351
  },
352
  {
353
- "epoch": 0.98,
354
- "learning_rate": 9.052472250252271e-06,
355
- "loss": 0.0,
356
  "step": 5800
357
  },
358
  {
359
- "epoch": 0.99,
360
- "learning_rate": 9.035654221325262e-06,
361
- "loss": 0.0,
362
  "step": 5900
363
  },
364
  {
365
- "epoch": 1.01,
366
- "learning_rate": 9.018836192398251e-06,
367
- "loss": 0.0,
368
  "step": 6000
369
  },
370
  {
371
- "epoch": 1.03,
372
- "learning_rate": 9.002018163471242e-06,
373
- "loss": 0.0,
 
 
 
 
 
 
 
 
374
  "step": 6100
375
  },
376
  {
377
- "epoch": 1.04,
378
- "learning_rate": 8.985200134544231e-06,
379
- "loss": 0.0,
380
  "step": 6200
381
  },
382
  {
383
- "epoch": 1.06,
384
- "learning_rate": 8.968382105617222e-06,
385
- "loss": 0.0,
386
  "step": 6300
387
  },
388
  {
389
- "epoch": 1.08,
390
- "learning_rate": 8.951564076690213e-06,
391
- "loss": 0.0,
392
  "step": 6400
393
  },
394
  {
395
- "epoch": 1.09,
396
- "learning_rate": 8.934746047763204e-06,
397
- "loss": 0.0,
398
  "step": 6500
399
  },
400
  {
401
- "epoch": 1.11,
402
- "learning_rate": 8.917928018836193e-06,
403
- "loss": 0.0,
 
 
 
 
 
 
 
 
404
  "step": 6600
405
  },
406
  {
407
- "epoch": 1.13,
408
- "learning_rate": 8.901109989909184e-06,
409
- "loss": 0.0,
410
  "step": 6700
411
  },
412
  {
413
- "epoch": 1.14,
414
- "learning_rate": 8.884291960982173e-06,
415
- "loss": 0.0,
416
  "step": 6800
417
  },
418
  {
419
- "epoch": 1.16,
420
- "learning_rate": 8.867473932055164e-06,
421
- "loss": 0.0,
422
  "step": 6900
423
  },
424
  {
425
- "epoch": 1.18,
426
- "learning_rate": 8.850655903128153e-06,
427
- "loss": 0.0,
428
  "step": 7000
429
  },
430
  {
431
- "epoch": 1.19,
432
- "learning_rate": 8.833837874201144e-06,
433
- "loss": 0.0,
 
 
 
 
 
 
 
 
434
  "step": 7100
435
  },
436
  {
437
- "epoch": 1.21,
438
- "learning_rate": 8.817019845274135e-06,
439
- "loss": 0.0,
440
  "step": 7200
441
  },
442
  {
443
- "epoch": 1.23,
444
- "learning_rate": 8.800201816347124e-06,
445
- "loss": 0.0,
446
  "step": 7300
447
  },
448
  {
449
- "epoch": 1.24,
450
- "learning_rate": 8.783383787420115e-06,
451
- "loss": 0.0,
452
  "step": 7400
453
  },
454
  {
455
- "epoch": 1.26,
456
- "learning_rate": 8.766565758493106e-06,
457
- "loss": 0.0,
458
  "step": 7500
459
  },
460
  {
461
- "epoch": 1.28,
462
- "learning_rate": 8.749747729566097e-06,
463
- "loss": 0.0,
 
 
 
 
 
 
 
 
464
  "step": 7600
465
  },
466
  {
467
- "epoch": 1.29,
468
- "learning_rate": 8.732929700639086e-06,
469
- "loss": 0.0,
470
  "step": 7700
471
  },
472
  {
473
- "epoch": 1.31,
474
- "learning_rate": 8.716111671712077e-06,
475
- "loss": 0.0,
476
  "step": 7800
477
  },
478
  {
479
- "epoch": 1.33,
480
- "learning_rate": 8.699293642785066e-06,
481
- "loss": 0.0,
482
  "step": 7900
483
  },
484
  {
485
- "epoch": 1.35,
486
- "learning_rate": 8.682475613858057e-06,
487
- "loss": 0.0,
488
  "step": 8000
489
  },
490
  {
491
- "epoch": 1.36,
492
- "learning_rate": 8.665657584931046e-06,
493
- "loss": 0.0,
 
 
 
 
 
 
 
 
494
  "step": 8100
495
  },
496
  {
497
- "epoch": 1.38,
498
- "learning_rate": 8.648839556004037e-06,
499
- "loss": 0.0,
500
  "step": 8200
501
  },
502
  {
503
- "epoch": 1.4,
504
- "learning_rate": 8.632021527077026e-06,
505
- "loss": 0.0,
506
  "step": 8300
507
  },
508
  {
509
- "epoch": 1.41,
510
- "learning_rate": 8.615203498150017e-06,
511
- "loss": 0.0,
512
  "step": 8400
513
  },
514
  {
515
- "epoch": 1.43,
516
- "learning_rate": 8.598385469223008e-06,
517
- "loss": 0.0,
518
  "step": 8500
519
  },
520
  {
521
- "epoch": 1.45,
522
- "learning_rate": 8.581567440295999e-06,
523
- "loss": 0.0,
 
 
 
 
 
 
 
 
524
  "step": 8600
525
  },
526
  {
527
- "epoch": 1.46,
528
- "learning_rate": 8.564749411368988e-06,
529
- "loss": 0.0,
530
  "step": 8700
531
  },
532
  {
533
- "epoch": 1.48,
534
- "learning_rate": 8.547931382441979e-06,
535
- "loss": 0.0,
536
  "step": 8800
537
  },
538
  {
539
- "epoch": 1.5,
540
- "learning_rate": 8.53111335351497e-06,
541
- "loss": 0.0,
542
  "step": 8900
543
  },
544
  {
545
- "epoch": 1.51,
546
- "learning_rate": 8.514295324587959e-06,
547
- "loss": 0.0,
548
  "step": 9000
549
  },
550
  {
551
- "epoch": 1.53,
552
- "learning_rate": 8.49747729566095e-06,
553
- "loss": 0.0,
 
 
 
 
 
 
 
 
554
  "step": 9100
555
  },
556
  {
557
- "epoch": 1.55,
558
- "learning_rate": 8.480659266733939e-06,
559
- "loss": 0.0,
560
  "step": 9200
561
  },
562
  {
563
- "epoch": 1.56,
564
- "learning_rate": 8.46384123780693e-06,
565
- "loss": 0.0,
566
  "step": 9300
567
  },
568
  {
569
- "epoch": 1.58,
570
- "learning_rate": 8.447023208879919e-06,
571
- "loss": 0.0,
572
  "step": 9400
573
  },
574
  {
575
- "epoch": 1.6,
576
- "learning_rate": 8.43020517995291e-06,
577
- "loss": 0.0,
578
  "step": 9500
579
  },
580
  {
581
- "epoch": 1.61,
582
- "learning_rate": 8.413387151025901e-06,
583
- "loss": 0.0,
 
 
 
 
 
 
 
 
584
  "step": 9600
585
  },
586
  {
587
- "epoch": 1.63,
588
- "learning_rate": 8.396569122098892e-06,
589
- "loss": 0.0,
590
  "step": 9700
591
  },
592
  {
593
- "epoch": 1.65,
594
- "learning_rate": 8.379751093171881e-06,
595
- "loss": 0.0,
596
  "step": 9800
597
  },
598
  {
599
- "epoch": 1.66,
600
- "learning_rate": 8.362933064244872e-06,
601
- "loss": 0.0,
602
  "step": 9900
603
  },
604
  {
605
- "epoch": 1.68,
606
- "learning_rate": 8.346115035317861e-06,
607
- "loss": 0.0,
608
  "step": 10000
609
  },
610
  {
611
- "epoch": 1.7,
612
- "learning_rate": 8.329297006390852e-06,
613
- "loss": 0.0,
 
 
 
 
 
 
 
 
614
  "step": 10100
615
  },
616
  {
617
- "epoch": 1.72,
618
- "learning_rate": 8.312478977463841e-06,
619
- "loss": 0.0,
620
  "step": 10200
621
  },
622
  {
623
- "epoch": 1.73,
624
- "learning_rate": 8.295660948536832e-06,
625
- "loss": 0.0,
626
  "step": 10300
627
  },
628
  {
629
- "epoch": 1.75,
630
- "learning_rate": 8.278842919609821e-06,
631
- "loss": 0.0,
632
  "step": 10400
633
  },
634
  {
635
- "epoch": 1.77,
636
- "learning_rate": 8.262024890682812e-06,
637
- "loss": 0.0,
638
  "step": 10500
639
  },
640
  {
641
- "epoch": 1.78,
642
- "learning_rate": 8.245206861755803e-06,
643
- "loss": 0.0,
 
 
 
 
 
 
 
 
644
  "step": 10600
645
  },
646
  {
647
- "epoch": 1.8,
648
- "learning_rate": 8.228388832828794e-06,
649
- "loss": 0.0,
650
  "step": 10700
651
  },
652
  {
653
- "epoch": 1.82,
654
- "learning_rate": 8.211570803901785e-06,
655
- "loss": 0.0,
656
  "step": 10800
657
  },
658
  {
659
- "epoch": 1.83,
660
- "learning_rate": 8.194752774974774e-06,
661
- "loss": 0.0,
662
  "step": 10900
663
  },
664
  {
665
- "epoch": 1.85,
666
- "learning_rate": 8.177934746047765e-06,
667
- "loss": 0.0,
668
  "step": 11000
669
  },
670
  {
671
- "epoch": 1.87,
672
- "learning_rate": 8.161116717120754e-06,
673
- "loss": 0.0,
674
- "step": 11100
675
- },
 
 
676
  {
677
- "epoch": 1.88,
678
- "learning_rate": 8.144298688193745e-06,
679
- "loss": 0.0,
 
 
 
 
 
 
680
  "step": 11200
681
  },
682
  {
683
- "epoch": 1.9,
684
- "learning_rate": 8.127480659266734e-06,
685
- "loss": 0.0,
686
  "step": 11300
687
  },
688
  {
689
- "epoch": 1.92,
690
- "learning_rate": 8.110662630339725e-06,
691
- "loss": 0.0,
692
  "step": 11400
693
  },
694
  {
695
- "epoch": 1.93,
696
- "learning_rate": 8.093844601412714e-06,
697
- "loss": 0.0,
698
  "step": 11500
699
  },
700
  {
701
- "epoch": 1.95,
702
- "learning_rate": 8.077026572485705e-06,
703
- "loss": 0.0,
 
 
 
 
 
 
 
 
704
  "step": 11600
705
  },
706
  {
707
- "epoch": 1.97,
708
- "learning_rate": 8.060208543558696e-06,
709
- "loss": 0.0,
710
  "step": 11700
711
  },
712
  {
713
- "epoch": 1.98,
714
- "learning_rate": 8.043390514631687e-06,
715
- "loss": 0.0,
716
  "step": 11800
717
  },
718
  {
719
- "epoch": 2.0,
720
- "learning_rate": 8.026572485704676e-06,
721
- "loss": 0.0,
722
  "step": 11900
723
  },
724
  {
725
- "epoch": 2.02,
726
- "learning_rate": 8.009754456777667e-06,
727
- "loss": 0.0,
 
 
 
 
 
 
 
 
728
  "step": 12000
729
  },
730
  {
731
- "epoch": 2.03,
732
- "learning_rate": 7.992936427850656e-06,
733
- "loss": 0.0,
734
  "step": 12100
735
  },
736
  {
737
- "epoch": 2.05,
738
- "learning_rate": 7.976118398923647e-06,
739
- "loss": 0.0,
740
  "step": 12200
741
  },
742
  {
743
- "epoch": 2.07,
744
- "learning_rate": 7.959300369996638e-06,
745
- "loss": 0.0,
746
  "step": 12300
747
  },
748
  {
749
- "epoch": 2.09,
750
- "learning_rate": 7.942482341069627e-06,
751
- "loss": 0.0,
752
  "step": 12400
753
  },
754
  {
755
- "epoch": 2.1,
756
- "learning_rate": 7.925664312142618e-06,
757
- "loss": 0.0,
 
 
 
 
 
 
 
 
758
  "step": 12500
759
  },
760
  {
761
- "epoch": 2.12,
762
- "learning_rate": 7.908846283215607e-06,
763
- "loss": 0.0,
764
  "step": 12600
765
  },
766
  {
767
- "epoch": 2.14,
768
- "learning_rate": 7.892028254288598e-06,
769
- "loss": 0.0,
770
  "step": 12700
771
  },
772
  {
773
- "epoch": 2.15,
774
- "learning_rate": 7.875210225361589e-06,
775
- "loss": 0.0,
776
  "step": 12800
777
  },
778
  {
779
- "epoch": 2.17,
780
- "learning_rate": 7.85839219643458e-06,
781
- "loss": 0.0,
782
  "step": 12900
783
  },
784
  {
785
- "epoch": 2.19,
786
- "learning_rate": 7.841574167507569e-06,
787
- "loss": 0.0,
 
 
 
 
 
 
 
 
788
  "step": 13000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  }
790
  ],
791
- "max_steps": 59460,
792
- "num_train_epochs": 10,
793
- "total_flos": 1.314423778738176e+18,
 
 
794
  "trial_name": null,
795
  "trial_params": null
796
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9993226593794158,
5
+ "eval_steps": 500,
6
+ "global_step": 32100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
+ "learning_rate": 9.969479912799752e-06,
14
+ "loss": 5.7053,
15
  "step": 100
16
  },
17
  {
18
+ "epoch": 0.01,
19
+ "learning_rate": 9.938336966677049e-06,
20
+ "loss": 4.2137,
21
  "step": 200
22
  },
23
  {
24
+ "epoch": 0.02,
25
+ "learning_rate": 9.907194020554345e-06,
26
+ "loss": 3.7382,
27
  "step": 300
28
  },
29
  {
30
+ "epoch": 0.02,
31
+ "learning_rate": 9.876051074431642e-06,
32
+ "loss": 3.768,
33
  "step": 400
34
  },
35
  {
36
+ "epoch": 0.03,
37
+ "learning_rate": 9.844908128308939e-06,
38
+ "loss": 4.0831,
39
  "step": 500
40
  },
41
  {
42
+ "epoch": 0.03,
43
+ "eval_loss": NaN,
44
+ "eval_runtime": 1241.4479,
45
+ "eval_samples_per_second": 183.933,
46
+ "eval_steps_per_second": 22.992,
47
+ "step": 500
48
+ },
49
+ {
50
+ "epoch": 0.04,
51
+ "learning_rate": 9.813765182186236e-06,
52
+ "loss": 3.9938,
53
  "step": 600
54
  },
55
  {
56
+ "epoch": 0.04,
57
+ "learning_rate": 9.782622236063532e-06,
58
+ "loss": 3.6619,
59
  "step": 700
60
  },
61
  {
62
+ "epoch": 0.05,
63
+ "learning_rate": 9.751790719402056e-06,
64
+ "loss": 3.9896,
65
  "step": 800
66
  },
67
  {
68
+ "epoch": 0.06,
69
+ "learning_rate": 9.720647773279352e-06,
70
+ "loss": 3.9662,
71
  "step": 900
72
  },
73
  {
74
+ "epoch": 0.06,
75
+ "learning_rate": 9.68950482715665e-06,
76
+ "loss": 4.0557,
77
  "step": 1000
78
  },
79
  {
80
+ "epoch": 0.06,
81
+ "eval_loss": NaN,
82
+ "eval_runtime": 1241.0879,
83
+ "eval_samples_per_second": 183.986,
84
+ "eval_steps_per_second": 22.998,
85
+ "step": 1000
86
+ },
87
+ {
88
+ "epoch": 0.07,
89
+ "learning_rate": 9.658361881033946e-06,
90
+ "loss": 3.4231,
91
  "step": 1100
92
  },
93
  {
94
+ "epoch": 0.07,
95
+ "learning_rate": 9.627218934911242e-06,
96
+ "loss": 3.4517,
97
  "step": 1200
98
  },
99
  {
100
+ "epoch": 0.08,
101
+ "learning_rate": 9.59607598878854e-06,
102
+ "loss": 4.235,
103
  "step": 1300
104
  },
105
  {
106
+ "epoch": 0.09,
107
+ "learning_rate": 9.564933042665836e-06,
108
+ "loss": 3.3104,
109
  "step": 1400
110
  },
111
  {
112
+ "epoch": 0.09,
113
+ "learning_rate": 9.533790096543134e-06,
114
+ "loss": 3.6169,
115
  "step": 1500
116
  },
117
  {
118
+ "epoch": 0.09,
119
+ "eval_loss": NaN,
120
+ "eval_runtime": 1240.8951,
121
+ "eval_samples_per_second": 184.015,
122
+ "eval_steps_per_second": 23.002,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 0.1,
127
+ "learning_rate": 9.50264715042043e-06,
128
+ "loss": 3.3399,
129
  "step": 1600
130
  },
131
  {
132
+ "epoch": 0.11,
133
+ "learning_rate": 9.471504204297728e-06,
134
+ "loss": 3.9392,
135
  "step": 1700
136
  },
137
  {
138
+ "epoch": 0.11,
139
+ "learning_rate": 9.440361258175024e-06,
140
+ "loss": 4.2941,
141
  "step": 1800
142
  },
143
  {
144
+ "epoch": 0.12,
145
+ "learning_rate": 9.40921831205232e-06,
146
+ "loss": 4.2293,
147
  "step": 1900
148
  },
149
  {
150
+ "epoch": 0.12,
151
+ "learning_rate": 9.378075365929618e-06,
152
+ "loss": 3.411,
153
  "step": 2000
154
  },
155
  {
156
+ "epoch": 0.12,
157
+ "eval_loss": NaN,
158
+ "eval_runtime": 1240.9958,
159
+ "eval_samples_per_second": 184.0,
160
+ "eval_steps_per_second": 23.0,
161
+ "step": 2000
162
+ },
163
+ {
164
+ "epoch": 0.13,
165
+ "learning_rate": 9.346932419806914e-06,
166
+ "loss": 3.1979,
167
  "step": 2100
168
  },
169
  {
170
+ "epoch": 0.14,
171
+ "learning_rate": 9.315789473684212e-06,
172
+ "loss": 3.303,
173
  "step": 2200
174
  },
175
  {
176
+ "epoch": 0.14,
177
+ "learning_rate": 9.284646527561508e-06,
178
+ "loss": 3.2433,
179
  "step": 2300
180
  },
181
  {
182
+ "epoch": 0.15,
183
+ "learning_rate": 9.253503581438806e-06,
184
+ "loss": 3.0607,
185
  "step": 2400
186
  },
187
  {
188
+ "epoch": 0.16,
189
+ "learning_rate": 9.222360635316102e-06,
190
+ "loss": 2.8986,
191
  "step": 2500
192
  },
193
  {
194
+ "epoch": 0.16,
195
+ "eval_loss": NaN,
196
+ "eval_runtime": 1242.4997,
197
+ "eval_samples_per_second": 183.777,
198
+ "eval_steps_per_second": 22.972,
199
+ "step": 2500
200
+ },
201
+ {
202
+ "epoch": 0.16,
203
+ "learning_rate": 9.191217689193398e-06,
204
+ "loss": 3.5778,
205
  "step": 2600
206
  },
207
  {
208
+ "epoch": 0.17,
209
+ "learning_rate": 9.160074743070694e-06,
210
+ "loss": 3.2627,
211
  "step": 2700
212
  },
213
  {
214
+ "epoch": 0.17,
215
+ "learning_rate": 9.128931796947992e-06,
216
+ "loss": 2.8115,
217
  "step": 2800
218
  },
219
  {
220
+ "epoch": 0.18,
221
+ "learning_rate": 9.097788850825288e-06,
222
+ "loss": 3.1633,
223
  "step": 2900
224
  },
225
  {
226
+ "epoch": 0.19,
227
+ "learning_rate": 9.066645904702586e-06,
228
+ "loss": 3.3414,
229
  "step": 3000
230
  },
231
  {
232
+ "epoch": 0.19,
233
+ "eval_loss": NaN,
234
+ "eval_runtime": 1241.065,
235
+ "eval_samples_per_second": 183.99,
236
+ "eval_steps_per_second": 22.999,
237
+ "step": 3000
238
+ },
239
+ {
240
+ "epoch": 0.19,
241
+ "learning_rate": 9.035502958579882e-06,
242
+ "loss": 3.1895,
243
  "step": 3100
244
  },
245
  {
246
+ "epoch": 0.2,
247
+ "learning_rate": 9.00436001245718e-06,
248
+ "loss": 3.2972,
249
  "step": 3200
250
  },
251
  {
252
+ "epoch": 0.21,
253
+ "learning_rate": 8.973217066334476e-06,
254
+ "loss": 3.3117,
255
  "step": 3300
256
  },
257
  {
258
+ "epoch": 0.21,
259
+ "learning_rate": 8.942074120211772e-06,
260
+ "loss": 3.8183,
261
  "step": 3400
262
  },
263
  {
264
+ "epoch": 0.22,
265
+ "learning_rate": 8.911242603550296e-06,
266
+ "loss": 3.0445,
267
  "step": 3500
268
  },
269
  {
270
+ "epoch": 0.22,
271
+ "eval_loss": NaN,
272
+ "eval_runtime": 1241.4692,
273
+ "eval_samples_per_second": 183.93,
274
+ "eval_steps_per_second": 22.991,
275
+ "step": 3500
276
+ },
277
+ {
278
+ "epoch": 0.22,
279
+ "learning_rate": 8.880099657427594e-06,
280
+ "loss": 2.716,
281
  "step": 3600
282
  },
283
  {
284
+ "epoch": 0.23,
285
+ "learning_rate": 8.84895671130489e-06,
286
+ "loss": 2.7579,
287
  "step": 3700
288
  },
289
  {
290
+ "epoch": 0.24,
291
+ "learning_rate": 8.817813765182188e-06,
292
+ "loss": 3.6821,
293
  "step": 3800
294
  },
295
  {
296
+ "epoch": 0.24,
297
+ "learning_rate": 8.786670819059484e-06,
298
+ "loss": 2.9784,
299
  "step": 3900
300
  },
301
  {
302
+ "epoch": 0.25,
303
+ "learning_rate": 8.755527872936782e-06,
304
+ "loss": 3.1284,
305
  "step": 4000
306
  },
307
  {
308
+ "epoch": 0.25,
309
+ "eval_loss": NaN,
310
+ "eval_runtime": 1241.3243,
311
+ "eval_samples_per_second": 183.951,
312
+ "eval_steps_per_second": 22.994,
313
+ "step": 4000
314
+ },
315
+ {
316
+ "epoch": 0.26,
317
+ "learning_rate": 8.724384926814076e-06,
318
+ "loss": 3.3944,
319
  "step": 4100
320
  },
321
  {
322
+ "epoch": 0.26,
323
+ "learning_rate": 8.693241980691374e-06,
324
+ "loss": 4.3378,
325
  "step": 4200
326
  },
327
  {
328
+ "epoch": 0.27,
329
+ "learning_rate": 8.66209903456867e-06,
330
+ "loss": 3.4281,
331
  "step": 4300
332
  },
333
  {
334
+ "epoch": 0.27,
335
+ "learning_rate": 8.630956088445968e-06,
336
+ "loss": 3.4173,
337
  "step": 4400
338
  },
339
  {
340
+ "epoch": 0.28,
341
+ "learning_rate": 8.599813142323264e-06,
342
+ "loss": 2.987,
343
  "step": 4500
344
  },
345
  {
346
+ "epoch": 0.28,
347
+ "eval_loss": NaN,
348
+ "eval_runtime": 1242.9007,
349
+ "eval_samples_per_second": 183.718,
350
+ "eval_steps_per_second": 22.965,
351
+ "step": 4500
352
+ },
353
+ {
354
+ "epoch": 0.29,
355
+ "learning_rate": 8.568670196200562e-06,
356
+ "loss": 2.9806,
357
  "step": 4600
358
  },
359
  {
360
+ "epoch": 0.29,
361
+ "learning_rate": 8.537527250077858e-06,
362
+ "loss": 3.2971,
363
  "step": 4700
364
  },
365
  {
366
+ "epoch": 0.3,
367
+ "learning_rate": 8.506384303955156e-06,
368
+ "loss": 2.9308,
369
  "step": 4800
370
  },
371
  {
372
+ "epoch": 0.31,
373
+ "learning_rate": 8.475241357832452e-06,
374
+ "loss": 3.95,
375
  "step": 4900
376
  },
377
  {
378
+ "epoch": 0.31,
379
+ "learning_rate": 8.444098411709748e-06,
380
+ "loss": 3.6195,
381
  "step": 5000
382
  },
383
  {
384
+ "epoch": 0.31,
385
+ "eval_loss": NaN,
386
+ "eval_runtime": 12249.9104,
387
+ "eval_samples_per_second": 18.64,
388
+ "eval_steps_per_second": 2.33,
389
+ "step": 5000
390
+ },
391
+ {
392
+ "epoch": 0.32,
393
+ "learning_rate": 8.412955465587044e-06,
394
+ "loss": 3.3623,
395
  "step": 5100
396
  },
397
  {
398
+ "epoch": 0.32,
399
+ "learning_rate": 8.381812519464342e-06,
400
+ "loss": 2.7048,
401
  "step": 5200
402
  },
403
  {
404
+ "epoch": 0.33,
405
+ "learning_rate": 8.350669573341638e-06,
406
+ "loss": 2.9451,
407
  "step": 5300
408
  },
409
  {
410
+ "epoch": 0.34,
411
+ "learning_rate": 8.319526627218936e-06,
412
+ "loss": 3.9476,
413
  "step": 5400
414
  },
415
  {
416
+ "epoch": 0.34,
417
+ "learning_rate": 8.288383681096232e-06,
418
+ "loss": 3.0161,
419
  "step": 5500
420
  },
421
  {
422
+ "epoch": 0.34,
423
+ "eval_loss": NaN,
424
+ "eval_runtime": 1242.1829,
425
+ "eval_samples_per_second": 183.824,
426
+ "eval_steps_per_second": 22.978,
427
+ "step": 5500
428
+ },
429
+ {
430
+ "epoch": 0.35,
431
+ "learning_rate": 8.25724073497353e-06,
432
+ "loss": 2.7617,
433
  "step": 5600
434
  },
435
  {
436
+ "epoch": 0.36,
437
+ "learning_rate": 8.226097788850826e-06,
438
+ "loss": 2.6001,
439
  "step": 5700
440
  },
441
  {
442
+ "epoch": 0.36,
443
+ "learning_rate": 8.194954842728122e-06,
444
+ "loss": 2.8024,
445
  "step": 5800
446
  },
447
  {
448
+ "epoch": 0.37,
449
+ "learning_rate": 8.16381189660542e-06,
450
+ "loss": 2.6335,
451
  "step": 5900
452
  },
453
  {
454
+ "epoch": 0.37,
455
+ "learning_rate": 8.132668950482716e-06,
456
+ "loss": 3.0251,
457
  "step": 6000
458
  },
459
  {
460
+ "epoch": 0.37,
461
+ "eval_loss": NaN,
462
+ "eval_runtime": 1241.5477,
463
+ "eval_samples_per_second": 183.918,
464
+ "eval_steps_per_second": 22.99,
465
+ "step": 6000
466
+ },
467
+ {
468
+ "epoch": 0.38,
469
+ "learning_rate": 8.101526004360014e-06,
470
+ "loss": 4.0077,
471
  "step": 6100
472
  },
473
  {
474
+ "epoch": 0.39,
475
+ "learning_rate": 8.07038305823731e-06,
476
+ "loss": 4.0494,
477
  "step": 6200
478
  },
479
  {
480
+ "epoch": 0.39,
481
+ "learning_rate": 8.039240112114608e-06,
482
+ "loss": 2.8992,
483
  "step": 6300
484
  },
485
  {
486
+ "epoch": 0.4,
487
+ "learning_rate": 8.008097165991904e-06,
488
+ "loss": 3.0616,
489
  "step": 6400
490
  },
491
  {
492
+ "epoch": 0.4,
493
+ "learning_rate": 7.9769542198692e-06,
494
+ "loss": 3.4257,
495
  "step": 6500
496
  },
497
  {
498
+ "epoch": 0.4,
499
+ "eval_loss": NaN,
500
+ "eval_runtime": 1242.4356,
501
+ "eval_samples_per_second": 183.787,
502
+ "eval_steps_per_second": 22.973,
503
+ "step": 6500
504
+ },
505
+ {
506
+ "epoch": 0.41,
507
+ "learning_rate": 7.945811273746496e-06,
508
+ "loss": 2.9658,
509
  "step": 6600
510
  },
511
  {
512
+ "epoch": 0.42,
513
+ "learning_rate": 7.914668327623794e-06,
514
+ "loss": 3.288,
515
  "step": 6700
516
  },
517
  {
518
+ "epoch": 0.42,
519
+ "learning_rate": 7.88352538150109e-06,
520
+ "loss": 3.1517,
521
  "step": 6800
522
  },
523
  {
524
+ "epoch": 0.43,
525
+ "learning_rate": 7.852382435378388e-06,
526
+ "loss": 3.3913,
527
  "step": 6900
528
  },
529
  {
530
+ "epoch": 0.44,
531
+ "learning_rate": 7.821239489255684e-06,
532
+ "loss": 2.7966,
533
  "step": 7000
534
  },
535
  {
536
+ "epoch": 0.44,
537
+ "eval_loss": NaN,
538
+ "eval_runtime": 1239.9554,
539
+ "eval_samples_per_second": 184.154,
540
+ "eval_steps_per_second": 23.019,
541
+ "step": 7000
542
+ },
543
+ {
544
+ "epoch": 0.44,
545
+ "learning_rate": 7.790096543132982e-06,
546
+ "loss": 3.372,
547
  "step": 7100
548
  },
549
  {
550
+ "epoch": 0.45,
551
+ "learning_rate": 7.758953597010278e-06,
552
+ "loss": 2.8074,
553
  "step": 7200
554
  },
555
  {
556
+ "epoch": 0.45,
557
+ "learning_rate": 7.727810650887576e-06,
558
+ "loss": 2.6364,
559
  "step": 7300
560
  },
561
  {
562
+ "epoch": 0.46,
563
+ "learning_rate": 7.696667704764872e-06,
564
+ "loss": 3.0556,
565
  "step": 7400
566
  },
567
  {
568
+ "epoch": 0.47,
569
+ "learning_rate": 7.665524758642168e-06,
570
+ "loss": 2.4966,
571
  "step": 7500
572
  },
573
  {
574
+ "epoch": 0.47,
575
+ "eval_loss": NaN,
576
+ "eval_runtime": 1238.3825,
577
+ "eval_samples_per_second": 184.388,
578
+ "eval_steps_per_second": 23.049,
579
+ "step": 7500
580
+ },
581
+ {
582
+ "epoch": 0.47,
583
+ "learning_rate": 7.634381812519464e-06,
584
+ "loss": 3.562,
585
  "step": 7600
586
  },
587
  {
588
+ "epoch": 0.48,
589
+ "learning_rate": 7.603238866396762e-06,
590
+ "loss": 4.0498,
591
  "step": 7700
592
  },
593
  {
594
+ "epoch": 0.49,
595
+ "learning_rate": 7.572095920274059e-06,
596
+ "loss": 2.8504,
597
  "step": 7800
598
  },
599
  {
600
+ "epoch": 0.49,
601
+ "learning_rate": 7.540952974151356e-06,
602
+ "loss": 2.7274,
603
  "step": 7900
604
  },
605
  {
606
+ "epoch": 0.5,
607
+ "learning_rate": 7.509810028028653e-06,
608
+ "loss": 2.9404,
609
  "step": 8000
610
  },
611
  {
612
+ "epoch": 0.5,
613
+ "eval_loss": NaN,
614
+ "eval_runtime": 1242.6775,
615
+ "eval_samples_per_second": 183.751,
616
+ "eval_steps_per_second": 22.969,
617
+ "step": 8000
618
+ },
619
+ {
620
+ "epoch": 0.5,
621
+ "learning_rate": 7.47866708190595e-06,
622
+ "loss": 2.6808,
623
  "step": 8100
624
  },
625
  {
626
+ "epoch": 0.51,
627
+ "learning_rate": 7.447835565244473e-06,
628
+ "loss": 2.7214,
629
  "step": 8200
630
  },
631
  {
632
+ "epoch": 0.52,
633
+ "learning_rate": 7.4166926191217695e-06,
634
+ "loss": 2.7306,
635
  "step": 8300
636
  },
637
  {
638
+ "epoch": 0.52,
639
+ "learning_rate": 7.3855496729990665e-06,
640
+ "loss": 2.7578,
641
  "step": 8400
642
  },
643
  {
644
+ "epoch": 0.53,
645
+ "learning_rate": 7.354406726876363e-06,
646
+ "loss": 2.6296,
647
  "step": 8500
648
  },
649
  {
650
+ "epoch": 0.53,
651
+ "eval_loss": NaN,
652
+ "eval_runtime": 1242.0617,
653
+ "eval_samples_per_second": 183.842,
654
+ "eval_steps_per_second": 22.98,
655
+ "step": 8500
656
+ },
657
+ {
658
+ "epoch": 0.54,
659
+ "learning_rate": 7.32326378075366e-06,
660
+ "loss": 2.7712,
661
  "step": 8600
662
  },
663
  {
664
+ "epoch": 0.54,
665
+ "learning_rate": 7.2921208346309565e-06,
666
+ "loss": 2.9979,
667
  "step": 8700
668
  },
669
  {
670
+ "epoch": 0.55,
671
+ "learning_rate": 7.2609778885082535e-06,
672
+ "loss": 2.8045,
673
  "step": 8800
674
  },
675
  {
676
+ "epoch": 0.55,
677
+ "learning_rate": 7.2298349423855505e-06,
678
+ "loss": 3.1612,
679
  "step": 8900
680
  },
681
  {
682
+ "epoch": 0.56,
683
+ "learning_rate": 7.198691996262847e-06,
684
+ "loss": 2.8809,
685
  "step": 9000
686
  },
687
  {
688
+ "epoch": 0.56,
689
+ "eval_loss": NaN,
690
+ "eval_runtime": 1242.2163,
691
+ "eval_samples_per_second": 183.819,
692
+ "eval_steps_per_second": 22.977,
693
+ "step": 9000
694
+ },
695
+ {
696
+ "epoch": 0.57,
697
+ "learning_rate": 7.1675490501401435e-06,
698
+ "loss": 2.8204,
699
  "step": 9100
700
  },
701
  {
702
+ "epoch": 0.57,
703
+ "learning_rate": 7.1364061040174405e-06,
704
+ "loss": 2.508,
705
  "step": 9200
706
  },
707
  {
708
+ "epoch": 0.58,
709
+ "learning_rate": 7.1052631578947375e-06,
710
+ "loss": 2.3787,
711
  "step": 9300
712
  },
713
  {
714
+ "epoch": 0.59,
715
+ "learning_rate": 7.0741202117720344e-06,
716
+ "loss": 2.5762,
717
  "step": 9400
718
  },
719
  {
720
+ "epoch": 0.59,
721
+ "learning_rate": 7.042977265649331e-06,
722
+ "loss": 2.9636,
723
  "step": 9500
724
  },
725
  {
726
+ "epoch": 0.59,
727
+ "eval_loss": NaN,
728
+ "eval_runtime": 1242.6873,
729
+ "eval_samples_per_second": 183.749,
730
+ "eval_steps_per_second": 22.969,
731
+ "step": 9500
732
+ },
733
+ {
734
+ "epoch": 0.6,
735
+ "learning_rate": 7.011834319526628e-06,
736
+ "loss": 2.6669,
737
  "step": 9600
738
  },
739
  {
740
+ "epoch": 0.6,
741
+ "learning_rate": 6.980691373403925e-06,
742
+ "loss": 2.9417,
743
  "step": 9700
744
  },
745
  {
746
+ "epoch": 0.61,
747
+ "learning_rate": 6.949548427281222e-06,
748
+ "loss": 3.1507,
749
  "step": 9800
750
  },
751
  {
752
+ "epoch": 0.62,
753
+ "learning_rate": 6.9184054811585175e-06,
754
+ "loss": 3.0434,
755
  "step": 9900
756
  },
757
  {
758
+ "epoch": 0.62,
759
+ "learning_rate": 6.8872625350358145e-06,
760
+ "loss": 3.0385,
761
  "step": 10000
762
  },
763
  {
764
+ "epoch": 0.62,
765
+ "eval_loss": NaN,
766
+ "eval_runtime": 1242.5564,
767
+ "eval_samples_per_second": 183.769,
768
+ "eval_steps_per_second": 22.971,
769
+ "step": 10000
770
+ },
771
+ {
772
+ "epoch": 0.63,
773
+ "learning_rate": 6.8561195889131115e-06,
774
+ "loss": 2.6385,
775
  "step": 10100
776
  },
777
  {
778
+ "epoch": 0.64,
779
+ "learning_rate": 6.8249766427904084e-06,
780
+ "loss": 2.5347,
781
  "step": 10200
782
  },
783
  {
784
+ "epoch": 0.64,
785
+ "learning_rate": 6.793833696667705e-06,
786
+ "loss": 3.0458,
787
  "step": 10300
788
  },
789
  {
790
+ "epoch": 0.65,
791
+ "learning_rate": 6.763002180006229e-06,
792
+ "loss": 3.0999,
793
  "step": 10400
794
  },
795
  {
796
+ "epoch": 0.65,
797
+ "learning_rate": 6.731859233883526e-06,
798
+ "loss": 2.5865,
799
  "step": 10500
800
  },
801
  {
802
+ "epoch": 0.65,
803
+ "eval_loss": NaN,
804
+ "eval_runtime": 1242.285,
805
+ "eval_samples_per_second": 183.809,
806
+ "eval_steps_per_second": 22.976,
807
+ "step": 10500
808
+ },
809
+ {
810
+ "epoch": 0.66,
811
+ "learning_rate": 6.700716287760822e-06,
812
+ "loss": 2.7499,
813
  "step": 10600
814
  },
815
  {
816
+ "epoch": 0.67,
817
+ "learning_rate": 6.669573341638119e-06,
818
+ "loss": 2.9551,
819
  "step": 10700
820
  },
821
  {
822
+ "epoch": 0.67,
823
+ "learning_rate": 6.638430395515416e-06,
824
+ "loss": 2.8254,
825
  "step": 10800
826
  },
827
  {
828
+ "epoch": 0.68,
829
+ "learning_rate": 6.607287449392713e-06,
830
+ "loss": 2.8981,
831
  "step": 10900
832
  },
833
  {
834
+ "epoch": 0.69,
835
+ "learning_rate": 6.57614450327001e-06,
836
+ "loss": 2.7491,
837
  "step": 11000
838
  },
839
  {
840
+ "epoch": 0.69,
841
+ "eval_loss": NaN,
842
+ "eval_runtime": 1242.9348,
843
+ "eval_samples_per_second": 183.713,
844
+ "eval_steps_per_second": 22.964,
845
+ "step": 11000
846
+ },
847
  {
848
+ "epoch": 0.69,
849
+ "learning_rate": 6.545001557147307e-06,
850
+ "loss": 2.8125,
851
+ "step": 11100
852
+ },
853
+ {
854
+ "epoch": 0.7,
855
+ "learning_rate": 6.513858611024604e-06,
856
+ "loss": 2.6231,
857
  "step": 11200
858
  },
859
  {
860
+ "epoch": 0.7,
861
+ "learning_rate": 6.482715664901901e-06,
862
+ "loss": 2.8729,
863
  "step": 11300
864
  },
865
  {
866
+ "epoch": 0.71,
867
+ "learning_rate": 6.451572718779196e-06,
868
+ "loss": 2.9655,
869
  "step": 11400
870
  },
871
  {
872
+ "epoch": 0.72,
873
+ "learning_rate": 6.420429772656493e-06,
874
+ "loss": 3.1339,
875
  "step": 11500
876
  },
877
  {
878
+ "epoch": 0.72,
879
+ "eval_loss": NaN,
880
+ "eval_runtime": 1242.7737,
881
+ "eval_samples_per_second": 183.737,
882
+ "eval_steps_per_second": 22.967,
883
+ "step": 11500
884
+ },
885
+ {
886
+ "epoch": 0.72,
887
+ "learning_rate": 6.38928682653379e-06,
888
+ "loss": 3.2655,
889
  "step": 11600
890
  },
891
  {
892
+ "epoch": 0.73,
893
+ "learning_rate": 6.358143880411087e-06,
894
+ "loss": 2.7888,
895
  "step": 11700
896
  },
897
  {
898
+ "epoch": 0.73,
899
+ "learning_rate": 6.327000934288384e-06,
900
+ "loss": 3.7292,
901
  "step": 11800
902
  },
903
  {
904
+ "epoch": 0.74,
905
+ "learning_rate": 6.295857988165681e-06,
906
+ "loss": 2.6393,
907
  "step": 11900
908
  },
909
  {
910
+ "epoch": 0.75,
911
+ "learning_rate": 6.264715042042978e-06,
912
+ "loss": 2.9632,
913
+ "step": 12000
914
+ },
915
+ {
916
+ "epoch": 0.75,
917
+ "eval_loss": NaN,
918
+ "eval_runtime": 1242.1518,
919
+ "eval_samples_per_second": 183.829,
920
+ "eval_steps_per_second": 22.979,
921
  "step": 12000
922
  },
923
  {
924
+ "epoch": 0.75,
925
+ "learning_rate": 6.233572095920275e-06,
926
+ "loss": 2.4857,
927
  "step": 12100
928
  },
929
  {
930
+ "epoch": 0.76,
931
+ "learning_rate": 6.202429149797571e-06,
932
+ "loss": 2.9144,
933
  "step": 12200
934
  },
935
  {
936
+ "epoch": 0.77,
937
+ "learning_rate": 6.171286203674868e-06,
938
+ "loss": 2.7133,
939
  "step": 12300
940
  },
941
  {
942
+ "epoch": 0.77,
943
+ "learning_rate": 6.140143257552165e-06,
944
+ "loss": 3.8753,
945
  "step": 12400
946
  },
947
  {
948
+ "epoch": 0.78,
949
+ "learning_rate": 6.109000311429461e-06,
950
+ "loss": 3.0089,
951
+ "step": 12500
952
+ },
953
+ {
954
+ "epoch": 0.78,
955
+ "eval_loss": NaN,
956
+ "eval_runtime": 1242.2159,
957
+ "eval_samples_per_second": 183.819,
958
+ "eval_steps_per_second": 22.977,
959
  "step": 12500
960
  },
961
  {
962
+ "epoch": 0.78,
963
+ "learning_rate": 6.077857365306758e-06,
964
+ "loss": 2.9934,
965
  "step": 12600
966
  },
967
  {
968
+ "epoch": 0.79,
969
+ "learning_rate": 6.047025848645283e-06,
970
+ "loss": 2.5078,
971
  "step": 12700
972
  },
973
  {
974
+ "epoch": 0.8,
975
+ "learning_rate": 6.01588290252258e-06,
976
+ "loss": 2.7188,
977
  "step": 12800
978
  },
979
  {
980
+ "epoch": 0.8,
981
+ "learning_rate": 5.984739956399875e-06,
982
+ "loss": 2.6552,
983
  "step": 12900
984
  },
985
  {
986
+ "epoch": 0.81,
987
+ "learning_rate": 5.953597010277172e-06,
988
+ "loss": 2.3908,
989
+ "step": 13000
990
+ },
991
+ {
992
+ "epoch": 0.81,
993
+ "eval_loss": NaN,
994
+ "eval_runtime": 1242.3282,
995
+ "eval_samples_per_second": 183.802,
996
+ "eval_steps_per_second": 22.975,
997
  "step": 13000
998
+ },
999
+ {
1000
+ "epoch": 0.82,
1001
+ "learning_rate": 5.922454064154469e-06,
1002
+ "loss": 2.7332,
1003
+ "step": 13100
1004
+ },
1005
+ {
1006
+ "epoch": 0.82,
1007
+ "learning_rate": 5.891311118031766e-06,
1008
+ "loss": 2.3849,
1009
+ "step": 13200
1010
+ },
1011
+ {
1012
+ "epoch": 0.83,
1013
+ "learning_rate": 5.860168171909063e-06,
1014
+ "loss": 3.4628,
1015
+ "step": 13300
1016
+ },
1017
+ {
1018
+ "epoch": 0.83,
1019
+ "learning_rate": 5.82902522578636e-06,
1020
+ "loss": 2.6728,
1021
+ "step": 13400
1022
+ },
1023
+ {
1024
+ "epoch": 0.84,
1025
+ "learning_rate": 5.797882279663657e-06,
1026
+ "loss": 2.8807,
1027
+ "step": 13500
1028
+ },
1029
+ {
1030
+ "epoch": 0.84,
1031
+ "eval_loss": NaN,
1032
+ "eval_runtime": 1241.8007,
1033
+ "eval_samples_per_second": 183.881,
1034
+ "eval_steps_per_second": 22.985,
1035
+ "step": 13500
1036
+ },
1037
+ {
1038
+ "epoch": 0.85,
1039
+ "learning_rate": 5.766739333540954e-06,
1040
+ "loss": 2.9648,
1041
+ "step": 13600
1042
+ },
1043
+ {
1044
+ "epoch": 0.85,
1045
+ "learning_rate": 5.735596387418251e-06,
1046
+ "loss": 2.8685,
1047
+ "step": 13700
1048
+ },
1049
+ {
1050
+ "epoch": 0.86,
1051
+ "learning_rate": 5.704453441295547e-06,
1052
+ "loss": 2.7226,
1053
+ "step": 13800
1054
+ },
1055
+ {
1056
+ "epoch": 0.87,
1057
+ "learning_rate": 5.673310495172844e-06,
1058
+ "loss": 2.7493,
1059
+ "step": 13900
1060
+ },
1061
+ {
1062
+ "epoch": 0.87,
1063
+ "learning_rate": 5.642167549050141e-06,
1064
+ "loss": 2.4787,
1065
+ "step": 14000
1066
+ },
1067
+ {
1068
+ "epoch": 0.87,
1069
+ "eval_loss": NaN,
1070
+ "eval_runtime": 1242.1112,
1071
+ "eval_samples_per_second": 183.835,
1072
+ "eval_steps_per_second": 22.979,
1073
+ "step": 14000
1074
+ },
1075
+ {
1076
+ "epoch": 0.88,
1077
+ "learning_rate": 5.611024602927437e-06,
1078
+ "loss": 2.6621,
1079
+ "step": 14100
1080
+ },
1081
+ {
1082
+ "epoch": 0.88,
1083
+ "learning_rate": 5.580193086265962e-06,
1084
+ "loss": 2.5317,
1085
+ "step": 14200
1086
+ },
1087
+ {
1088
+ "epoch": 0.89,
1089
+ "learning_rate": 5.549050140143259e-06,
1090
+ "loss": 2.8094,
1091
+ "step": 14300
1092
+ },
1093
+ {
1094
+ "epoch": 0.9,
1095
+ "learning_rate": 5.5179071940205556e-06,
1096
+ "loss": 2.9184,
1097
+ "step": 14400
1098
+ },
1099
+ {
1100
+ "epoch": 0.9,
1101
+ "learning_rate": 5.486764247897851e-06,
1102
+ "loss": 3.0641,
1103
+ "step": 14500
1104
+ },
1105
+ {
1106
+ "epoch": 0.9,
1107
+ "eval_loss": NaN,
1108
+ "eval_runtime": 1242.2939,
1109
+ "eval_samples_per_second": 183.808,
1110
+ "eval_steps_per_second": 22.976,
1111
+ "step": 14500
1112
+ },
1113
+ {
1114
+ "epoch": 0.91,
1115
+ "learning_rate": 5.455621301775148e-06,
1116
+ "loss": 2.707,
1117
+ "step": 14600
1118
+ },
1119
+ {
1120
+ "epoch": 0.92,
1121
+ "learning_rate": 5.424478355652445e-06,
1122
+ "loss": 2.8484,
1123
+ "step": 14700
1124
+ },
1125
+ {
1126
+ "epoch": 0.92,
1127
+ "learning_rate": 5.393335409529742e-06,
1128
+ "loss": 2.7637,
1129
+ "step": 14800
1130
+ },
1131
+ {
1132
+ "epoch": 0.93,
1133
+ "learning_rate": 5.362192463407039e-06,
1134
+ "loss": 3.2172,
1135
+ "step": 14900
1136
+ },
1137
+ {
1138
+ "epoch": 0.93,
1139
+ "learning_rate": 5.331049517284336e-06,
1140
+ "loss": 3.8572,
1141
+ "step": 15000
1142
+ },
1143
+ {
1144
+ "epoch": 0.93,
1145
+ "eval_loss": NaN,
1146
+ "eval_runtime": 1242.3426,
1147
+ "eval_samples_per_second": 183.8,
1148
+ "eval_steps_per_second": 22.975,
1149
+ "step": 15000
1150
+ },
1151
+ {
1152
+ "epoch": 0.94,
1153
+ "learning_rate": 5.299906571161633e-06,
1154
+ "loss": 2.6719,
1155
+ "step": 15100
1156
+ },
1157
+ {
1158
+ "epoch": 0.95,
1159
+ "learning_rate": 5.2687636250389296e-06,
1160
+ "loss": 2.6299,
1161
+ "step": 15200
1162
+ },
1163
+ {
1164
+ "epoch": 0.95,
1165
+ "learning_rate": 5.237620678916226e-06,
1166
+ "loss": 2.7315,
1167
+ "step": 15300
1168
+ },
1169
+ {
1170
+ "epoch": 0.96,
1171
+ "learning_rate": 5.206477732793523e-06,
1172
+ "loss": 2.5421,
1173
+ "step": 15400
1174
+ },
1175
+ {
1176
+ "epoch": 0.97,
1177
+ "learning_rate": 5.17533478667082e-06,
1178
+ "loss": 2.6612,
1179
+ "step": 15500
1180
+ },
1181
+ {
1182
+ "epoch": 0.97,
1183
+ "eval_loss": NaN,
1184
+ "eval_runtime": 1242.7472,
1185
+ "eval_samples_per_second": 183.741,
1186
+ "eval_steps_per_second": 22.968,
1187
+ "step": 15500
1188
+ },
1189
+ {
1190
+ "epoch": 0.97,
1191
+ "learning_rate": 5.1441918405481166e-06,
1192
+ "loss": 2.4232,
1193
+ "step": 15600
1194
+ },
1195
+ {
1196
+ "epoch": 0.98,
1197
+ "learning_rate": 5.113048894425413e-06,
1198
+ "loss": 2.4607,
1199
+ "step": 15700
1200
+ },
1201
+ {
1202
+ "epoch": 0.98,
1203
+ "learning_rate": 5.08190594830271e-06,
1204
+ "loss": 2.5239,
1205
+ "step": 15800
1206
+ },
1207
+ {
1208
+ "epoch": 0.99,
1209
+ "learning_rate": 5.050763002180007e-06,
1210
+ "loss": 2.7479,
1211
+ "step": 15900
1212
+ },
1213
+ {
1214
+ "epoch": 1.0,
1215
+ "learning_rate": 5.0196200560573036e-06,
1216
+ "loss": 2.3672,
1217
+ "step": 16000
1218
+ },
1219
+ {
1220
+ "epoch": 1.0,
1221
+ "eval_loss": NaN,
1222
+ "eval_runtime": 1242.0429,
1223
+ "eval_samples_per_second": 183.845,
1224
+ "eval_steps_per_second": 22.981,
1225
+ "step": 16000
1226
+ },
1227
+ {
1228
+ "epoch": 1.0,
1229
+ "learning_rate": 4.9884771099346005e-06,
1230
+ "loss": 2.8632,
1231
+ "step": 16100
1232
+ },
1233
+ {
1234
+ "epoch": 1.01,
1235
+ "learning_rate": 4.9573341638118975e-06,
1236
+ "loss": 2.6653,
1237
+ "step": 16200
1238
+ },
1239
+ {
1240
+ "epoch": 1.02,
1241
+ "learning_rate": 4.9261912176891945e-06,
1242
+ "loss": 2.539,
1243
+ "step": 16300
1244
+ },
1245
+ {
1246
+ "epoch": 1.02,
1247
+ "learning_rate": 4.8950482715664906e-06,
1248
+ "loss": 2.8172,
1249
+ "step": 16400
1250
+ },
1251
+ {
1252
+ "epoch": 1.03,
1253
+ "learning_rate": 4.8639053254437875e-06,
1254
+ "loss": 2.5157,
1255
+ "step": 16500
1256
+ },
1257
+ {
1258
+ "epoch": 1.03,
1259
+ "eval_loss": NaN,
1260
+ "eval_runtime": 11659.1974,
1261
+ "eval_samples_per_second": 19.585,
1262
+ "eval_steps_per_second": 2.448,
1263
+ "step": 16500
1264
+ },
1265
+ {
1266
+ "epoch": 1.03,
1267
+ "learning_rate": 4.8327623793210845e-06,
1268
+ "loss": 2.5209,
1269
+ "step": 16600
1270
+ },
1271
+ {
1272
+ "epoch": 1.04,
1273
+ "learning_rate": 4.8016194331983815e-06,
1274
+ "loss": 2.9114,
1275
+ "step": 16700
1276
+ },
1277
+ {
1278
+ "epoch": 1.05,
1279
+ "learning_rate": 4.7704764870756776e-06,
1280
+ "loss": 2.6053,
1281
+ "step": 16800
1282
+ },
1283
+ {
1284
+ "epoch": 1.05,
1285
+ "learning_rate": 4.7393335409529745e-06,
1286
+ "loss": 2.8304,
1287
+ "step": 16900
1288
+ },
1289
+ {
1290
+ "epoch": 1.06,
1291
+ "learning_rate": 4.7081905948302715e-06,
1292
+ "loss": 2.7648,
1293
+ "step": 17000
1294
+ },
1295
+ {
1296
+ "epoch": 1.06,
1297
+ "eval_loss": NaN,
1298
+ "eval_runtime": 1241.8221,
1299
+ "eval_samples_per_second": 183.877,
1300
+ "eval_steps_per_second": 22.985,
1301
+ "step": 17000
1302
+ },
1303
+ {
1304
+ "epoch": 1.07,
1305
+ "learning_rate": 4.6770476487075685e-06,
1306
+ "loss": 2.9948,
1307
+ "step": 17100
1308
+ },
1309
+ {
1310
+ "epoch": 1.07,
1311
+ "learning_rate": 4.6459047025848646e-06,
1312
+ "loss": 2.7007,
1313
+ "step": 17200
1314
+ },
1315
+ {
1316
+ "epoch": 1.08,
1317
+ "learning_rate": 4.6147617564621615e-06,
1318
+ "loss": 3.0049,
1319
+ "step": 17300
1320
+ },
1321
+ {
1322
+ "epoch": 1.08,
1323
+ "learning_rate": 4.5836188103394585e-06,
1324
+ "loss": 2.4344,
1325
+ "step": 17400
1326
+ },
1327
+ {
1328
+ "epoch": 1.09,
1329
+ "learning_rate": 4.5524758642167555e-06,
1330
+ "loss": 2.474,
1331
+ "step": 17500
1332
+ },
1333
+ {
1334
+ "epoch": 1.09,
1335
+ "eval_loss": NaN,
1336
+ "eval_runtime": 1242.354,
1337
+ "eval_samples_per_second": 183.799,
1338
+ "eval_steps_per_second": 22.975,
1339
+ "step": 17500
1340
+ },
1341
+ {
1342
+ "epoch": 1.1,
1343
+ "learning_rate": 4.5213329180940516e-06,
1344
+ "loss": 2.653,
1345
+ "step": 17600
1346
+ },
1347
+ {
1348
+ "epoch": 1.1,
1349
+ "learning_rate": 4.4901899719713485e-06,
1350
+ "loss": 2.7398,
1351
+ "step": 17700
1352
+ },
1353
+ {
1354
+ "epoch": 1.11,
1355
+ "learning_rate": 4.4590470258486455e-06,
1356
+ "loss": 2.3064,
1357
+ "step": 17800
1358
+ },
1359
+ {
1360
+ "epoch": 1.11,
1361
+ "learning_rate": 4.4279040797259425e-06,
1362
+ "loss": 2.8268,
1363
+ "step": 17900
1364
+ },
1365
+ {
1366
+ "epoch": 1.12,
1367
+ "learning_rate": 4.3967611336032386e-06,
1368
+ "loss": 2.3197,
1369
+ "step": 18000
1370
+ },
1371
+ {
1372
+ "epoch": 1.12,
1373
+ "eval_loss": NaN,
1374
+ "eval_runtime": 1244.3942,
1375
+ "eval_samples_per_second": 183.497,
1376
+ "eval_steps_per_second": 22.937,
1377
+ "step": 18000
1378
+ },
1379
+ {
1380
+ "epoch": 1.13,
1381
+ "learning_rate": 4.3656181874805355e-06,
1382
+ "loss": 2.6969,
1383
+ "step": 18100
1384
+ },
1385
+ {
1386
+ "epoch": 1.13,
1387
+ "learning_rate": 4.33478667081906e-06,
1388
+ "loss": 2.6233,
1389
+ "step": 18200
1390
+ },
1391
+ {
1392
+ "epoch": 1.14,
1393
+ "learning_rate": 4.303643724696356e-06,
1394
+ "loss": 2.9522,
1395
+ "step": 18300
1396
+ },
1397
+ {
1398
+ "epoch": 1.15,
1399
+ "learning_rate": 4.272500778573653e-06,
1400
+ "loss": 2.5308,
1401
+ "step": 18400
1402
+ },
1403
+ {
1404
+ "epoch": 1.15,
1405
+ "learning_rate": 4.24135783245095e-06,
1406
+ "loss": 2.9766,
1407
+ "step": 18500
1408
+ },
1409
+ {
1410
+ "epoch": 1.15,
1411
+ "eval_loss": NaN,
1412
+ "eval_runtime": 1242.8298,
1413
+ "eval_samples_per_second": 183.728,
1414
+ "eval_steps_per_second": 22.966,
1415
+ "step": 18500
1416
+ },
1417
+ {
1418
+ "epoch": 1.16,
1419
+ "learning_rate": 4.210214886328247e-06,
1420
+ "loss": 2.4952,
1421
+ "step": 18600
1422
+ },
1423
+ {
1424
+ "epoch": 1.16,
1425
+ "learning_rate": 4.179071940205543e-06,
1426
+ "loss": 3.0581,
1427
+ "step": 18700
1428
+ },
1429
+ {
1430
+ "epoch": 1.17,
1431
+ "learning_rate": 4.14792899408284e-06,
1432
+ "loss": 2.3722,
1433
+ "step": 18800
1434
+ },
1435
+ {
1436
+ "epoch": 1.18,
1437
+ "learning_rate": 4.116786047960137e-06,
1438
+ "loss": 2.6995,
1439
+ "step": 18900
1440
+ },
1441
+ {
1442
+ "epoch": 1.18,
1443
+ "learning_rate": 4.085643101837434e-06,
1444
+ "loss": 2.827,
1445
+ "step": 19000
1446
+ },
1447
+ {
1448
+ "epoch": 1.18,
1449
+ "eval_loss": NaN,
1450
+ "eval_runtime": 1242.4071,
1451
+ "eval_samples_per_second": 183.791,
1452
+ "eval_steps_per_second": 22.974,
1453
+ "step": 19000
1454
+ },
1455
+ {
1456
+ "epoch": 1.19,
1457
+ "learning_rate": 4.054500155714731e-06,
1458
+ "loss": 2.7205,
1459
+ "step": 19100
1460
+ },
1461
+ {
1462
+ "epoch": 1.2,
1463
+ "learning_rate": 4.023357209592027e-06,
1464
+ "loss": 2.9195,
1465
+ "step": 19200
1466
+ },
1467
+ {
1468
+ "epoch": 1.2,
1469
+ "learning_rate": 3.992214263469324e-06,
1470
+ "loss": 2.8558,
1471
+ "step": 19300
1472
+ },
1473
+ {
1474
+ "epoch": 1.21,
1475
+ "learning_rate": 3.961071317346621e-06,
1476
+ "loss": 3.2538,
1477
+ "step": 19400
1478
+ },
1479
+ {
1480
+ "epoch": 1.21,
1481
+ "learning_rate": 3.929928371223918e-06,
1482
+ "loss": 2.7213,
1483
+ "step": 19500
1484
+ },
1485
+ {
1486
+ "epoch": 1.21,
1487
+ "eval_loss": NaN,
1488
+ "eval_runtime": 1242.5138,
1489
+ "eval_samples_per_second": 183.775,
1490
+ "eval_steps_per_second": 22.972,
1491
+ "step": 19500
1492
+ },
1493
+ {
1494
+ "epoch": 1.22,
1495
+ "learning_rate": 3.898785425101214e-06,
1496
+ "loss": 2.7368,
1497
+ "step": 19600
1498
+ },
1499
+ {
1500
+ "epoch": 1.23,
1501
+ "learning_rate": 3.867642478978511e-06,
1502
+ "loss": 2.6442,
1503
+ "step": 19700
1504
+ },
1505
+ {
1506
+ "epoch": 1.23,
1507
+ "learning_rate": 3.836499532855808e-06,
1508
+ "loss": 2.6891,
1509
+ "step": 19800
1510
+ },
1511
+ {
1512
+ "epoch": 1.24,
1513
+ "learning_rate": 3.8053565867331056e-06,
1514
+ "loss": 2.6657,
1515
+ "step": 19900
1516
+ },
1517
+ {
1518
+ "epoch": 1.25,
1519
+ "learning_rate": 3.7742136406104017e-06,
1520
+ "loss": 2.6653,
1521
+ "step": 20000
1522
+ },
1523
+ {
1524
+ "epoch": 1.25,
1525
+ "eval_loss": NaN,
1526
+ "eval_runtime": 1243.214,
1527
+ "eval_samples_per_second": 183.672,
1528
+ "eval_steps_per_second": 22.959,
1529
+ "step": 20000
1530
+ },
1531
+ {
1532
+ "epoch": 1.25,
1533
+ "learning_rate": 3.7430706944876987e-06,
1534
+ "loss": 3.0333,
1535
+ "step": 20100
1536
+ },
1537
+ {
1538
+ "epoch": 1.26,
1539
+ "learning_rate": 3.7119277483649957e-06,
1540
+ "loss": 2.2654,
1541
+ "step": 20200
1542
+ },
1543
+ {
1544
+ "epoch": 1.26,
1545
+ "learning_rate": 3.6807848022422926e-06,
1546
+ "loss": 2.8074,
1547
+ "step": 20300
1548
+ },
1549
+ {
1550
+ "epoch": 1.27,
1551
+ "learning_rate": 3.649641856119589e-06,
1552
+ "loss": 2.5416,
1553
+ "step": 20400
1554
+ },
1555
+ {
1556
+ "epoch": 1.28,
1557
+ "learning_rate": 3.618498909996886e-06,
1558
+ "loss": 2.671,
1559
+ "step": 20500
1560
+ },
1561
+ {
1562
+ "epoch": 1.28,
1563
+ "eval_loss": NaN,
1564
+ "eval_runtime": 1242.2133,
1565
+ "eval_samples_per_second": 183.819,
1566
+ "eval_steps_per_second": 22.978,
1567
+ "step": 20500
1568
+ },
1569
+ {
1570
+ "epoch": 1.28,
1571
+ "learning_rate": 3.5873559638741827e-06,
1572
+ "loss": 2.3197,
1573
+ "step": 20600
1574
+ },
1575
+ {
1576
+ "epoch": 1.29,
1577
+ "learning_rate": 3.5562130177514796e-06,
1578
+ "loss": 2.6825,
1579
+ "step": 20700
1580
+ },
1581
+ {
1582
+ "epoch": 1.3,
1583
+ "learning_rate": 3.525070071628776e-06,
1584
+ "loss": 2.9642,
1585
+ "step": 20800
1586
+ },
1587
+ {
1588
+ "epoch": 1.3,
1589
+ "learning_rate": 3.493927125506073e-06,
1590
+ "loss": 2.327,
1591
+ "step": 20900
1592
+ },
1593
+ {
1594
+ "epoch": 1.31,
1595
+ "learning_rate": 3.463095608844597e-06,
1596
+ "loss": 2.5401,
1597
+ "step": 21000
1598
+ },
1599
+ {
1600
+ "epoch": 1.31,
1601
+ "eval_loss": NaN,
1602
+ "eval_runtime": 1242.9072,
1603
+ "eval_samples_per_second": 183.717,
1604
+ "eval_steps_per_second": 22.965,
1605
+ "step": 21000
1606
+ },
1607
+ {
1608
+ "epoch": 1.31,
1609
+ "learning_rate": 3.4319526627218935e-06,
1610
+ "loss": 2.6376,
1611
+ "step": 21100
1612
+ },
1613
+ {
1614
+ "epoch": 1.32,
1615
+ "learning_rate": 3.4008097165991905e-06,
1616
+ "loss": 2.8178,
1617
+ "step": 21200
1618
+ },
1619
+ {
1620
+ "epoch": 1.33,
1621
+ "learning_rate": 3.3696667704764874e-06,
1622
+ "loss": 2.2995,
1623
+ "step": 21300
1624
+ },
1625
+ {
1626
+ "epoch": 1.33,
1627
+ "learning_rate": 3.3385238243537844e-06,
1628
+ "loss": 3.476,
1629
+ "step": 21400
1630
+ },
1631
+ {
1632
+ "epoch": 1.34,
1633
+ "learning_rate": 3.3073808782310805e-06,
1634
+ "loss": 2.7193,
1635
+ "step": 21500
1636
+ },
1637
+ {
1638
+ "epoch": 1.34,
1639
+ "eval_loss": NaN,
1640
+ "eval_runtime": 1242.302,
1641
+ "eval_samples_per_second": 183.806,
1642
+ "eval_steps_per_second": 22.976,
1643
+ "step": 21500
1644
+ },
1645
+ {
1646
+ "epoch": 1.35,
1647
+ "learning_rate": 3.2762379321083775e-06,
1648
+ "loss": 2.2606,
1649
+ "step": 21600
1650
+ },
1651
+ {
1652
+ "epoch": 1.35,
1653
+ "learning_rate": 3.2450949859856744e-06,
1654
+ "loss": 2.4979,
1655
+ "step": 21700
1656
+ },
1657
+ {
1658
+ "epoch": 1.36,
1659
+ "learning_rate": 3.2139520398629714e-06,
1660
+ "loss": 2.8394,
1661
+ "step": 21800
1662
+ },
1663
+ {
1664
+ "epoch": 1.36,
1665
+ "learning_rate": 3.1828090937402684e-06,
1666
+ "loss": 2.5935,
1667
+ "step": 21900
1668
+ },
1669
+ {
1670
+ "epoch": 1.37,
1671
+ "learning_rate": 3.151666147617565e-06,
1672
+ "loss": 2.5924,
1673
+ "step": 22000
1674
+ },
1675
+ {
1676
+ "epoch": 1.37,
1677
+ "eval_loss": NaN,
1678
+ "eval_runtime": 1239.6995,
1679
+ "eval_samples_per_second": 184.192,
1680
+ "eval_steps_per_second": 23.024,
1681
+ "step": 22000
1682
+ },
1683
+ {
1684
+ "epoch": 1.38,
1685
+ "learning_rate": 3.120523201494862e-06,
1686
+ "loss": 2.3212,
1687
+ "step": 22100
1688
+ },
1689
+ {
1690
+ "epoch": 1.38,
1691
+ "learning_rate": 3.0893802553721584e-06,
1692
+ "loss": 2.5372,
1693
+ "step": 22200
1694
+ },
1695
+ {
1696
+ "epoch": 1.39,
1697
+ "learning_rate": 3.0582373092494554e-06,
1698
+ "loss": 3.17,
1699
+ "step": 22300
1700
+ },
1701
+ {
1702
+ "epoch": 1.4,
1703
+ "learning_rate": 3.027094363126752e-06,
1704
+ "loss": 2.3103,
1705
+ "step": 22400
1706
+ },
1707
+ {
1708
+ "epoch": 1.4,
1709
+ "learning_rate": 2.995951417004049e-06,
1710
+ "loss": 2.5506,
1711
+ "step": 22500
1712
+ },
1713
+ {
1714
+ "epoch": 1.4,
1715
+ "eval_loss": NaN,
1716
+ "eval_runtime": 1238.6358,
1717
+ "eval_samples_per_second": 184.35,
1718
+ "eval_steps_per_second": 23.044,
1719
+ "step": 22500
1720
+ },
1721
+ {
1722
+ "epoch": 1.41,
1723
+ "learning_rate": 2.964808470881346e-06,
1724
+ "loss": 2.3131,
1725
+ "step": 22600
1726
+ },
1727
+ {
1728
+ "epoch": 1.41,
1729
+ "learning_rate": 2.9336655247586428e-06,
1730
+ "loss": 2.9797,
1731
+ "step": 22700
1732
+ },
1733
+ {
1734
+ "epoch": 1.42,
1735
+ "learning_rate": 2.902522578635939e-06,
1736
+ "loss": 3.3517,
1737
+ "step": 22800
1738
+ },
1739
+ {
1740
+ "epoch": 1.43,
1741
+ "learning_rate": 2.871379632513236e-06,
1742
+ "loss": 2.3309,
1743
+ "step": 22900
1744
+ },
1745
+ {
1746
+ "epoch": 1.43,
1747
+ "learning_rate": 2.840236686390533e-06,
1748
+ "loss": 2.9167,
1749
+ "step": 23000
1750
+ },
1751
+ {
1752
+ "epoch": 1.43,
1753
+ "eval_loss": NaN,
1754
+ "eval_runtime": 1239.5086,
1755
+ "eval_samples_per_second": 184.221,
1756
+ "eval_steps_per_second": 23.028,
1757
+ "step": 23000
1758
+ },
1759
+ {
1760
+ "epoch": 1.44,
1761
+ "learning_rate": 2.8090937402678298e-06,
1762
+ "loss": 2.612,
1763
+ "step": 23100
1764
+ },
1765
+ {
1766
+ "epoch": 1.44,
1767
+ "learning_rate": 2.7779507941451263e-06,
1768
+ "loss": 2.5327,
1769
+ "step": 23200
1770
+ },
1771
+ {
1772
+ "epoch": 1.45,
1773
+ "learning_rate": 2.746807848022423e-06,
1774
+ "loss": 2.4244,
1775
+ "step": 23300
1776
+ },
1777
+ {
1778
+ "epoch": 1.46,
1779
+ "learning_rate": 2.71566490189972e-06,
1780
+ "loss": 2.6675,
1781
+ "step": 23400
1782
+ },
1783
+ {
1784
+ "epoch": 1.46,
1785
+ "learning_rate": 2.6845219557770168e-06,
1786
+ "loss": 2.8272,
1787
+ "step": 23500
1788
+ },
1789
+ {
1790
+ "epoch": 1.46,
1791
+ "eval_loss": NaN,
1792
+ "eval_runtime": 1242.2478,
1793
+ "eval_samples_per_second": 183.814,
1794
+ "eval_steps_per_second": 22.977,
1795
+ "step": 23500
1796
+ },
1797
+ {
1798
+ "epoch": 1.47,
1799
+ "learning_rate": 2.6533790096543133e-06,
1800
+ "loss": 2.419,
1801
+ "step": 23600
1802
+ },
1803
+ {
1804
+ "epoch": 1.48,
1805
+ "learning_rate": 2.6222360635316103e-06,
1806
+ "loss": 2.4781,
1807
+ "step": 23700
1808
+ },
1809
+ {
1810
+ "epoch": 1.48,
1811
+ "learning_rate": 2.5910931174089072e-06,
1812
+ "loss": 3.1415,
1813
+ "step": 23800
1814
+ },
1815
+ {
1816
+ "epoch": 1.49,
1817
+ "learning_rate": 2.559950171286204e-06,
1818
+ "loss": 2.5226,
1819
+ "step": 23900
1820
+ },
1821
+ {
1822
+ "epoch": 1.49,
1823
+ "learning_rate": 2.5288072251635003e-06,
1824
+ "loss": 2.4586,
1825
+ "step": 24000
1826
+ },
1827
+ {
1828
+ "epoch": 1.49,
1829
+ "eval_loss": NaN,
1830
+ "eval_runtime": 1242.6605,
1831
+ "eval_samples_per_second": 183.753,
1832
+ "eval_steps_per_second": 22.969,
1833
+ "step": 24000
1834
+ },
1835
+ {
1836
+ "epoch": 1.5,
1837
+ "learning_rate": 2.4976642790407973e-06,
1838
+ "loss": 2.6241,
1839
+ "step": 24100
1840
+ },
1841
+ {
1842
+ "epoch": 1.51,
1843
+ "learning_rate": 2.4665213329180942e-06,
1844
+ "loss": 2.9088,
1845
+ "step": 24200
1846
+ },
1847
+ {
1848
+ "epoch": 1.51,
1849
+ "learning_rate": 2.435378386795391e-06,
1850
+ "loss": 2.3136,
1851
+ "step": 24300
1852
+ },
1853
+ {
1854
+ "epoch": 1.52,
1855
+ "learning_rate": 2.4042354406726877e-06,
1856
+ "loss": 2.5916,
1857
+ "step": 24400
1858
+ },
1859
+ {
1860
+ "epoch": 1.53,
1861
+ "learning_rate": 2.3730924945499847e-06,
1862
+ "loss": 2.5698,
1863
+ "step": 24500
1864
+ },
1865
+ {
1866
+ "epoch": 1.53,
1867
+ "eval_loss": NaN,
1868
+ "eval_runtime": 1243.0815,
1869
+ "eval_samples_per_second": 183.691,
1870
+ "eval_steps_per_second": 22.961,
1871
+ "step": 24500
1872
+ },
1873
+ {
1874
+ "epoch": 1.53,
1875
+ "learning_rate": 2.3419495484272812e-06,
1876
+ "loss": 2.9015,
1877
+ "step": 24600
1878
+ },
1879
+ {
1880
+ "epoch": 1.54,
1881
+ "learning_rate": 2.310806602304578e-06,
1882
+ "loss": 3.1771,
1883
+ "step": 24700
1884
+ },
1885
+ {
1886
+ "epoch": 1.54,
1887
+ "learning_rate": 2.279663656181875e-06,
1888
+ "loss": 3.388,
1889
+ "step": 24800
1890
+ },
1891
+ {
1892
+ "epoch": 1.55,
1893
+ "learning_rate": 2.2485207100591717e-06,
1894
+ "loss": 2.5991,
1895
+ "step": 24900
1896
+ },
1897
+ {
1898
+ "epoch": 1.56,
1899
+ "learning_rate": 2.2173777639364687e-06,
1900
+ "loss": 2.6171,
1901
+ "step": 25000
1902
+ },
1903
+ {
1904
+ "epoch": 1.56,
1905
+ "eval_loss": NaN,
1906
+ "eval_runtime": 1242.7545,
1907
+ "eval_samples_per_second": 183.739,
1908
+ "eval_steps_per_second": 22.968,
1909
+ "step": 25000
1910
+ },
1911
+ {
1912
+ "epoch": 1.56,
1913
+ "learning_rate": 2.1865462472749925e-06,
1914
+ "loss": 2.506,
1915
+ "step": 25100
1916
+ },
1917
+ {
1918
+ "epoch": 1.57,
1919
+ "learning_rate": 2.1554033011522895e-06,
1920
+ "loss": 2.6567,
1921
+ "step": 25200
1922
+ },
1923
+ {
1924
+ "epoch": 1.58,
1925
+ "learning_rate": 2.124260355029586e-06,
1926
+ "loss": 3.2163,
1927
+ "step": 25300
1928
+ },
1929
+ {
1930
+ "epoch": 1.58,
1931
+ "learning_rate": 2.09342883836811e-06,
1932
+ "loss": 2.8622,
1933
+ "step": 25400
1934
+ },
1935
+ {
1936
+ "epoch": 1.59,
1937
+ "learning_rate": 2.062285892245407e-06,
1938
+ "loss": 3.0785,
1939
+ "step": 25500
1940
+ },
1941
+ {
1942
+ "epoch": 1.59,
1943
+ "eval_loss": NaN,
1944
+ "eval_runtime": 1241.7925,
1945
+ "eval_samples_per_second": 183.882,
1946
+ "eval_steps_per_second": 22.985,
1947
+ "step": 25500
1948
+ },
1949
+ {
1950
+ "epoch": 1.59,
1951
+ "learning_rate": 2.0311429461227034e-06,
1952
+ "loss": 2.735,
1953
+ "step": 25600
1954
+ },
1955
+ {
1956
+ "epoch": 1.6,
1957
+ "learning_rate": 2.0000000000000003e-06,
1958
+ "loss": 2.61,
1959
+ "step": 25700
1960
+ },
1961
+ {
1962
+ "epoch": 1.61,
1963
+ "learning_rate": 1.968857053877297e-06,
1964
+ "loss": 2.3942,
1965
+ "step": 25800
1966
+ },
1967
+ {
1968
+ "epoch": 1.61,
1969
+ "learning_rate": 1.937714107754594e-06,
1970
+ "loss": 2.1723,
1971
+ "step": 25900
1972
+ },
1973
+ {
1974
+ "epoch": 1.62,
1975
+ "learning_rate": 1.9065711616318906e-06,
1976
+ "loss": 2.856,
1977
+ "step": 26000
1978
+ },
1979
+ {
1980
+ "epoch": 1.62,
1981
+ "eval_loss": NaN,
1982
+ "eval_runtime": 1242.4465,
1983
+ "eval_samples_per_second": 183.785,
1984
+ "eval_steps_per_second": 22.973,
1985
+ "step": 26000
1986
+ },
1987
+ {
1988
+ "epoch": 1.63,
1989
+ "learning_rate": 1.8757396449704144e-06,
1990
+ "loss": 2.9466,
1991
+ "step": 26100
1992
+ },
1993
+ {
1994
+ "epoch": 1.63,
1995
+ "learning_rate": 1.8445966988477112e-06,
1996
+ "loss": 3.1076,
1997
+ "step": 26200
1998
+ },
1999
+ {
2000
+ "epoch": 1.64,
2001
+ "learning_rate": 1.813453752725008e-06,
2002
+ "loss": 2.2859,
2003
+ "step": 26300
2004
+ },
2005
+ {
2006
+ "epoch": 1.64,
2007
+ "learning_rate": 1.782310806602305e-06,
2008
+ "loss": 3.0796,
2009
+ "step": 26400
2010
+ },
2011
+ {
2012
+ "epoch": 1.65,
2013
+ "learning_rate": 1.7511678604796014e-06,
2014
+ "loss": 2.3788,
2015
+ "step": 26500
2016
+ },
2017
+ {
2018
+ "epoch": 1.65,
2019
+ "eval_loss": NaN,
2020
+ "eval_runtime": 1242.137,
2021
+ "eval_samples_per_second": 183.831,
2022
+ "eval_steps_per_second": 22.979,
2023
+ "step": 26500
2024
+ },
2025
+ {
2026
+ "epoch": 1.66,
2027
+ "learning_rate": 1.7200249143568984e-06,
2028
+ "loss": 2.7557,
2029
+ "step": 26600
2030
+ },
2031
+ {
2032
+ "epoch": 1.66,
2033
+ "learning_rate": 1.688881968234195e-06,
2034
+ "loss": 2.328,
2035
+ "step": 26700
2036
+ },
2037
+ {
2038
+ "epoch": 1.67,
2039
+ "learning_rate": 1.657739022111492e-06,
2040
+ "loss": 2.7651,
2041
+ "step": 26800
2042
+ },
2043
+ {
2044
+ "epoch": 1.68,
2045
+ "learning_rate": 1.6265960759887886e-06,
2046
+ "loss": 3.1467,
2047
+ "step": 26900
2048
+ },
2049
+ {
2050
+ "epoch": 1.68,
2051
+ "learning_rate": 1.5954531298660856e-06,
2052
+ "loss": 2.5629,
2053
+ "step": 27000
2054
+ },
2055
+ {
2056
+ "epoch": 1.68,
2057
+ "eval_loss": NaN,
2058
+ "eval_runtime": 1243.6496,
2059
+ "eval_samples_per_second": 183.607,
2060
+ "eval_steps_per_second": 22.951,
2061
+ "step": 27000
2062
+ },
2063
+ {
2064
+ "epoch": 1.69,
2065
+ "learning_rate": 1.5643101837433821e-06,
2066
+ "loss": 2.6638,
2067
+ "step": 27100
2068
+ },
2069
+ {
2070
+ "epoch": 1.69,
2071
+ "learning_rate": 1.5331672376206791e-06,
2072
+ "loss": 2.7094,
2073
+ "step": 27200
2074
+ },
2075
+ {
2076
+ "epoch": 1.7,
2077
+ "learning_rate": 1.5020242914979756e-06,
2078
+ "loss": 2.6052,
2079
+ "step": 27300
2080
+ },
2081
+ {
2082
+ "epoch": 1.71,
2083
+ "learning_rate": 1.4708813453752726e-06,
2084
+ "loss": 2.6647,
2085
+ "step": 27400
2086
+ },
2087
+ {
2088
+ "epoch": 1.71,
2089
+ "learning_rate": 1.4397383992525694e-06,
2090
+ "loss": 2.5914,
2091
+ "step": 27500
2092
+ },
2093
+ {
2094
+ "epoch": 1.71,
2095
+ "eval_loss": NaN,
2096
+ "eval_runtime": 1242.5399,
2097
+ "eval_samples_per_second": 183.771,
2098
+ "eval_steps_per_second": 22.971,
2099
+ "step": 27500
2100
+ },
2101
+ {
2102
+ "epoch": 1.72,
2103
+ "learning_rate": 1.4085954531298663e-06,
2104
+ "loss": 2.6187,
2105
+ "step": 27600
2106
+ },
2107
+ {
2108
+ "epoch": 1.73,
2109
+ "learning_rate": 1.3774525070071629e-06,
2110
+ "loss": 2.2137,
2111
+ "step": 27700
2112
+ },
2113
+ {
2114
+ "epoch": 1.73,
2115
+ "learning_rate": 1.3463095608844598e-06,
2116
+ "loss": 2.5718,
2117
+ "step": 27800
2118
+ },
2119
+ {
2120
+ "epoch": 1.74,
2121
+ "learning_rate": 1.3151666147617564e-06,
2122
+ "loss": 2.3102,
2123
+ "step": 27900
2124
+ },
2125
+ {
2126
+ "epoch": 1.74,
2127
+ "learning_rate": 1.2840236686390533e-06,
2128
+ "loss": 2.5802,
2129
+ "step": 28000
2130
+ },
2131
+ {
2132
+ "epoch": 1.74,
2133
+ "eval_loss": NaN,
2134
+ "eval_runtime": 1242.0317,
2135
+ "eval_samples_per_second": 183.846,
2136
+ "eval_steps_per_second": 22.981,
2137
+ "step": 28000
2138
+ },
2139
+ {
2140
+ "epoch": 1.75,
2141
+ "learning_rate": 1.25288072251635e-06,
2142
+ "loss": 3.2864,
2143
+ "step": 28100
2144
+ },
2145
+ {
2146
+ "epoch": 1.76,
2147
+ "learning_rate": 1.221737776393647e-06,
2148
+ "loss": 2.4009,
2149
+ "step": 28200
2150
+ },
2151
+ {
2152
+ "epoch": 1.76,
2153
+ "learning_rate": 1.1905948302709438e-06,
2154
+ "loss": 2.2293,
2155
+ "step": 28300
2156
+ },
2157
+ {
2158
+ "epoch": 1.77,
2159
+ "learning_rate": 1.1594518841482405e-06,
2160
+ "loss": 2.5808,
2161
+ "step": 28400
2162
+ },
2163
+ {
2164
+ "epoch": 1.78,
2165
+ "learning_rate": 1.1283089380255373e-06,
2166
+ "loss": 2.2956,
2167
+ "step": 28500
2168
+ },
2169
+ {
2170
+ "epoch": 1.78,
2171
+ "eval_loss": NaN,
2172
+ "eval_runtime": 1242.0354,
2173
+ "eval_samples_per_second": 183.846,
2174
+ "eval_steps_per_second": 22.981,
2175
+ "step": 28500
2176
+ },
2177
+ {
2178
+ "epoch": 1.78,
2179
+ "learning_rate": 1.097165991902834e-06,
2180
+ "loss": 2.6273,
2181
+ "step": 28600
2182
+ },
2183
+ {
2184
+ "epoch": 1.79,
2185
+ "learning_rate": 1.0660230457801308e-06,
2186
+ "loss": 2.5466,
2187
+ "step": 28700
2188
+ },
2189
+ {
2190
+ "epoch": 1.79,
2191
+ "learning_rate": 1.0348800996574275e-06,
2192
+ "loss": 2.7805,
2193
+ "step": 28800
2194
+ },
2195
+ {
2196
+ "epoch": 1.8,
2197
+ "learning_rate": 1.0037371535347245e-06,
2198
+ "loss": 2.4265,
2199
+ "step": 28900
2200
+ },
2201
+ {
2202
+ "epoch": 1.81,
2203
+ "learning_rate": 9.725942074120212e-07,
2204
+ "loss": 2.6872,
2205
+ "step": 29000
2206
+ },
2207
+ {
2208
+ "epoch": 1.81,
2209
+ "eval_loss": NaN,
2210
+ "eval_runtime": 1241.1816,
2211
+ "eval_samples_per_second": 183.972,
2212
+ "eval_steps_per_second": 22.997,
2213
+ "step": 29000
2214
+ },
2215
+ {
2216
+ "epoch": 1.81,
2217
+ "learning_rate": 9.41451261289318e-07,
2218
+ "loss": 2.8077,
2219
+ "step": 29100
2220
+ },
2221
+ {
2222
+ "epoch": 1.82,
2223
+ "learning_rate": 9.103083151666147e-07,
2224
+ "loss": 2.7051,
2225
+ "step": 29200
2226
+ },
2227
+ {
2228
+ "epoch": 1.82,
2229
+ "learning_rate": 8.791653690439116e-07,
2230
+ "loss": 2.6449,
2231
+ "step": 29300
2232
+ },
2233
+ {
2234
+ "epoch": 1.83,
2235
+ "learning_rate": 8.480224229212085e-07,
2236
+ "loss": 2.2203,
2237
+ "step": 29400
2238
+ },
2239
+ {
2240
+ "epoch": 1.84,
2241
+ "learning_rate": 8.168794767985053e-07,
2242
+ "loss": 2.7376,
2243
+ "step": 29500
2244
+ },
2245
+ {
2246
+ "epoch": 1.84,
2247
+ "eval_loss": NaN,
2248
+ "eval_runtime": 1242.2627,
2249
+ "eval_samples_per_second": 183.812,
2250
+ "eval_steps_per_second": 22.977,
2251
+ "step": 29500
2252
+ },
2253
+ {
2254
+ "epoch": 1.84,
2255
+ "learning_rate": 7.857365306758021e-07,
2256
+ "loss": 2.3105,
2257
+ "step": 29600
2258
+ },
2259
+ {
2260
+ "epoch": 1.85,
2261
+ "learning_rate": 7.545935845530988e-07,
2262
+ "loss": 3.303,
2263
+ "step": 29700
2264
+ },
2265
+ {
2266
+ "epoch": 1.86,
2267
+ "learning_rate": 7.234506384303956e-07,
2268
+ "loss": 2.6327,
2269
+ "step": 29800
2270
+ },
2271
+ {
2272
+ "epoch": 1.86,
2273
+ "learning_rate": 6.923076923076924e-07,
2274
+ "loss": 2.4727,
2275
+ "step": 29900
2276
+ },
2277
+ {
2278
+ "epoch": 1.87,
2279
+ "learning_rate": 6.611647461849892e-07,
2280
+ "loss": 2.5736,
2281
+ "step": 30000
2282
+ },
2283
+ {
2284
+ "epoch": 1.87,
2285
+ "eval_loss": NaN,
2286
+ "eval_runtime": 1242.5556,
2287
+ "eval_samples_per_second": 183.769,
2288
+ "eval_steps_per_second": 22.971,
2289
+ "step": 30000
2290
+ },
2291
+ {
2292
+ "epoch": 1.87,
2293
+ "learning_rate": 6.300218000622859e-07,
2294
+ "loss": 2.687,
2295
+ "step": 30100
2296
+ },
2297
+ {
2298
+ "epoch": 1.88,
2299
+ "learning_rate": 5.988788539395828e-07,
2300
+ "loss": 2.664,
2301
+ "step": 30200
2302
+ },
2303
+ {
2304
+ "epoch": 1.89,
2305
+ "learning_rate": 5.677359078168795e-07,
2306
+ "loss": 2.4285,
2307
+ "step": 30300
2308
+ },
2309
+ {
2310
+ "epoch": 1.89,
2311
+ "learning_rate": 5.369043911554033e-07,
2312
+ "loss": 2.4898,
2313
+ "step": 30400
2314
+ },
2315
+ {
2316
+ "epoch": 1.9,
2317
+ "learning_rate": 5.057614450327001e-07,
2318
+ "loss": 2.3551,
2319
+ "step": 30500
2320
+ },
2321
+ {
2322
+ "epoch": 1.9,
2323
+ "eval_loss": NaN,
2324
+ "eval_runtime": 1242.2664,
2325
+ "eval_samples_per_second": 183.812,
2326
+ "eval_steps_per_second": 22.977,
2327
+ "step": 30500
2328
+ },
2329
+ {
2330
+ "epoch": 1.91,
2331
+ "learning_rate": 4.7461849890999693e-07,
2332
+ "loss": 2.8719,
2333
+ "step": 30600
2334
+ },
2335
+ {
2336
+ "epoch": 1.91,
2337
+ "learning_rate": 4.4347555278729373e-07,
2338
+ "loss": 2.6584,
2339
+ "step": 30700
2340
+ },
2341
+ {
2342
+ "epoch": 1.92,
2343
+ "learning_rate": 4.1233260666459054e-07,
2344
+ "loss": 2.3366,
2345
+ "step": 30800
2346
+ },
2347
+ {
2348
+ "epoch": 1.92,
2349
+ "learning_rate": 3.811896605418873e-07,
2350
+ "loss": 2.8612,
2351
+ "step": 30900
2352
+ },
2353
+ {
2354
+ "epoch": 1.93,
2355
+ "learning_rate": 3.500467144191841e-07,
2356
+ "loss": 2.7175,
2357
+ "step": 31000
2358
+ },
2359
+ {
2360
+ "epoch": 1.93,
2361
+ "eval_loss": NaN,
2362
+ "eval_runtime": 1242.8604,
2363
+ "eval_samples_per_second": 183.724,
2364
+ "eval_steps_per_second": 22.966,
2365
+ "step": 31000
2366
+ },
2367
+ {
2368
+ "epoch": 1.94,
2369
+ "learning_rate": 3.189037682964809e-07,
2370
+ "loss": 2.9036,
2371
+ "step": 31100
2372
+ },
2373
+ {
2374
+ "epoch": 1.94,
2375
+ "learning_rate": 2.8776082217377764e-07,
2376
+ "loss": 2.8631,
2377
+ "step": 31200
2378
+ },
2379
+ {
2380
+ "epoch": 1.95,
2381
+ "learning_rate": 2.5661787605107445e-07,
2382
+ "loss": 2.4594,
2383
+ "step": 31300
2384
+ },
2385
+ {
2386
+ "epoch": 1.96,
2387
+ "learning_rate": 2.2547492992837125e-07,
2388
+ "loss": 2.9569,
2389
+ "step": 31400
2390
+ },
2391
+ {
2392
+ "epoch": 1.96,
2393
+ "learning_rate": 1.9433198380566805e-07,
2394
+ "loss": 3.0516,
2395
+ "step": 31500
2396
+ },
2397
+ {
2398
+ "epoch": 1.96,
2399
+ "eval_loss": NaN,
2400
+ "eval_runtime": 1243.9849,
2401
+ "eval_samples_per_second": 183.558,
2402
+ "eval_steps_per_second": 22.945,
2403
+ "step": 31500
2404
+ },
2405
+ {
2406
+ "epoch": 1.97,
2407
+ "learning_rate": 1.6318903768296483e-07,
2408
+ "loss": 2.529,
2409
+ "step": 31600
2410
+ },
2411
+ {
2412
+ "epoch": 1.97,
2413
+ "learning_rate": 1.320460915602616e-07,
2414
+ "loss": 2.4768,
2415
+ "step": 31700
2416
+ },
2417
+ {
2418
+ "epoch": 1.98,
2419
+ "learning_rate": 1.009031454375584e-07,
2420
+ "loss": 2.5839,
2421
+ "step": 31800
2422
+ },
2423
+ {
2424
+ "epoch": 1.99,
2425
+ "learning_rate": 6.97601993148552e-08,
2426
+ "loss": 2.6676,
2427
+ "step": 31900
2428
+ },
2429
+ {
2430
+ "epoch": 1.99,
2431
+ "learning_rate": 3.861725319215198e-08,
2432
+ "loss": 3.2724,
2433
+ "step": 32000
2434
+ },
2435
+ {
2436
+ "epoch": 1.99,
2437
+ "eval_loss": NaN,
2438
+ "eval_runtime": 1242.7853,
2439
+ "eval_samples_per_second": 183.735,
2440
+ "eval_steps_per_second": 22.967,
2441
+ "step": 32000
2442
+ },
2443
+ {
2444
+ "epoch": 2.0,
2445
+ "learning_rate": 7.47430706944877e-09,
2446
+ "loss": 2.7629,
2447
+ "step": 32100
2448
  }
2449
  ],
2450
+ "logging_steps": 100,
2451
+ "max_steps": 32110,
2452
+ "num_train_epochs": 2,
2453
+ "save_steps": 100,
2454
+ "total_flos": 1.0814548820133888e+18,
2455
  "trial_name": null,
2456
  "trial_params": null
2457
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ef65b045ad4b2dc92fbc98d274f27e0862642b5ac670c481855351ab61a2199
3
- size 3439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63248d3212eed9dc79dcf28db34d358c07675fbcbcd9e5b5d24121d0ccc9f00a
3
+ size 4027