ProgramInNonsense commited on
Commit
fcdf117
·
verified ·
1 Parent(s): cc08e57

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a0f7e66a5cf5e2fe250e8be921f24692b83ef6dc556fb1ad68f814e1c8e95fd
3
  size 205573472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae644db6dced204b3d4a7fc41f2488b972585d9fe6da1b714504390dc3f3276c
3
  size 205573472
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d74b30ee9a6aaf67c9615cf76ae290dd35040db63b258316a8bf7a766bc2ed2
3
  size 411372650
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2763dbdc475b39f3629e485ec0f7fe8dd8f62d1750d0a500141831377c4ebbc9
3
  size 411372650
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd2c8e97237e376105e407b05be1e33b22c026561b920b8fe9d134eb88ebbcfa
3
  size 14308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af5cedfa33a695e8e556db1ebc9eecca4a98e863bc6bdc732784653406a6d13
3
  size 14308
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0dfc2af941e2567517229a8f44267f5380076f9a37680e99372f149a8e0c635
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5f37e21fbe85e09b136734aed2deb2ce642b5bd3d64c65196a2c110d8c5ff3a
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,2046 +1,125 @@
1
  {
2
- "best_metric": 0.7622952461242676,
3
- "best_model_checkpoint": "./output/checkpoint-2700",
4
- "epoch": 0.017052128989882405,
5
  "eval_steps": 150,
6
- "global_step": 2700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 6.315603329586075e-05,
13
- "grad_norm": 24.43903923034668,
14
  "learning_rate": 5.500000000000001e-06,
15
- "loss": 1.7206,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.0001263120665917215,
20
- "grad_norm": 22.279735565185547,
21
  "learning_rate": 1.1000000000000001e-05,
22
- "loss": 1.4591,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.00018946809988758226,
27
- "grad_norm": 11.544012069702148,
28
  "learning_rate": 1.65e-05,
29
- "loss": 1.1749,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.000252624133183443,
34
- "grad_norm": 22.925434112548828,
35
  "learning_rate": 2.2000000000000003e-05,
36
- "loss": 1.0654,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.00031578016647930375,
41
- "grad_norm": 19.041534423828125,
42
  "learning_rate": 2.75e-05,
43
- "loss": 1.0842,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.00037893619977516453,
48
- "grad_norm": 17.149545669555664,
49
  "learning_rate": 3.3e-05,
50
- "loss": 0.9869,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.00044209223307102526,
55
- "grad_norm": 13.974715232849121,
56
  "learning_rate": 3.85e-05,
57
- "loss": 0.9986,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.000505248266366886,
62
- "grad_norm": 21.322072982788086,
63
  "learning_rate": 4.4000000000000006e-05,
64
- "loss": 0.9939,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.0005684042996627468,
69
- "grad_norm": 11.883322715759277,
70
  "learning_rate": 4.9500000000000004e-05,
71
- "loss": 1.0996,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.0006315603329586075,
76
- "grad_norm": 10.673542976379395,
77
  "learning_rate": 5.5e-05,
78
- "loss": 0.9707,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.0006947163662544683,
83
- "grad_norm": 14.419370651245117,
84
  "learning_rate": 5.4999434791355066e-05,
85
- "loss": 1.1121,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.0007578723995503291,
90
- "grad_norm": 17.156391143798828,
91
  "learning_rate": 5.4997739188653784e-05,
92
- "loss": 0.9994,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.0008210284328461898,
97
- "grad_norm": 12.670730590820312,
98
  "learning_rate": 5.4994913261595724e-05,
99
- "loss": 0.9904,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.0008841844661420505,
104
- "grad_norm": 14.464807510375977,
105
  "learning_rate": 5.49909571263437e-05,
106
- "loss": 1.0876,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.0009473404994379113,
111
- "grad_norm": 12.979162216186523,
112
  "learning_rate": 5.498587094551892e-05,
113
- "loss": 0.9945,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.0009473404994379113,
118
- "eval_loss": 0.9104289412498474,
119
- "eval_runtime": 47.9241,
120
- "eval_samples_per_second": 10.433,
121
- "eval_steps_per_second": 10.433,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.001010496532733772,
126
- "grad_norm": 14.477323532104492,
127
- "learning_rate": 5.497965492819436e-05,
128
- "loss": 1.0328,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.001073652566029633,
133
- "grad_norm": 15.670412063598633,
134
- "learning_rate": 5.4972309329886156e-05,
135
- "loss": 1.0667,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.0011368085993254935,
140
- "grad_norm": 12.536147117614746,
141
- "learning_rate": 5.496383445254307e-05,
142
- "loss": 1.0397,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.0011999646326213544,
147
- "grad_norm": 16.962190628051758,
148
- "learning_rate": 5.495423064453413e-05,
149
- "loss": 1.084,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.001263120665917215,
154
- "grad_norm": 9.591404914855957,
155
- "learning_rate": 5.4943498300634254e-05,
156
- "loss": 1.0233,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.0013262766992130758,
161
- "grad_norm": 12.136938095092773,
162
- "learning_rate": 5.493163786200807e-05,
163
- "loss": 1.0611,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.0013894327325089367,
168
- "grad_norm": 10.48437786102295,
169
- "learning_rate": 5.491864981619175e-05,
170
- "loss": 1.0624,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.0014525887658047973,
175
- "grad_norm": 11.525893211364746,
176
- "learning_rate": 5.4904534697073e-05,
177
- "loss": 1.0334,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.0015157447991006581,
182
- "grad_norm": 8.725274085998535,
183
- "learning_rate": 5.488929308486908e-05,
184
- "loss": 0.9931,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.0015789008323965187,
189
- "grad_norm": 11.628905296325684,
190
- "learning_rate": 5.487292560610295e-05,
191
- "loss": 0.963,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.0016420568656923796,
196
- "grad_norm": 16.11012840270996,
197
- "learning_rate": 5.485543293357758e-05,
198
- "loss": 0.9915,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.0017052128989882404,
203
- "grad_norm": 10.494763374328613,
204
- "learning_rate": 5.483681578634821e-05,
205
- "loss": 1.0215,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.001768368932284101,
210
- "grad_norm": 9.800888061523438,
211
- "learning_rate": 5.481707492969285e-05,
212
- "loss": 1.0794,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 0.0018315249655799619,
217
- "grad_norm": 8.919413566589355,
218
- "learning_rate": 5.479621117508079e-05,
219
- "loss": 1.0463,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 0.0018946809988758227,
224
- "grad_norm": 14.154871940612793,
225
- "learning_rate": 5.477422538013927e-05,
226
- "loss": 1.0916,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 0.0018946809988758227,
231
- "eval_loss": 0.8587029576301575,
232
- "eval_runtime": 47.5564,
233
- "eval_samples_per_second": 10.514,
234
- "eval_steps_per_second": 10.514,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 0.0019578370321716833,
239
- "grad_norm": 9.973740577697754,
240
- "learning_rate": 5.475111844861821e-05,
241
- "loss": 1.024,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 0.002020993065467544,
246
- "grad_norm": 15.102771759033203,
247
- "learning_rate": 5.4726891330353056e-05,
248
- "loss": 1.0394,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.002084149098763405,
253
- "grad_norm": 9.678414344787598,
254
- "learning_rate": 5.4701545021225746e-05,
255
- "loss": 1.0355,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 0.002147305132059266,
260
- "grad_norm": 16.077899932861328,
261
- "learning_rate": 5.4675080563123786e-05,
262
- "loss": 0.9778,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 0.0022104611653551262,
267
- "grad_norm": 11.449134826660156,
268
- "learning_rate": 5.4647499043897386e-05,
269
- "loss": 1.0076,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 0.002273617198650987,
274
- "grad_norm": 9.525121688842773,
275
- "learning_rate": 5.461880159731476e-05,
276
- "loss": 0.9715,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 0.002336773231946848,
281
- "grad_norm": 8.877346992492676,
282
- "learning_rate": 5.4588989403015564e-05,
283
- "loss": 1.0915,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 0.0023999292652427087,
288
- "grad_norm": 9.848546981811523,
289
- "learning_rate": 5.4558063686462315e-05,
290
- "loss": 1.0043,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 0.0024630852985385696,
295
- "grad_norm": 14.764713287353516,
296
- "learning_rate": 5.4526025718890104e-05,
297
- "loss": 1.0622,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 0.00252624133183443,
302
- "grad_norm": 9.830604553222656,
303
- "learning_rate": 5.44928768172543e-05,
304
- "loss": 1.0403,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 0.002589397365130291,
309
- "grad_norm": 11.697220802307129,
310
- "learning_rate": 5.44586183441764e-05,
311
- "loss": 1.0231,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 0.0026525533984261516,
316
- "grad_norm": 11.017810821533203,
317
- "learning_rate": 5.442325170788806e-05,
318
- "loss": 0.9498,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 0.0027157094317220125,
323
- "grad_norm": 9.533434867858887,
324
- "learning_rate": 5.438677836217317e-05,
325
- "loss": 1.0669,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 0.0027788654650178733,
330
- "grad_norm": 8.570756912231445,
331
- "learning_rate": 5.434919980630811e-05,
332
- "loss": 1.0319,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 0.0028420214983137337,
337
- "grad_norm": 10.60665512084961,
338
- "learning_rate": 5.431051758500015e-05,
339
- "loss": 0.9454,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 0.0028420214983137337,
344
- "eval_loss": 0.8439372777938843,
345
- "eval_runtime": 46.3858,
346
- "eval_samples_per_second": 10.779,
347
- "eval_steps_per_second": 10.779,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.0029051775316095946,
352
- "grad_norm": 11.22940731048584,
353
- "learning_rate": 5.427073328832388e-05,
354
- "loss": 1.0539,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.0029683335649054554,
359
- "grad_norm": 10.116870880126953,
360
- "learning_rate": 5.422984855165592e-05,
361
- "loss": 1.0274,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.0030314895982013162,
366
- "grad_norm": 12.770539283752441,
367
- "learning_rate": 5.418786505560766e-05,
368
- "loss": 1.0253,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.003094645631497177,
373
- "grad_norm": 11.039621353149414,
374
- "learning_rate": 5.414478452595617e-05,
375
- "loss": 1.0877,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.0031578016647930375,
380
- "grad_norm": 13.608321189880371,
381
- "learning_rate": 5.4100608733573315e-05,
382
- "loss": 0.965,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.0032209576980888983,
387
- "grad_norm": 10.158246994018555,
388
- "learning_rate": 5.4055339494352874e-05,
389
- "loss": 0.9643,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.003284113731384759,
394
- "grad_norm": 9.061509132385254,
395
- "learning_rate": 5.400897866913597e-05,
396
- "loss": 0.9278,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.00334726976468062,
401
- "grad_norm": 9.303275108337402,
402
- "learning_rate": 5.3961528163634546e-05,
403
- "loss": 1.0179,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.003410425797976481,
408
- "grad_norm": 12.215669631958008,
409
- "learning_rate": 5.391298992835303e-05,
410
- "loss": 0.982,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.0034735818312723417,
415
- "grad_norm": 11.75090217590332,
416
- "learning_rate": 5.386336595850817e-05,
417
- "loss": 1.0544,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.003536737864568202,
422
- "grad_norm": 11.43079662322998,
423
- "learning_rate": 5.3812658293946995e-05,
424
- "loss": 1.0241,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.003599893897864063,
429
- "grad_norm": 10.177742004394531,
430
- "learning_rate": 5.376086901906299e-05,
431
- "loss": 0.9771,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.0036630499311599237,
436
- "grad_norm": 15.041535377502441,
437
- "learning_rate": 5.37080002627104e-05,
438
- "loss": 0.9757,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.0037262059644557846,
443
- "grad_norm": 11.840357780456543,
444
- "learning_rate": 5.365405419811673e-05,
445
- "loss": 1.0436,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.0037893619977516454,
450
- "grad_norm": 17.893346786499023,
451
- "learning_rate": 5.359903304279339e-05,
452
- "loss": 1.0265,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.0037893619977516454,
457
- "eval_loss": 0.8398363590240479,
458
- "eval_runtime": 47.7667,
459
- "eval_samples_per_second": 10.468,
460
- "eval_steps_per_second": 10.468,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.003852518031047506,
465
- "grad_norm": 9.05345630645752,
466
- "learning_rate": 5.354293905844459e-05,
467
- "loss": 1.0507,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.003915674064343367,
472
- "grad_norm": 7.9775872230529785,
473
- "learning_rate": 5.3485774550874306e-05,
474
- "loss": 0.995,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.0039788300976392275,
479
- "grad_norm": 15.064658164978027,
480
- "learning_rate": 5.3427541869891556e-05,
481
- "loss": 1.017,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.004041986130935088,
486
- "grad_norm": 10.115464210510254,
487
- "learning_rate": 5.336824340921377e-05,
488
- "loss": 0.9997,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.004105142164230949,
493
- "grad_norm": 11.037529945373535,
494
- "learning_rate": 5.330788160636841e-05,
495
- "loss": 1.0244,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.00416829819752681,
500
- "grad_norm": 14.279242515563965,
501
- "learning_rate": 5.3246458942592776e-05,
502
- "loss": 0.9978,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.004231454230822671,
507
- "grad_norm": 12.71838092803955,
508
- "learning_rate": 5.318397794273199e-05,
509
- "loss": 0.9989,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.004294610264118532,
514
- "grad_norm": 12.012393951416016,
515
- "learning_rate": 5.312044117513524e-05,
516
- "loss": 1.0078,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.004357766297414392,
521
- "grad_norm": 15.37903118133545,
522
- "learning_rate": 5.305585125155018e-05,
523
- "loss": 1.0038,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.0044209223307102525,
528
- "grad_norm": 9.483263969421387,
529
- "learning_rate": 5.29902108270156e-05,
530
- "loss": 1.015,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.004484078364006113,
535
- "grad_norm": 10.519631385803223,
536
- "learning_rate": 5.2923522599752245e-05,
537
- "loss": 1.0532,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.004547234397301974,
542
- "grad_norm": 10.06659984588623,
543
- "learning_rate": 5.2855789311051945e-05,
544
- "loss": 0.9932,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.004610390430597835,
549
- "grad_norm": 10.307997703552246,
550
- "learning_rate": 5.27870137451649e-05,
551
- "loss": 1.0364,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.004673546463893696,
556
- "grad_norm": 10.062649726867676,
557
- "learning_rate": 5.2717198729185245e-05,
558
- "loss": 0.9269,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.004736702497189557,
563
- "grad_norm": 13.508529663085938,
564
- "learning_rate": 5.264634713293485e-05,
565
- "loss": 0.9577,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.004736702497189557,
570
- "eval_loss": 0.8395355343818665,
571
- "eval_runtime": 47.7365,
572
- "eval_samples_per_second": 10.474,
573
- "eval_steps_per_second": 10.474,
574
- "step": 750
575
- },
576
- {
577
- "epoch": 0.0047998585304854175,
578
- "grad_norm": 12.114377975463867,
579
- "learning_rate": 5.2574461868845316e-05,
580
- "loss": 1.0364,
581
- "step": 760
582
- },
583
- {
584
- "epoch": 0.004863014563781278,
585
- "grad_norm": 9.41462516784668,
586
- "learning_rate": 5.2501545891838315e-05,
587
- "loss": 1.0971,
588
- "step": 770
589
- },
590
- {
591
- "epoch": 0.004926170597077139,
592
- "grad_norm": 12.119346618652344,
593
- "learning_rate": 5.242760219920405e-05,
594
- "loss": 1.0022,
595
- "step": 780
596
- },
597
- {
598
- "epoch": 0.004989326630372999,
599
- "grad_norm": 11.057748794555664,
600
- "learning_rate": 5.235263383047812e-05,
601
- "loss": 1.0155,
602
- "step": 790
603
- },
604
- {
605
- "epoch": 0.00505248266366886,
606
- "grad_norm": 13.1547212600708,
607
- "learning_rate": 5.2276643867316525e-05,
608
- "loss": 1.0677,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 0.005115638696964721,
613
- "grad_norm": 10.418506622314453,
614
- "learning_rate": 5.219963543336902e-05,
615
- "loss": 0.9545,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 0.005178794730260582,
620
- "grad_norm": 11.536872863769531,
621
- "learning_rate": 5.212161169415071e-05,
622
- "loss": 0.9849,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 0.0052419507635564425,
627
- "grad_norm": 8.616244316101074,
628
- "learning_rate": 5.204257585691191e-05,
629
- "loss": 0.97,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 0.005305106796852303,
634
- "grad_norm": 9.628042221069336,
635
- "learning_rate": 5.196253117050633e-05,
636
- "loss": 1.0998,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 0.005368262830148164,
641
- "grad_norm": 13.618757247924805,
642
- "learning_rate": 5.188148092525751e-05,
643
- "loss": 1.143,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 0.005431418863444025,
648
- "grad_norm": 11.812853813171387,
649
- "learning_rate": 5.179942845282357e-05,
650
- "loss": 0.9219,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 0.005494574896739886,
655
- "grad_norm": 11.771102905273438,
656
- "learning_rate": 5.17163771260603e-05,
657
- "loss": 1.0431,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 0.005557730930035747,
662
- "grad_norm": 11.519074440002441,
663
- "learning_rate": 5.163233035888244e-05,
664
- "loss": 1.0038,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 0.0056208869633316075,
669
- "grad_norm": 13.609795570373535,
670
- "learning_rate": 5.154729160612338e-05,
671
- "loss": 0.9865,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 0.0056840429966274674,
676
- "grad_norm": 9.950550079345703,
677
- "learning_rate": 5.146126436339321e-05,
678
- "loss": 0.9997,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 0.0056840429966274674,
683
- "eval_loss": 0.8275522589683533,
684
- "eval_runtime": 46.5445,
685
- "eval_samples_per_second": 10.742,
686
- "eval_steps_per_second": 10.742,
687
- "step": 900
688
- },
689
- {
690
- "epoch": 0.005747199029923328,
691
- "grad_norm": 10.390827178955078,
692
- "learning_rate": 5.137425216693491e-05,
693
- "loss": 1.0279,
694
- "step": 910
695
- },
696
- {
697
- "epoch": 0.005810355063219189,
698
- "grad_norm": 11.740073204040527,
699
- "learning_rate": 5.128625859347907e-05,
700
- "loss": 1.059,
701
- "step": 920
702
- },
703
- {
704
- "epoch": 0.00587351109651505,
705
- "grad_norm": 10.699241638183594,
706
- "learning_rate": 5.1197287260096865e-05,
707
- "loss": 1.0401,
708
- "step": 930
709
- },
710
- {
711
- "epoch": 0.005936667129810911,
712
- "grad_norm": 10.624009132385254,
713
- "learning_rate": 5.110734182405132e-05,
714
- "loss": 1.0079,
715
- "step": 940
716
- },
717
- {
718
- "epoch": 0.005999823163106772,
719
- "grad_norm": 10.221404075622559,
720
- "learning_rate": 5.1016425982647025e-05,
721
- "loss": 0.9439,
722
- "step": 950
723
- },
724
- {
725
- "epoch": 0.0060629791964026325,
726
- "grad_norm": 10.964374542236328,
727
- "learning_rate": 5.092454347307812e-05,
728
- "loss": 1.0385,
729
- "step": 960
730
- },
731
- {
732
- "epoch": 0.006126135229698493,
733
- "grad_norm": 11.145320892333984,
734
- "learning_rate": 5.08316980722747e-05,
735
- "loss": 1.0424,
736
- "step": 970
737
- },
738
- {
739
- "epoch": 0.006189291262994354,
740
- "grad_norm": 14.886443138122559,
741
- "learning_rate": 5.0737893596747534e-05,
742
- "loss": 0.9933,
743
- "step": 980
744
- },
745
- {
746
- "epoch": 0.006252447296290215,
747
- "grad_norm": 11.04690933227539,
748
- "learning_rate": 5.064313390243121e-05,
749
- "loss": 1.1194,
750
- "step": 990
751
- },
752
- {
753
- "epoch": 0.006315603329586075,
754
- "grad_norm": 8.777559280395508,
755
- "learning_rate": 5.054742288452562e-05,
756
- "loss": 1.0695,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 0.006378759362881936,
761
- "grad_norm": 11.905736923217773,
762
- "learning_rate": 5.0450764477335825e-05,
763
- "loss": 0.9496,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 0.006441915396177797,
768
- "grad_norm": 10.9399995803833,
769
- "learning_rate": 5.035316265411036e-05,
770
- "loss": 0.9413,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 0.0065050714294736574,
775
- "grad_norm": 10.541971206665039,
776
- "learning_rate": 5.02546214268779e-05,
777
- "loss": 0.9718,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 0.006568227462769518,
782
- "grad_norm": 11.16507625579834,
783
- "learning_rate": 5.0155144846282345e-05,
784
- "loss": 1.0197,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 0.006631383496065379,
789
- "grad_norm": 10.459815979003906,
790
- "learning_rate": 5.005473700141629e-05,
791
- "loss": 1.0872,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 0.006631383496065379,
796
- "eval_loss": 0.8140906691551208,
797
- "eval_runtime": 47.6038,
798
- "eval_samples_per_second": 10.503,
799
- "eval_steps_per_second": 10.503,
800
- "step": 1050
801
- },
802
- {
803
- "epoch": 0.00669453952936124,
804
- "grad_norm": 12.183165550231934,
805
- "learning_rate": 4.995340201965296e-05,
806
- "loss": 0.9901,
807
- "step": 1060
808
- },
809
- {
810
- "epoch": 0.006757695562657101,
811
- "grad_norm": 9.4849853515625,
812
- "learning_rate": 4.985114406647658e-05,
813
- "loss": 0.9736,
814
- "step": 1070
815
- },
816
- {
817
- "epoch": 0.006820851595952962,
818
- "grad_norm": 10.753118515014648,
819
- "learning_rate": 4.9747967345311055e-05,
820
- "loss": 1.0449,
821
- "step": 1080
822
- },
823
- {
824
- "epoch": 0.0068840076292488225,
825
- "grad_norm": 13.308238983154297,
826
- "learning_rate": 4.9643876097347296e-05,
827
- "loss": 0.9135,
828
- "step": 1090
829
- },
830
- {
831
- "epoch": 0.006947163662544683,
832
- "grad_norm": 7.950455665588379,
833
- "learning_rate": 4.953887460136881e-05,
834
- "loss": 1.007,
835
- "step": 1100
836
- },
837
- {
838
- "epoch": 0.007010319695840543,
839
- "grad_norm": 11.71200942993164,
840
- "learning_rate": 4.943296717357583e-05,
841
- "loss": 1.0232,
842
- "step": 1110
843
- },
844
- {
845
- "epoch": 0.007073475729136404,
846
- "grad_norm": 11.027750968933105,
847
- "learning_rate": 4.93261581674079e-05,
848
- "loss": 1.0357,
849
- "step": 1120
850
- },
851
- {
852
- "epoch": 0.007136631762432265,
853
- "grad_norm": 11.876934051513672,
854
- "learning_rate": 4.921845197336491e-05,
855
- "loss": 1.0203,
856
- "step": 1130
857
- },
858
- {
859
- "epoch": 0.007199787795728126,
860
- "grad_norm": 9.920092582702637,
861
- "learning_rate": 4.910985301882667e-05,
862
- "loss": 1.0059,
863
- "step": 1140
864
- },
865
- {
866
- "epoch": 0.007262943829023987,
867
- "grad_norm": 15.140681266784668,
868
- "learning_rate": 4.9000365767870824e-05,
869
- "loss": 0.9476,
870
- "step": 1150
871
- },
872
- {
873
- "epoch": 0.0073260998623198475,
874
- "grad_norm": 13.791475296020508,
875
- "learning_rate": 4.8889994721089426e-05,
876
- "loss": 1.0171,
877
- "step": 1160
878
- },
879
- {
880
- "epoch": 0.007389255895615708,
881
- "grad_norm": 11.235550880432129,
882
- "learning_rate": 4.877874441540394e-05,
883
- "loss": 1.0835,
884
- "step": 1170
885
- },
886
- {
887
- "epoch": 0.007452411928911569,
888
- "grad_norm": 10.311321258544922,
889
- "learning_rate": 4.866661942387867e-05,
890
- "loss": 0.9844,
891
- "step": 1180
892
- },
893
- {
894
- "epoch": 0.00751556796220743,
895
- "grad_norm": 10.934793472290039,
896
- "learning_rate": 4.855362435553285e-05,
897
- "loss": 0.9778,
898
- "step": 1190
899
- },
900
- {
901
- "epoch": 0.007578723995503291,
902
- "grad_norm": 10.166644096374512,
903
- "learning_rate": 4.84397638551512e-05,
904
- "loss": 0.9602,
905
- "step": 1200
906
- },
907
- {
908
- "epoch": 0.007578723995503291,
909
- "eval_loss": 0.8132573366165161,
910
- "eval_runtime": 46.9842,
911
- "eval_samples_per_second": 10.642,
912
- "eval_steps_per_second": 10.642,
913
- "step": 1200
914
- },
915
- {
916
- "epoch": 0.007641880028799151,
917
- "grad_norm": 8.92570686340332,
918
- "learning_rate": 4.83250426030929e-05,
919
- "loss": 1.0157,
920
- "step": 1210
921
- },
922
- {
923
- "epoch": 0.007705036062095012,
924
- "grad_norm": 17.16946792602539,
925
- "learning_rate": 4.82094653150993e-05,
926
- "loss": 1.1113,
927
- "step": 1220
928
- },
929
- {
930
- "epoch": 0.0077681920953908724,
931
- "grad_norm": 10.54002571105957,
932
- "learning_rate": 4.8093036742100026e-05,
933
- "loss": 1.1527,
934
- "step": 1230
935
- },
936
- {
937
- "epoch": 0.007831348128686733,
938
- "grad_norm": 7.984756946563721,
939
- "learning_rate": 4.79757616700177e-05,
940
- "loss": 1.0058,
941
- "step": 1240
942
- },
943
- {
944
- "epoch": 0.007894504161982594,
945
- "grad_norm": 8.092245101928711,
946
- "learning_rate": 4.7857644919571176e-05,
947
- "loss": 1.0381,
948
- "step": 1250
949
- },
950
- {
951
- "epoch": 0.007957660195278455,
952
- "grad_norm": 9.762504577636719,
953
- "learning_rate": 4.773869134607747e-05,
954
- "loss": 1.0143,
955
- "step": 1260
956
- },
957
- {
958
- "epoch": 0.008020816228574316,
959
- "grad_norm": 12.095884323120117,
960
- "learning_rate": 4.761890583925204e-05,
961
- "loss": 0.9924,
962
- "step": 1270
963
- },
964
- {
965
- "epoch": 0.008083972261870177,
966
- "grad_norm": 8.758190155029297,
967
- "learning_rate": 4.749829332300792e-05,
968
- "loss": 1.0306,
969
- "step": 1280
970
- },
971
- {
972
- "epoch": 0.008147128295166037,
973
- "grad_norm": 10.170825958251953,
974
- "learning_rate": 4.737685875525327e-05,
975
- "loss": 1.0738,
976
- "step": 1290
977
- },
978
- {
979
- "epoch": 0.008210284328461898,
980
- "grad_norm": 12.792614936828613,
981
- "learning_rate": 4.725460712768751e-05,
982
- "loss": 1.0438,
983
- "step": 1300
984
- },
985
- {
986
- "epoch": 0.008273440361757759,
987
- "grad_norm": 9.970301628112793,
988
- "learning_rate": 4.7131543465596236e-05,
989
- "loss": 0.9807,
990
- "step": 1310
991
- },
992
- {
993
- "epoch": 0.00833659639505362,
994
- "grad_norm": 14.707938194274902,
995
- "learning_rate": 4.700767282764459e-05,
996
- "loss": 1.0162,
997
- "step": 1320
998
- },
999
- {
1000
- "epoch": 0.00839975242834948,
1001
- "grad_norm": 7.348803520202637,
1002
- "learning_rate": 4.688300030566933e-05,
1003
- "loss": 1.112,
1004
- "step": 1330
1005
- },
1006
- {
1007
- "epoch": 0.008462908461645342,
1008
- "grad_norm": 11.421042442321777,
1009
- "learning_rate": 4.6757531024469514e-05,
1010
- "loss": 1.0082,
1011
- "step": 1340
1012
- },
1013
- {
1014
- "epoch": 0.008526064494941202,
1015
- "grad_norm": 11.286333084106445,
1016
- "learning_rate": 4.663127014159588e-05,
1017
- "loss": 1.037,
1018
- "step": 1350
1019
- },
1020
- {
1021
- "epoch": 0.008526064494941202,
1022
- "eval_loss": 0.8081769347190857,
1023
- "eval_runtime": 46.791,
1024
- "eval_samples_per_second": 10.686,
1025
- "eval_steps_per_second": 10.686,
1026
- "step": 1350
1027
- },
1028
- {
1029
- "epoch": 0.008589220528237063,
1030
- "grad_norm": 12.930586814880371,
1031
- "learning_rate": 4.650422284713878e-05,
1032
- "loss": 0.9875,
1033
- "step": 1360
1034
- },
1035
- {
1036
- "epoch": 0.008652376561532922,
1037
- "grad_norm": 10.3340482711792,
1038
- "learning_rate": 4.637639436351489e-05,
1039
- "loss": 0.9905,
1040
- "step": 1370
1041
- },
1042
- {
1043
- "epoch": 0.008715532594828783,
1044
- "grad_norm": 16.0806941986084,
1045
- "learning_rate": 4.624778994525249e-05,
1046
- "loss": 1.0658,
1047
- "step": 1380
1048
- },
1049
- {
1050
- "epoch": 0.008778688628124644,
1051
- "grad_norm": 15.543879508972168,
1052
- "learning_rate": 4.6118414878775514e-05,
1053
- "loss": 0.9815,
1054
- "step": 1390
1055
- },
1056
- {
1057
- "epoch": 0.008841844661420505,
1058
- "grad_norm": 8.364940643310547,
1059
- "learning_rate": 4.5988274482186214e-05,
1060
- "loss": 0.9831,
1061
- "step": 1400
1062
- },
1063
- {
1064
- "epoch": 0.008905000694716366,
1065
- "grad_norm": 10.709568977355957,
1066
- "learning_rate": 4.5857374105046574e-05,
1067
- "loss": 0.9679,
1068
- "step": 1410
1069
- },
1070
- {
1071
- "epoch": 0.008968156728012227,
1072
- "grad_norm": 12.81725788116455,
1073
- "learning_rate": 4.572571912815838e-05,
1074
- "loss": 1.0276,
1075
- "step": 1420
1076
- },
1077
- {
1078
- "epoch": 0.009031312761308087,
1079
- "grad_norm": 10.538129806518555,
1080
- "learning_rate": 4.55933149633421e-05,
1081
- "loss": 1.0586,
1082
- "step": 1430
1083
- },
1084
- {
1085
- "epoch": 0.009094468794603948,
1086
- "grad_norm": 14.130047798156738,
1087
- "learning_rate": 4.5460167053214335e-05,
1088
- "loss": 1.0164,
1089
- "step": 1440
1090
- },
1091
- {
1092
- "epoch": 0.009157624827899809,
1093
- "grad_norm": 12.928654670715332,
1094
- "learning_rate": 4.532628087096419e-05,
1095
- "loss": 1.0102,
1096
- "step": 1450
1097
- },
1098
- {
1099
- "epoch": 0.00922078086119567,
1100
- "grad_norm": 13.670207023620605,
1101
- "learning_rate": 4.5191661920128194e-05,
1102
- "loss": 0.9835,
1103
- "step": 1460
1104
- },
1105
- {
1106
- "epoch": 0.00928393689449153,
1107
- "grad_norm": 13.250060081481934,
1108
- "learning_rate": 4.5056315734364154e-05,
1109
- "loss": 0.9599,
1110
- "step": 1470
1111
- },
1112
- {
1113
- "epoch": 0.009347092927787392,
1114
- "grad_norm": 13.03905963897705,
1115
- "learning_rate": 4.492024787722368e-05,
1116
- "loss": 1.0552,
1117
- "step": 1480
1118
- },
1119
- {
1120
- "epoch": 0.009410248961083252,
1121
- "grad_norm": 11.335886001586914,
1122
- "learning_rate": 4.47834639419234e-05,
1123
- "loss": 0.9254,
1124
- "step": 1490
1125
- },
1126
- {
1127
- "epoch": 0.009473404994379113,
1128
- "grad_norm": 11.690058708190918,
1129
- "learning_rate": 4.464596955111518e-05,
1130
- "loss": 1.0431,
1131
- "step": 1500
1132
- },
1133
- {
1134
- "epoch": 0.009473404994379113,
1135
- "eval_loss": 0.8068813681602478,
1136
- "eval_runtime": 46.6502,
1137
- "eval_samples_per_second": 10.718,
1138
- "eval_steps_per_second": 10.718,
1139
- "step": 1500
1140
- },
1141
- {
1142
- "epoch": 0.009536561027674974,
1143
- "grad_norm": 9.877214431762695,
1144
- "learning_rate": 4.450777035665487e-05,
1145
- "loss": 0.9958,
1146
- "step": 1510
1147
- },
1148
- {
1149
- "epoch": 0.009599717060970835,
1150
- "grad_norm": 8.600367546081543,
1151
- "learning_rate": 4.436887203937009e-05,
1152
- "loss": 1.01,
1153
- "step": 1520
1154
- },
1155
- {
1156
- "epoch": 0.009662873094266696,
1157
- "grad_norm": 10.299551963806152,
1158
- "learning_rate": 4.422928030882661e-05,
1159
- "loss": 0.9326,
1160
- "step": 1530
1161
- },
1162
- {
1163
- "epoch": 0.009726029127562557,
1164
- "grad_norm": 10.05881404876709,
1165
- "learning_rate": 4.4089000903093746e-05,
1166
- "loss": 0.9944,
1167
- "step": 1540
1168
- },
1169
- {
1170
- "epoch": 0.009789185160858417,
1171
- "grad_norm": 9.281290054321289,
1172
- "learning_rate": 4.394803958850844e-05,
1173
- "loss": 1.0549,
1174
- "step": 1550
1175
- },
1176
- {
1177
- "epoch": 0.009852341194154278,
1178
- "grad_norm": 12.724478721618652,
1179
- "learning_rate": 4.380640215943821e-05,
1180
- "loss": 1.0675,
1181
- "step": 1560
1182
- },
1183
- {
1184
- "epoch": 0.009915497227450139,
1185
- "grad_norm": 10.141554832458496,
1186
- "learning_rate": 4.366409443804301e-05,
1187
- "loss": 1.0266,
1188
- "step": 1570
1189
- },
1190
- {
1191
- "epoch": 0.009978653260745998,
1192
- "grad_norm": 10.95832347869873,
1193
- "learning_rate": 4.352112227403589e-05,
1194
- "loss": 1.0158,
1195
- "step": 1580
1196
- },
1197
- {
1198
- "epoch": 0.010041809294041859,
1199
- "grad_norm": 12.602045059204102,
1200
- "learning_rate": 4.337749154444254e-05,
1201
- "loss": 0.9915,
1202
- "step": 1590
1203
- },
1204
- {
1205
- "epoch": 0.01010496532733772,
1206
- "grad_norm": 13.776515007019043,
1207
- "learning_rate": 4.3233208153359665e-05,
1208
- "loss": 1.0467,
1209
- "step": 1600
1210
- },
1211
- {
1212
- "epoch": 0.01016812136063358,
1213
- "grad_norm": 11.861095428466797,
1214
- "learning_rate": 4.308827803171238e-05,
1215
- "loss": 0.9941,
1216
- "step": 1610
1217
- },
1218
- {
1219
- "epoch": 0.010231277393929442,
1220
- "grad_norm": 10.238848686218262,
1221
- "learning_rate": 4.294270713701031e-05,
1222
- "loss": 1.0242,
1223
- "step": 1620
1224
- },
1225
- {
1226
- "epoch": 0.010294433427225302,
1227
- "grad_norm": 10.419410705566406,
1228
- "learning_rate": 4.2796501453102784e-05,
1229
- "loss": 0.9683,
1230
- "step": 1630
1231
- },
1232
- {
1233
- "epoch": 0.010357589460521163,
1234
- "grad_norm": 10.017350196838379,
1235
- "learning_rate": 4.264966698993282e-05,
1236
- "loss": 1.0742,
1237
- "step": 1640
1238
- },
1239
- {
1240
- "epoch": 0.010420745493817024,
1241
- "grad_norm": 9.052342414855957,
1242
- "learning_rate": 4.2502209783290085e-05,
1243
- "loss": 0.9827,
1244
- "step": 1650
1245
- },
1246
- {
1247
- "epoch": 0.010420745493817024,
1248
- "eval_loss": 0.7950078248977661,
1249
- "eval_runtime": 47.5324,
1250
- "eval_samples_per_second": 10.519,
1251
- "eval_steps_per_second": 10.519,
1252
- "step": 1650
1253
- },
1254
- {
1255
- "epoch": 0.010483901527112885,
1256
- "grad_norm": 9.733206748962402,
1257
- "learning_rate": 4.235413589456281e-05,
1258
- "loss": 1.0076,
1259
- "step": 1660
1260
- },
1261
- {
1262
- "epoch": 0.010547057560408746,
1263
- "grad_norm": 13.072066307067871,
1264
- "learning_rate": 4.2205451410488565e-05,
1265
- "loss": 0.9684,
1266
- "step": 1670
1267
- },
1268
- {
1269
- "epoch": 0.010610213593704607,
1270
- "grad_norm": 10.068881034851074,
1271
- "learning_rate": 4.205616244290416e-05,
1272
- "loss": 1.0199,
1273
- "step": 1680
1274
- },
1275
- {
1276
- "epoch": 0.010673369627000467,
1277
- "grad_norm": 13.202546119689941,
1278
- "learning_rate": 4.1906275128494296e-05,
1279
- "loss": 1.0349,
1280
- "step": 1690
1281
- },
1282
- {
1283
- "epoch": 0.010736525660296328,
1284
- "grad_norm": 9.893623352050781,
1285
- "learning_rate": 4.175579562853945e-05,
1286
- "loss": 0.99,
1287
- "step": 1700
1288
- },
1289
- {
1290
- "epoch": 0.010799681693592189,
1291
- "grad_norm": 7.337754726409912,
1292
- "learning_rate": 4.160473012866242e-05,
1293
- "loss": 1.0124,
1294
- "step": 1710
1295
- },
1296
- {
1297
- "epoch": 0.01086283772688805,
1298
- "grad_norm": 8.579658508300781,
1299
- "learning_rate": 4.145308483857426e-05,
1300
- "loss": 1.0585,
1301
- "step": 1720
1302
- },
1303
- {
1304
- "epoch": 0.01092599376018391,
1305
- "grad_norm": 9.737414360046387,
1306
- "learning_rate": 4.1300865991818885e-05,
1307
- "loss": 1.0679,
1308
- "step": 1730
1309
- },
1310
- {
1311
- "epoch": 0.010989149793479772,
1312
- "grad_norm": 11.66639518737793,
1313
- "learning_rate": 4.114807984551688e-05,
1314
- "loss": 1.0104,
1315
- "step": 1740
1316
- },
1317
- {
1318
- "epoch": 0.011052305826775632,
1319
- "grad_norm": 8.344025611877441,
1320
- "learning_rate": 4.0994732680108296e-05,
1321
- "loss": 1.0121,
1322
- "step": 1750
1323
- },
1324
- {
1325
- "epoch": 0.011115461860071493,
1326
- "grad_norm": 9.461050987243652,
1327
- "learning_rate": 4.084083079909448e-05,
1328
- "loss": 0.9293,
1329
- "step": 1760
1330
- },
1331
- {
1332
- "epoch": 0.011178617893367354,
1333
- "grad_norm": 11.834268569946289,
1334
- "learning_rate": 4.068638052877899e-05,
1335
- "loss": 1.031,
1336
- "step": 1770
1337
- },
1338
- {
1339
- "epoch": 0.011241773926663215,
1340
- "grad_norm": 11.639049530029297,
1341
- "learning_rate": 4.0531388218007466e-05,
1342
- "loss": 1.0569,
1343
- "step": 1780
1344
- },
1345
- {
1346
- "epoch": 0.011304929959959074,
1347
- "grad_norm": 11.35422134399414,
1348
- "learning_rate": 4.037586023790676e-05,
1349
- "loss": 1.003,
1350
- "step": 1790
1351
- },
1352
- {
1353
- "epoch": 0.011368085993254935,
1354
- "grad_norm": 7.621947288513184,
1355
- "learning_rate": 4.0219802981622975e-05,
1356
- "loss": 1.0169,
1357
- "step": 1800
1358
- },
1359
- {
1360
- "epoch": 0.011368085993254935,
1361
- "eval_loss": 0.7995550632476807,
1362
- "eval_runtime": 47.0267,
1363
- "eval_samples_per_second": 10.632,
1364
- "eval_steps_per_second": 10.632,
1365
- "step": 1800
1366
- },
1367
- {
1368
- "epoch": 0.011431242026550796,
1369
- "grad_norm": 8.875388145446777,
1370
- "learning_rate": 4.006322286405867e-05,
1371
- "loss": 1.1443,
1372
- "step": 1810
1373
- },
1374
- {
1375
- "epoch": 0.011494398059846657,
1376
- "grad_norm": 9.980118751525879,
1377
- "learning_rate": 3.99061263216092e-05,
1378
- "loss": 0.9522,
1379
- "step": 1820
1380
- },
1381
- {
1382
- "epoch": 0.011557554093142517,
1383
- "grad_norm": 13.012611389160156,
1384
- "learning_rate": 3.974851981189813e-05,
1385
- "loss": 0.9845,
1386
- "step": 1830
1387
- },
1388
- {
1389
- "epoch": 0.011620710126438378,
1390
- "grad_norm": 8.837371826171875,
1391
- "learning_rate": 3.9590409813511765e-05,
1392
- "loss": 1.0351,
1393
- "step": 1840
1394
- },
1395
- {
1396
- "epoch": 0.011683866159734239,
1397
- "grad_norm": 7.165218830108643,
1398
- "learning_rate": 3.943180282573285e-05,
1399
- "loss": 1.0092,
1400
- "step": 1850
1401
- },
1402
- {
1403
- "epoch": 0.0117470221930301,
1404
- "grad_norm": 8.621559143066406,
1405
- "learning_rate": 3.927270536827346e-05,
1406
- "loss": 0.9187,
1407
- "step": 1860
1408
- },
1409
- {
1410
- "epoch": 0.01181017822632596,
1411
- "grad_norm": 11.009320259094238,
1412
- "learning_rate": 3.91131239810069e-05,
1413
- "loss": 0.9718,
1414
- "step": 1870
1415
- },
1416
- {
1417
- "epoch": 0.011873334259621822,
1418
- "grad_norm": 11.635251998901367,
1419
- "learning_rate": 3.895306522369898e-05,
1420
- "loss": 0.99,
1421
- "step": 1880
1422
- },
1423
- {
1424
- "epoch": 0.011936490292917682,
1425
- "grad_norm": 13.04053783416748,
1426
- "learning_rate": 3.87925356757383e-05,
1427
- "loss": 0.971,
1428
- "step": 1890
1429
- },
1430
- {
1431
- "epoch": 0.011999646326213543,
1432
- "grad_norm": 9.825182914733887,
1433
- "learning_rate": 3.863154193586583e-05,
1434
- "loss": 0.9937,
1435
- "step": 1900
1436
- },
1437
- {
1438
- "epoch": 0.012062802359509404,
1439
- "grad_norm": 10.418283462524414,
1440
- "learning_rate": 3.847009062190365e-05,
1441
- "loss": 0.9927,
1442
- "step": 1910
1443
- },
1444
- {
1445
- "epoch": 0.012125958392805265,
1446
- "grad_norm": 9.913063049316406,
1447
- "learning_rate": 3.83081883704829e-05,
1448
- "loss": 0.9518,
1449
- "step": 1920
1450
- },
1451
- {
1452
- "epoch": 0.012189114426101126,
1453
- "grad_norm": 14.064604759216309,
1454
- "learning_rate": 3.814584183677102e-05,
1455
- "loss": 1.0422,
1456
- "step": 1930
1457
- },
1458
- {
1459
- "epoch": 0.012252270459396987,
1460
- "grad_norm": 9.01578426361084,
1461
- "learning_rate": 3.7983057694198145e-05,
1462
- "loss": 0.928,
1463
- "step": 1940
1464
- },
1465
- {
1466
- "epoch": 0.012315426492692847,
1467
- "grad_norm": 8.885491371154785,
1468
- "learning_rate": 3.781984263418279e-05,
1469
- "loss": 0.9318,
1470
- "step": 1950
1471
- },
1472
- {
1473
- "epoch": 0.012315426492692847,
1474
- "eval_loss": 0.8084725737571716,
1475
- "eval_runtime": 45.7715,
1476
- "eval_samples_per_second": 10.924,
1477
- "eval_steps_per_second": 10.924,
1478
- "step": 1950
1479
- },
1480
- {
1481
- "epoch": 0.012378582525988708,
1482
- "grad_norm": 13.71338939666748,
1483
- "learning_rate": 3.76562033658568e-05,
1484
- "loss": 0.9817,
1485
- "step": 1960
1486
- },
1487
- {
1488
- "epoch": 0.012441738559284569,
1489
- "grad_norm": 8.36632251739502,
1490
- "learning_rate": 3.749214661578957e-05,
1491
- "loss": 0.9689,
1492
- "step": 1970
1493
- },
1494
- {
1495
- "epoch": 0.01250489459258043,
1496
- "grad_norm": 9.292037010192871,
1497
- "learning_rate": 3.732767912771153e-05,
1498
- "loss": 0.876,
1499
- "step": 1980
1500
- },
1501
- {
1502
- "epoch": 0.01256805062587629,
1503
- "grad_norm": 10.010106086730957,
1504
- "learning_rate": 3.716280766223693e-05,
1505
- "loss": 0.9178,
1506
- "step": 1990
1507
- },
1508
- {
1509
- "epoch": 0.01263120665917215,
1510
- "grad_norm": 12.617682456970215,
1511
- "learning_rate": 3.699753899658596e-05,
1512
- "loss": 0.9502,
1513
- "step": 2000
1514
- },
1515
- {
1516
- "epoch": 0.01269436269246801,
1517
- "grad_norm": 9.597683906555176,
1518
- "learning_rate": 3.683187992430616e-05,
1519
- "loss": 0.9964,
1520
- "step": 2010
1521
- },
1522
- {
1523
- "epoch": 0.012757518725763872,
1524
- "grad_norm": 11.982294082641602,
1525
- "learning_rate": 3.666583725499315e-05,
1526
- "loss": 0.8793,
1527
- "step": 2020
1528
- },
1529
- {
1530
- "epoch": 0.012820674759059732,
1531
- "grad_norm": 12.932259559631348,
1532
- "learning_rate": 3.6499417814010715e-05,
1533
- "loss": 1.0322,
1534
- "step": 2030
1535
- },
1536
- {
1537
- "epoch": 0.012883830792355593,
1538
- "grad_norm": 11.51771068572998,
1539
- "learning_rate": 3.6332628442210255e-05,
1540
- "loss": 0.9822,
1541
- "step": 2040
1542
- },
1543
- {
1544
- "epoch": 0.012946986825651454,
1545
- "grad_norm": 11.403252601623535,
1546
- "learning_rate": 3.616547599564958e-05,
1547
- "loss": 1.0164,
1548
- "step": 2050
1549
- },
1550
- {
1551
- "epoch": 0.013010142858947315,
1552
- "grad_norm": 9.606186866760254,
1553
- "learning_rate": 3.599796734531105e-05,
1554
- "loss": 1.0764,
1555
- "step": 2060
1556
- },
1557
- {
1558
- "epoch": 0.013073298892243176,
1559
- "grad_norm": 10.46757984161377,
1560
- "learning_rate": 3.5830109376819235e-05,
1561
- "loss": 1.0254,
1562
- "step": 2070
1563
- },
1564
- {
1565
- "epoch": 0.013136454925539037,
1566
- "grad_norm": 12.479496955871582,
1567
- "learning_rate": 3.566190899015774e-05,
1568
- "loss": 1.0147,
1569
- "step": 2080
1570
- },
1571
- {
1572
- "epoch": 0.013199610958834897,
1573
- "grad_norm": 11.903168678283691,
1574
- "learning_rate": 3.5493373099385677e-05,
1575
- "loss": 1.0082,
1576
- "step": 2090
1577
- },
1578
- {
1579
- "epoch": 0.013262766992130758,
1580
- "grad_norm": 9.391939163208008,
1581
- "learning_rate": 3.5324508632353394e-05,
1582
- "loss": 1.0489,
1583
- "step": 2100
1584
- },
1585
- {
1586
- "epoch": 0.013262766992130758,
1587
- "eval_loss": 0.7725129127502441,
1588
- "eval_runtime": 45.2632,
1589
- "eval_samples_per_second": 11.047,
1590
- "eval_steps_per_second": 11.047,
1591
- "step": 2100
1592
- },
1593
- {
1594
- "epoch": 0.013325923025426619,
1595
- "grad_norm": 10.084378242492676,
1596
- "learning_rate": 3.515532253041774e-05,
1597
- "loss": 1.0542,
1598
- "step": 2110
1599
- },
1600
- {
1601
- "epoch": 0.01338907905872248,
1602
- "grad_norm": 11.785043716430664,
1603
- "learning_rate": 3.498582174815671e-05,
1604
- "loss": 1.0488,
1605
- "step": 2120
1606
- },
1607
- {
1608
- "epoch": 0.01345223509201834,
1609
- "grad_norm": 10.521591186523438,
1610
- "learning_rate": 3.481601325308357e-05,
1611
- "loss": 0.9893,
1612
- "step": 2130
1613
- },
1614
- {
1615
- "epoch": 0.013515391125314202,
1616
- "grad_norm": 8.644886016845703,
1617
- "learning_rate": 3.4645904025360455e-05,
1618
- "loss": 0.9614,
1619
- "step": 2140
1620
- },
1621
- {
1622
- "epoch": 0.013578547158610062,
1623
- "grad_norm": 9.69764232635498,
1624
- "learning_rate": 3.447550105751145e-05,
1625
- "loss": 0.9104,
1626
- "step": 2150
1627
- },
1628
- {
1629
- "epoch": 0.013641703191905923,
1630
- "grad_norm": 14.97421646118164,
1631
- "learning_rate": 3.4304811354135145e-05,
1632
- "loss": 0.9324,
1633
- "step": 2160
1634
- },
1635
- {
1636
- "epoch": 0.013704859225201784,
1637
- "grad_norm": 9.63864803314209,
1638
- "learning_rate": 3.4133841931616696e-05,
1639
- "loss": 1.0068,
1640
- "step": 2170
1641
- },
1642
- {
1643
- "epoch": 0.013768015258497645,
1644
- "grad_norm": 9.305994033813477,
1645
- "learning_rate": 3.396259981783942e-05,
1646
- "loss": 0.9625,
1647
- "step": 2180
1648
- },
1649
- {
1650
- "epoch": 0.013831171291793506,
1651
- "grad_norm": 8.71728515625,
1652
- "learning_rate": 3.37910920518959e-05,
1653
- "loss": 1.0038,
1654
- "step": 2190
1655
- },
1656
- {
1657
- "epoch": 0.013894327325089367,
1658
- "grad_norm": 6.916072845458984,
1659
- "learning_rate": 3.3619325683798646e-05,
1660
- "loss": 1.017,
1661
- "step": 2200
1662
- },
1663
- {
1664
- "epoch": 0.013957483358385226,
1665
- "grad_norm": 10.644246101379395,
1666
- "learning_rate": 3.3447307774190296e-05,
1667
- "loss": 0.883,
1668
- "step": 2210
1669
- },
1670
- {
1671
- "epoch": 0.014020639391681087,
1672
- "grad_norm": 12.687426567077637,
1673
- "learning_rate": 3.327504539405335e-05,
1674
- "loss": 0.9776,
1675
- "step": 2220
1676
- },
1677
- {
1678
- "epoch": 0.014083795424976947,
1679
- "grad_norm": 13.301749229431152,
1680
- "learning_rate": 3.3102545624419583e-05,
1681
- "loss": 0.9652,
1682
- "step": 2230
1683
- },
1684
- {
1685
- "epoch": 0.014146951458272808,
1686
- "grad_norm": 11.929834365844727,
1687
- "learning_rate": 3.292981555607884e-05,
1688
- "loss": 0.9486,
1689
- "step": 2240
1690
- },
1691
- {
1692
- "epoch": 0.014210107491568669,
1693
- "grad_norm": 11.27684211730957,
1694
- "learning_rate": 3.2756862289287746e-05,
1695
- "loss": 0.971,
1696
- "step": 2250
1697
- },
1698
- {
1699
- "epoch": 0.014210107491568669,
1700
- "eval_loss": 0.7654532194137573,
1701
- "eval_runtime": 45.5502,
1702
- "eval_samples_per_second": 10.977,
1703
- "eval_steps_per_second": 10.977,
1704
- "step": 2250
1705
- },
1706
- {
1707
- "epoch": 0.01427326352486453,
1708
- "grad_norm": 7.769909381866455,
1709
- "learning_rate": 3.258369293347764e-05,
1710
- "loss": 0.9666,
1711
- "step": 2260
1712
- },
1713
- {
1714
- "epoch": 0.01433641955816039,
1715
- "grad_norm": 8.687183380126953,
1716
- "learning_rate": 3.241031460696251e-05,
1717
- "loss": 0.995,
1718
- "step": 2270
1719
- },
1720
- {
1721
- "epoch": 0.014399575591456252,
1722
- "grad_norm": 9.484212875366211,
1723
- "learning_rate": 3.223673443664627e-05,
1724
- "loss": 0.9212,
1725
- "step": 2280
1726
- },
1727
- {
1728
- "epoch": 0.014462731624752112,
1729
- "grad_norm": 8.085176467895508,
1730
- "learning_rate": 3.206295955772987e-05,
1731
- "loss": 0.9898,
1732
- "step": 2290
1733
- },
1734
- {
1735
- "epoch": 0.014525887658047973,
1736
- "grad_norm": 7.797507286071777,
1737
- "learning_rate": 3.188899711341793e-05,
1738
- "loss": 1.0636,
1739
- "step": 2300
1740
- },
1741
- {
1742
- "epoch": 0.014589043691343834,
1743
- "grad_norm": 9.53068733215332,
1744
- "learning_rate": 3.171485425462518e-05,
1745
- "loss": 0.9875,
1746
- "step": 2310
1747
- },
1748
- {
1749
- "epoch": 0.014652199724639695,
1750
- "grad_norm": 8.484146118164062,
1751
- "learning_rate": 3.15405381396825e-05,
1752
- "loss": 0.9965,
1753
- "step": 2320
1754
- },
1755
- {
1756
- "epoch": 0.014715355757935556,
1757
- "grad_norm": 9.026712417602539,
1758
- "learning_rate": 3.136605593404258e-05,
1759
- "loss": 1.0284,
1760
- "step": 2330
1761
- },
1762
- {
1763
- "epoch": 0.014778511791231417,
1764
- "grad_norm": 8.642735481262207,
1765
- "learning_rate": 3.119141480998553e-05,
1766
- "loss": 0.9835,
1767
- "step": 2340
1768
- },
1769
- {
1770
- "epoch": 0.014841667824527277,
1771
- "grad_norm": 9.15679931640625,
1772
- "learning_rate": 3.101662194632392e-05,
1773
- "loss": 1.065,
1774
- "step": 2350
1775
- },
1776
- {
1777
- "epoch": 0.014904823857823138,
1778
- "grad_norm": 12.24231243133545,
1779
- "learning_rate": 3.0841684528107766e-05,
1780
- "loss": 0.9253,
1781
- "step": 2360
1782
- },
1783
- {
1784
- "epoch": 0.014967979891118999,
1785
- "grad_norm": 10.09658432006836,
1786
- "learning_rate": 3.066660974632914e-05,
1787
- "loss": 1.0556,
1788
- "step": 2370
1789
- },
1790
- {
1791
- "epoch": 0.01503113592441486,
1792
- "grad_norm": 9.404873847961426,
1793
- "learning_rate": 3.0491404797626605e-05,
1794
- "loss": 0.9384,
1795
- "step": 2380
1796
- },
1797
- {
1798
- "epoch": 0.01509429195771072,
1799
- "grad_norm": 10.99118423461914,
1800
- "learning_rate": 3.031607688398936e-05,
1801
- "loss": 0.8505,
1802
- "step": 2390
1803
- },
1804
- {
1805
- "epoch": 0.015157447991006582,
1806
- "grad_norm": 8.370269775390625,
1807
- "learning_rate": 3.0140633212461248e-05,
1808
- "loss": 1.0307,
1809
- "step": 2400
1810
- },
1811
- {
1812
- "epoch": 0.015157447991006582,
1813
- "eval_loss": 0.7730175852775574,
1814
- "eval_runtime": 45.2705,
1815
- "eval_samples_per_second": 11.045,
1816
- "eval_steps_per_second": 11.045,
1817
- "step": 2400
1818
- },
1819
- {
1820
- "epoch": 0.015220604024302442,
1821
- "grad_norm": 10.080233573913574,
1822
- "learning_rate": 2.9965080994844422e-05,
1823
- "loss": 0.9019,
1824
- "step": 2410
1825
- },
1826
- {
1827
- "epoch": 0.015283760057598302,
1828
- "grad_norm": 8.396200180053711,
1829
- "learning_rate": 2.978942744740296e-05,
1830
- "loss": 0.9471,
1831
- "step": 2420
1832
- },
1833
- {
1834
- "epoch": 0.015346916090894162,
1835
- "grad_norm": 8.11946964263916,
1836
- "learning_rate": 2.961367979056621e-05,
1837
- "loss": 0.9168,
1838
- "step": 2430
1839
- },
1840
- {
1841
- "epoch": 0.015410072124190023,
1842
- "grad_norm": 9.612983703613281,
1843
- "learning_rate": 2.9437845248631984e-05,
1844
- "loss": 0.8768,
1845
- "step": 2440
1846
- },
1847
- {
1848
- "epoch": 0.015473228157485884,
1849
- "grad_norm": 8.1029634475708,
1850
- "learning_rate": 2.926193104946961e-05,
1851
- "loss": 0.9638,
1852
- "step": 2450
1853
- },
1854
- {
1855
- "epoch": 0.015536384190781745,
1856
- "grad_norm": 7.916716575622559,
1857
- "learning_rate": 2.90859444242228e-05,
1858
- "loss": 1.0279,
1859
- "step": 2460
1860
- },
1861
- {
1862
- "epoch": 0.015599540224077606,
1863
- "grad_norm": 9.679882049560547,
1864
- "learning_rate": 2.8909892607012427e-05,
1865
- "loss": 0.9395,
1866
- "step": 2470
1867
- },
1868
- {
1869
- "epoch": 0.015662696257373467,
1870
- "grad_norm": 10.735697746276855,
1871
- "learning_rate": 2.8733782834639165e-05,
1872
- "loss": 0.9673,
1873
- "step": 2480
1874
- },
1875
- {
1876
- "epoch": 0.015725852290669327,
1877
- "grad_norm": 9.609941482543945,
1878
- "learning_rate": 2.8557622346285957e-05,
1879
- "loss": 0.9712,
1880
- "step": 2490
1881
- },
1882
- {
1883
- "epoch": 0.015789008323965188,
1884
- "grad_norm": 10.74219799041748,
1885
- "learning_rate": 2.8381418383220526e-05,
1886
- "loss": 0.9714,
1887
- "step": 2500
1888
- },
1889
- {
1890
- "epoch": 0.01585216435726105,
1891
- "grad_norm": 8.981695175170898,
1892
- "learning_rate": 2.8205178188497627e-05,
1893
- "loss": 0.9977,
1894
- "step": 2510
1895
- },
1896
- {
1897
- "epoch": 0.01591532039055691,
1898
- "grad_norm": 9.596540451049805,
1899
- "learning_rate": 2.8028909006661396e-05,
1900
- "loss": 0.946,
1901
- "step": 2520
1902
- },
1903
- {
1904
- "epoch": 0.01597847642385277,
1905
- "grad_norm": 7.082404613494873,
1906
- "learning_rate": 2.78526180834475e-05,
1907
- "loss": 0.9852,
1908
- "step": 2530
1909
- },
1910
- {
1911
- "epoch": 0.01604163245714863,
1912
- "grad_norm": 8.827990531921387,
1913
- "learning_rate": 2.7676312665485307e-05,
1914
- "loss": 1.0499,
1915
- "step": 2540
1916
- },
1917
- {
1918
- "epoch": 0.016104788490444492,
1919
- "grad_norm": 10.917720794677734,
1920
- "learning_rate": 2.75e-05,
1921
- "loss": 0.9835,
1922
- "step": 2550
1923
- },
1924
- {
1925
- "epoch": 0.016104788490444492,
1926
- "eval_loss": 0.7639342546463013,
1927
- "eval_runtime": 45.562,
1928
- "eval_samples_per_second": 10.974,
1929
- "eval_steps_per_second": 10.974,
1930
- "step": 2550
1931
- },
1932
- {
1933
- "epoch": 0.016167944523740353,
1934
- "grad_norm": 11.584785461425781,
1935
- "learning_rate": 2.7323687334514695e-05,
1936
- "loss": 0.8422,
1937
- "step": 2560
1938
- },
1939
- {
1940
- "epoch": 0.016231100557036214,
1941
- "grad_norm": 9.246831893920898,
1942
- "learning_rate": 2.71473819165525e-05,
1943
- "loss": 1.0227,
1944
- "step": 2570
1945
- },
1946
- {
1947
- "epoch": 0.016294256590332075,
1948
- "grad_norm": 9.19963264465332,
1949
- "learning_rate": 2.6971090993338606e-05,
1950
- "loss": 0.9826,
1951
- "step": 2580
1952
- },
1953
- {
1954
- "epoch": 0.016357412623627936,
1955
- "grad_norm": 7.812788009643555,
1956
- "learning_rate": 2.679482181150238e-05,
1957
- "loss": 0.8062,
1958
- "step": 2590
1959
- },
1960
- {
1961
- "epoch": 0.016420568656923797,
1962
- "grad_norm": 6.75607967376709,
1963
- "learning_rate": 2.6618581616779483e-05,
1964
- "loss": 0.8495,
1965
- "step": 2600
1966
- },
1967
- {
1968
- "epoch": 0.016483724690219657,
1969
- "grad_norm": 8.447277069091797,
1970
- "learning_rate": 2.644237765371404e-05,
1971
- "loss": 1.1002,
1972
- "step": 2610
1973
- },
1974
- {
1975
- "epoch": 0.016546880723515518,
1976
- "grad_norm": 9.761106491088867,
1977
- "learning_rate": 2.626621716536085e-05,
1978
- "loss": 0.9549,
1979
- "step": 2620
1980
- },
1981
- {
1982
- "epoch": 0.01661003675681138,
1983
- "grad_norm": 10.971216201782227,
1984
- "learning_rate": 2.6090107392987575e-05,
1985
- "loss": 0.9771,
1986
- "step": 2630
1987
- },
1988
- {
1989
- "epoch": 0.01667319279010724,
1990
- "grad_norm": 11.389016151428223,
1991
- "learning_rate": 2.591405557577721e-05,
1992
- "loss": 0.9737,
1993
- "step": 2640
1994
- },
1995
- {
1996
- "epoch": 0.0167363488234031,
1997
- "grad_norm": 9.189516067504883,
1998
- "learning_rate": 2.5738068950530398e-05,
1999
- "loss": 0.9855,
2000
- "step": 2650
2001
- },
2002
- {
2003
- "epoch": 0.01679950485669896,
2004
- "grad_norm": 8.623804092407227,
2005
- "learning_rate": 2.5562154751368014e-05,
2006
- "loss": 1.0468,
2007
- "step": 2660
2008
- },
2009
- {
2010
- "epoch": 0.016862660889994822,
2011
- "grad_norm": 10.484329223632812,
2012
- "learning_rate": 2.5386320209433798e-05,
2013
- "loss": 1.0479,
2014
- "step": 2670
2015
- },
2016
- {
2017
- "epoch": 0.016925816923290683,
2018
- "grad_norm": 9.45596981048584,
2019
- "learning_rate": 2.5210572552597046e-05,
2020
- "loss": 0.9985,
2021
- "step": 2680
2022
- },
2023
- {
2024
- "epoch": 0.016988972956586544,
2025
- "grad_norm": 11.783865928649902,
2026
- "learning_rate": 2.5034919005155583e-05,
2027
- "loss": 1.0066,
2028
- "step": 2690
2029
- },
2030
- {
2031
- "epoch": 0.017052128989882405,
2032
- "grad_norm": 11.132217407226562,
2033
- "learning_rate": 2.4859366787538754e-05,
2034
- "loss": 0.8558,
2035
- "step": 2700
2036
- },
2037
- {
2038
- "epoch": 0.017052128989882405,
2039
- "eval_loss": 0.7622952461242676,
2040
- "eval_runtime": 45.0496,
2041
- "eval_samples_per_second": 11.099,
2042
- "eval_steps_per_second": 11.099,
2043
- "step": 2700
2044
  }
2045
  ],
2046
  "logging_steps": 10,
@@ -2060,8 +139,8 @@
2060
  "attributes": {}
2061
  }
2062
  },
2063
- "total_flos": 2.1658889386082304e+17,
2064
- "train_batch_size": 16,
2065
  "trial_name": null,
2066
  "trial_params": null
2067
  }
 
1
  {
2
+ "best_metric": 0.8098691701889038,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.0004736702497189557,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 3.157801664793038e-05,
13
+ "grad_norm": 9.345338821411133,
14
  "learning_rate": 5.500000000000001e-06,
15
+ "loss": 0.9144,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 6.315603329586075e-05,
20
+ "grad_norm": 15.730955123901367,
21
  "learning_rate": 1.1000000000000001e-05,
22
+ "loss": 0.8917,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 9.473404994379113e-05,
27
+ "grad_norm": 11.728280067443848,
28
  "learning_rate": 1.65e-05,
29
+ "loss": 0.9514,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.0001263120665917215,
34
+ "grad_norm": 10.77907943725586,
35
  "learning_rate": 2.2000000000000003e-05,
36
+ "loss": 0.9063,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.00015789008323965187,
41
+ "grad_norm": 11.698442459106445,
42
  "learning_rate": 2.75e-05,
43
+ "loss": 0.8968,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.00018946809988758226,
48
+ "grad_norm": 14.10632038116455,
49
  "learning_rate": 3.3e-05,
50
+ "loss": 0.8452,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.00022104611653551263,
55
+ "grad_norm": 16.12811851501465,
56
  "learning_rate": 3.85e-05,
57
+ "loss": 0.8933,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.000252624133183443,
62
+ "grad_norm": 15.834749221801758,
63
  "learning_rate": 4.4000000000000006e-05,
64
+ "loss": 0.9842,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.0002842021498313734,
69
+ "grad_norm": 9.127525329589844,
70
  "learning_rate": 4.9500000000000004e-05,
71
+ "loss": 0.8358,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.00031578016647930375,
76
+ "grad_norm": 16.81833839416504,
77
  "learning_rate": 5.5e-05,
78
+ "loss": 0.8891,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.00034735818312723417,
83
+ "grad_norm": 15.804556846618652,
84
  "learning_rate": 5.4999434791355066e-05,
85
+ "loss": 0.9371,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.00037893619977516453,
90
+ "grad_norm": 18.529338836669922,
91
  "learning_rate": 5.4997739188653784e-05,
92
+ "loss": 1.0804,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.0004105142164230949,
97
+ "grad_norm": 13.449525833129883,
98
  "learning_rate": 5.4994913261595724e-05,
99
+ "loss": 0.9468,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.00044209223307102526,
104
+ "grad_norm": 20.342548370361328,
105
  "learning_rate": 5.49909571263437e-05,
106
+ "loss": 0.9881,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.0004736702497189557,
111
+ "grad_norm": 11.419866561889648,
112
  "learning_rate": 5.498587094551892e-05,
113
+ "loss": 1.0642,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.0004736702497189557,
118
+ "eval_loss": 0.8098691701889038,
119
+ "eval_runtime": 45.4214,
120
+ "eval_samples_per_second": 11.008,
121
+ "eval_steps_per_second": 11.008,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 4816346273759232.0,
143
+ "train_batch_size": 8,
144
  "trial_name": null,
145
  "trial_params": null
146
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80041c12037d496719b3517a1611698bc0fc97d06d1ad7685a650233b0d54843
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f2876111fb354dec6b4e9d27b5fc744ee66c6a7ed1060873d2769d459ef601a
3
  size 5496