yuweiiizz commited on
Commit
e3c1a8b
·
verified ·
1 Parent(s): 30e16c9

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bfea1f8ccff1d3d104539ef9c86c38d6670980c839e6047b65be8f2eae783c8
3
  size 966995080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df59285835e0cc93eed8a07997106068a6025a1eecf29ca1883050640082bf5
3
  size 966995080
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca089eb0a1d5699b01f559f18d4bede6fbd50e2cda9b1cb1676c3c5548889ceb
3
  size 1925064044
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba8b8aca45c706933df0ac865eb25d0a5f5734f04a35d51d8b5f659916db1f8f
3
  size 1925064044
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7c93a397e9322e49f4ed50d18f810eaf2c39ecdb2985c95d248cd7a2fa2aa47
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1f09b1f1f9b06ad2afb12e89fc8695073b76afcf9ea0b3552c7069932117824
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd54311344b834087a4b1c20d06544579c7f43d33908960b6b3b61734dbde46d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c2e7a67c9c301b36183def727305bd60ef4c597b197ad54cdb0001ffc36e45a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1177 +1,310 @@
1
  {
2
- "best_metric": 48.63818252226668,
3
- "best_model_checkpoint": "./whisper-small-taiwanese/checkpoint-4000",
4
- "epoch": 2.5806451612903225,
5
  "eval_steps": 1000,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.016129032258064516,
13
- "grad_norm": 241.39755249023438,
14
- "learning_rate": 5.376344086021506e-07,
15
- "loss": 8.0646,
16
  "step": 25
17
  },
18
  {
19
- "epoch": 0.03225806451612903,
20
- "grad_norm": 52.91600799560547,
21
- "learning_rate": 1.0752688172043011e-06,
22
- "loss": 5.6903,
23
  "step": 50
24
  },
25
  {
26
- "epoch": 0.04838709677419355,
27
- "grad_norm": 32.09747314453125,
28
- "learning_rate": 1.6129032258064516e-06,
29
- "loss": 3.6353,
30
  "step": 75
31
  },
32
  {
33
- "epoch": 0.06451612903225806,
34
- "grad_norm": 31.451000213623047,
35
- "learning_rate": 2.1505376344086023e-06,
36
- "loss": 2.6364,
37
  "step": 100
38
  },
39
  {
40
- "epoch": 0.08064516129032258,
41
- "grad_norm": 29.471986770629883,
42
- "learning_rate": 2.688172043010753e-06,
43
- "loss": 2.3125,
44
  "step": 125
45
  },
46
  {
47
- "epoch": 0.0967741935483871,
48
- "grad_norm": 28.64345932006836,
49
- "learning_rate": 3.225806451612903e-06,
50
- "loss": 2.1281,
51
  "step": 150
52
  },
53
  {
54
- "epoch": 0.11290322580645161,
55
- "grad_norm": 28.750173568725586,
56
- "learning_rate": 3.763440860215054e-06,
57
- "loss": 1.9073,
58
  "step": 175
59
  },
60
  {
61
- "epoch": 0.12903225806451613,
62
- "grad_norm": 23.051420211791992,
63
- "learning_rate": 4.3010752688172045e-06,
64
- "loss": 1.5977,
65
  "step": 200
66
  },
67
  {
68
- "epoch": 0.14516129032258066,
69
- "grad_norm": 18.67135238647461,
70
- "learning_rate": 4.838709677419355e-06,
71
- "loss": 1.5081,
72
  "step": 225
73
  },
74
  {
75
- "epoch": 0.16129032258064516,
76
- "grad_norm": 15.335652351379395,
77
- "learning_rate": 5.376344086021506e-06,
78
- "loss": 1.4169,
79
  "step": 250
80
  },
81
  {
82
- "epoch": 0.1774193548387097,
83
- "grad_norm": 16.2917537689209,
84
- "learning_rate": 5.9139784946236566e-06,
85
- "loss": 1.3469,
86
  "step": 275
87
  },
88
  {
89
- "epoch": 0.1935483870967742,
90
- "grad_norm": 15.212031364440918,
91
- "learning_rate": 6.451612903225806e-06,
92
- "loss": 1.4059,
93
  "step": 300
94
  },
95
  {
96
- "epoch": 0.20967741935483872,
97
- "grad_norm": 15.661399841308594,
98
- "learning_rate": 6.989247311827958e-06,
99
- "loss": 1.333,
100
  "step": 325
101
  },
102
  {
103
- "epoch": 0.22580645161290322,
104
- "grad_norm": 16.841798782348633,
105
- "learning_rate": 7.526881720430108e-06,
106
- "loss": 1.2252,
107
  "step": 350
108
  },
109
  {
110
- "epoch": 0.24193548387096775,
111
- "grad_norm": 17.468032836914062,
112
- "learning_rate": 8.064516129032258e-06,
113
- "loss": 1.2996,
114
  "step": 375
115
  },
116
  {
117
- "epoch": 0.25806451612903225,
118
- "grad_norm": 16.684844970703125,
119
- "learning_rate": 8.602150537634409e-06,
120
- "loss": 1.2653,
121
  "step": 400
122
  },
123
  {
124
- "epoch": 0.27419354838709675,
125
- "grad_norm": 14.749136924743652,
126
- "learning_rate": 9.13978494623656e-06,
127
- "loss": 1.1967,
128
  "step": 425
129
  },
130
  {
131
- "epoch": 0.2903225806451613,
132
- "grad_norm": 13.751141548156738,
133
- "learning_rate": 9.67741935483871e-06,
134
- "loss": 1.1865,
135
  "step": 450
136
  },
137
  {
138
- "epoch": 0.3064516129032258,
139
- "grad_norm": 16.48873519897461,
140
- "learning_rate": 9.97610513739546e-06,
141
- "loss": 1.1636,
142
  "step": 475
143
  },
144
  {
145
- "epoch": 0.3225806451612903,
146
- "grad_norm": 14.694608688354492,
147
- "learning_rate": 9.916367980884111e-06,
148
- "loss": 1.1796,
149
  "step": 500
150
  },
151
  {
152
- "epoch": 0.3387096774193548,
153
- "grad_norm": 15.619414329528809,
154
- "learning_rate": 9.856630824372761e-06,
155
- "loss": 1.1655,
156
  "step": 525
157
  },
158
  {
159
- "epoch": 0.3548387096774194,
160
- "grad_norm": 13.177242279052734,
161
- "learning_rate": 9.79689366786141e-06,
162
- "loss": 1.143,
163
  "step": 550
164
  },
165
  {
166
- "epoch": 0.3709677419354839,
167
- "grad_norm": 15.957605361938477,
168
- "learning_rate": 9.737156511350062e-06,
169
- "loss": 1.1414,
170
  "step": 575
171
  },
172
  {
173
- "epoch": 0.3870967741935484,
174
- "grad_norm": 12.467620849609375,
175
- "learning_rate": 9.67741935483871e-06,
176
- "loss": 1.0964,
177
  "step": 600
178
  },
179
  {
180
- "epoch": 0.4032258064516129,
181
- "grad_norm": 15.435978889465332,
182
- "learning_rate": 9.61768219832736e-06,
183
- "loss": 1.1512,
184
  "step": 625
185
  },
186
  {
187
- "epoch": 0.41935483870967744,
188
- "grad_norm": 13.087624549865723,
189
- "learning_rate": 9.557945041816011e-06,
190
- "loss": 1.1338,
191
  "step": 650
192
  },
193
  {
194
- "epoch": 0.43548387096774194,
195
- "grad_norm": 15.716456413269043,
196
- "learning_rate": 9.49820788530466e-06,
197
- "loss": 1.0783,
198
  "step": 675
199
  },
200
  {
201
- "epoch": 0.45161290322580644,
202
- "grad_norm": 14.517507553100586,
203
- "learning_rate": 9.43847072879331e-06,
204
- "loss": 1.0728,
205
  "step": 700
206
  },
207
  {
208
- "epoch": 0.46774193548387094,
209
- "grad_norm": 17.37009620666504,
210
- "learning_rate": 9.37873357228196e-06,
211
- "loss": 1.0317,
212
  "step": 725
213
  },
214
  {
215
- "epoch": 0.4838709677419355,
216
- "grad_norm": 14.03701400756836,
217
- "learning_rate": 9.31899641577061e-06,
218
- "loss": 1.0347,
219
  "step": 750
220
  },
221
  {
222
- "epoch": 0.5,
223
- "grad_norm": 12.431659698486328,
224
- "learning_rate": 9.25925925925926e-06,
225
- "loss": 1.0524,
226
  "step": 775
227
  },
228
  {
229
- "epoch": 0.5161290322580645,
230
- "grad_norm": 12.746413230895996,
231
- "learning_rate": 9.19952210274791e-06,
232
- "loss": 1.0826,
233
  "step": 800
234
  },
235
  {
236
- "epoch": 0.532258064516129,
237
- "grad_norm": 15.521408081054688,
238
- "learning_rate": 9.13978494623656e-06,
239
- "loss": 1.0377,
240
  "step": 825
241
  },
242
  {
243
- "epoch": 0.5483870967741935,
244
- "grad_norm": 15.342901229858398,
245
- "learning_rate": 9.08004778972521e-06,
246
- "loss": 0.9762,
247
  "step": 850
248
  },
249
  {
250
- "epoch": 0.5645161290322581,
251
- "grad_norm": 16.137371063232422,
252
- "learning_rate": 9.02031063321386e-06,
253
- "loss": 1.0725,
254
  "step": 875
255
  },
256
  {
257
- "epoch": 0.5806451612903226,
258
- "grad_norm": 14.61146068572998,
259
- "learning_rate": 8.96057347670251e-06,
260
- "loss": 0.9554,
261
  "step": 900
262
  },
263
  {
264
- "epoch": 0.5967741935483871,
265
- "grad_norm": 13.561723709106445,
266
- "learning_rate": 8.90083632019116e-06,
267
- "loss": 1.0127,
268
  "step": 925
269
  },
270
  {
271
- "epoch": 0.6129032258064516,
272
- "grad_norm": 16.037729263305664,
273
- "learning_rate": 8.84109916367981e-06,
274
- "loss": 0.9621,
275
  "step": 950
276
  },
277
  {
278
- "epoch": 0.6290322580645161,
279
- "grad_norm": 13.945268630981445,
280
- "learning_rate": 8.78136200716846e-06,
281
- "loss": 0.9479,
282
  "step": 975
283
  },
284
  {
285
- "epoch": 0.6451612903225806,
286
- "grad_norm": 15.826567649841309,
287
- "learning_rate": 8.72162485065711e-06,
288
- "loss": 0.9789,
289
  "step": 1000
290
  },
291
  {
292
- "epoch": 0.6451612903225806,
293
- "eval_cer": 60.21685813863431,
294
- "eval_loss": 0.9020848870277405,
295
- "eval_runtime": 953.7359,
296
- "eval_samples_per_second": 2.392,
297
- "eval_steps_per_second": 0.3,
298
  "step": 1000
299
- },
300
- {
301
- "epoch": 0.6612903225806451,
302
- "grad_norm": 11.495616912841797,
303
- "learning_rate": 8.66188769414576e-06,
304
- "loss": 0.9695,
305
- "step": 1025
306
- },
307
- {
308
- "epoch": 0.6774193548387096,
309
- "grad_norm": 15.224388122558594,
310
- "learning_rate": 8.602150537634409e-06,
311
- "loss": 0.9488,
312
- "step": 1050
313
- },
314
- {
315
- "epoch": 0.6935483870967742,
316
- "grad_norm": 13.824469566345215,
317
- "learning_rate": 8.54241338112306e-06,
318
- "loss": 1.0474,
319
- "step": 1075
320
- },
321
- {
322
- "epoch": 0.7096774193548387,
323
- "grad_norm": 14.53409194946289,
324
- "learning_rate": 8.48267622461171e-06,
325
- "loss": 0.9866,
326
- "step": 1100
327
- },
328
- {
329
- "epoch": 0.7258064516129032,
330
- "grad_norm": 12.956225395202637,
331
- "learning_rate": 8.422939068100358e-06,
332
- "loss": 0.9072,
333
- "step": 1125
334
- },
335
- {
336
- "epoch": 0.7419354838709677,
337
- "grad_norm": 13.533162117004395,
338
- "learning_rate": 8.36320191158901e-06,
339
- "loss": 0.9428,
340
- "step": 1150
341
- },
342
- {
343
- "epoch": 0.7580645161290323,
344
- "grad_norm": 14.72665023803711,
345
- "learning_rate": 8.303464755077659e-06,
346
- "loss": 0.9387,
347
- "step": 1175
348
- },
349
- {
350
- "epoch": 0.7741935483870968,
351
- "grad_norm": 12.921445846557617,
352
- "learning_rate": 8.24372759856631e-06,
353
- "loss": 0.9442,
354
- "step": 1200
355
- },
356
- {
357
- "epoch": 0.7903225806451613,
358
- "grad_norm": 11.652874946594238,
359
- "learning_rate": 8.18399044205496e-06,
360
- "loss": 0.9359,
361
- "step": 1225
362
- },
363
- {
364
- "epoch": 0.8064516129032258,
365
- "grad_norm": 15.415846824645996,
366
- "learning_rate": 8.124253285543608e-06,
367
- "loss": 0.916,
368
- "step": 1250
369
- },
370
- {
371
- "epoch": 0.8225806451612904,
372
- "grad_norm": 18.422143936157227,
373
- "learning_rate": 8.064516129032258e-06,
374
- "loss": 0.9608,
375
- "step": 1275
376
- },
377
- {
378
- "epoch": 0.8387096774193549,
379
- "grad_norm": 11.93355941772461,
380
- "learning_rate": 8.004778972520909e-06,
381
- "loss": 0.9297,
382
- "step": 1300
383
- },
384
- {
385
- "epoch": 0.8548387096774194,
386
- "grad_norm": 16.42209243774414,
387
- "learning_rate": 7.945041816009559e-06,
388
- "loss": 0.8933,
389
- "step": 1325
390
- },
391
- {
392
- "epoch": 0.8709677419354839,
393
- "grad_norm": 14.272250175476074,
394
- "learning_rate": 7.88530465949821e-06,
395
- "loss": 0.9185,
396
- "step": 1350
397
- },
398
- {
399
- "epoch": 0.8870967741935484,
400
- "grad_norm": 12.172361373901367,
401
- "learning_rate": 7.825567502986858e-06,
402
- "loss": 0.8476,
403
- "step": 1375
404
- },
405
- {
406
- "epoch": 0.9032258064516129,
407
- "grad_norm": 14.475882530212402,
408
- "learning_rate": 7.765830346475508e-06,
409
- "loss": 0.925,
410
- "step": 1400
411
- },
412
- {
413
- "epoch": 0.9193548387096774,
414
- "grad_norm": 14.247998237609863,
415
- "learning_rate": 7.706093189964159e-06,
416
- "loss": 0.888,
417
- "step": 1425
418
- },
419
- {
420
- "epoch": 0.9354838709677419,
421
- "grad_norm": 12.855352401733398,
422
- "learning_rate": 7.646356033452809e-06,
423
- "loss": 0.888,
424
- "step": 1450
425
- },
426
- {
427
- "epoch": 0.9516129032258065,
428
- "grad_norm": 14.016806602478027,
429
- "learning_rate": 7.586618876941458e-06,
430
- "loss": 0.9237,
431
- "step": 1475
432
- },
433
- {
434
- "epoch": 0.967741935483871,
435
- "grad_norm": 13.113448143005371,
436
- "learning_rate": 7.526881720430108e-06,
437
- "loss": 0.8767,
438
- "step": 1500
439
- },
440
- {
441
- "epoch": 0.9838709677419355,
442
- "grad_norm": 15.823156356811523,
443
- "learning_rate": 7.467144563918758e-06,
444
- "loss": 0.8561,
445
- "step": 1525
446
- },
447
- {
448
- "epoch": 1.0,
449
- "grad_norm": 16.72173309326172,
450
- "learning_rate": 7.4074074074074075e-06,
451
- "loss": 0.853,
452
- "step": 1550
453
- },
454
- {
455
- "epoch": 1.0161290322580645,
456
- "grad_norm": 11.821678161621094,
457
- "learning_rate": 7.347670250896059e-06,
458
- "loss": 0.6325,
459
- "step": 1575
460
- },
461
- {
462
- "epoch": 1.032258064516129,
463
- "grad_norm": 10.003717422485352,
464
- "learning_rate": 7.287933094384708e-06,
465
- "loss": 0.5999,
466
- "step": 1600
467
- },
468
- {
469
- "epoch": 1.0483870967741935,
470
- "grad_norm": 11.098932266235352,
471
- "learning_rate": 7.2281959378733575e-06,
472
- "loss": 0.6439,
473
- "step": 1625
474
- },
475
- {
476
- "epoch": 1.064516129032258,
477
- "grad_norm": 13.39173412322998,
478
- "learning_rate": 7.168458781362008e-06,
479
- "loss": 0.6171,
480
- "step": 1650
481
- },
482
- {
483
- "epoch": 1.0806451612903225,
484
- "grad_norm": 12.478330612182617,
485
- "learning_rate": 7.108721624850657e-06,
486
- "loss": 0.6296,
487
- "step": 1675
488
- },
489
- {
490
- "epoch": 1.096774193548387,
491
- "grad_norm": 11.143562316894531,
492
- "learning_rate": 7.048984468339307e-06,
493
- "loss": 0.6252,
494
- "step": 1700
495
- },
496
- {
497
- "epoch": 1.1129032258064515,
498
- "grad_norm": 9.06653118133545,
499
- "learning_rate": 6.989247311827958e-06,
500
- "loss": 0.627,
501
- "step": 1725
502
- },
503
- {
504
- "epoch": 1.129032258064516,
505
- "grad_norm": 12.985542297363281,
506
- "learning_rate": 6.929510155316607e-06,
507
- "loss": 0.6145,
508
- "step": 1750
509
- },
510
- {
511
- "epoch": 1.1451612903225807,
512
- "grad_norm": 12.124594688415527,
513
- "learning_rate": 6.869772998805258e-06,
514
- "loss": 0.601,
515
- "step": 1775
516
- },
517
- {
518
- "epoch": 1.1612903225806452,
519
- "grad_norm": 11.50346851348877,
520
- "learning_rate": 6.810035842293907e-06,
521
- "loss": 0.5787,
522
- "step": 1800
523
- },
524
- {
525
- "epoch": 1.1774193548387097,
526
- "grad_norm": 11.256744384765625,
527
- "learning_rate": 6.7502986857825566e-06,
528
- "loss": 0.5949,
529
- "step": 1825
530
- },
531
- {
532
- "epoch": 1.1935483870967742,
533
- "grad_norm": 12.568142890930176,
534
- "learning_rate": 6.690561529271207e-06,
535
- "loss": 0.6396,
536
- "step": 1850
537
- },
538
- {
539
- "epoch": 1.2096774193548387,
540
- "grad_norm": 11.688636779785156,
541
- "learning_rate": 6.630824372759857e-06,
542
- "loss": 0.6106,
543
- "step": 1875
544
- },
545
- {
546
- "epoch": 1.2258064516129032,
547
- "grad_norm": 13.135574340820312,
548
- "learning_rate": 6.5710872162485075e-06,
549
- "loss": 0.6197,
550
- "step": 1900
551
- },
552
- {
553
- "epoch": 1.2419354838709677,
554
- "grad_norm": 14.128840446472168,
555
- "learning_rate": 6.511350059737157e-06,
556
- "loss": 0.6474,
557
- "step": 1925
558
- },
559
- {
560
- "epoch": 1.2580645161290323,
561
- "grad_norm": 11.889117240905762,
562
- "learning_rate": 6.451612903225806e-06,
563
- "loss": 0.5966,
564
- "step": 1950
565
- },
566
- {
567
- "epoch": 1.2741935483870968,
568
- "grad_norm": 12.298087120056152,
569
- "learning_rate": 6.391875746714457e-06,
570
- "loss": 0.6007,
571
- "step": 1975
572
- },
573
- {
574
- "epoch": 1.2903225806451613,
575
- "grad_norm": 13.969961166381836,
576
- "learning_rate": 6.332138590203107e-06,
577
- "loss": 0.61,
578
- "step": 2000
579
- },
580
- {
581
- "epoch": 1.2903225806451613,
582
- "eval_cer": 53.38840841616109,
583
- "eval_loss": 0.753625750541687,
584
- "eval_runtime": 951.328,
585
- "eval_samples_per_second": 2.398,
586
- "eval_steps_per_second": 0.301,
587
- "step": 2000
588
- },
589
- {
590
- "epoch": 1.3064516129032258,
591
- "grad_norm": 9.99063777923584,
592
- "learning_rate": 6.272401433691757e-06,
593
- "loss": 0.588,
594
- "step": 2025
595
- },
596
- {
597
- "epoch": 1.3225806451612903,
598
- "grad_norm": 13.123091697692871,
599
- "learning_rate": 6.212664277180407e-06,
600
- "loss": 0.5886,
601
- "step": 2050
602
- },
603
- {
604
- "epoch": 1.3387096774193548,
605
- "grad_norm": 10.930394172668457,
606
- "learning_rate": 6.152927120669057e-06,
607
- "loss": 0.6117,
608
- "step": 2075
609
- },
610
- {
611
- "epoch": 1.3548387096774195,
612
- "grad_norm": 12.531543731689453,
613
- "learning_rate": 6.0931899641577065e-06,
614
- "loss": 0.5931,
615
- "step": 2100
616
- },
617
- {
618
- "epoch": 1.370967741935484,
619
- "grad_norm": 13.16308307647705,
620
- "learning_rate": 6.033452807646356e-06,
621
- "loss": 0.598,
622
- "step": 2125
623
- },
624
- {
625
- "epoch": 1.3870967741935485,
626
- "grad_norm": 11.17799186706543,
627
- "learning_rate": 5.973715651135007e-06,
628
- "loss": 0.6141,
629
- "step": 2150
630
- },
631
- {
632
- "epoch": 1.403225806451613,
633
- "grad_norm": 10.640506744384766,
634
- "learning_rate": 5.9139784946236566e-06,
635
- "loss": 0.5682,
636
- "step": 2175
637
- },
638
- {
639
- "epoch": 1.4193548387096775,
640
- "grad_norm": 11.789594650268555,
641
- "learning_rate": 5.854241338112307e-06,
642
- "loss": 0.5598,
643
- "step": 2200
644
- },
645
- {
646
- "epoch": 1.435483870967742,
647
- "grad_norm": 11.937474250793457,
648
- "learning_rate": 5.794504181600956e-06,
649
- "loss": 0.6344,
650
- "step": 2225
651
- },
652
- {
653
- "epoch": 1.4516129032258065,
654
- "grad_norm": 14.106030464172363,
655
- "learning_rate": 5.734767025089606e-06,
656
- "loss": 0.5783,
657
- "step": 2250
658
- },
659
- {
660
- "epoch": 1.467741935483871,
661
- "grad_norm": 12.365781784057617,
662
- "learning_rate": 5.675029868578256e-06,
663
- "loss": 0.6335,
664
- "step": 2275
665
- },
666
- {
667
- "epoch": 1.4838709677419355,
668
- "grad_norm": 14.670917510986328,
669
- "learning_rate": 5.615292712066906e-06,
670
- "loss": 0.5988,
671
- "step": 2300
672
- },
673
- {
674
- "epoch": 1.5,
675
- "grad_norm": 10.45535659790039,
676
- "learning_rate": 5.555555555555557e-06,
677
- "loss": 0.5912,
678
- "step": 2325
679
- },
680
- {
681
- "epoch": 1.5161290322580645,
682
- "grad_norm": 15.059216499328613,
683
- "learning_rate": 5.495818399044206e-06,
684
- "loss": 0.5405,
685
- "step": 2350
686
- },
687
- {
688
- "epoch": 1.532258064516129,
689
- "grad_norm": 12.705628395080566,
690
- "learning_rate": 5.436081242532856e-06,
691
- "loss": 0.5816,
692
- "step": 2375
693
- },
694
- {
695
- "epoch": 1.5483870967741935,
696
- "grad_norm": 14.382452964782715,
697
- "learning_rate": 5.376344086021506e-06,
698
- "loss": 0.5437,
699
- "step": 2400
700
- },
701
- {
702
- "epoch": 1.564516129032258,
703
- "grad_norm": 10.80752944946289,
704
- "learning_rate": 5.316606929510155e-06,
705
- "loss": 0.5975,
706
- "step": 2425
707
- },
708
- {
709
- "epoch": 1.5806451612903225,
710
- "grad_norm": 12.146509170532227,
711
- "learning_rate": 5.2568697729988065e-06,
712
- "loss": 0.599,
713
- "step": 2450
714
- },
715
- {
716
- "epoch": 1.596774193548387,
717
- "grad_norm": 12.145088195800781,
718
- "learning_rate": 5.197132616487456e-06,
719
- "loss": 0.6506,
720
- "step": 2475
721
- },
722
- {
723
- "epoch": 1.6129032258064515,
724
- "grad_norm": 13.103174209594727,
725
- "learning_rate": 5.137395459976105e-06,
726
- "loss": 0.5649,
727
- "step": 2500
728
- },
729
- {
730
- "epoch": 1.629032258064516,
731
- "grad_norm": 13.602423667907715,
732
- "learning_rate": 5.077658303464756e-06,
733
- "loss": 0.5424,
734
- "step": 2525
735
- },
736
- {
737
- "epoch": 1.6451612903225805,
738
- "grad_norm": 14.787790298461914,
739
- "learning_rate": 5.017921146953405e-06,
740
- "loss": 0.5628,
741
- "step": 2550
742
- },
743
- {
744
- "epoch": 1.661290322580645,
745
- "grad_norm": 11.559283256530762,
746
- "learning_rate": 4.9581839904420555e-06,
747
- "loss": 0.6216,
748
- "step": 2575
749
- },
750
- {
751
- "epoch": 1.6774193548387095,
752
- "grad_norm": 13.20376968383789,
753
- "learning_rate": 4.898446833930705e-06,
754
- "loss": 0.5694,
755
- "step": 2600
756
- },
757
- {
758
- "epoch": 1.6935483870967742,
759
- "grad_norm": 9.632781982421875,
760
- "learning_rate": 4.838709677419355e-06,
761
- "loss": 0.5808,
762
- "step": 2625
763
- },
764
- {
765
- "epoch": 1.7096774193548387,
766
- "grad_norm": 12.304398536682129,
767
- "learning_rate": 4.7789725209080055e-06,
768
- "loss": 0.5777,
769
- "step": 2650
770
- },
771
- {
772
- "epoch": 1.7258064516129032,
773
- "grad_norm": 11.025238990783691,
774
- "learning_rate": 4.719235364396655e-06,
775
- "loss": 0.5964,
776
- "step": 2675
777
- },
778
- {
779
- "epoch": 1.7419354838709677,
780
- "grad_norm": 13.640275955200195,
781
- "learning_rate": 4.659498207885305e-06,
782
- "loss": 0.5936,
783
- "step": 2700
784
- },
785
- {
786
- "epoch": 1.7580645161290323,
787
- "grad_norm": 14.28750991821289,
788
- "learning_rate": 4.599761051373955e-06,
789
- "loss": 0.5814,
790
- "step": 2725
791
- },
792
- {
793
- "epoch": 1.7741935483870968,
794
- "grad_norm": 14.228248596191406,
795
- "learning_rate": 4.540023894862605e-06,
796
- "loss": 0.5881,
797
- "step": 2750
798
- },
799
- {
800
- "epoch": 1.7903225806451613,
801
- "grad_norm": 12.126937866210938,
802
- "learning_rate": 4.480286738351255e-06,
803
- "loss": 0.5568,
804
- "step": 2775
805
- },
806
- {
807
- "epoch": 1.8064516129032258,
808
- "grad_norm": 12.653525352478027,
809
- "learning_rate": 4.420549581839905e-06,
810
- "loss": 0.5988,
811
- "step": 2800
812
- },
813
- {
814
- "epoch": 1.8225806451612905,
815
- "grad_norm": 10.851930618286133,
816
- "learning_rate": 4.360812425328555e-06,
817
- "loss": 0.6073,
818
- "step": 2825
819
- },
820
- {
821
- "epoch": 1.838709677419355,
822
- "grad_norm": 12.00724983215332,
823
- "learning_rate": 4.3010752688172045e-06,
824
- "loss": 0.5739,
825
- "step": 2850
826
- },
827
- {
828
- "epoch": 1.8548387096774195,
829
- "grad_norm": 10.997614860534668,
830
- "learning_rate": 4.241338112305855e-06,
831
- "loss": 0.5663,
832
- "step": 2875
833
- },
834
- {
835
- "epoch": 1.870967741935484,
836
- "grad_norm": 12.384391784667969,
837
- "learning_rate": 4.181600955794505e-06,
838
- "loss": 0.5325,
839
- "step": 2900
840
- },
841
- {
842
- "epoch": 1.8870967741935485,
843
- "grad_norm": 10.200772285461426,
844
- "learning_rate": 4.121863799283155e-06,
845
- "loss": 0.5918,
846
- "step": 2925
847
- },
848
- {
849
- "epoch": 1.903225806451613,
850
- "grad_norm": 13.224651336669922,
851
- "learning_rate": 4.062126642771804e-06,
852
- "loss": 0.5399,
853
- "step": 2950
854
- },
855
- {
856
- "epoch": 1.9193548387096775,
857
- "grad_norm": 10.611023902893066,
858
- "learning_rate": 4.002389486260454e-06,
859
- "loss": 0.5593,
860
- "step": 2975
861
- },
862
- {
863
- "epoch": 1.935483870967742,
864
- "grad_norm": 10.110644340515137,
865
- "learning_rate": 3.942652329749105e-06,
866
- "loss": 0.5611,
867
- "step": 3000
868
- },
869
- {
870
- "epoch": 1.935483870967742,
871
- "eval_cer": 51.336001032657805,
872
- "eval_loss": 0.6702780723571777,
873
- "eval_runtime": 963.0475,
874
- "eval_samples_per_second": 2.369,
875
- "eval_steps_per_second": 0.297,
876
- "step": 3000
877
- },
878
- {
879
- "epoch": 1.9516129032258065,
880
- "grad_norm": 10.685456275939941,
881
- "learning_rate": 3.882915173237754e-06,
882
- "loss": 0.5326,
883
- "step": 3025
884
- },
885
- {
886
- "epoch": 1.967741935483871,
887
- "grad_norm": 14.404354095458984,
888
- "learning_rate": 3.823178016726404e-06,
889
- "loss": 0.5828,
890
- "step": 3050
891
- },
892
- {
893
- "epoch": 1.9838709677419355,
894
- "grad_norm": 13.997696876525879,
895
- "learning_rate": 3.763440860215054e-06,
896
- "loss": 0.5394,
897
- "step": 3075
898
- },
899
- {
900
- "epoch": 2.0,
901
- "grad_norm": 16.121444702148438,
902
- "learning_rate": 3.7037037037037037e-06,
903
- "loss": 0.5635,
904
- "step": 3100
905
- },
906
- {
907
- "epoch": 2.0161290322580645,
908
- "grad_norm": 9.237725257873535,
909
- "learning_rate": 3.643966547192354e-06,
910
- "loss": 0.3737,
911
- "step": 3125
912
- },
913
- {
914
- "epoch": 2.032258064516129,
915
- "grad_norm": 11.313372611999512,
916
- "learning_rate": 3.584229390681004e-06,
917
- "loss": 0.3934,
918
- "step": 3150
919
- },
920
- {
921
- "epoch": 2.0483870967741935,
922
- "grad_norm": 9.819090843200684,
923
- "learning_rate": 3.5244922341696534e-06,
924
- "loss": 0.3494,
925
- "step": 3175
926
- },
927
- {
928
- "epoch": 2.064516129032258,
929
- "grad_norm": 9.302324295043945,
930
- "learning_rate": 3.4647550776583037e-06,
931
- "loss": 0.3691,
932
- "step": 3200
933
- },
934
- {
935
- "epoch": 2.0806451612903225,
936
- "grad_norm": 11.517475128173828,
937
- "learning_rate": 3.4050179211469536e-06,
938
- "loss": 0.3652,
939
- "step": 3225
940
- },
941
- {
942
- "epoch": 2.096774193548387,
943
- "grad_norm": 7.707530975341797,
944
- "learning_rate": 3.3452807646356034e-06,
945
- "loss": 0.3566,
946
- "step": 3250
947
- },
948
- {
949
- "epoch": 2.1129032258064515,
950
- "grad_norm": 9.121161460876465,
951
- "learning_rate": 3.2855436081242537e-06,
952
- "loss": 0.3409,
953
- "step": 3275
954
- },
955
- {
956
- "epoch": 2.129032258064516,
957
- "grad_norm": 10.464853286743164,
958
- "learning_rate": 3.225806451612903e-06,
959
- "loss": 0.33,
960
- "step": 3300
961
- },
962
- {
963
- "epoch": 2.1451612903225805,
964
- "grad_norm": 8.300515174865723,
965
- "learning_rate": 3.1660692951015535e-06,
966
- "loss": 0.3436,
967
- "step": 3325
968
- },
969
- {
970
- "epoch": 2.161290322580645,
971
- "grad_norm": 7.577033519744873,
972
- "learning_rate": 3.1063321385902034e-06,
973
- "loss": 0.3441,
974
- "step": 3350
975
- },
976
- {
977
- "epoch": 2.1774193548387095,
978
- "grad_norm": 12.314337730407715,
979
- "learning_rate": 3.0465949820788532e-06,
980
- "loss": 0.387,
981
- "step": 3375
982
- },
983
- {
984
- "epoch": 2.193548387096774,
985
- "grad_norm": 8.03864860534668,
986
- "learning_rate": 2.9868578255675035e-06,
987
- "loss": 0.3533,
988
- "step": 3400
989
- },
990
- {
991
- "epoch": 2.2096774193548385,
992
- "grad_norm": 10.326530456542969,
993
- "learning_rate": 2.9271206690561534e-06,
994
- "loss": 0.351,
995
- "step": 3425
996
- },
997
- {
998
- "epoch": 2.225806451612903,
999
- "grad_norm": 8.268649101257324,
1000
- "learning_rate": 2.867383512544803e-06,
1001
- "loss": 0.3437,
1002
- "step": 3450
1003
- },
1004
- {
1005
- "epoch": 2.241935483870968,
1006
- "grad_norm": 9.62258529663086,
1007
- "learning_rate": 2.807646356033453e-06,
1008
- "loss": 0.3254,
1009
- "step": 3475
1010
- },
1011
- {
1012
- "epoch": 2.258064516129032,
1013
- "grad_norm": 8.58535099029541,
1014
- "learning_rate": 2.747909199522103e-06,
1015
- "loss": 0.3592,
1016
- "step": 3500
1017
- },
1018
- {
1019
- "epoch": 2.274193548387097,
1020
- "grad_norm": 10.211243629455566,
1021
- "learning_rate": 2.688172043010753e-06,
1022
- "loss": 0.3334,
1023
- "step": 3525
1024
- },
1025
- {
1026
- "epoch": 2.2903225806451615,
1027
- "grad_norm": 9.174546241760254,
1028
- "learning_rate": 2.6284348864994032e-06,
1029
- "loss": 0.3533,
1030
- "step": 3550
1031
- },
1032
- {
1033
- "epoch": 2.306451612903226,
1034
- "grad_norm": 9.889862060546875,
1035
- "learning_rate": 2.5686977299880527e-06,
1036
- "loss": 0.3263,
1037
- "step": 3575
1038
- },
1039
- {
1040
- "epoch": 2.3225806451612905,
1041
- "grad_norm": 10.23873519897461,
1042
- "learning_rate": 2.5089605734767026e-06,
1043
- "loss": 0.3601,
1044
- "step": 3600
1045
- },
1046
- {
1047
- "epoch": 2.338709677419355,
1048
- "grad_norm": 8.46229076385498,
1049
- "learning_rate": 2.4492234169653525e-06,
1050
- "loss": 0.335,
1051
- "step": 3625
1052
- },
1053
- {
1054
- "epoch": 2.3548387096774195,
1055
- "grad_norm": 8.364771842956543,
1056
- "learning_rate": 2.3894862604540028e-06,
1057
- "loss": 0.3447,
1058
- "step": 3650
1059
- },
1060
- {
1061
- "epoch": 2.370967741935484,
1062
- "grad_norm": 11.249506950378418,
1063
- "learning_rate": 2.3297491039426526e-06,
1064
- "loss": 0.3544,
1065
- "step": 3675
1066
- },
1067
- {
1068
- "epoch": 2.3870967741935485,
1069
- "grad_norm": 8.8016996383667,
1070
- "learning_rate": 2.2700119474313025e-06,
1071
- "loss": 0.3277,
1072
- "step": 3700
1073
- },
1074
- {
1075
- "epoch": 2.403225806451613,
1076
- "grad_norm": 9.774581909179688,
1077
- "learning_rate": 2.2102747909199524e-06,
1078
- "loss": 0.3346,
1079
- "step": 3725
1080
- },
1081
- {
1082
- "epoch": 2.4193548387096775,
1083
- "grad_norm": 8.027830123901367,
1084
- "learning_rate": 2.1505376344086023e-06,
1085
- "loss": 0.3291,
1086
- "step": 3750
1087
- },
1088
- {
1089
- "epoch": 2.435483870967742,
1090
- "grad_norm": 10.107059478759766,
1091
- "learning_rate": 2.0908004778972526e-06,
1092
- "loss": 0.3366,
1093
- "step": 3775
1094
- },
1095
- {
1096
- "epoch": 2.4516129032258065,
1097
- "grad_norm": 8.280789375305176,
1098
- "learning_rate": 2.031063321385902e-06,
1099
- "loss": 0.3473,
1100
- "step": 3800
1101
- },
1102
- {
1103
- "epoch": 2.467741935483871,
1104
- "grad_norm": 9.160382270812988,
1105
- "learning_rate": 1.9713261648745523e-06,
1106
- "loss": 0.3215,
1107
- "step": 3825
1108
- },
1109
- {
1110
- "epoch": 2.4838709677419355,
1111
- "grad_norm": 7.922098636627197,
1112
- "learning_rate": 1.911589008363202e-06,
1113
- "loss": 0.3285,
1114
- "step": 3850
1115
- },
1116
- {
1117
- "epoch": 2.5,
1118
- "grad_norm": 9.239423751831055,
1119
- "learning_rate": 1.8518518518518519e-06,
1120
- "loss": 0.3608,
1121
- "step": 3875
1122
- },
1123
- {
1124
- "epoch": 2.5161290322580645,
1125
- "grad_norm": 8.667262077331543,
1126
- "learning_rate": 1.792114695340502e-06,
1127
- "loss": 0.3389,
1128
- "step": 3900
1129
- },
1130
- {
1131
- "epoch": 2.532258064516129,
1132
- "grad_norm": 10.475480079650879,
1133
- "learning_rate": 1.7323775388291518e-06,
1134
- "loss": 0.3226,
1135
- "step": 3925
1136
- },
1137
- {
1138
- "epoch": 2.5483870967741935,
1139
- "grad_norm": 11.079362869262695,
1140
- "learning_rate": 1.6726403823178017e-06,
1141
- "loss": 0.3559,
1142
- "step": 3950
1143
- },
1144
- {
1145
- "epoch": 2.564516129032258,
1146
- "grad_norm": 10.680990219116211,
1147
- "learning_rate": 1.6129032258064516e-06,
1148
- "loss": 0.297,
1149
- "step": 3975
1150
- },
1151
- {
1152
- "epoch": 2.5806451612903225,
1153
- "grad_norm": 13.101299285888672,
1154
- "learning_rate": 1.5531660692951017e-06,
1155
- "loss": 0.3359,
1156
- "step": 4000
1157
- },
1158
- {
1159
- "epoch": 2.5806451612903225,
1160
- "eval_cer": 48.63818252226668,
1161
- "eval_loss": 0.6473900675773621,
1162
- "eval_runtime": 969.6918,
1163
- "eval_samples_per_second": 2.352,
1164
- "eval_steps_per_second": 0.295,
1165
- "step": 4000
1166
  }
1167
  ],
1168
  "logging_steps": 25,
1169
- "max_steps": 4650,
1170
  "num_input_tokens_seen": 0,
1171
- "num_train_epochs": 3,
1172
  "save_steps": 1000,
1173
- "total_flos": 1.84665797664768e+19,
1174
- "train_batch_size": 16,
1175
  "trial_name": null,
1176
  "trial_params": null
1177
  }
 
1
  {
2
+ "best_metric": 61.346116219917825,
3
+ "best_model_checkpoint": "./whisper-small-taiwanese/checkpoint-1000",
4
+ "epoch": 0.40024014408645187,
5
  "eval_steps": 1000,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.010006003602161296,
13
+ "grad_norm": 197.02195739746094,
14
+ "learning_rate": 5.000000000000001e-07,
15
+ "loss": 7.3864,
16
  "step": 25
17
  },
18
  {
19
+ "epoch": 0.020012007204322592,
20
+ "grad_norm": 41.90484619140625,
21
+ "learning_rate": 1.0000000000000002e-06,
22
+ "loss": 5.5537,
23
  "step": 50
24
  },
25
  {
26
+ "epoch": 0.03001801080648389,
27
+ "grad_norm": 32.08101272583008,
28
+ "learning_rate": 1.5e-06,
29
+ "loss": 3.7704,
30
  "step": 75
31
  },
32
  {
33
+ "epoch": 0.040024014408645184,
34
+ "grad_norm": 28.216585159301758,
35
+ "learning_rate": 2.0000000000000003e-06,
36
+ "loss": 2.6296,
37
  "step": 100
38
  },
39
  {
40
+ "epoch": 0.05003001801080648,
41
+ "grad_norm": 23.683828353881836,
42
+ "learning_rate": 2.5e-06,
43
+ "loss": 2.4474,
44
  "step": 125
45
  },
46
  {
47
+ "epoch": 0.06003602161296778,
48
+ "grad_norm": 21.883520126342773,
49
+ "learning_rate": 3e-06,
50
+ "loss": 2.2054,
51
  "step": 150
52
  },
53
  {
54
+ "epoch": 0.07004202521512908,
55
+ "grad_norm": 24.772098541259766,
56
+ "learning_rate": 3.5e-06,
57
+ "loss": 2.0695,
58
  "step": 175
59
  },
60
  {
61
+ "epoch": 0.08004802881729037,
62
+ "grad_norm": 24.105548858642578,
63
+ "learning_rate": 4.000000000000001e-06,
64
+ "loss": 1.9116,
65
  "step": 200
66
  },
67
  {
68
+ "epoch": 0.09005403241945167,
69
+ "grad_norm": 18.805519104003906,
70
+ "learning_rate": 4.5e-06,
71
+ "loss": 1.7643,
72
  "step": 225
73
  },
74
  {
75
+ "epoch": 0.10006003602161297,
76
+ "grad_norm": 15.599541664123535,
77
+ "learning_rate": 5e-06,
78
+ "loss": 1.6394,
79
  "step": 250
80
  },
81
  {
82
+ "epoch": 0.11006603962377426,
83
+ "grad_norm": 15.514196395874023,
84
+ "learning_rate": 5.500000000000001e-06,
85
+ "loss": 1.6016,
86
  "step": 275
87
  },
88
  {
89
+ "epoch": 0.12007204322593557,
90
+ "grad_norm": 15.5431547164917,
91
+ "learning_rate": 6e-06,
92
+ "loss": 1.5851,
93
  "step": 300
94
  },
95
  {
96
+ "epoch": 0.13007804682809687,
97
+ "grad_norm": 16.450502395629883,
98
+ "learning_rate": 6.5000000000000004e-06,
99
+ "loss": 1.5076,
100
  "step": 325
101
  },
102
  {
103
+ "epoch": 0.14008405043025815,
104
+ "grad_norm": 16.393997192382812,
105
+ "learning_rate": 7e-06,
106
+ "loss": 1.5487,
107
  "step": 350
108
  },
109
  {
110
+ "epoch": 0.15009005403241946,
111
+ "grad_norm": 14.165709495544434,
112
+ "learning_rate": 7.500000000000001e-06,
113
+ "loss": 1.5365,
114
  "step": 375
115
  },
116
  {
117
+ "epoch": 0.16009605763458074,
118
+ "grad_norm": 15.929381370544434,
119
+ "learning_rate": 8.000000000000001e-06,
120
+ "loss": 1.5023,
121
  "step": 400
122
  },
123
  {
124
+ "epoch": 0.17010206123674204,
125
+ "grad_norm": 14.422001838684082,
126
+ "learning_rate": 8.5e-06,
127
+ "loss": 1.3558,
128
  "step": 425
129
  },
130
  {
131
+ "epoch": 0.18010806483890335,
132
+ "grad_norm": 13.510339736938477,
133
+ "learning_rate": 9e-06,
134
+ "loss": 1.3898,
135
  "step": 450
136
  },
137
  {
138
+ "epoch": 0.19011406844106463,
139
+ "grad_norm": 14.485660552978516,
140
+ "learning_rate": 9.5e-06,
141
+ "loss": 1.4279,
142
  "step": 475
143
  },
144
  {
145
+ "epoch": 0.20012007204322593,
146
+ "grad_norm": 14.117327690124512,
147
+ "learning_rate": 1e-05,
148
+ "loss": 1.3455,
149
  "step": 500
150
  },
151
  {
152
+ "epoch": 0.21012607564538724,
153
+ "grad_norm": 16.12464714050293,
154
+ "learning_rate": 9.944395017793596e-06,
155
+ "loss": 1.4,
156
  "step": 525
157
  },
158
  {
159
+ "epoch": 0.22013207924754852,
160
+ "grad_norm": 15.304022789001465,
161
+ "learning_rate": 9.888790035587188e-06,
162
+ "loss": 1.4159,
163
  "step": 550
164
  },
165
  {
166
+ "epoch": 0.23013808284970982,
167
+ "grad_norm": 14.668664932250977,
168
+ "learning_rate": 9.833185053380784e-06,
169
+ "loss": 1.3445,
170
  "step": 575
171
  },
172
  {
173
+ "epoch": 0.24014408645187113,
174
+ "grad_norm": 13.041420936584473,
175
+ "learning_rate": 9.777580071174379e-06,
176
+ "loss": 1.3622,
177
  "step": 600
178
  },
179
  {
180
+ "epoch": 0.25015009005403244,
181
+ "grad_norm": 15.908055305480957,
182
+ "learning_rate": 9.721975088967973e-06,
183
+ "loss": 1.3234,
184
  "step": 625
185
  },
186
  {
187
+ "epoch": 0.26015609365619374,
188
+ "grad_norm": 13.73078727722168,
189
+ "learning_rate": 9.666370106761567e-06,
190
+ "loss": 1.2332,
191
  "step": 650
192
  },
193
  {
194
+ "epoch": 0.270162097258355,
195
+ "grad_norm": 14.327301979064941,
196
+ "learning_rate": 9.610765124555162e-06,
197
+ "loss": 1.3042,
198
  "step": 675
199
  },
200
  {
201
+ "epoch": 0.2801681008605163,
202
+ "grad_norm": 14.390907287597656,
203
+ "learning_rate": 9.555160142348756e-06,
204
+ "loss": 1.3216,
205
  "step": 700
206
  },
207
  {
208
+ "epoch": 0.2901741044626776,
209
+ "grad_norm": 13.917515754699707,
210
+ "learning_rate": 9.49955516014235e-06,
211
+ "loss": 1.2931,
212
  "step": 725
213
  },
214
  {
215
+ "epoch": 0.3001801080648389,
216
+ "grad_norm": 15.108023643493652,
217
+ "learning_rate": 9.443950177935945e-06,
218
+ "loss": 1.3286,
219
  "step": 750
220
  },
221
  {
222
+ "epoch": 0.3101861116670002,
223
+ "grad_norm": 13.692678451538086,
224
+ "learning_rate": 9.388345195729539e-06,
225
+ "loss": 1.3057,
226
  "step": 775
227
  },
228
  {
229
+ "epoch": 0.32019211526916147,
230
+ "grad_norm": 13.685354232788086,
231
+ "learning_rate": 9.332740213523132e-06,
232
+ "loss": 1.2402,
233
  "step": 800
234
  },
235
  {
236
+ "epoch": 0.3301981188713228,
237
+ "grad_norm": 14.591761589050293,
238
+ "learning_rate": 9.277135231316726e-06,
239
+ "loss": 1.2688,
240
  "step": 825
241
  },
242
  {
243
+ "epoch": 0.3402041224734841,
244
+ "grad_norm": 15.677751541137695,
245
+ "learning_rate": 9.221530249110321e-06,
246
+ "loss": 1.3076,
247
  "step": 850
248
  },
249
  {
250
+ "epoch": 0.3502101260756454,
251
+ "grad_norm": 15.109577178955078,
252
+ "learning_rate": 9.165925266903915e-06,
253
+ "loss": 1.2141,
254
  "step": 875
255
  },
256
  {
257
+ "epoch": 0.3602161296778067,
258
+ "grad_norm": 10.552845001220703,
259
+ "learning_rate": 9.110320284697509e-06,
260
+ "loss": 1.2393,
261
  "step": 900
262
  },
263
  {
264
+ "epoch": 0.370222133279968,
265
+ "grad_norm": 12.321894645690918,
266
+ "learning_rate": 9.054715302491104e-06,
267
+ "loss": 1.2417,
268
  "step": 925
269
  },
270
  {
271
+ "epoch": 0.38022813688212925,
272
+ "grad_norm": 13.729790687561035,
273
+ "learning_rate": 8.999110320284698e-06,
274
+ "loss": 1.2082,
275
  "step": 950
276
  },
277
  {
278
+ "epoch": 0.39023414048429056,
279
+ "grad_norm": 13.137016296386719,
280
+ "learning_rate": 8.943505338078292e-06,
281
+ "loss": 1.2048,
282
  "step": 975
283
  },
284
  {
285
+ "epoch": 0.40024014408645187,
286
+ "grad_norm": 12.194613456726074,
287
+ "learning_rate": 8.887900355871887e-06,
288
+ "loss": 1.2739,
289
  "step": 1000
290
  },
291
  {
292
+ "epoch": 0.40024014408645187,
293
+ "eval_cer": 61.346116219917825,
294
+ "eval_loss": 1.169872522354126,
295
+ "eval_runtime": 1744.6409,
296
+ "eval_samples_per_second": 2.274,
297
+ "eval_steps_per_second": 0.284,
298
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  }
300
  ],
301
  "logging_steps": 25,
302
+ "max_steps": 4996,
303
  "num_input_tokens_seen": 0,
304
+ "num_train_epochs": 2,
305
  "save_steps": 1000,
306
+ "total_flos": 4.61736640512e+18,
307
+ "train_batch_size": 8,
308
  "trial_name": null,
309
  "trial_params": null
310
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5780b3fe6cf6a2b7abc711d493a9d31fc1181c9fff73c0fc0a79ae423a23e2fb
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc814e8346759fc832d3d40fb2efbd9f7f5bf91489499603abb8463206368d6b
3
  size 5176