error577 commited on
Commit
7aab378
·
verified ·
1 Parent(s): a62ae41

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "up_proj",
24
- "down_proj",
25
  "q_proj",
26
- "gate_proj",
27
  "v_proj",
28
- "k_proj",
29
- "o_proj"
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "q_proj",
 
24
  "v_proj",
25
+ "gate_proj",
26
+ "o_proj",
27
+ "down_proj",
28
+ "up_proj",
29
+ "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a323fc60e398f07df76076effccdfc93386e7f2c06a4d210f5fe11ef1e793cb
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cac2011fb82e5057de30ac0f4ffa50549541a1374f6529f80528efcca6fa182
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:782565b9486e1f287d668be0d2421dc3074e2b1b1e242aff627dee195273fa4c
3
  size 43122580
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b41a67e3cf04aab2b4ab76df5e332334d0306c62a9b9e823e5c35bdcca406ab
3
  size 43122580
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdc4595eb36291107783340eb4d984b80178a3bbeee1592461d47747aabcdadd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6df819ca8e305cb78b68e11c597c4be358949e29b2d90d1c61804b773554cf08
3
  size 14244
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.007842983470912335,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
@@ -9,703 +9,703 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 7.842983470912334e-05,
13
- "grad_norm": 8.825519561767578,
14
  "learning_rate": 1e-05,
15
- "loss": 2.8016,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.0001568596694182467,
20
- "grad_norm": 12.083088874816895,
21
  "learning_rate": 2e-05,
22
- "loss": 3.6411,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.00023528950412737005,
27
- "grad_norm": 15.9275541305542,
28
  "learning_rate": 3e-05,
29
- "loss": 4.9024,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.0003137193388364934,
34
- "grad_norm": 21.615602493286133,
35
  "learning_rate": 4e-05,
36
- "loss": 6.0083,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.00039214917354561674,
41
- "grad_norm": 22.12832260131836,
42
  "learning_rate": 5e-05,
43
- "loss": 5.5202,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.0004705790082547401,
48
- "grad_norm": 9.948811531066895,
49
  "learning_rate": 6e-05,
50
- "loss": 2.9518,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 0.0005490088429638635,
55
- "grad_norm": 15.534398078918457,
56
  "learning_rate": 7e-05,
57
- "loss": 4.1157,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 0.0006274386776729868,
62
- "grad_norm": 16.559049606323242,
63
  "learning_rate": 8e-05,
64
- "loss": 5.0582,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 0.0007058685123821102,
69
- "grad_norm": 42.421512603759766,
70
  "learning_rate": 9e-05,
71
- "loss": 6.5426,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 0.0007842983470912335,
76
- "grad_norm": 20.62925148010254,
77
  "learning_rate": 0.0001,
78
- "loss": 4.6693,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 0.0008627281818003569,
83
- "grad_norm": 15.661595344543457,
84
  "learning_rate": 9.99695413509548e-05,
85
- "loss": 3.6362,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 0.0009411580165094802,
90
- "grad_norm": 14.2806978225708,
91
  "learning_rate": 9.987820251299122e-05,
92
- "loss": 2.3931,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 0.0010195878512186036,
97
- "grad_norm": 28.024402618408203,
98
  "learning_rate": 9.972609476841367e-05,
99
- "loss": 3.4275,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 0.001098017685927727,
104
- "grad_norm": 16.453929901123047,
105
  "learning_rate": 9.951340343707852e-05,
106
- "loss": 2.1056,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 0.0011764475206368502,
111
- "grad_norm": 22.603105545043945,
112
  "learning_rate": 9.924038765061042e-05,
113
- "loss": 2.0804,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 0.0012548773553459735,
118
- "grad_norm": 25.013137817382812,
119
  "learning_rate": 9.890738003669029e-05,
120
- "loss": 2.4932,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 0.001333307190055097,
125
- "grad_norm": 22.514392852783203,
126
  "learning_rate": 9.851478631379982e-05,
127
- "loss": 2.5763,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 0.0014117370247642203,
132
- "grad_norm": 29.10135841369629,
133
  "learning_rate": 9.806308479691595e-05,
134
- "loss": 1.8406,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 0.0014901668594733436,
139
- "grad_norm": 12.27782917022705,
140
  "learning_rate": 9.755282581475769e-05,
141
- "loss": 1.4448,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 0.001568596694182467,
146
- "grad_norm": 36.448997497558594,
147
  "learning_rate": 9.698463103929542e-05,
148
- "loss": 2.9484,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 0.0016470265288915905,
153
- "grad_norm": 11.227051734924316,
154
  "learning_rate": 9.635919272833938e-05,
155
- "loss": 1.8091,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 0.0017254563636007138,
160
- "grad_norm": 14.083976745605469,
161
  "learning_rate": 9.567727288213005e-05,
162
- "loss": 1.3116,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 0.001803886198309837,
167
- "grad_norm": 14.545560836791992,
168
  "learning_rate": 9.493970231495835e-05,
169
- "loss": 1.6795,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 0.0018823160330189604,
174
- "grad_norm": 15.083060264587402,
175
  "learning_rate": 9.414737964294636e-05,
176
- "loss": 1.7426,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 0.0019607458677280837,
181
- "grad_norm": 14.212522506713867,
182
  "learning_rate": 9.330127018922194e-05,
183
- "loss": 1.732,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 0.002039175702437207,
188
- "grad_norm": 11.611138343811035,
189
  "learning_rate": 9.24024048078213e-05,
190
- "loss": 1.2554,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 0.0021176055371463303,
195
- "grad_norm": 10.914994239807129,
196
  "learning_rate": 9.145187862775209e-05,
197
- "loss": 1.394,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 0.002196035371855454,
202
- "grad_norm": 20.15048599243164,
203
  "learning_rate": 9.045084971874738e-05,
204
- "loss": 2.1326,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 0.0022744652065645773,
209
- "grad_norm": 14.226958274841309,
210
  "learning_rate": 8.940053768033609e-05,
211
- "loss": 1.411,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 0.0023528950412737004,
216
- "grad_norm": 15.308096885681152,
217
  "learning_rate": 8.83022221559489e-05,
218
- "loss": 2.2317,
219
  "step": 30
220
  },
221
  {
222
- "epoch": 0.002431324875982824,
223
- "grad_norm": 37.11000442504883,
224
  "learning_rate": 8.715724127386972e-05,
225
- "loss": 3.0994,
226
  "step": 31
227
  },
228
  {
229
- "epoch": 0.002509754710691947,
230
- "grad_norm": 13.671127319335938,
231
  "learning_rate": 8.596699001693255e-05,
232
- "loss": 2.3658,
233
  "step": 32
234
  },
235
  {
236
- "epoch": 0.0025881845454010705,
237
- "grad_norm": 11.981795310974121,
238
  "learning_rate": 8.473291852294987e-05,
239
- "loss": 1.7347,
240
  "step": 33
241
  },
242
  {
243
- "epoch": 0.002666614380110194,
244
- "grad_norm": 17.430723190307617,
245
  "learning_rate": 8.345653031794292e-05,
246
- "loss": 2.2345,
247
  "step": 34
248
  },
249
  {
250
- "epoch": 0.002745044214819317,
251
- "grad_norm": 14.712089538574219,
252
  "learning_rate": 8.213938048432697e-05,
253
- "loss": 2.2161,
254
  "step": 35
255
  },
256
  {
257
- "epoch": 0.0028234740495284407,
258
- "grad_norm": 12.411870956420898,
259
  "learning_rate": 8.07830737662829e-05,
260
- "loss": 1.6571,
261
  "step": 36
262
  },
263
  {
264
- "epoch": 0.0029019038842375638,
265
- "grad_norm": 11.750947952270508,
266
  "learning_rate": 7.938926261462366e-05,
267
- "loss": 1.3471,
268
  "step": 37
269
  },
270
  {
271
- "epoch": 0.0029803337189466873,
272
- "grad_norm": 18.219520568847656,
273
  "learning_rate": 7.795964517353735e-05,
274
- "loss": 2.2844,
275
  "step": 38
276
  },
277
  {
278
- "epoch": 0.003058763553655811,
279
- "grad_norm": 8.215274810791016,
280
  "learning_rate": 7.649596321166024e-05,
281
- "loss": 1.379,
282
  "step": 39
283
  },
284
  {
285
- "epoch": 0.003137193388364934,
286
- "grad_norm": 10.532532691955566,
287
  "learning_rate": 7.500000000000001e-05,
288
- "loss": 1.8841,
289
  "step": 40
290
  },
291
  {
292
- "epoch": 0.0032156232230740574,
293
- "grad_norm": 12.455815315246582,
294
  "learning_rate": 7.347357813929454e-05,
295
- "loss": 1.7141,
296
  "step": 41
297
  },
298
  {
299
- "epoch": 0.003294053057783181,
300
- "grad_norm": 10.054532051086426,
301
  "learning_rate": 7.191855733945387e-05,
302
- "loss": 1.6398,
303
  "step": 42
304
  },
305
  {
306
- "epoch": 0.003372482892492304,
307
- "grad_norm": 10.613759994506836,
308
  "learning_rate": 7.033683215379002e-05,
309
- "loss": 1.7214,
310
  "step": 43
311
  },
312
  {
313
- "epoch": 0.0034509127272014275,
314
- "grad_norm": 10.810108184814453,
315
  "learning_rate": 6.873032967079561e-05,
316
- "loss": 2.3159,
317
  "step": 44
318
  },
319
  {
320
- "epoch": 0.0035293425619105506,
321
- "grad_norm": 11.261944770812988,
322
  "learning_rate": 6.710100716628344e-05,
323
- "loss": 1.463,
324
  "step": 45
325
  },
326
  {
327
- "epoch": 0.003607772396619674,
328
- "grad_norm": 11.900640487670898,
329
  "learning_rate": 6.545084971874738e-05,
330
- "loss": 1.5287,
331
  "step": 46
332
  },
333
  {
334
- "epoch": 0.0036862022313287977,
335
- "grad_norm": 14.910116195678711,
336
  "learning_rate": 6.378186779084995e-05,
337
- "loss": 2.1966,
338
  "step": 47
339
  },
340
  {
341
- "epoch": 0.0037646320660379208,
342
- "grad_norm": 11.18796157836914,
343
  "learning_rate": 6.209609477998338e-05,
344
- "loss": 1.7811,
345
  "step": 48
346
  },
347
  {
348
- "epoch": 0.0038430619007470443,
349
- "grad_norm": 15.256511688232422,
350
  "learning_rate": 6.0395584540887963e-05,
351
- "loss": 1.3157,
352
  "step": 49
353
  },
354
  {
355
- "epoch": 0.003921491735456167,
356
- "grad_norm": 10.210933685302734,
357
  "learning_rate": 5.868240888334653e-05,
358
- "loss": 1.5167,
359
  "step": 50
360
  },
361
  {
362
- "epoch": 0.003999921570165291,
363
- "grad_norm": 14.052650451660156,
364
  "learning_rate": 5.695865504800327e-05,
365
- "loss": 1.7569,
366
  "step": 51
367
  },
368
  {
369
- "epoch": 0.004078351404874414,
370
- "grad_norm": 14.927163124084473,
371
  "learning_rate": 5.522642316338268e-05,
372
- "loss": 1.4792,
373
  "step": 52
374
  },
375
  {
376
- "epoch": 0.004156781239583538,
377
- "grad_norm": 18.198444366455078,
378
  "learning_rate": 5.348782368720626e-05,
379
- "loss": 3.5506,
380
  "step": 53
381
  },
382
  {
383
- "epoch": 0.004235211074292661,
384
- "grad_norm": 13.294053077697754,
385
  "learning_rate": 5.174497483512506e-05,
386
- "loss": 1.4432,
387
  "step": 54
388
  },
389
  {
390
- "epoch": 0.004313640909001784,
391
- "grad_norm": 11.814456939697266,
392
  "learning_rate": 5e-05,
393
- "loss": 1.4172,
394
  "step": 55
395
  },
396
  {
397
- "epoch": 0.004392070743710908,
398
- "grad_norm": 8.617324829101562,
399
  "learning_rate": 4.825502516487497e-05,
400
- "loss": 1.3786,
401
  "step": 56
402
  },
403
  {
404
- "epoch": 0.004470500578420031,
405
- "grad_norm": 14.102591514587402,
406
  "learning_rate": 4.6512176312793736e-05,
407
- "loss": 1.9422,
408
  "step": 57
409
  },
410
  {
411
- "epoch": 0.004548930413129155,
412
- "grad_norm": 10.432605743408203,
413
  "learning_rate": 4.477357683661734e-05,
414
- "loss": 1.3672,
415
  "step": 58
416
  },
417
  {
418
- "epoch": 0.004627360247838277,
419
- "grad_norm": 11.157849311828613,
420
  "learning_rate": 4.3041344951996746e-05,
421
- "loss": 1.5081,
422
  "step": 59
423
  },
424
  {
425
- "epoch": 0.004705790082547401,
426
- "grad_norm": 10.752006530761719,
427
  "learning_rate": 4.131759111665349e-05,
428
- "loss": 1.763,
429
  "step": 60
430
  },
431
  {
432
- "epoch": 0.004784219917256524,
433
- "grad_norm": 7.7650017738342285,
434
  "learning_rate": 3.960441545911204e-05,
435
- "loss": 1.338,
436
  "step": 61
437
  },
438
  {
439
- "epoch": 0.004862649751965648,
440
- "grad_norm": 9.609498977661133,
441
  "learning_rate": 3.790390522001662e-05,
442
- "loss": 1.5723,
443
  "step": 62
444
  },
445
  {
446
- "epoch": 0.004941079586674771,
447
- "grad_norm": 10.62063980102539,
448
  "learning_rate": 3.6218132209150045e-05,
449
- "loss": 1.6656,
450
  "step": 63
451
  },
452
  {
453
- "epoch": 0.005019509421383894,
454
- "grad_norm": 8.951350212097168,
455
  "learning_rate": 3.4549150281252636e-05,
456
- "loss": 1.2429,
457
  "step": 64
458
  },
459
  {
460
- "epoch": 0.005097939256093018,
461
- "grad_norm": 18.992971420288086,
462
  "learning_rate": 3.289899283371657e-05,
463
- "loss": 3.1177,
464
  "step": 65
465
  },
466
  {
467
- "epoch": 0.005176369090802141,
468
- "grad_norm": 13.394461631774902,
469
  "learning_rate": 3.12696703292044e-05,
470
- "loss": 1.8502,
471
  "step": 66
472
  },
473
  {
474
- "epoch": 0.005254798925511265,
475
- "grad_norm": 12.403133392333984,
476
  "learning_rate": 2.9663167846209998e-05,
477
- "loss": 1.1488,
478
  "step": 67
479
  },
480
  {
481
- "epoch": 0.005333228760220388,
482
- "grad_norm": 7.903024673461914,
483
  "learning_rate": 2.8081442660546125e-05,
484
- "loss": 1.2091,
485
  "step": 68
486
  },
487
  {
488
- "epoch": 0.005411658594929511,
489
- "grad_norm": 11.113040924072266,
490
  "learning_rate": 2.6526421860705473e-05,
491
- "loss": 1.6937,
492
  "step": 69
493
  },
494
  {
495
- "epoch": 0.005490088429638634,
496
- "grad_norm": 15.218375205993652,
497
  "learning_rate": 2.500000000000001e-05,
498
- "loss": 1.9523,
499
  "step": 70
500
  },
501
  {
502
- "epoch": 0.005568518264347758,
503
- "grad_norm": 10.67128849029541,
504
  "learning_rate": 2.350403678833976e-05,
505
- "loss": 1.3664,
506
  "step": 71
507
  },
508
  {
509
- "epoch": 0.005646948099056881,
510
- "grad_norm": 9.806090354919434,
511
  "learning_rate": 2.2040354826462668e-05,
512
- "loss": 1.4883,
513
  "step": 72
514
  },
515
  {
516
- "epoch": 0.005725377933766005,
517
- "grad_norm": 11.227718353271484,
518
  "learning_rate": 2.061073738537635e-05,
519
- "loss": 1.1364,
520
  "step": 73
521
  },
522
  {
523
- "epoch": 0.0058038077684751275,
524
- "grad_norm": 17.53663444519043,
525
  "learning_rate": 1.9216926233717085e-05,
526
- "loss": 1.8685,
527
  "step": 74
528
  },
529
  {
530
- "epoch": 0.005882237603184251,
531
- "grad_norm": 6.851722240447998,
532
  "learning_rate": 1.7860619515673033e-05,
533
- "loss": 1.0575,
534
  "step": 75
535
  },
536
  {
537
- "epoch": 0.0059606674378933746,
538
- "grad_norm": 9.295647621154785,
539
  "learning_rate": 1.6543469682057106e-05,
540
- "loss": 1.0835,
541
  "step": 76
542
  },
543
  {
544
- "epoch": 0.006039097272602498,
545
- "grad_norm": 9.07209300994873,
546
  "learning_rate": 1.526708147705013e-05,
547
- "loss": 1.1428,
548
  "step": 77
549
  },
550
  {
551
- "epoch": 0.006117527107311622,
552
- "grad_norm": 8.919276237487793,
553
  "learning_rate": 1.4033009983067452e-05,
554
- "loss": 1.6744,
555
  "step": 78
556
  },
557
  {
558
- "epoch": 0.006195956942020745,
559
- "grad_norm": 14.970587730407715,
560
  "learning_rate": 1.2842758726130283e-05,
561
- "loss": 1.7913,
562
  "step": 79
563
  },
564
  {
565
- "epoch": 0.006274386776729868,
566
- "grad_norm": 7.171314239501953,
567
  "learning_rate": 1.1697777844051105e-05,
568
- "loss": 1.5177,
569
  "step": 80
570
  },
571
  {
572
- "epoch": 0.006352816611438991,
573
- "grad_norm": 7.902921676635742,
574
  "learning_rate": 1.0599462319663905e-05,
575
- "loss": 1.3037,
576
  "step": 81
577
  },
578
  {
579
- "epoch": 0.006431246446148115,
580
- "grad_norm": 6.716002941131592,
581
  "learning_rate": 9.549150281252633e-06,
582
- "loss": 0.962,
583
  "step": 82
584
  },
585
  {
586
- "epoch": 0.006509676280857238,
587
- "grad_norm": 10.144266128540039,
588
  "learning_rate": 8.548121372247918e-06,
589
- "loss": 1.4771,
590
  "step": 83
591
  },
592
  {
593
- "epoch": 0.006588106115566362,
594
- "grad_norm": 8.199270248413086,
595
  "learning_rate": 7.597595192178702e-06,
596
- "loss": 1.2546,
597
  "step": 84
598
  },
599
  {
600
- "epoch": 0.0066665359502754845,
601
- "grad_norm": 11.108482360839844,
602
  "learning_rate": 6.698729810778065e-06,
603
- "loss": 1.3973,
604
  "step": 85
605
  },
606
  {
607
- "epoch": 0.006744965784984608,
608
- "grad_norm": 8.652900695800781,
609
  "learning_rate": 5.852620357053651e-06,
610
- "loss": 1.3762,
611
  "step": 86
612
  },
613
  {
614
- "epoch": 0.0068233956196937316,
615
- "grad_norm": 14.958466529846191,
616
  "learning_rate": 5.060297685041659e-06,
617
- "loss": 1.7316,
618
  "step": 87
619
  },
620
  {
621
- "epoch": 0.006901825454402855,
622
- "grad_norm": 6.1512227058410645,
623
  "learning_rate": 4.322727117869951e-06,
624
- "loss": 1.1393,
625
  "step": 88
626
  },
627
  {
628
- "epoch": 0.006980255289111979,
629
- "grad_norm": 9.555928230285645,
630
  "learning_rate": 3.6408072716606346e-06,
631
- "loss": 1.3302,
632
  "step": 89
633
  },
634
  {
635
- "epoch": 0.007058685123821101,
636
- "grad_norm": 9.107611656188965,
637
  "learning_rate": 3.0153689607045845e-06,
638
- "loss": 1.2227,
639
  "step": 90
640
  },
641
  {
642
- "epoch": 0.007137114958530225,
643
- "grad_norm": 5.744083881378174,
644
  "learning_rate": 2.4471741852423237e-06,
645
- "loss": 1.0086,
646
  "step": 91
647
  },
648
  {
649
- "epoch": 0.007215544793239348,
650
- "grad_norm": 13.552300453186035,
651
  "learning_rate": 1.9369152030840556e-06,
652
- "loss": 1.7774,
653
  "step": 92
654
  },
655
  {
656
- "epoch": 0.007293974627948472,
657
- "grad_norm": 16.62940788269043,
658
  "learning_rate": 1.4852136862001764e-06,
659
- "loss": 1.7008,
660
  "step": 93
661
  },
662
  {
663
- "epoch": 0.007372404462657595,
664
- "grad_norm": 12.544093132019043,
665
  "learning_rate": 1.0926199633097157e-06,
666
- "loss": 1.6784,
667
  "step": 94
668
  },
669
  {
670
- "epoch": 0.007450834297366718,
671
- "grad_norm": 10.599544525146484,
672
  "learning_rate": 7.596123493895991e-07,
673
- "loss": 1.5247,
674
  "step": 95
675
  },
676
  {
677
- "epoch": 0.0075292641320758415,
678
- "grad_norm": 11.64220142364502,
679
  "learning_rate": 4.865965629214819e-07,
680
- "loss": 1.5947,
681
  "step": 96
682
  },
683
  {
684
- "epoch": 0.007607693966784965,
685
- "grad_norm": 7.192104339599609,
686
  "learning_rate": 2.7390523158633554e-07,
687
- "loss": 1.3408,
688
  "step": 97
689
  },
690
  {
691
- "epoch": 0.0076861238014940885,
692
- "grad_norm": 12.79780101776123,
693
  "learning_rate": 1.2179748700879012e-07,
694
- "loss": 2.0013,
695
  "step": 98
696
  },
697
  {
698
- "epoch": 0.007764553636203212,
699
- "grad_norm": 8.97789478302002,
700
  "learning_rate": 3.04586490452119e-08,
701
- "loss": 1.5413,
702
  "step": 99
703
  },
704
  {
705
- "epoch": 0.007842983470912335,
706
- "grad_norm": 13.65710735321045,
707
  "learning_rate": 0.0,
708
- "loss": 2.3571,
709
  "step": 100
710
  }
711
  ],
@@ -726,7 +726,7 @@
726
  "attributes": {}
727
  }
728
  },
729
- "total_flos": 9442519318265856.0,
730
  "train_batch_size": 1,
731
  "trial_name": null,
732
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.007763824459928961,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 7.763824459928961e-05,
13
+ "grad_norm": 15.173023223876953,
14
  "learning_rate": 1e-05,
15
+ "loss": 3.6366,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.00015527648919857922,
20
+ "grad_norm": 18.251972198486328,
21
  "learning_rate": 2e-05,
22
+ "loss": 5.1138,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.00023291473379786882,
27
+ "grad_norm": 18.701696395874023,
28
  "learning_rate": 3e-05,
29
+ "loss": 4.9591,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.00031055297839715845,
34
+ "grad_norm": 19.8187198638916,
35
  "learning_rate": 4e-05,
36
+ "loss": 4.3723,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.00038819122299644805,
41
+ "grad_norm": 18.29945945739746,
42
  "learning_rate": 5e-05,
43
+ "loss": 5.9455,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.00046582946759573764,
48
+ "grad_norm": 32.43329620361328,
49
  "learning_rate": 6e-05,
50
+ "loss": 8.4791,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.0005434677121950273,
55
+ "grad_norm": 20.790515899658203,
56
  "learning_rate": 7e-05,
57
+ "loss": 5.3114,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.0006211059567943169,
62
+ "grad_norm": 17.102371215820312,
63
  "learning_rate": 8e-05,
64
+ "loss": 4.3752,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.0006987442013936065,
69
+ "grad_norm": 16.773916244506836,
70
  "learning_rate": 9e-05,
71
+ "loss": 4.1246,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.0007763824459928961,
76
+ "grad_norm": 12.671195030212402,
77
  "learning_rate": 0.0001,
78
+ "loss": 2.7808,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.0008540206905921857,
83
+ "grad_norm": 18.52830696105957,
84
  "learning_rate": 9.99695413509548e-05,
85
+ "loss": 3.1277,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.0009316589351914753,
90
+ "grad_norm": 18.569049835205078,
91
  "learning_rate": 9.987820251299122e-05,
92
+ "loss": 3.1262,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.0010092971797907649,
97
+ "grad_norm": 19.517616271972656,
98
  "learning_rate": 9.972609476841367e-05,
99
+ "loss": 2.7361,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.0010869354243900546,
104
+ "grad_norm": 22.662939071655273,
105
  "learning_rate": 9.951340343707852e-05,
106
+ "loss": 2.5586,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.001164573668989344,
111
+ "grad_norm": 35.87885665893555,
112
  "learning_rate": 9.924038765061042e-05,
113
+ "loss": 2.4555,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.0012422119135886338,
118
+ "grad_norm": 18.5505428314209,
119
  "learning_rate": 9.890738003669029e-05,
120
+ "loss": 1.974,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.0013198501581879233,
125
+ "grad_norm": 13.761811256408691,
126
  "learning_rate": 9.851478631379982e-05,
127
+ "loss": 1.4696,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.001397488402787213,
132
+ "grad_norm": 14.877791404724121,
133
  "learning_rate": 9.806308479691595e-05,
134
+ "loss": 1.7532,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.0014751266473865027,
139
+ "grad_norm": 17.029918670654297,
140
  "learning_rate": 9.755282581475769e-05,
141
+ "loss": 1.7558,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.0015527648919857922,
146
+ "grad_norm": 17.873872756958008,
147
  "learning_rate": 9.698463103929542e-05,
148
+ "loss": 1.7163,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.0016304031365850819,
153
+ "grad_norm": 10.733744621276855,
154
  "learning_rate": 9.635919272833938e-05,
155
+ "loss": 1.2642,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 0.0017080413811843714,
160
+ "grad_norm": 16.097576141357422,
161
  "learning_rate": 9.567727288213005e-05,
162
+ "loss": 1.9344,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 0.001785679625783661,
167
+ "grad_norm": 14.290857315063477,
168
  "learning_rate": 9.493970231495835e-05,
169
+ "loss": 1.3434,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 0.0018633178703829506,
174
+ "grad_norm": 12.831293106079102,
175
  "learning_rate": 9.414737964294636e-05,
176
+ "loss": 1.7986,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 0.0019409561149822403,
181
+ "grad_norm": 25.20969581604004,
182
  "learning_rate": 9.330127018922194e-05,
183
+ "loss": 2.0613,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 0.0020185943595815298,
188
+ "grad_norm": 11.39193344116211,
189
  "learning_rate": 9.24024048078213e-05,
190
+ "loss": 1.3613,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 0.0020962326041808195,
195
+ "grad_norm": 20.78348731994629,
196
  "learning_rate": 9.145187862775209e-05,
197
+ "loss": 2.0145,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 0.002173870848780109,
202
+ "grad_norm": 13.54377555847168,
203
  "learning_rate": 9.045084971874738e-05,
204
+ "loss": 1.6814,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 0.002251509093379399,
209
+ "grad_norm": 9.71171760559082,
210
  "learning_rate": 8.940053768033609e-05,
211
+ "loss": 1.2467,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 0.002329147337978688,
216
+ "grad_norm": 10.445706367492676,
217
  "learning_rate": 8.83022221559489e-05,
218
+ "loss": 1.6891,
219
  "step": 30
220
  },
221
  {
222
+ "epoch": 0.002406785582577978,
223
+ "grad_norm": 16.003211975097656,
224
  "learning_rate": 8.715724127386972e-05,
225
+ "loss": 2.4213,
226
  "step": 31
227
  },
228
  {
229
+ "epoch": 0.0024844238271772676,
230
+ "grad_norm": 12.86924934387207,
231
  "learning_rate": 8.596699001693255e-05,
232
+ "loss": 1.8616,
233
  "step": 32
234
  },
235
  {
236
+ "epoch": 0.0025620620717765573,
237
+ "grad_norm": 8.061415672302246,
238
  "learning_rate": 8.473291852294987e-05,
239
+ "loss": 1.5254,
240
  "step": 33
241
  },
242
  {
243
+ "epoch": 0.0026397003163758466,
244
+ "grad_norm": 21.588794708251953,
245
  "learning_rate": 8.345653031794292e-05,
246
+ "loss": 2.4648,
247
  "step": 34
248
  },
249
  {
250
+ "epoch": 0.0027173385609751363,
251
+ "grad_norm": 7.5725603103637695,
252
  "learning_rate": 8.213938048432697e-05,
253
+ "loss": 1.3198,
254
  "step": 35
255
  },
256
  {
257
+ "epoch": 0.002794976805574426,
258
+ "grad_norm": 11.222454071044922,
259
  "learning_rate": 8.07830737662829e-05,
260
+ "loss": 1.7226,
261
  "step": 36
262
  },
263
  {
264
+ "epoch": 0.0028726150501737157,
265
+ "grad_norm": 12.333401679992676,
266
  "learning_rate": 7.938926261462366e-05,
267
+ "loss": 1.492,
268
  "step": 37
269
  },
270
  {
271
+ "epoch": 0.0029502532947730054,
272
+ "grad_norm": 14.928689002990723,
273
  "learning_rate": 7.795964517353735e-05,
274
+ "loss": 2.1283,
275
  "step": 38
276
  },
277
  {
278
+ "epoch": 0.0030278915393722947,
279
+ "grad_norm": 11.8944091796875,
280
  "learning_rate": 7.649596321166024e-05,
281
+ "loss": 1.4857,
282
  "step": 39
283
  },
284
  {
285
+ "epoch": 0.0031055297839715844,
286
+ "grad_norm": 9.366742134094238,
287
  "learning_rate": 7.500000000000001e-05,
288
+ "loss": 1.5347,
289
  "step": 40
290
  },
291
  {
292
+ "epoch": 0.003183168028570874,
293
+ "grad_norm": 9.03040599822998,
294
  "learning_rate": 7.347357813929454e-05,
295
+ "loss": 0.9668,
296
  "step": 41
297
  },
298
  {
299
+ "epoch": 0.0032608062731701638,
300
+ "grad_norm": 12.494256973266602,
301
  "learning_rate": 7.191855733945387e-05,
302
+ "loss": 1.4275,
303
  "step": 42
304
  },
305
  {
306
+ "epoch": 0.003338444517769453,
307
+ "grad_norm": 14.382235527038574,
308
  "learning_rate": 7.033683215379002e-05,
309
+ "loss": 1.625,
310
  "step": 43
311
  },
312
  {
313
+ "epoch": 0.0034160827623687428,
314
+ "grad_norm": 9.916275024414062,
315
  "learning_rate": 6.873032967079561e-05,
316
+ "loss": 1.4007,
317
  "step": 44
318
  },
319
  {
320
+ "epoch": 0.0034937210069680325,
321
+ "grad_norm": 7.911404132843018,
322
  "learning_rate": 6.710100716628344e-05,
323
+ "loss": 1.344,
324
  "step": 45
325
  },
326
  {
327
+ "epoch": 0.003571359251567322,
328
+ "grad_norm": 20.48213768005371,
329
  "learning_rate": 6.545084971874738e-05,
330
+ "loss": 1.4345,
331
  "step": 46
332
  },
333
  {
334
+ "epoch": 0.003648997496166612,
335
+ "grad_norm": 17.24418830871582,
336
  "learning_rate": 6.378186779084995e-05,
337
+ "loss": 2.0092,
338
  "step": 47
339
  },
340
  {
341
+ "epoch": 0.003726635740765901,
342
+ "grad_norm": 18.877300262451172,
343
  "learning_rate": 6.209609477998338e-05,
344
+ "loss": 1.8605,
345
  "step": 48
346
  },
347
  {
348
+ "epoch": 0.003804273985365191,
349
+ "grad_norm": 15.295802116394043,
350
  "learning_rate": 6.0395584540887963e-05,
351
+ "loss": 1.7475,
352
  "step": 49
353
  },
354
  {
355
+ "epoch": 0.0038819122299644806,
356
+ "grad_norm": 9.2744779586792,
357
  "learning_rate": 5.868240888334653e-05,
358
+ "loss": 1.5498,
359
  "step": 50
360
  },
361
  {
362
+ "epoch": 0.00395955047456377,
363
+ "grad_norm": 12.479969024658203,
364
  "learning_rate": 5.695865504800327e-05,
365
+ "loss": 1.6775,
366
  "step": 51
367
  },
368
  {
369
+ "epoch": 0.0040371887191630595,
370
+ "grad_norm": 19.23907470703125,
371
  "learning_rate": 5.522642316338268e-05,
372
+ "loss": 1.7539,
373
  "step": 52
374
  },
375
  {
376
+ "epoch": 0.00411482696376235,
377
+ "grad_norm": 12.233394622802734,
378
  "learning_rate": 5.348782368720626e-05,
379
+ "loss": 1.6477,
380
  "step": 53
381
  },
382
  {
383
+ "epoch": 0.004192465208361639,
384
+ "grad_norm": 7.942619800567627,
385
  "learning_rate": 5.174497483512506e-05,
386
+ "loss": 1.3514,
387
  "step": 54
388
  },
389
  {
390
+ "epoch": 0.004270103452960928,
391
+ "grad_norm": 7.320205211639404,
392
  "learning_rate": 5e-05,
393
+ "loss": 1.0451,
394
  "step": 55
395
  },
396
  {
397
+ "epoch": 0.004347741697560218,
398
+ "grad_norm": 18.8829288482666,
399
  "learning_rate": 4.825502516487497e-05,
400
+ "loss": 2.23,
401
  "step": 56
402
  },
403
  {
404
+ "epoch": 0.004425379942159508,
405
+ "grad_norm": 11.247135162353516,
406
  "learning_rate": 4.6512176312793736e-05,
407
+ "loss": 1.361,
408
  "step": 57
409
  },
410
  {
411
+ "epoch": 0.004503018186758798,
412
+ "grad_norm": 17.50565528869629,
413
  "learning_rate": 4.477357683661734e-05,
414
+ "loss": 1.7137,
415
  "step": 58
416
  },
417
  {
418
+ "epoch": 0.004580656431358087,
419
+ "grad_norm": 15.826338768005371,
420
  "learning_rate": 4.3041344951996746e-05,
421
+ "loss": 2.1369,
422
  "step": 59
423
  },
424
  {
425
+ "epoch": 0.004658294675957376,
426
+ "grad_norm": 12.451811790466309,
427
  "learning_rate": 4.131759111665349e-05,
428
+ "loss": 1.827,
429
  "step": 60
430
  },
431
  {
432
+ "epoch": 0.0047359329205566665,
433
+ "grad_norm": 7.233865737915039,
434
  "learning_rate": 3.960441545911204e-05,
435
+ "loss": 1.2302,
436
  "step": 61
437
  },
438
  {
439
+ "epoch": 0.004813571165155956,
440
+ "grad_norm": 15.002647399902344,
441
  "learning_rate": 3.790390522001662e-05,
442
+ "loss": 1.7143,
443
  "step": 62
444
  },
445
  {
446
+ "epoch": 0.004891209409755245,
447
+ "grad_norm": 12.043486595153809,
448
  "learning_rate": 3.6218132209150045e-05,
449
+ "loss": 1.6653,
450
  "step": 63
451
  },
452
  {
453
+ "epoch": 0.004968847654354535,
454
+ "grad_norm": 11.110363960266113,
455
  "learning_rate": 3.4549150281252636e-05,
456
+ "loss": 1.5404,
457
  "step": 64
458
  },
459
  {
460
+ "epoch": 0.0050464858989538244,
461
+ "grad_norm": 9.822565078735352,
462
  "learning_rate": 3.289899283371657e-05,
463
+ "loss": 1.4809,
464
  "step": 65
465
  },
466
  {
467
+ "epoch": 0.005124124143553115,
468
+ "grad_norm": 11.402060508728027,
469
  "learning_rate": 3.12696703292044e-05,
470
+ "loss": 1.6257,
471
  "step": 66
472
  },
473
  {
474
+ "epoch": 0.005201762388152404,
475
+ "grad_norm": 9.092423439025879,
476
  "learning_rate": 2.9663167846209998e-05,
477
+ "loss": 1.1908,
478
  "step": 67
479
  },
480
  {
481
+ "epoch": 0.005279400632751693,
482
+ "grad_norm": 16.00357437133789,
483
  "learning_rate": 2.8081442660546125e-05,
484
+ "loss": 2.0899,
485
  "step": 68
486
  },
487
  {
488
+ "epoch": 0.005357038877350983,
489
+ "grad_norm": 11.897706031799316,
490
  "learning_rate": 2.6526421860705473e-05,
491
+ "loss": 1.606,
492
  "step": 69
493
  },
494
  {
495
+ "epoch": 0.0054346771219502725,
496
+ "grad_norm": 12.209402084350586,
497
  "learning_rate": 2.500000000000001e-05,
498
+ "loss": 1.6851,
499
  "step": 70
500
  },
501
  {
502
+ "epoch": 0.005512315366549563,
503
+ "grad_norm": 10.662795066833496,
504
  "learning_rate": 2.350403678833976e-05,
505
+ "loss": 1.7668,
506
  "step": 71
507
  },
508
  {
509
+ "epoch": 0.005589953611148852,
510
+ "grad_norm": 8.787954330444336,
511
  "learning_rate": 2.2040354826462668e-05,
512
+ "loss": 1.4363,
513
  "step": 72
514
  },
515
  {
516
+ "epoch": 0.005667591855748141,
517
+ "grad_norm": 7.407290935516357,
518
  "learning_rate": 2.061073738537635e-05,
519
+ "loss": 1.4621,
520
  "step": 73
521
  },
522
  {
523
+ "epoch": 0.005745230100347431,
524
+ "grad_norm": 12.64970588684082,
525
  "learning_rate": 1.9216926233717085e-05,
526
+ "loss": 1.5502,
527
  "step": 74
528
  },
529
  {
530
+ "epoch": 0.005822868344946721,
531
+ "grad_norm": 15.698612213134766,
532
  "learning_rate": 1.7860619515673033e-05,
533
+ "loss": 1.7476,
534
  "step": 75
535
  },
536
  {
537
+ "epoch": 0.005900506589546011,
538
+ "grad_norm": 9.95603084564209,
539
  "learning_rate": 1.6543469682057106e-05,
540
+ "loss": 1.2277,
541
  "step": 76
542
  },
543
  {
544
+ "epoch": 0.0059781448341453,
545
+ "grad_norm": 21.470365524291992,
546
  "learning_rate": 1.526708147705013e-05,
547
+ "loss": 2.0358,
548
  "step": 77
549
  },
550
  {
551
+ "epoch": 0.006055783078744589,
552
+ "grad_norm": 8.216337203979492,
553
  "learning_rate": 1.4033009983067452e-05,
554
+ "loss": 0.8807,
555
  "step": 78
556
  },
557
  {
558
+ "epoch": 0.0061334213233438795,
559
+ "grad_norm": 8.098356246948242,
560
  "learning_rate": 1.2842758726130283e-05,
561
+ "loss": 1.2345,
562
  "step": 79
563
  },
564
  {
565
+ "epoch": 0.006211059567943169,
566
+ "grad_norm": 9.083595275878906,
567
  "learning_rate": 1.1697777844051105e-05,
568
+ "loss": 1.5091,
569
  "step": 80
570
  },
571
  {
572
+ "epoch": 0.006288697812542458,
573
+ "grad_norm": 16.64013671875,
574
  "learning_rate": 1.0599462319663905e-05,
575
+ "loss": 2.3048,
576
  "step": 81
577
  },
578
  {
579
+ "epoch": 0.006366336057141748,
580
+ "grad_norm": 8.369383811950684,
581
  "learning_rate": 9.549150281252633e-06,
582
+ "loss": 1.0547,
583
  "step": 82
584
  },
585
  {
586
+ "epoch": 0.006443974301741037,
587
+ "grad_norm": 7.26373291015625,
588
  "learning_rate": 8.548121372247918e-06,
589
+ "loss": 1.1388,
590
  "step": 83
591
  },
592
  {
593
+ "epoch": 0.0065216125463403276,
594
+ "grad_norm": 13.645014762878418,
595
  "learning_rate": 7.597595192178702e-06,
596
+ "loss": 1.0994,
597
  "step": 84
598
  },
599
  {
600
+ "epoch": 0.006599250790939617,
601
+ "grad_norm": 12.321860313415527,
602
  "learning_rate": 6.698729810778065e-06,
603
+ "loss": 1.5612,
604
  "step": 85
605
  },
606
  {
607
+ "epoch": 0.006676889035538906,
608
+ "grad_norm": 13.168002128601074,
609
  "learning_rate": 5.852620357053651e-06,
610
+ "loss": 1.3202,
611
  "step": 86
612
  },
613
  {
614
+ "epoch": 0.006754527280138196,
615
+ "grad_norm": 7.88428258895874,
616
  "learning_rate": 5.060297685041659e-06,
617
+ "loss": 1.0294,
618
  "step": 87
619
  },
620
  {
621
+ "epoch": 0.0068321655247374855,
622
+ "grad_norm": 10.311480522155762,
623
  "learning_rate": 4.322727117869951e-06,
624
+ "loss": 1.3795,
625
  "step": 88
626
  },
627
  {
628
+ "epoch": 0.006909803769336776,
629
+ "grad_norm": 10.419498443603516,
630
  "learning_rate": 3.6408072716606346e-06,
631
+ "loss": 1.1515,
632
  "step": 89
633
  },
634
  {
635
+ "epoch": 0.006987442013936065,
636
+ "grad_norm": 7.458017349243164,
637
  "learning_rate": 3.0153689607045845e-06,
638
+ "loss": 1.0061,
639
  "step": 90
640
  },
641
  {
642
+ "epoch": 0.007065080258535354,
643
+ "grad_norm": 14.859030723571777,
644
  "learning_rate": 2.4471741852423237e-06,
645
+ "loss": 2.0989,
646
  "step": 91
647
  },
648
  {
649
+ "epoch": 0.007142718503134644,
650
+ "grad_norm": 10.113492012023926,
651
  "learning_rate": 1.9369152030840556e-06,
652
+ "loss": 1.0713,
653
  "step": 92
654
  },
655
  {
656
+ "epoch": 0.007220356747733934,
657
+ "grad_norm": 12.123403549194336,
658
  "learning_rate": 1.4852136862001764e-06,
659
+ "loss": 1.1804,
660
  "step": 93
661
  },
662
  {
663
+ "epoch": 0.007297994992333224,
664
+ "grad_norm": 8.072012901306152,
665
  "learning_rate": 1.0926199633097157e-06,
666
+ "loss": 1.4086,
667
  "step": 94
668
  },
669
  {
670
+ "epoch": 0.007375633236932513,
671
+ "grad_norm": 13.728706359863281,
672
  "learning_rate": 7.596123493895991e-07,
673
+ "loss": 1.728,
674
  "step": 95
675
  },
676
  {
677
+ "epoch": 0.007453271481531802,
678
+ "grad_norm": 14.608238220214844,
679
  "learning_rate": 4.865965629214819e-07,
680
+ "loss": 1.2751,
681
  "step": 96
682
  },
683
  {
684
+ "epoch": 0.0075309097261310924,
685
+ "grad_norm": 10.540985107421875,
686
  "learning_rate": 2.7390523158633554e-07,
687
+ "loss": 1.1588,
688
  "step": 97
689
  },
690
  {
691
+ "epoch": 0.007608547970730382,
692
+ "grad_norm": 8.851415634155273,
693
  "learning_rate": 1.2179748700879012e-07,
694
+ "loss": 1.4473,
695
  "step": 98
696
  },
697
  {
698
+ "epoch": 0.007686186215329671,
699
+ "grad_norm": 9.513968467712402,
700
  "learning_rate": 3.04586490452119e-08,
701
+ "loss": 1.1629,
702
  "step": 99
703
  },
704
  {
705
+ "epoch": 0.007763824459928961,
706
+ "grad_norm": 14.992069244384766,
707
  "learning_rate": 0.0,
708
+ "loss": 1.3815,
709
  "step": 100
710
  }
711
  ],
 
726
  "attributes": {}
727
  }
728
  },
729
+ "total_flos": 9376794125795328.0,
730
  "train_batch_size": 1,
731
  "trial_name": null,
732
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe8eb4a561fe614bf9dfc40cf2e5c3a17aeb1eabb7efb2bc9d4d362489af9314
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f23284b418e64ba8eb6a0fb5f9fbe8b5e6a20ebb48a7ddd92ce031e4b2486d95
3
  size 6776