error577 commited on
Commit
37b2614
·
verified ·
1 Parent(s): b5774f4

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -16,17 +16,17 @@
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "k_proj",
25
- "up_proj",
26
- "v_proj",
27
  "down_proj",
 
 
 
28
  "gate_proj",
29
- "o_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 32,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
 
23
  "down_proj",
24
+ "k_proj",
25
+ "o_proj",
26
+ "q_proj",
27
  "gate_proj",
28
+ "up_proj",
29
+ "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f2c54b3cff7229bba3a337321576ca3fbedcde46f10b6c700245830c01cb495
3
- size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43756c87d3c353cd42eeb6185a59e755a36f9b382d2d1141fdd67b110a8adbc4
3
+ size 319876032
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79bc646c7471cb3943c0b5456f615d091883e21fcd695a7c0aa6311ff2dd361a
3
- size 41119636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2df7d962df4e50580d38c384c9f5830d70e0b87aebfd1ee3eca672bf2f43f95
3
+ size 639908666
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5da8984c55f90689ec5dc6254808c095ed22f24233bafba7be5034f696b9c85
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58ae2ba232b823e0685960e3c048b41a588f00aea0d1bea73208b1f786b5c9f9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9299ec7d0989f843c66221f6a5f12c76f22cfda8e3a2897dd9a527db5b37854
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f48d04a21be75a42496761f8a6d10bd6bbb09a3805770c41b54fa6f987df24ff
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,756 +1,407 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.2,
5
  "eval_steps": 50,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.002,
13
- "grad_norm": 0.3986969590187073,
14
- "learning_rate": 0.0001,
15
- "loss": 2.7769,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.002,
20
  "eval_loss": 3.0125324726104736,
21
- "eval_runtime": 4.8013,
22
- "eval_samples_per_second": 4.374,
23
- "eval_steps_per_second": 4.374,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.004,
28
- "grad_norm": 0.5986809730529785,
29
- "learning_rate": 0.0002,
30
- "loss": 2.9521,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.006,
35
- "grad_norm": 0.595142662525177,
36
- "learning_rate": 0.0003,
37
- "loss": 2.955,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.008,
42
- "grad_norm": 0.7013932466506958,
43
- "learning_rate": 0.0004,
44
- "loss": 2.9037,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 0.01,
49
- "grad_norm": 1.5847638845443726,
50
- "learning_rate": 0.0005,
51
- "loss": 2.9706,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 0.012,
56
- "grad_norm": 1.6309813261032104,
57
- "learning_rate": 0.0006,
58
- "loss": 2.75,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.014,
63
- "grad_norm": 1.3442208766937256,
64
- "learning_rate": 0.0007,
65
- "loss": 2.5161,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.016,
70
- "grad_norm": 0.900488018989563,
71
- "learning_rate": 0.0008,
72
- "loss": 2.2906,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.018,
77
- "grad_norm": 2.340869903564453,
78
- "learning_rate": 0.0009000000000000001,
79
- "loss": 2.6079,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 0.02,
84
- "grad_norm": 2.987302303314209,
85
- "learning_rate": 0.001,
86
- "loss": 2.5506,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.022,
91
- "grad_norm": 1.844685673713684,
92
- "learning_rate": 0.0009996954135095479,
93
- "loss": 2.7146,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.024,
98
- "grad_norm": 0.9662850499153137,
99
- "learning_rate": 0.0009987820251299122,
100
- "loss": 2.6323,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.026,
105
- "grad_norm": 3.0721042156219482,
106
- "learning_rate": 0.0009972609476841367,
107
- "loss": 2.1718,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 0.028,
112
- "grad_norm": 1.0009405612945557,
113
- "learning_rate": 0.0009951340343707852,
114
- "loss": 2.6348,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 0.03,
119
- "grad_norm": 14.435264587402344,
120
- "learning_rate": 0.000992403876506104,
121
- "loss": 2.5352,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 0.032,
126
- "grad_norm": 5.060039520263672,
127
- "learning_rate": 0.0009890738003669028,
128
- "loss": 2.708,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 0.034,
133
- "grad_norm": 1.6351608037948608,
134
- "learning_rate": 0.0009851478631379982,
135
- "loss": 2.3905,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 0.036,
140
- "grad_norm": 2.9582386016845703,
141
- "learning_rate": 0.0009806308479691594,
142
- "loss": 2.5147,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 0.038,
147
- "grad_norm": 1.8205921649932861,
148
- "learning_rate": 0.0009755282581475768,
149
- "loss": 2.766,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 0.04,
154
- "grad_norm": 1.1158825159072876,
155
- "learning_rate": 0.0009698463103929542,
156
- "loss": 2.7895,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.042,
161
- "grad_norm": 1.1689060926437378,
162
- "learning_rate": 0.0009635919272833937,
163
- "loss": 2.6373,
164
  "step": 21
165
  },
166
  {
167
- "epoch": 0.044,
168
- "grad_norm": 0.8205438256263733,
169
- "learning_rate": 0.0009567727288213005,
170
- "loss": 2.4038,
171
  "step": 22
172
  },
173
  {
174
- "epoch": 0.046,
175
- "grad_norm": 1.2794568538665771,
176
- "learning_rate": 0.0009493970231495835,
177
- "loss": 2.3676,
178
  "step": 23
179
  },
180
  {
181
- "epoch": 0.048,
182
- "grad_norm": 0.822256863117218,
183
- "learning_rate": 0.0009414737964294635,
184
- "loss": 2.327,
185
  "step": 24
186
  },
187
  {
188
- "epoch": 0.05,
189
- "grad_norm": 1.986864447593689,
190
- "learning_rate": 0.0009330127018922195,
191
- "loss": 2.4431,
192
  "step": 25
193
  },
194
  {
195
- "epoch": 0.052,
196
- "grad_norm": 3.7959301471710205,
197
- "learning_rate": 0.0009240240480782129,
198
- "loss": 2.6657,
199
  "step": 26
200
  },
201
  {
202
- "epoch": 0.054,
203
- "grad_norm": 2.489267587661743,
204
- "learning_rate": 0.0009145187862775209,
205
- "loss": 2.5005,
206
  "step": 27
207
  },
208
  {
209
- "epoch": 0.056,
210
- "grad_norm": 2.1583516597747803,
211
- "learning_rate": 0.0009045084971874737,
212
- "loss": 2.5402,
213
  "step": 28
214
  },
215
  {
216
- "epoch": 0.058,
217
- "grad_norm": 4.524465084075928,
218
- "learning_rate": 0.0008940053768033609,
219
- "loss": 2.2461,
220
  "step": 29
221
  },
222
  {
223
- "epoch": 0.06,
224
- "grad_norm": 1.3595800399780273,
225
- "learning_rate": 0.000883022221559489,
226
- "loss": 2.331,
227
  "step": 30
228
  },
229
  {
230
- "epoch": 0.062,
231
- "grad_norm": 0.9844056367874146,
232
- "learning_rate": 0.0008715724127386971,
233
- "loss": 2.3781,
234
  "step": 31
235
  },
236
  {
237
- "epoch": 0.064,
238
- "grad_norm": 1.117148518562317,
239
- "learning_rate": 0.0008596699001693256,
240
- "loss": 2.4258,
241
  "step": 32
242
  },
243
  {
244
- "epoch": 0.066,
245
- "grad_norm": 0.7900739312171936,
246
- "learning_rate": 0.0008473291852294987,
247
- "loss": 2.437,
248
  "step": 33
249
  },
250
  {
251
- "epoch": 0.068,
252
- "grad_norm": 0.8672456741333008,
253
- "learning_rate": 0.0008345653031794292,
254
- "loss": 2.8025,
255
  "step": 34
256
  },
257
  {
258
- "epoch": 0.07,
259
- "grad_norm": 0.816504716873169,
260
- "learning_rate": 0.0008213938048432696,
261
- "loss": 2.5078,
262
  "step": 35
263
  },
264
  {
265
- "epoch": 0.072,
266
- "grad_norm": 1.0574641227722168,
267
- "learning_rate": 0.0008078307376628291,
268
- "loss": 2.6408,
269
  "step": 36
270
  },
271
  {
272
- "epoch": 0.074,
273
- "grad_norm": 0.6753240823745728,
274
- "learning_rate": 0.0007938926261462366,
275
- "loss": 2.2858,
276
  "step": 37
277
  },
278
  {
279
- "epoch": 0.076,
280
- "grad_norm": 0.9166250824928284,
281
- "learning_rate": 0.0007795964517353734,
282
- "loss": 2.7091,
283
  "step": 38
284
  },
285
  {
286
- "epoch": 0.078,
287
- "grad_norm": 0.9022424221038818,
288
- "learning_rate": 0.0007649596321166025,
289
- "loss": 2.6459,
290
  "step": 39
291
  },
292
  {
293
- "epoch": 0.08,
294
- "grad_norm": 0.7723848223686218,
295
- "learning_rate": 0.00075,
296
- "loss": 2.4329,
297
  "step": 40
298
  },
299
  {
300
- "epoch": 0.082,
301
- "grad_norm": 0.8669672012329102,
302
- "learning_rate": 0.0007347357813929454,
303
- "loss": 2.3661,
304
  "step": 41
305
  },
306
  {
307
- "epoch": 0.084,
308
- "grad_norm": 0.9701873660087585,
309
- "learning_rate": 0.0007191855733945387,
310
- "loss": 2.6723,
311
  "step": 42
312
  },
313
  {
314
- "epoch": 0.086,
315
- "grad_norm": 0.8038893342018127,
316
- "learning_rate": 0.0007033683215379002,
317
- "loss": 2.7652,
318
  "step": 43
319
  },
320
  {
321
- "epoch": 0.088,
322
- "grad_norm": 0.6812747716903687,
323
- "learning_rate": 0.0006873032967079561,
324
- "loss": 2.4019,
325
  "step": 44
326
  },
327
  {
328
- "epoch": 0.09,
329
- "grad_norm": 0.8909493088722229,
330
- "learning_rate": 0.0006710100716628344,
331
- "loss": 2.349,
332
  "step": 45
333
  },
334
  {
335
- "epoch": 0.092,
336
- "grad_norm": 0.9887206554412842,
337
- "learning_rate": 0.0006545084971874737,
338
- "loss": 2.5577,
339
  "step": 46
340
  },
341
  {
342
- "epoch": 0.094,
343
- "grad_norm": 0.7749077081680298,
344
- "learning_rate": 0.0006378186779084996,
345
- "loss": 2.2903,
346
  "step": 47
347
  },
348
  {
349
- "epoch": 0.096,
350
- "grad_norm": 1.0913500785827637,
351
- "learning_rate": 0.0006209609477998338,
352
- "loss": 2.3697,
353
  "step": 48
354
  },
355
  {
356
- "epoch": 0.098,
357
- "grad_norm": 0.894119381904602,
358
- "learning_rate": 0.0006039558454088796,
359
- "loss": 2.5167,
360
  "step": 49
361
  },
362
  {
363
- "epoch": 0.1,
364
- "grad_norm": 1.159035325050354,
365
- "learning_rate": 0.0005868240888334653,
366
- "loss": 2.4637,
367
  "step": 50
368
  },
369
  {
370
- "epoch": 0.1,
371
- "eval_loss": 2.5838444232940674,
372
- "eval_runtime": 4.8707,
373
- "eval_samples_per_second": 4.311,
374
- "eval_steps_per_second": 4.311,
375
  "step": 50
376
- },
377
- {
378
- "epoch": 0.102,
379
- "grad_norm": 0.6844251751899719,
380
- "learning_rate": 0.0005695865504800327,
381
- "loss": 2.4118,
382
- "step": 51
383
- },
384
- {
385
- "epoch": 0.104,
386
- "grad_norm": 1.1709848642349243,
387
- "learning_rate": 0.0005522642316338268,
388
- "loss": 2.444,
389
- "step": 52
390
- },
391
- {
392
- "epoch": 0.106,
393
- "grad_norm": 0.9435467720031738,
394
- "learning_rate": 0.0005348782368720626,
395
- "loss": 2.5568,
396
- "step": 53
397
- },
398
- {
399
- "epoch": 0.108,
400
- "grad_norm": 1.0800719261169434,
401
- "learning_rate": 0.0005174497483512506,
402
- "loss": 2.5766,
403
- "step": 54
404
- },
405
- {
406
- "epoch": 0.11,
407
- "grad_norm": 1.001356840133667,
408
- "learning_rate": 0.0005,
409
- "loss": 2.2205,
410
- "step": 55
411
- },
412
- {
413
- "epoch": 0.112,
414
- "grad_norm": 1.4582829475402832,
415
- "learning_rate": 0.0004825502516487497,
416
- "loss": 2.7271,
417
- "step": 56
418
- },
419
- {
420
- "epoch": 0.114,
421
- "grad_norm": 0.8312236666679382,
422
- "learning_rate": 0.00046512176312793734,
423
- "loss": 2.3204,
424
- "step": 57
425
- },
426
- {
427
- "epoch": 0.116,
428
- "grad_norm": 1.2127161026000977,
429
- "learning_rate": 0.00044773576836617336,
430
- "loss": 2.0169,
431
- "step": 58
432
- },
433
- {
434
- "epoch": 0.118,
435
- "grad_norm": 1.6428215503692627,
436
- "learning_rate": 0.0004304134495199674,
437
- "loss": 2.4521,
438
- "step": 59
439
- },
440
- {
441
- "epoch": 0.12,
442
- "grad_norm": 1.7682443857192993,
443
- "learning_rate": 0.00041317591116653486,
444
- "loss": 2.6753,
445
- "step": 60
446
- },
447
- {
448
- "epoch": 0.122,
449
- "grad_norm": 1.0919681787490845,
450
- "learning_rate": 0.0003960441545911204,
451
- "loss": 2.4022,
452
- "step": 61
453
- },
454
- {
455
- "epoch": 0.124,
456
- "grad_norm": 2.5304136276245117,
457
- "learning_rate": 0.0003790390522001662,
458
- "loss": 2.4325,
459
- "step": 62
460
- },
461
- {
462
- "epoch": 0.126,
463
- "grad_norm": 1.1737953424453735,
464
- "learning_rate": 0.00036218132209150044,
465
- "loss": 2.2653,
466
- "step": 63
467
- },
468
- {
469
- "epoch": 0.128,
470
- "grad_norm": 0.7943472862243652,
471
- "learning_rate": 0.00034549150281252633,
472
- "loss": 2.6079,
473
- "step": 64
474
- },
475
- {
476
- "epoch": 0.13,
477
- "grad_norm": 1.3269349336624146,
478
- "learning_rate": 0.0003289899283371657,
479
- "loss": 2.3745,
480
- "step": 65
481
- },
482
- {
483
- "epoch": 0.132,
484
- "grad_norm": 0.8898394107818604,
485
- "learning_rate": 0.00031269670329204396,
486
- "loss": 2.3862,
487
- "step": 66
488
- },
489
- {
490
- "epoch": 0.134,
491
- "grad_norm": 0.8309778571128845,
492
- "learning_rate": 0.0002966316784621,
493
- "loss": 2.5131,
494
- "step": 67
495
- },
496
- {
497
- "epoch": 0.136,
498
- "grad_norm": 1.2103646993637085,
499
- "learning_rate": 0.00028081442660546124,
500
- "loss": 2.5138,
501
- "step": 68
502
- },
503
- {
504
- "epoch": 0.138,
505
- "grad_norm": 0.9281813502311707,
506
- "learning_rate": 0.00026526421860705474,
507
- "loss": 2.5798,
508
- "step": 69
509
- },
510
- {
511
- "epoch": 0.14,
512
- "grad_norm": 0.8275775909423828,
513
- "learning_rate": 0.0002500000000000001,
514
- "loss": 2.5348,
515
- "step": 70
516
- },
517
- {
518
- "epoch": 0.142,
519
- "grad_norm": 1.5009329319000244,
520
- "learning_rate": 0.0002350403678833976,
521
- "loss": 2.5156,
522
- "step": 71
523
- },
524
- {
525
- "epoch": 0.144,
526
- "grad_norm": 1.4796998500823975,
527
- "learning_rate": 0.00022040354826462666,
528
- "loss": 2.3567,
529
- "step": 72
530
- },
531
- {
532
- "epoch": 0.146,
533
- "grad_norm": 0.7437081933021545,
534
- "learning_rate": 0.00020610737385376348,
535
- "loss": 2.4399,
536
- "step": 73
537
- },
538
- {
539
- "epoch": 0.148,
540
- "grad_norm": 0.7033576369285583,
541
- "learning_rate": 0.00019216926233717085,
542
- "loss": 2.3149,
543
- "step": 74
544
- },
545
- {
546
- "epoch": 0.15,
547
- "grad_norm": 0.9651651978492737,
548
- "learning_rate": 0.0001786061951567303,
549
- "loss": 2.5816,
550
- "step": 75
551
- },
552
- {
553
- "epoch": 0.152,
554
- "grad_norm": 1.0059478282928467,
555
- "learning_rate": 0.00016543469682057105,
556
- "loss": 2.6395,
557
- "step": 76
558
- },
559
- {
560
- "epoch": 0.154,
561
- "grad_norm": 1.6795697212219238,
562
- "learning_rate": 0.00015267081477050133,
563
- "loss": 2.3551,
564
- "step": 77
565
- },
566
- {
567
- "epoch": 0.156,
568
- "grad_norm": 0.7962441444396973,
569
- "learning_rate": 0.00014033009983067452,
570
- "loss": 2.2151,
571
- "step": 78
572
- },
573
- {
574
- "epoch": 0.158,
575
- "grad_norm": 0.880089282989502,
576
- "learning_rate": 0.00012842758726130281,
577
- "loss": 2.4376,
578
- "step": 79
579
- },
580
- {
581
- "epoch": 0.16,
582
- "grad_norm": 1.0629572868347168,
583
- "learning_rate": 0.00011697777844051105,
584
- "loss": 2.6063,
585
- "step": 80
586
- },
587
- {
588
- "epoch": 0.162,
589
- "grad_norm": 0.8691402077674866,
590
- "learning_rate": 0.00010599462319663906,
591
- "loss": 2.4764,
592
- "step": 81
593
- },
594
- {
595
- "epoch": 0.164,
596
- "grad_norm": 0.8258126378059387,
597
- "learning_rate": 9.549150281252633e-05,
598
- "loss": 2.3996,
599
- "step": 82
600
- },
601
- {
602
- "epoch": 0.166,
603
- "grad_norm": 2.253006935119629,
604
- "learning_rate": 8.548121372247918e-05,
605
- "loss": 2.7106,
606
- "step": 83
607
- },
608
- {
609
- "epoch": 0.168,
610
- "grad_norm": 0.9351361393928528,
611
- "learning_rate": 7.597595192178702e-05,
612
- "loss": 2.3613,
613
- "step": 84
614
- },
615
- {
616
- "epoch": 0.17,
617
- "grad_norm": 0.8624694347381592,
618
- "learning_rate": 6.698729810778065e-05,
619
- "loss": 2.4328,
620
- "step": 85
621
- },
622
- {
623
- "epoch": 0.172,
624
- "grad_norm": 0.6949071884155273,
625
- "learning_rate": 5.852620357053651e-05,
626
- "loss": 2.4157,
627
- "step": 86
628
- },
629
- {
630
- "epoch": 0.174,
631
- "grad_norm": 0.7830259203910828,
632
- "learning_rate": 5.060297685041659e-05,
633
- "loss": 2.2797,
634
- "step": 87
635
- },
636
- {
637
- "epoch": 0.176,
638
- "grad_norm": 1.3727121353149414,
639
- "learning_rate": 4.322727117869951e-05,
640
- "loss": 2.6155,
641
- "step": 88
642
- },
643
- {
644
- "epoch": 0.178,
645
- "grad_norm": 0.6731472611427307,
646
- "learning_rate": 3.6408072716606344e-05,
647
- "loss": 2.4149,
648
- "step": 89
649
- },
650
- {
651
- "epoch": 0.18,
652
- "grad_norm": 0.846976101398468,
653
- "learning_rate": 3.0153689607045842e-05,
654
- "loss": 2.3137,
655
- "step": 90
656
- },
657
- {
658
- "epoch": 0.182,
659
- "grad_norm": 0.9294453859329224,
660
- "learning_rate": 2.4471741852423235e-05,
661
- "loss": 2.5798,
662
- "step": 91
663
- },
664
- {
665
- "epoch": 0.184,
666
- "grad_norm": 0.766918957233429,
667
- "learning_rate": 1.9369152030840554e-05,
668
- "loss": 2.6766,
669
- "step": 92
670
- },
671
- {
672
- "epoch": 0.186,
673
- "grad_norm": 1.3079534769058228,
674
- "learning_rate": 1.4852136862001764e-05,
675
- "loss": 2.6047,
676
- "step": 93
677
- },
678
- {
679
- "epoch": 0.188,
680
- "grad_norm": 1.1351994276046753,
681
- "learning_rate": 1.0926199633097156e-05,
682
- "loss": 2.6034,
683
- "step": 94
684
- },
685
- {
686
- "epoch": 0.19,
687
- "grad_norm": 0.8010856509208679,
688
- "learning_rate": 7.59612349389599e-06,
689
- "loss": 2.2994,
690
- "step": 95
691
- },
692
- {
693
- "epoch": 0.192,
694
- "grad_norm": 0.9184717535972595,
695
- "learning_rate": 4.865965629214819e-06,
696
- "loss": 2.5489,
697
- "step": 96
698
- },
699
- {
700
- "epoch": 0.194,
701
- "grad_norm": 0.9543655514717102,
702
- "learning_rate": 2.739052315863355e-06,
703
- "loss": 2.5186,
704
- "step": 97
705
- },
706
- {
707
- "epoch": 0.196,
708
- "grad_norm": 0.9216803908348083,
709
- "learning_rate": 1.2179748700879012e-06,
710
- "loss": 2.5627,
711
- "step": 98
712
- },
713
- {
714
- "epoch": 0.198,
715
- "grad_norm": 0.8810911178588867,
716
- "learning_rate": 3.0458649045211895e-07,
717
- "loss": 2.6527,
718
- "step": 99
719
- },
720
- {
721
- "epoch": 0.2,
722
- "grad_norm": 0.7426478266716003,
723
- "learning_rate": 0.0,
724
- "loss": 2.1737,
725
- "step": 100
726
- },
727
- {
728
- "epoch": 0.2,
729
- "eval_loss": 2.527949094772339,
730
- "eval_runtime": 4.9855,
731
- "eval_samples_per_second": 4.212,
732
- "eval_steps_per_second": 4.212,
733
- "step": 100
734
  }
735
  ],
736
  "logging_steps": 1,
737
- "max_steps": 100,
738
  "num_input_tokens_seen": 0,
739
- "num_train_epochs": 1,
740
- "save_steps": 500,
741
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
742
  "TrainerControl": {
743
  "args": {
744
  "should_epoch_stop": false,
745
  "should_evaluate": false,
746
  "should_log": false,
747
  "should_save": true,
748
- "should_training_stop": true
749
  },
750
  "attributes": {}
751
  }
752
  },
753
- "total_flos": 1.62874924204032e+16,
754
  "train_batch_size": 1,
755
  "trial_name": null,
756
  "trial_params": null
 
1
  {
2
+ "best_metric": 2.580864906311035,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-50",
4
+ "epoch": 0.05,
5
  "eval_steps": 50,
6
+ "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.001,
13
+ "grad_norm": 0.2811749279499054,
14
+ "learning_rate": 2e-05,
15
+ "loss": 2.6895,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.001,
20
  "eval_loss": 3.0125324726104736,
21
+ "eval_runtime": 4.6936,
22
+ "eval_samples_per_second": 4.474,
23
+ "eval_steps_per_second": 4.474,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.002,
28
+ "grad_norm": 0.2987586557865143,
29
+ "learning_rate": 4e-05,
30
+ "loss": 2.8912,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.003,
35
+ "grad_norm": 0.452608197927475,
36
+ "learning_rate": 6e-05,
37
+ "loss": 3.4357,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 0.004,
42
+ "grad_norm": 0.38785919547080994,
43
+ "learning_rate": 8e-05,
44
+ "loss": 2.5889,
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 0.005,
49
+ "grad_norm": 0.38931822776794434,
50
+ "learning_rate": 0.0001,
51
+ "loss": 2.7513,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 0.006,
56
+ "grad_norm": 0.516417384147644,
57
+ "learning_rate": 0.00012,
58
+ "loss": 3.2128,
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.007,
63
+ "grad_norm": 0.4206741750240326,
64
+ "learning_rate": 0.00014,
65
+ "loss": 2.9368,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.008,
70
+ "grad_norm": 0.48171964287757874,
71
+ "learning_rate": 0.00016,
72
+ "loss": 2.8618,
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.009,
77
+ "grad_norm": 0.8544142842292786,
78
+ "learning_rate": 0.00018,
79
+ "loss": 3.0312,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.01,
84
+ "grad_norm": 0.848558247089386,
85
+ "learning_rate": 0.0002,
86
+ "loss": 2.9334,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.011,
91
+ "grad_norm": 0.8914313316345215,
92
+ "learning_rate": 0.00019999996900269505,
93
+ "loss": 2.7981,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.012,
98
+ "grad_norm": 0.6103464365005493,
99
+ "learning_rate": 0.0001999998760107994,
100
+ "loss": 2.7247,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.013,
105
+ "grad_norm": 0.7618600726127625,
106
+ "learning_rate": 0.00019999972102437074,
107
+ "loss": 2.472,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.014,
112
+ "grad_norm": 0.6825264692306519,
113
+ "learning_rate": 0.00019999950404350512,
114
+ "loss": 2.6008,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.015,
119
+ "grad_norm": 0.5940832495689392,
120
+ "learning_rate": 0.00019999922506833704,
121
+ "loss": 2.1996,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.016,
126
+ "grad_norm": 0.6273623108863831,
127
+ "learning_rate": 0.00019999888409903948,
128
+ "loss": 2.3565,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.017,
133
+ "grad_norm": 0.7437952160835266,
134
+ "learning_rate": 0.00019999848113582384,
135
+ "loss": 2.7232,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.018,
140
+ "grad_norm": 0.5971533060073853,
141
+ "learning_rate": 0.0001999980161789399,
142
+ "loss": 2.509,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.019,
147
+ "grad_norm": 0.5190719962120056,
148
+ "learning_rate": 0.00019999748922867592,
149
+ "loss": 2.3535,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.02,
154
+ "grad_norm": 0.9244285821914673,
155
+ "learning_rate": 0.00019999690028535855,
156
+ "loss": 2.7599,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.021,
161
+ "grad_norm": 0.8340674638748169,
162
+ "learning_rate": 0.00019999624934935296,
163
+ "loss": 3.0057,
164
  "step": 21
165
  },
166
  {
167
+ "epoch": 0.022,
168
+ "grad_norm": 1.0633089542388916,
169
+ "learning_rate": 0.00019999553642106266,
170
+ "loss": 2.2808,
171
  "step": 22
172
  },
173
  {
174
+ "epoch": 0.023,
175
+ "grad_norm": 4.8767266273498535,
176
+ "learning_rate": 0.00019999476150092967,
177
+ "loss": 2.8268,
178
  "step": 23
179
  },
180
  {
181
+ "epoch": 0.024,
182
+ "grad_norm": 2.7197344303131104,
183
+ "learning_rate": 0.00019999392458943432,
184
+ "loss": 2.6517,
185
  "step": 24
186
  },
187
  {
188
+ "epoch": 0.025,
189
+ "grad_norm": 0.9329593777656555,
190
+ "learning_rate": 0.00019999302568709547,
191
+ "loss": 2.212,
192
  "step": 25
193
  },
194
  {
195
+ "epoch": 0.026,
196
+ "grad_norm": 0.6679103374481201,
197
+ "learning_rate": 0.00019999206479447045,
198
+ "loss": 2.0117,
199
  "step": 26
200
  },
201
  {
202
+ "epoch": 0.027,
203
+ "grad_norm": 0.5428286790847778,
204
+ "learning_rate": 0.00019999104191215493,
205
+ "loss": 2.7582,
206
  "step": 27
207
  },
208
  {
209
+ "epoch": 0.028,
210
+ "grad_norm": 0.5552177429199219,
211
+ "learning_rate": 0.00019998995704078305,
212
+ "loss": 2.54,
213
  "step": 28
214
  },
215
  {
216
+ "epoch": 0.029,
217
+ "grad_norm": 0.5453671216964722,
218
+ "learning_rate": 0.00019998881018102737,
219
+ "loss": 2.5358,
220
  "step": 29
221
  },
222
  {
223
+ "epoch": 0.03,
224
+ "grad_norm": 0.47653189301490784,
225
+ "learning_rate": 0.00019998760133359885,
226
+ "loss": 2.2443,
227
  "step": 30
228
  },
229
  {
230
+ "epoch": 0.031,
231
+ "grad_norm": 0.755976140499115,
232
+ "learning_rate": 0.0001999863304992469,
233
+ "loss": 2.5519,
234
  "step": 31
235
  },
236
  {
237
+ "epoch": 0.032,
238
+ "grad_norm": 0.7680912017822266,
239
+ "learning_rate": 0.00019998499767875943,
240
+ "loss": 2.7503,
241
  "step": 32
242
  },
243
  {
244
+ "epoch": 0.033,
245
+ "grad_norm": 3.768080472946167,
246
+ "learning_rate": 0.0001999836028729627,
247
+ "loss": 2.6051,
248
  "step": 33
249
  },
250
  {
251
+ "epoch": 0.034,
252
+ "grad_norm": 0.5304062962532043,
253
+ "learning_rate": 0.00019998214608272136,
254
+ "loss": 2.2065,
255
  "step": 34
256
  },
257
  {
258
+ "epoch": 0.035,
259
+ "grad_norm": 1.1568998098373413,
260
+ "learning_rate": 0.00019998062730893862,
261
+ "loss": 2.444,
262
  "step": 35
263
  },
264
  {
265
+ "epoch": 0.036,
266
+ "grad_norm": 0.8356309533119202,
267
+ "learning_rate": 0.000199979046552556,
268
+ "loss": 2.5763,
269
  "step": 36
270
  },
271
  {
272
+ "epoch": 0.037,
273
+ "grad_norm": 0.5210471749305725,
274
+ "learning_rate": 0.00019997740381455346,
275
+ "loss": 2.8545,
276
  "step": 37
277
  },
278
  {
279
+ "epoch": 0.038,
280
+ "grad_norm": 1.550714373588562,
281
+ "learning_rate": 0.00019997569909594947,
282
+ "loss": 2.6236,
283
  "step": 38
284
  },
285
  {
286
+ "epoch": 0.039,
287
+ "grad_norm": 0.6044741868972778,
288
+ "learning_rate": 0.0001999739323978008,
289
+ "loss": 2.5349,
290
  "step": 39
291
  },
292
  {
293
+ "epoch": 0.04,
294
+ "grad_norm": 0.9703565239906311,
295
+ "learning_rate": 0.00019997210372120274,
296
+ "loss": 3.1004,
297
  "step": 40
298
  },
299
  {
300
+ "epoch": 0.041,
301
+ "grad_norm": 0.7796650528907776,
302
+ "learning_rate": 0.000199970213067289,
303
+ "loss": 2.5757,
304
  "step": 41
305
  },
306
  {
307
+ "epoch": 0.042,
308
+ "grad_norm": 0.6824871301651001,
309
+ "learning_rate": 0.00019996826043723162,
310
+ "loss": 2.6766,
311
  "step": 42
312
  },
313
  {
314
+ "epoch": 0.043,
315
+ "grad_norm": 0.8048773407936096,
316
+ "learning_rate": 0.00019996624583224114,
317
+ "loss": 2.3065,
318
  "step": 43
319
  },
320
  {
321
+ "epoch": 0.044,
322
+ "grad_norm": 0.5458154082298279,
323
+ "learning_rate": 0.00019996416925356652,
324
+ "loss": 2.4336,
325
  "step": 44
326
  },
327
  {
328
+ "epoch": 0.045,
329
+ "grad_norm": 0.623190701007843,
330
+ "learning_rate": 0.00019996203070249516,
331
+ "loss": 2.3835,
332
  "step": 45
333
  },
334
  {
335
+ "epoch": 0.046,
336
+ "grad_norm": 0.5928781032562256,
337
+ "learning_rate": 0.00019995983018035278,
338
+ "loss": 2.3408,
339
  "step": 46
340
  },
341
  {
342
+ "epoch": 0.047,
343
+ "grad_norm": 0.5790976881980896,
344
+ "learning_rate": 0.00019995756768850364,
345
+ "loss": 2.3878,
346
  "step": 47
347
  },
348
  {
349
+ "epoch": 0.048,
350
+ "grad_norm": 0.5648425817489624,
351
+ "learning_rate": 0.00019995524322835034,
352
+ "loss": 2.2885,
353
  "step": 48
354
  },
355
  {
356
+ "epoch": 0.049,
357
+ "grad_norm": 0.526339054107666,
358
+ "learning_rate": 0.00019995285680133394,
359
+ "loss": 2.408,
360
  "step": 49
361
  },
362
  {
363
+ "epoch": 0.05,
364
+ "grad_norm": 0.6333803534507751,
365
+ "learning_rate": 0.00019995040840893388,
366
+ "loss": 2.4391,
367
  "step": 50
368
  },
369
  {
370
+ "epoch": 0.05,
371
+ "eval_loss": 2.580864906311035,
372
+ "eval_runtime": 4.8038,
373
+ "eval_samples_per_second": 4.372,
374
+ "eval_steps_per_second": 4.372,
375
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  }
377
  ],
378
  "logging_steps": 1,
379
+ "max_steps": 4000,
380
  "num_input_tokens_seen": 0,
381
+ "num_train_epochs": 4,
382
+ "save_steps": 50,
383
  "stateful_callbacks": {
384
+ "EarlyStoppingCallback": {
385
+ "args": {
386
+ "early_stopping_patience": 2,
387
+ "early_stopping_threshold": 0.0
388
+ },
389
+ "attributes": {
390
+ "early_stopping_patience_counter": 0
391
+ }
392
+ },
393
  "TrainerControl": {
394
  "args": {
395
  "should_epoch_stop": false,
396
  "should_evaluate": false,
397
  "should_log": false,
398
  "should_save": true,
399
+ "should_training_stop": false
400
  },
401
  "attributes": {}
402
  }
403
  },
404
+ "total_flos": 4108715871436800.0,
405
  "train_batch_size": 1,
406
  "trial_name": null,
407
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8d9124138abd44af04b2c60a935bcab4ff5cdb3ea64e57559b87dc3f7e79065
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:587385ca4b3a6c2778a0b2f3cca66b2116c2d78e99e16d5766eaff5ef6eeb893
3
  size 6776