error577 commited on
Commit
20cf361
·
verified ·
1 Parent(s): 3579579

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "k_proj",
24
- "down_proj",
25
  "up_proj",
26
- "gate_proj",
27
- "o_proj",
28
  "v_proj",
29
- "q_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
24
  "k_proj",
 
25
  "up_proj",
 
 
26
  "v_proj",
27
+ "down_proj",
28
+ "gate_proj",
29
+ "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ee93c022f8ffc3acce2277304bf58df4df22be26d6abf74a5f3c20bafe90046
3
  size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f2c54b3cff7229bba3a337321576ca3fbedcde46f10b6c700245830c01cb495
3
  size 80013120
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58e6129f935b6e1eb106a4d35b353cecba548d9909665a3ed5a8128bafce50d7
3
  size 41119636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79bc646c7471cb3943c0b5456f615d091883e21fcd695a7c0aa6311ff2dd361a
3
  size 41119636
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e33ec2d279a54b31bcbd59efbda4ed3412f13129417ee23eb1d555b2fb4c15c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5da8984c55f90689ec5dc6254808c095ed22f24233bafba7be5034f696b9c85
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49d60a69e2379be2053e816cbaff31e6c931b5922dd86c71c9eaf473299cbf62
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9299ec7d0989f843c66221f6a5f12c76f22cfda8e3a2897dd9a527db5b37854
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8379156847342236,
5
  "eval_steps": 50,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
@@ -9,727 +9,727 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.008379156847342237,
13
- "grad_norm": 0.5054947733879089,
14
- "learning_rate": 1e-05,
15
- "loss": 2.9505,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.008379156847342237,
20
- "eval_loss": 3.054915428161621,
21
- "eval_runtime": 43.2673,
22
- "eval_samples_per_second": 4.669,
23
- "eval_steps_per_second": 4.669,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.016758313694684474,
28
- "grad_norm": 0.3709588348865509,
29
- "learning_rate": 2e-05,
30
- "loss": 2.8603,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.02513747054202671,
35
- "grad_norm": 0.5003033876419067,
36
- "learning_rate": 3e-05,
37
- "loss": 2.8325,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.03351662738936895,
42
- "grad_norm": 0.3898451328277588,
43
- "learning_rate": 4e-05,
44
- "loss": 2.7888,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 0.041895784236711184,
49
- "grad_norm": 0.4698517918586731,
50
- "learning_rate": 5e-05,
51
- "loss": 2.8257,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 0.05027494108405342,
56
- "grad_norm": 0.4781450927257538,
57
- "learning_rate": 6e-05,
58
- "loss": 2.9094,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.05865409793139565,
63
- "grad_norm": 0.6419183015823364,
64
- "learning_rate": 7e-05,
65
- "loss": 2.8045,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.0670332547787379,
70
- "grad_norm": 0.7329452633857727,
71
- "learning_rate": 8e-05,
72
- "loss": 2.7656,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.07541241162608013,
77
- "grad_norm": 0.8872014284133911,
78
- "learning_rate": 9e-05,
79
- "loss": 2.9754,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 0.08379156847342237,
84
- "grad_norm": 1.0049771070480347,
85
- "learning_rate": 0.0001,
86
- "loss": 2.7885,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.0921707253207646,
91
- "grad_norm": 1.288948655128479,
92
- "learning_rate": 9.99695413509548e-05,
93
- "loss": 2.862,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.10054988216810684,
98
- "grad_norm": 1.6090466976165771,
99
- "learning_rate": 9.987820251299122e-05,
100
- "loss": 2.9898,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.10892903901544906,
105
- "grad_norm": 1.5656819343566895,
106
- "learning_rate": 9.972609476841367e-05,
107
- "loss": 2.8883,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 0.1173081958627913,
112
- "grad_norm": 1.6467982530593872,
113
- "learning_rate": 9.951340343707852e-05,
114
- "loss": 2.7058,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 0.12568735271013354,
119
- "grad_norm": 2.0958547592163086,
120
- "learning_rate": 9.924038765061042e-05,
121
- "loss": 2.7129,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 0.1340665095574758,
126
- "grad_norm": 1.0730005502700806,
127
- "learning_rate": 9.890738003669029e-05,
128
- "loss": 2.7274,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 0.142445666404818,
133
- "grad_norm": 1.0618668794631958,
134
- "learning_rate": 9.851478631379982e-05,
135
- "loss": 2.5979,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 0.15082482325216026,
140
- "grad_norm": 0.4424041211605072,
141
- "learning_rate": 9.806308479691595e-05,
142
- "loss": 2.5467,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 0.15920398009950248,
147
- "grad_norm": 0.5938422679901123,
148
- "learning_rate": 9.755282581475769e-05,
149
- "loss": 2.6563,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 0.16758313694684474,
154
- "grad_norm": 0.5060924291610718,
155
- "learning_rate": 9.698463103929542e-05,
156
- "loss": 2.6239,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.17596229379418696,
161
- "grad_norm": 0.5067716836929321,
162
- "learning_rate": 9.635919272833938e-05,
163
- "loss": 2.5558,
164
  "step": 21
165
  },
166
  {
167
- "epoch": 0.1843414506415292,
168
- "grad_norm": 0.5283418297767639,
169
- "learning_rate": 9.567727288213005e-05,
170
- "loss": 2.6283,
171
  "step": 22
172
  },
173
  {
174
- "epoch": 0.19272060748887143,
175
- "grad_norm": 0.5521147847175598,
176
- "learning_rate": 9.493970231495835e-05,
177
- "loss": 2.5582,
178
  "step": 23
179
  },
180
  {
181
- "epoch": 0.20109976433621368,
182
- "grad_norm": 0.6335211396217346,
183
- "learning_rate": 9.414737964294636e-05,
184
- "loss": 2.6768,
185
  "step": 24
186
  },
187
  {
188
- "epoch": 0.2094789211835559,
189
- "grad_norm": 1.2375539541244507,
190
- "learning_rate": 9.330127018922194e-05,
191
- "loss": 2.6038,
192
  "step": 25
193
  },
194
  {
195
- "epoch": 0.21785807803089813,
196
- "grad_norm": 1.3271223306655884,
197
- "learning_rate": 9.24024048078213e-05,
198
- "loss": 2.5917,
199
  "step": 26
200
  },
201
  {
202
- "epoch": 0.22623723487824038,
203
- "grad_norm": 0.4055769145488739,
204
- "learning_rate": 9.145187862775209e-05,
205
- "loss": 2.3628,
206
  "step": 27
207
  },
208
  {
209
- "epoch": 0.2346163917255826,
210
- "grad_norm": 0.964474081993103,
211
- "learning_rate": 9.045084971874738e-05,
212
- "loss": 2.5157,
213
  "step": 28
214
  },
215
  {
216
- "epoch": 0.24299554857292485,
217
- "grad_norm": 0.6849140524864197,
218
- "learning_rate": 8.940053768033609e-05,
219
- "loss": 2.6839,
220
  "step": 29
221
  },
222
  {
223
- "epoch": 0.2513747054202671,
224
- "grad_norm": 0.4390021562576294,
225
- "learning_rate": 8.83022221559489e-05,
226
- "loss": 2.5509,
227
  "step": 30
228
  },
229
  {
230
- "epoch": 0.2597538622676093,
231
- "grad_norm": 0.7394634485244751,
232
- "learning_rate": 8.715724127386972e-05,
233
- "loss": 2.5447,
234
  "step": 31
235
  },
236
  {
237
- "epoch": 0.2681330191149516,
238
- "grad_norm": 0.45920076966285706,
239
- "learning_rate": 8.596699001693255e-05,
240
- "loss": 2.5397,
241
  "step": 32
242
  },
243
  {
244
- "epoch": 0.27651217596229377,
245
- "grad_norm": 0.43246570229530334,
246
- "learning_rate": 8.473291852294987e-05,
247
- "loss": 2.6423,
248
  "step": 33
249
  },
250
  {
251
- "epoch": 0.284891332809636,
252
- "grad_norm": 0.38203856348991394,
253
- "learning_rate": 8.345653031794292e-05,
254
- "loss": 2.4129,
255
  "step": 34
256
  },
257
  {
258
- "epoch": 0.2932704896569783,
259
- "grad_norm": 0.37578803300857544,
260
- "learning_rate": 8.213938048432697e-05,
261
- "loss": 2.5367,
262
  "step": 35
263
  },
264
  {
265
- "epoch": 0.3016496465043205,
266
- "grad_norm": 0.6231028437614441,
267
- "learning_rate": 8.07830737662829e-05,
268
- "loss": 2.504,
269
  "step": 36
270
  },
271
  {
272
- "epoch": 0.3100288033516627,
273
- "grad_norm": 0.48215481638908386,
274
- "learning_rate": 7.938926261462366e-05,
275
- "loss": 2.5128,
276
  "step": 37
277
  },
278
  {
279
- "epoch": 0.31840796019900497,
280
- "grad_norm": 0.8526724576950073,
281
- "learning_rate": 7.795964517353735e-05,
282
- "loss": 2.4417,
283
  "step": 38
284
  },
285
  {
286
- "epoch": 0.3267871170463472,
287
- "grad_norm": 0.42325401306152344,
288
- "learning_rate": 7.649596321166024e-05,
289
- "loss": 2.484,
290
  "step": 39
291
  },
292
  {
293
- "epoch": 0.33516627389368947,
294
- "grad_norm": 0.45637720823287964,
295
- "learning_rate": 7.500000000000001e-05,
296
- "loss": 2.4911,
297
  "step": 40
298
  },
299
  {
300
- "epoch": 0.34354543074103167,
301
- "grad_norm": 0.5629859566688538,
302
- "learning_rate": 7.347357813929454e-05,
303
- "loss": 2.4716,
304
  "step": 41
305
  },
306
  {
307
- "epoch": 0.3519245875883739,
308
- "grad_norm": 0.6218668222427368,
309
- "learning_rate": 7.191855733945387e-05,
310
- "loss": 2.3725,
311
  "step": 42
312
  },
313
  {
314
- "epoch": 0.36030374443571617,
315
- "grad_norm": 0.5280422568321228,
316
- "learning_rate": 7.033683215379002e-05,
317
- "loss": 2.6807,
318
  "step": 43
319
  },
320
  {
321
- "epoch": 0.3686829012830584,
322
- "grad_norm": 0.4046926200389862,
323
- "learning_rate": 6.873032967079561e-05,
324
- "loss": 2.4806,
325
  "step": 44
326
  },
327
  {
328
- "epoch": 0.3770620581304006,
329
- "grad_norm": 0.3766598701477051,
330
- "learning_rate": 6.710100716628344e-05,
331
- "loss": 2.2953,
332
  "step": 45
333
  },
334
  {
335
- "epoch": 0.38544121497774286,
336
- "grad_norm": 0.5563957691192627,
337
- "learning_rate": 6.545084971874738e-05,
338
- "loss": 2.5368,
339
  "step": 46
340
  },
341
  {
342
- "epoch": 0.3938203718250851,
343
- "grad_norm": 0.403656542301178,
344
- "learning_rate": 6.378186779084995e-05,
345
- "loss": 2.4207,
346
  "step": 47
347
  },
348
  {
349
- "epoch": 0.40219952867242736,
350
- "grad_norm": 0.4429378807544708,
351
- "learning_rate": 6.209609477998338e-05,
352
- "loss": 2.637,
353
  "step": 48
354
  },
355
  {
356
- "epoch": 0.41057868551976956,
357
- "grad_norm": 0.46513986587524414,
358
- "learning_rate": 6.0395584540887963e-05,
359
- "loss": 2.3988,
360
  "step": 49
361
  },
362
  {
363
- "epoch": 0.4189578423671118,
364
- "grad_norm": 0.4939236640930176,
365
- "learning_rate": 5.868240888334653e-05,
366
- "loss": 2.6023,
367
  "step": 50
368
  },
369
  {
370
- "epoch": 0.4189578423671118,
371
- "eval_loss": 2.578394889831543,
372
- "eval_runtime": 42.4269,
373
- "eval_samples_per_second": 4.761,
374
- "eval_steps_per_second": 4.761,
375
  "step": 50
376
  },
377
  {
378
- "epoch": 0.42733699921445406,
379
- "grad_norm": 0.4209323823451996,
380
- "learning_rate": 5.695865504800327e-05,
381
- "loss": 2.4749,
382
  "step": 51
383
  },
384
  {
385
- "epoch": 0.43571615606179626,
386
- "grad_norm": 0.4266431927680969,
387
- "learning_rate": 5.522642316338268e-05,
388
- "loss": 2.4692,
389
  "step": 52
390
  },
391
  {
392
- "epoch": 0.4440953129091385,
393
- "grad_norm": 0.3953860402107239,
394
- "learning_rate": 5.348782368720626e-05,
395
- "loss": 2.3527,
396
  "step": 53
397
  },
398
  {
399
- "epoch": 0.45247446975648076,
400
- "grad_norm": 0.4726410210132599,
401
- "learning_rate": 5.174497483512506e-05,
402
- "loss": 2.4229,
403
  "step": 54
404
  },
405
  {
406
- "epoch": 0.460853626603823,
407
- "grad_norm": 0.4370991289615631,
408
- "learning_rate": 5e-05,
409
- "loss": 2.5421,
410
  "step": 55
411
  },
412
  {
413
- "epoch": 0.4692327834511652,
414
- "grad_norm": 0.4647808074951172,
415
- "learning_rate": 4.825502516487497e-05,
416
- "loss": 2.5684,
417
  "step": 56
418
  },
419
  {
420
- "epoch": 0.47761194029850745,
421
- "grad_norm": 0.4097454249858856,
422
- "learning_rate": 4.6512176312793736e-05,
423
- "loss": 2.4871,
424
  "step": 57
425
  },
426
  {
427
- "epoch": 0.4859910971458497,
428
- "grad_norm": 0.4504952132701874,
429
- "learning_rate": 4.477357683661734e-05,
430
- "loss": 2.4055,
431
  "step": 58
432
  },
433
  {
434
- "epoch": 0.49437025399319195,
435
- "grad_norm": 0.461972177028656,
436
- "learning_rate": 4.3041344951996746e-05,
437
- "loss": 2.5632,
438
  "step": 59
439
  },
440
  {
441
- "epoch": 0.5027494108405341,
442
- "grad_norm": 0.42533305287361145,
443
- "learning_rate": 4.131759111665349e-05,
444
- "loss": 2.4914,
445
  "step": 60
446
  },
447
  {
448
- "epoch": 0.5111285676878764,
449
- "grad_norm": 0.4510684311389923,
450
- "learning_rate": 3.960441545911204e-05,
451
- "loss": 2.5091,
452
  "step": 61
453
  },
454
  {
455
- "epoch": 0.5195077245352187,
456
- "grad_norm": 0.42407482862472534,
457
- "learning_rate": 3.790390522001662e-05,
458
- "loss": 2.3668,
459
  "step": 62
460
  },
461
  {
462
- "epoch": 0.5278868813825609,
463
- "grad_norm": 0.431485116481781,
464
- "learning_rate": 3.6218132209150045e-05,
465
- "loss": 2.4541,
466
  "step": 63
467
  },
468
  {
469
- "epoch": 0.5362660382299032,
470
- "grad_norm": 0.4215572774410248,
471
- "learning_rate": 3.4549150281252636e-05,
472
- "loss": 2.3595,
473
  "step": 64
474
  },
475
  {
476
- "epoch": 0.5446451950772454,
477
- "grad_norm": 0.5403610467910767,
478
- "learning_rate": 3.289899283371657e-05,
479
- "loss": 2.7011,
480
  "step": 65
481
  },
482
  {
483
- "epoch": 0.5530243519245875,
484
- "grad_norm": 0.43399372696876526,
485
- "learning_rate": 3.12696703292044e-05,
486
- "loss": 2.367,
487
  "step": 66
488
  },
489
  {
490
- "epoch": 0.5614035087719298,
491
- "grad_norm": 0.49265211820602417,
492
- "learning_rate": 2.9663167846209998e-05,
493
- "loss": 2.3943,
494
  "step": 67
495
  },
496
  {
497
- "epoch": 0.569782665619272,
498
- "grad_norm": 0.4823525547981262,
499
- "learning_rate": 2.8081442660546125e-05,
500
- "loss": 2.3512,
501
  "step": 68
502
  },
503
  {
504
- "epoch": 0.5781618224666143,
505
- "grad_norm": 0.4148399829864502,
506
- "learning_rate": 2.6526421860705473e-05,
507
- "loss": 2.4461,
508
  "step": 69
509
  },
510
  {
511
- "epoch": 0.5865409793139565,
512
- "grad_norm": 0.45690131187438965,
513
- "learning_rate": 2.500000000000001e-05,
514
- "loss": 2.3753,
515
  "step": 70
516
  },
517
  {
518
- "epoch": 0.5949201361612988,
519
- "grad_norm": 0.3864487111568451,
520
- "learning_rate": 2.350403678833976e-05,
521
- "loss": 2.3544,
522
  "step": 71
523
  },
524
  {
525
- "epoch": 0.603299293008641,
526
- "grad_norm": 0.4677547812461853,
527
- "learning_rate": 2.2040354826462668e-05,
528
- "loss": 2.5031,
529
  "step": 72
530
  },
531
  {
532
- "epoch": 0.6116784498559832,
533
- "grad_norm": 0.37837573885917664,
534
- "learning_rate": 2.061073738537635e-05,
535
- "loss": 2.3215,
536
  "step": 73
537
  },
538
  {
539
- "epoch": 0.6200576067033254,
540
- "grad_norm": 0.47641924023628235,
541
- "learning_rate": 1.9216926233717085e-05,
542
- "loss": 2.4183,
543
  "step": 74
544
  },
545
  {
546
- "epoch": 0.6284367635506677,
547
- "grad_norm": 0.5523366928100586,
548
- "learning_rate": 1.7860619515673033e-05,
549
- "loss": 2.5932,
550
  "step": 75
551
  },
552
  {
553
- "epoch": 0.6368159203980099,
554
- "grad_norm": 0.42463499307632446,
555
- "learning_rate": 1.6543469682057106e-05,
556
- "loss": 2.4165,
557
  "step": 76
558
  },
559
  {
560
- "epoch": 0.6451950772453522,
561
- "grad_norm": 0.5442836284637451,
562
- "learning_rate": 1.526708147705013e-05,
563
- "loss": 2.5349,
564
  "step": 77
565
  },
566
  {
567
- "epoch": 0.6535742340926944,
568
- "grad_norm": 0.40604889392852783,
569
- "learning_rate": 1.4033009983067452e-05,
570
- "loss": 2.4106,
571
  "step": 78
572
  },
573
  {
574
- "epoch": 0.6619533909400367,
575
- "grad_norm": 0.4163293242454529,
576
- "learning_rate": 1.2842758726130283e-05,
577
- "loss": 2.3373,
578
  "step": 79
579
  },
580
  {
581
- "epoch": 0.6703325477873789,
582
- "grad_norm": 0.48656710982322693,
583
- "learning_rate": 1.1697777844051105e-05,
584
- "loss": 2.5337,
585
  "step": 80
586
  },
587
  {
588
- "epoch": 0.6787117046347211,
589
- "grad_norm": 0.4434641897678375,
590
- "learning_rate": 1.0599462319663905e-05,
591
- "loss": 2.4307,
592
  "step": 81
593
  },
594
  {
595
- "epoch": 0.6870908614820633,
596
- "grad_norm": 0.4347304105758667,
597
- "learning_rate": 9.549150281252633e-06,
598
- "loss": 2.3949,
599
  "step": 82
600
  },
601
  {
602
- "epoch": 0.6954700183294056,
603
- "grad_norm": 0.4212183952331543,
604
- "learning_rate": 8.548121372247918e-06,
605
- "loss": 2.3767,
606
  "step": 83
607
  },
608
  {
609
- "epoch": 0.7038491751767478,
610
- "grad_norm": 0.44238415360450745,
611
- "learning_rate": 7.597595192178702e-06,
612
- "loss": 2.612,
613
  "step": 84
614
  },
615
  {
616
- "epoch": 0.7122283320240901,
617
- "grad_norm": 0.44845736026763916,
618
- "learning_rate": 6.698729810778065e-06,
619
- "loss": 2.2322,
620
  "step": 85
621
  },
622
  {
623
- "epoch": 0.7206074888714323,
624
- "grad_norm": 0.4716636836528778,
625
- "learning_rate": 5.852620357053651e-06,
626
- "loss": 2.5347,
627
  "step": 86
628
  },
629
  {
630
- "epoch": 0.7289866457187746,
631
- "grad_norm": 0.5085580945014954,
632
- "learning_rate": 5.060297685041659e-06,
633
- "loss": 2.4772,
634
  "step": 87
635
  },
636
  {
637
- "epoch": 0.7373658025661168,
638
- "grad_norm": 0.455152302980423,
639
- "learning_rate": 4.322727117869951e-06,
640
- "loss": 2.3143,
641
  "step": 88
642
  },
643
  {
644
- "epoch": 0.745744959413459,
645
- "grad_norm": 0.48934170603752136,
646
- "learning_rate": 3.6408072716606346e-06,
647
- "loss": 2.3811,
648
  "step": 89
649
  },
650
  {
651
- "epoch": 0.7541241162608012,
652
- "grad_norm": 0.4595421552658081,
653
- "learning_rate": 3.0153689607045845e-06,
654
- "loss": 2.4414,
655
  "step": 90
656
  },
657
  {
658
- "epoch": 0.7625032731081435,
659
- "grad_norm": 0.5183727741241455,
660
- "learning_rate": 2.4471741852423237e-06,
661
- "loss": 2.5257,
662
  "step": 91
663
  },
664
  {
665
- "epoch": 0.7708824299554857,
666
- "grad_norm": 0.42374536395072937,
667
- "learning_rate": 1.9369152030840556e-06,
668
- "loss": 2.4218,
669
  "step": 92
670
  },
671
  {
672
- "epoch": 0.779261586802828,
673
- "grad_norm": 0.45261675119400024,
674
- "learning_rate": 1.4852136862001764e-06,
675
- "loss": 2.4601,
676
  "step": 93
677
  },
678
  {
679
- "epoch": 0.7876407436501702,
680
- "grad_norm": 0.4387091100215912,
681
- "learning_rate": 1.0926199633097157e-06,
682
- "loss": 2.5303,
683
  "step": 94
684
  },
685
  {
686
- "epoch": 0.7960199004975125,
687
- "grad_norm": 0.42841577529907227,
688
- "learning_rate": 7.596123493895991e-07,
689
- "loss": 2.3097,
690
  "step": 95
691
  },
692
  {
693
- "epoch": 0.8043990573448547,
694
- "grad_norm": 0.47183293104171753,
695
- "learning_rate": 4.865965629214819e-07,
696
- "loss": 2.4747,
697
  "step": 96
698
  },
699
  {
700
- "epoch": 0.8127782141921969,
701
- "grad_norm": 0.5696967840194702,
702
- "learning_rate": 2.7390523158633554e-07,
703
- "loss": 2.4984,
704
  "step": 97
705
  },
706
  {
707
- "epoch": 0.8211573710395391,
708
- "grad_norm": 0.4509386718273163,
709
- "learning_rate": 1.2179748700879012e-07,
710
- "loss": 2.2953,
711
  "step": 98
712
  },
713
  {
714
- "epoch": 0.8295365278868814,
715
- "grad_norm": 0.5444111227989197,
716
- "learning_rate": 3.04586490452119e-08,
717
- "loss": 2.5414,
718
  "step": 99
719
  },
720
  {
721
- "epoch": 0.8379156847342236,
722
- "grad_norm": 0.4556747078895569,
723
  "learning_rate": 0.0,
724
- "loss": 2.4252,
725
  "step": 100
726
  },
727
  {
728
- "epoch": 0.8379156847342236,
729
- "eval_loss": 2.5602805614471436,
730
- "eval_runtime": 45.2034,
731
- "eval_samples_per_second": 4.469,
732
- "eval_steps_per_second": 4.469,
733
  "step": 100
734
  }
735
  ],
@@ -737,7 +737,7 @@
737
  "max_steps": 100,
738
  "num_input_tokens_seen": 0,
739
  "num_train_epochs": 1,
740
- "save_steps": 25,
741
  "stateful_callbacks": {
742
  "TrainerControl": {
743
  "args": {
@@ -750,7 +750,7 @@
750
  "attributes": {}
751
  }
752
  },
753
- "total_flos": 6.51499696816128e+16,
754
  "train_batch_size": 1,
755
  "trial_name": null,
756
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2,
5
  "eval_steps": 50,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.002,
13
+ "grad_norm": 0.3986969590187073,
14
+ "learning_rate": 0.0001,
15
+ "loss": 2.7769,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.002,
20
+ "eval_loss": 3.0125324726104736,
21
+ "eval_runtime": 4.8013,
22
+ "eval_samples_per_second": 4.374,
23
+ "eval_steps_per_second": 4.374,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.004,
28
+ "grad_norm": 0.5986809730529785,
29
+ "learning_rate": 0.0002,
30
+ "loss": 2.9521,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.006,
35
+ "grad_norm": 0.595142662525177,
36
+ "learning_rate": 0.0003,
37
+ "loss": 2.955,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 0.008,
42
+ "grad_norm": 0.7013932466506958,
43
+ "learning_rate": 0.0004,
44
+ "loss": 2.9037,
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 0.01,
49
+ "grad_norm": 1.5847638845443726,
50
+ "learning_rate": 0.0005,
51
+ "loss": 2.9706,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 0.012,
56
+ "grad_norm": 1.6309813261032104,
57
+ "learning_rate": 0.0006,
58
+ "loss": 2.75,
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.014,
63
+ "grad_norm": 1.3442208766937256,
64
+ "learning_rate": 0.0007,
65
+ "loss": 2.5161,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.016,
70
+ "grad_norm": 0.900488018989563,
71
+ "learning_rate": 0.0008,
72
+ "loss": 2.2906,
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.018,
77
+ "grad_norm": 2.340869903564453,
78
+ "learning_rate": 0.0009000000000000001,
79
+ "loss": 2.6079,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.02,
84
+ "grad_norm": 2.987302303314209,
85
+ "learning_rate": 0.001,
86
+ "loss": 2.5506,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.022,
91
+ "grad_norm": 1.844685673713684,
92
+ "learning_rate": 0.0009996954135095479,
93
+ "loss": 2.7146,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.024,
98
+ "grad_norm": 0.9662850499153137,
99
+ "learning_rate": 0.0009987820251299122,
100
+ "loss": 2.6323,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.026,
105
+ "grad_norm": 3.0721042156219482,
106
+ "learning_rate": 0.0009972609476841367,
107
+ "loss": 2.1718,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.028,
112
+ "grad_norm": 1.0009405612945557,
113
+ "learning_rate": 0.0009951340343707852,
114
+ "loss": 2.6348,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.03,
119
+ "grad_norm": 14.435264587402344,
120
+ "learning_rate": 0.000992403876506104,
121
+ "loss": 2.5352,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.032,
126
+ "grad_norm": 5.060039520263672,
127
+ "learning_rate": 0.0009890738003669028,
128
+ "loss": 2.708,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.034,
133
+ "grad_norm": 1.6351608037948608,
134
+ "learning_rate": 0.0009851478631379982,
135
+ "loss": 2.3905,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.036,
140
+ "grad_norm": 2.9582386016845703,
141
+ "learning_rate": 0.0009806308479691594,
142
+ "loss": 2.5147,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.038,
147
+ "grad_norm": 1.8205921649932861,
148
+ "learning_rate": 0.0009755282581475768,
149
+ "loss": 2.766,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.04,
154
+ "grad_norm": 1.1158825159072876,
155
+ "learning_rate": 0.0009698463103929542,
156
+ "loss": 2.7895,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.042,
161
+ "grad_norm": 1.1689060926437378,
162
+ "learning_rate": 0.0009635919272833937,
163
+ "loss": 2.6373,
164
  "step": 21
165
  },
166
  {
167
+ "epoch": 0.044,
168
+ "grad_norm": 0.8205438256263733,
169
+ "learning_rate": 0.0009567727288213005,
170
+ "loss": 2.4038,
171
  "step": 22
172
  },
173
  {
174
+ "epoch": 0.046,
175
+ "grad_norm": 1.2794568538665771,
176
+ "learning_rate": 0.0009493970231495835,
177
+ "loss": 2.3676,
178
  "step": 23
179
  },
180
  {
181
+ "epoch": 0.048,
182
+ "grad_norm": 0.822256863117218,
183
+ "learning_rate": 0.0009414737964294635,
184
+ "loss": 2.327,
185
  "step": 24
186
  },
187
  {
188
+ "epoch": 0.05,
189
+ "grad_norm": 1.986864447593689,
190
+ "learning_rate": 0.0009330127018922195,
191
+ "loss": 2.4431,
192
  "step": 25
193
  },
194
  {
195
+ "epoch": 0.052,
196
+ "grad_norm": 3.7959301471710205,
197
+ "learning_rate": 0.0009240240480782129,
198
+ "loss": 2.6657,
199
  "step": 26
200
  },
201
  {
202
+ "epoch": 0.054,
203
+ "grad_norm": 2.489267587661743,
204
+ "learning_rate": 0.0009145187862775209,
205
+ "loss": 2.5005,
206
  "step": 27
207
  },
208
  {
209
+ "epoch": 0.056,
210
+ "grad_norm": 2.1583516597747803,
211
+ "learning_rate": 0.0009045084971874737,
212
+ "loss": 2.5402,
213
  "step": 28
214
  },
215
  {
216
+ "epoch": 0.058,
217
+ "grad_norm": 4.524465084075928,
218
+ "learning_rate": 0.0008940053768033609,
219
+ "loss": 2.2461,
220
  "step": 29
221
  },
222
  {
223
+ "epoch": 0.06,
224
+ "grad_norm": 1.3595800399780273,
225
+ "learning_rate": 0.000883022221559489,
226
+ "loss": 2.331,
227
  "step": 30
228
  },
229
  {
230
+ "epoch": 0.062,
231
+ "grad_norm": 0.9844056367874146,
232
+ "learning_rate": 0.0008715724127386971,
233
+ "loss": 2.3781,
234
  "step": 31
235
  },
236
  {
237
+ "epoch": 0.064,
238
+ "grad_norm": 1.117148518562317,
239
+ "learning_rate": 0.0008596699001693256,
240
+ "loss": 2.4258,
241
  "step": 32
242
  },
243
  {
244
+ "epoch": 0.066,
245
+ "grad_norm": 0.7900739312171936,
246
+ "learning_rate": 0.0008473291852294987,
247
+ "loss": 2.437,
248
  "step": 33
249
  },
250
  {
251
+ "epoch": 0.068,
252
+ "grad_norm": 0.8672456741333008,
253
+ "learning_rate": 0.0008345653031794292,
254
+ "loss": 2.8025,
255
  "step": 34
256
  },
257
  {
258
+ "epoch": 0.07,
259
+ "grad_norm": 0.816504716873169,
260
+ "learning_rate": 0.0008213938048432696,
261
+ "loss": 2.5078,
262
  "step": 35
263
  },
264
  {
265
+ "epoch": 0.072,
266
+ "grad_norm": 1.0574641227722168,
267
+ "learning_rate": 0.0008078307376628291,
268
+ "loss": 2.6408,
269
  "step": 36
270
  },
271
  {
272
+ "epoch": 0.074,
273
+ "grad_norm": 0.6753240823745728,
274
+ "learning_rate": 0.0007938926261462366,
275
+ "loss": 2.2858,
276
  "step": 37
277
  },
278
  {
279
+ "epoch": 0.076,
280
+ "grad_norm": 0.9166250824928284,
281
+ "learning_rate": 0.0007795964517353734,
282
+ "loss": 2.7091,
283
  "step": 38
284
  },
285
  {
286
+ "epoch": 0.078,
287
+ "grad_norm": 0.9022424221038818,
288
+ "learning_rate": 0.0007649596321166025,
289
+ "loss": 2.6459,
290
  "step": 39
291
  },
292
  {
293
+ "epoch": 0.08,
294
+ "grad_norm": 0.7723848223686218,
295
+ "learning_rate": 0.00075,
296
+ "loss": 2.4329,
297
  "step": 40
298
  },
299
  {
300
+ "epoch": 0.082,
301
+ "grad_norm": 0.8669672012329102,
302
+ "learning_rate": 0.0007347357813929454,
303
+ "loss": 2.3661,
304
  "step": 41
305
  },
306
  {
307
+ "epoch": 0.084,
308
+ "grad_norm": 0.9701873660087585,
309
+ "learning_rate": 0.0007191855733945387,
310
+ "loss": 2.6723,
311
  "step": 42
312
  },
313
  {
314
+ "epoch": 0.086,
315
+ "grad_norm": 0.8038893342018127,
316
+ "learning_rate": 0.0007033683215379002,
317
+ "loss": 2.7652,
318
  "step": 43
319
  },
320
  {
321
+ "epoch": 0.088,
322
+ "grad_norm": 0.6812747716903687,
323
+ "learning_rate": 0.0006873032967079561,
324
+ "loss": 2.4019,
325
  "step": 44
326
  },
327
  {
328
+ "epoch": 0.09,
329
+ "grad_norm": 0.8909493088722229,
330
+ "learning_rate": 0.0006710100716628344,
331
+ "loss": 2.349,
332
  "step": 45
333
  },
334
  {
335
+ "epoch": 0.092,
336
+ "grad_norm": 0.9887206554412842,
337
+ "learning_rate": 0.0006545084971874737,
338
+ "loss": 2.5577,
339
  "step": 46
340
  },
341
  {
342
+ "epoch": 0.094,
343
+ "grad_norm": 0.7749077081680298,
344
+ "learning_rate": 0.0006378186779084996,
345
+ "loss": 2.2903,
346
  "step": 47
347
  },
348
  {
349
+ "epoch": 0.096,
350
+ "grad_norm": 1.0913500785827637,
351
+ "learning_rate": 0.0006209609477998338,
352
+ "loss": 2.3697,
353
  "step": 48
354
  },
355
  {
356
+ "epoch": 0.098,
357
+ "grad_norm": 0.894119381904602,
358
+ "learning_rate": 0.0006039558454088796,
359
+ "loss": 2.5167,
360
  "step": 49
361
  },
362
  {
363
+ "epoch": 0.1,
364
+ "grad_norm": 1.159035325050354,
365
+ "learning_rate": 0.0005868240888334653,
366
+ "loss": 2.4637,
367
  "step": 50
368
  },
369
  {
370
+ "epoch": 0.1,
371
+ "eval_loss": 2.5838444232940674,
372
+ "eval_runtime": 4.8707,
373
+ "eval_samples_per_second": 4.311,
374
+ "eval_steps_per_second": 4.311,
375
  "step": 50
376
  },
377
  {
378
+ "epoch": 0.102,
379
+ "grad_norm": 0.6844251751899719,
380
+ "learning_rate": 0.0005695865504800327,
381
+ "loss": 2.4118,
382
  "step": 51
383
  },
384
  {
385
+ "epoch": 0.104,
386
+ "grad_norm": 1.1709848642349243,
387
+ "learning_rate": 0.0005522642316338268,
388
+ "loss": 2.444,
389
  "step": 52
390
  },
391
  {
392
+ "epoch": 0.106,
393
+ "grad_norm": 0.9435467720031738,
394
+ "learning_rate": 0.0005348782368720626,
395
+ "loss": 2.5568,
396
  "step": 53
397
  },
398
  {
399
+ "epoch": 0.108,
400
+ "grad_norm": 1.0800719261169434,
401
+ "learning_rate": 0.0005174497483512506,
402
+ "loss": 2.5766,
403
  "step": 54
404
  },
405
  {
406
+ "epoch": 0.11,
407
+ "grad_norm": 1.001356840133667,
408
+ "learning_rate": 0.0005,
409
+ "loss": 2.2205,
410
  "step": 55
411
  },
412
  {
413
+ "epoch": 0.112,
414
+ "grad_norm": 1.4582829475402832,
415
+ "learning_rate": 0.0004825502516487497,
416
+ "loss": 2.7271,
417
  "step": 56
418
  },
419
  {
420
+ "epoch": 0.114,
421
+ "grad_norm": 0.8312236666679382,
422
+ "learning_rate": 0.00046512176312793734,
423
+ "loss": 2.3204,
424
  "step": 57
425
  },
426
  {
427
+ "epoch": 0.116,
428
+ "grad_norm": 1.2127161026000977,
429
+ "learning_rate": 0.00044773576836617336,
430
+ "loss": 2.0169,
431
  "step": 58
432
  },
433
  {
434
+ "epoch": 0.118,
435
+ "grad_norm": 1.6428215503692627,
436
+ "learning_rate": 0.0004304134495199674,
437
+ "loss": 2.4521,
438
  "step": 59
439
  },
440
  {
441
+ "epoch": 0.12,
442
+ "grad_norm": 1.7682443857192993,
443
+ "learning_rate": 0.00041317591116653486,
444
+ "loss": 2.6753,
445
  "step": 60
446
  },
447
  {
448
+ "epoch": 0.122,
449
+ "grad_norm": 1.0919681787490845,
450
+ "learning_rate": 0.0003960441545911204,
451
+ "loss": 2.4022,
452
  "step": 61
453
  },
454
  {
455
+ "epoch": 0.124,
456
+ "grad_norm": 2.5304136276245117,
457
+ "learning_rate": 0.0003790390522001662,
458
+ "loss": 2.4325,
459
  "step": 62
460
  },
461
  {
462
+ "epoch": 0.126,
463
+ "grad_norm": 1.1737953424453735,
464
+ "learning_rate": 0.00036218132209150044,
465
+ "loss": 2.2653,
466
  "step": 63
467
  },
468
  {
469
+ "epoch": 0.128,
470
+ "grad_norm": 0.7943472862243652,
471
+ "learning_rate": 0.00034549150281252633,
472
+ "loss": 2.6079,
473
  "step": 64
474
  },
475
  {
476
+ "epoch": 0.13,
477
+ "grad_norm": 1.3269349336624146,
478
+ "learning_rate": 0.0003289899283371657,
479
+ "loss": 2.3745,
480
  "step": 65
481
  },
482
  {
483
+ "epoch": 0.132,
484
+ "grad_norm": 0.8898394107818604,
485
+ "learning_rate": 0.00031269670329204396,
486
+ "loss": 2.3862,
487
  "step": 66
488
  },
489
  {
490
+ "epoch": 0.134,
491
+ "grad_norm": 0.8309778571128845,
492
+ "learning_rate": 0.0002966316784621,
493
+ "loss": 2.5131,
494
  "step": 67
495
  },
496
  {
497
+ "epoch": 0.136,
498
+ "grad_norm": 1.2103646993637085,
499
+ "learning_rate": 0.00028081442660546124,
500
+ "loss": 2.5138,
501
  "step": 68
502
  },
503
  {
504
+ "epoch": 0.138,
505
+ "grad_norm": 0.9281813502311707,
506
+ "learning_rate": 0.00026526421860705474,
507
+ "loss": 2.5798,
508
  "step": 69
509
  },
510
  {
511
+ "epoch": 0.14,
512
+ "grad_norm": 0.8275775909423828,
513
+ "learning_rate": 0.0002500000000000001,
514
+ "loss": 2.5348,
515
  "step": 70
516
  },
517
  {
518
+ "epoch": 0.142,
519
+ "grad_norm": 1.5009329319000244,
520
+ "learning_rate": 0.0002350403678833976,
521
+ "loss": 2.5156,
522
  "step": 71
523
  },
524
  {
525
+ "epoch": 0.144,
526
+ "grad_norm": 1.4796998500823975,
527
+ "learning_rate": 0.00022040354826462666,
528
+ "loss": 2.3567,
529
  "step": 72
530
  },
531
  {
532
+ "epoch": 0.146,
533
+ "grad_norm": 0.7437081933021545,
534
+ "learning_rate": 0.00020610737385376348,
535
+ "loss": 2.4399,
536
  "step": 73
537
  },
538
  {
539
+ "epoch": 0.148,
540
+ "grad_norm": 0.7033576369285583,
541
+ "learning_rate": 0.00019216926233717085,
542
+ "loss": 2.3149,
543
  "step": 74
544
  },
545
  {
546
+ "epoch": 0.15,
547
+ "grad_norm": 0.9651651978492737,
548
+ "learning_rate": 0.0001786061951567303,
549
+ "loss": 2.5816,
550
  "step": 75
551
  },
552
  {
553
+ "epoch": 0.152,
554
+ "grad_norm": 1.0059478282928467,
555
+ "learning_rate": 0.00016543469682057105,
556
+ "loss": 2.6395,
557
  "step": 76
558
  },
559
  {
560
+ "epoch": 0.154,
561
+ "grad_norm": 1.6795697212219238,
562
+ "learning_rate": 0.00015267081477050133,
563
+ "loss": 2.3551,
564
  "step": 77
565
  },
566
  {
567
+ "epoch": 0.156,
568
+ "grad_norm": 0.7962441444396973,
569
+ "learning_rate": 0.00014033009983067452,
570
+ "loss": 2.2151,
571
  "step": 78
572
  },
573
  {
574
+ "epoch": 0.158,
575
+ "grad_norm": 0.880089282989502,
576
+ "learning_rate": 0.00012842758726130281,
577
+ "loss": 2.4376,
578
  "step": 79
579
  },
580
  {
581
+ "epoch": 0.16,
582
+ "grad_norm": 1.0629572868347168,
583
+ "learning_rate": 0.00011697777844051105,
584
+ "loss": 2.6063,
585
  "step": 80
586
  },
587
  {
588
+ "epoch": 0.162,
589
+ "grad_norm": 0.8691402077674866,
590
+ "learning_rate": 0.00010599462319663906,
591
+ "loss": 2.4764,
592
  "step": 81
593
  },
594
  {
595
+ "epoch": 0.164,
596
+ "grad_norm": 0.8258126378059387,
597
+ "learning_rate": 9.549150281252633e-05,
598
+ "loss": 2.3996,
599
  "step": 82
600
  },
601
  {
602
+ "epoch": 0.166,
603
+ "grad_norm": 2.253006935119629,
604
+ "learning_rate": 8.548121372247918e-05,
605
+ "loss": 2.7106,
606
  "step": 83
607
  },
608
  {
609
+ "epoch": 0.168,
610
+ "grad_norm": 0.9351361393928528,
611
+ "learning_rate": 7.597595192178702e-05,
612
+ "loss": 2.3613,
613
  "step": 84
614
  },
615
  {
616
+ "epoch": 0.17,
617
+ "grad_norm": 0.8624694347381592,
618
+ "learning_rate": 6.698729810778065e-05,
619
+ "loss": 2.4328,
620
  "step": 85
621
  },
622
  {
623
+ "epoch": 0.172,
624
+ "grad_norm": 0.6949071884155273,
625
+ "learning_rate": 5.852620357053651e-05,
626
+ "loss": 2.4157,
627
  "step": 86
628
  },
629
  {
630
+ "epoch": 0.174,
631
+ "grad_norm": 0.7830259203910828,
632
+ "learning_rate": 5.060297685041659e-05,
633
+ "loss": 2.2797,
634
  "step": 87
635
  },
636
  {
637
+ "epoch": 0.176,
638
+ "grad_norm": 1.3727121353149414,
639
+ "learning_rate": 4.322727117869951e-05,
640
+ "loss": 2.6155,
641
  "step": 88
642
  },
643
  {
644
+ "epoch": 0.178,
645
+ "grad_norm": 0.6731472611427307,
646
+ "learning_rate": 3.6408072716606344e-05,
647
+ "loss": 2.4149,
648
  "step": 89
649
  },
650
  {
651
+ "epoch": 0.18,
652
+ "grad_norm": 0.846976101398468,
653
+ "learning_rate": 3.0153689607045842e-05,
654
+ "loss": 2.3137,
655
  "step": 90
656
  },
657
  {
658
+ "epoch": 0.182,
659
+ "grad_norm": 0.9294453859329224,
660
+ "learning_rate": 2.4471741852423235e-05,
661
+ "loss": 2.5798,
662
  "step": 91
663
  },
664
  {
665
+ "epoch": 0.184,
666
+ "grad_norm": 0.766918957233429,
667
+ "learning_rate": 1.9369152030840554e-05,
668
+ "loss": 2.6766,
669
  "step": 92
670
  },
671
  {
672
+ "epoch": 0.186,
673
+ "grad_norm": 1.3079534769058228,
674
+ "learning_rate": 1.4852136862001764e-05,
675
+ "loss": 2.6047,
676
  "step": 93
677
  },
678
  {
679
+ "epoch": 0.188,
680
+ "grad_norm": 1.1351994276046753,
681
+ "learning_rate": 1.0926199633097156e-05,
682
+ "loss": 2.6034,
683
  "step": 94
684
  },
685
  {
686
+ "epoch": 0.19,
687
+ "grad_norm": 0.8010856509208679,
688
+ "learning_rate": 7.59612349389599e-06,
689
+ "loss": 2.2994,
690
  "step": 95
691
  },
692
  {
693
+ "epoch": 0.192,
694
+ "grad_norm": 0.9184717535972595,
695
+ "learning_rate": 4.865965629214819e-06,
696
+ "loss": 2.5489,
697
  "step": 96
698
  },
699
  {
700
+ "epoch": 0.194,
701
+ "grad_norm": 0.9543655514717102,
702
+ "learning_rate": 2.739052315863355e-06,
703
+ "loss": 2.5186,
704
  "step": 97
705
  },
706
  {
707
+ "epoch": 0.196,
708
+ "grad_norm": 0.9216803908348083,
709
+ "learning_rate": 1.2179748700879012e-06,
710
+ "loss": 2.5627,
711
  "step": 98
712
  },
713
  {
714
+ "epoch": 0.198,
715
+ "grad_norm": 0.8810911178588867,
716
+ "learning_rate": 3.0458649045211895e-07,
717
+ "loss": 2.6527,
718
  "step": 99
719
  },
720
  {
721
+ "epoch": 0.2,
722
+ "grad_norm": 0.7426478266716003,
723
  "learning_rate": 0.0,
724
+ "loss": 2.1737,
725
  "step": 100
726
  },
727
  {
728
+ "epoch": 0.2,
729
+ "eval_loss": 2.527949094772339,
730
+ "eval_runtime": 4.9855,
731
+ "eval_samples_per_second": 4.212,
732
+ "eval_steps_per_second": 4.212,
733
  "step": 100
734
  }
735
  ],
 
737
  "max_steps": 100,
738
  "num_input_tokens_seen": 0,
739
  "num_train_epochs": 1,
740
+ "save_steps": 500,
741
  "stateful_callbacks": {
742
  "TrainerControl": {
743
  "args": {
 
750
  "attributes": {}
751
  }
752
  },
753
+ "total_flos": 1.62874924204032e+16,
754
  "train_batch_size": 1,
755
  "trial_name": null,
756
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa520e0ab0f5bd3a71845480ebc126400483d3f8f3790806982217002921d912
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8d9124138abd44af04b2c60a935bcab4ff5cdb3ea64e57559b87dc3f7e79065
3
  size 6776