chiayisu commited on
Commit
36807d0
·
verified ·
1 Parent(s): c6199d8

Delete llama/use-seq

Browse files
llama/use-seq/adapter_config.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "base_model_name_or_path": "decapoda-research/llama-7b-hf",
3
- "bias": "none",
4
- "enable_lora": null,
5
- "fan_in_fan_out": false,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "lora_alpha": 16,
9
- "lora_dropout": 0.05,
10
- "merge_weights": false,
11
- "modules_to_save": null,
12
- "peft_type": "LORA",
13
- "r": 8,
14
- "target_modules": [
15
- "q_proj",
16
- "v_proj"
17
- ],
18
- "task_type": "CAUSAL_LM"
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1070/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1070/tokenizer_config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": true,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "clean_up_tokenization_spaces": false,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "model_max_length": 1000000000000000019884624838656,
22
- "pad_token": null,
23
- "sp_model_kwargs": {},
24
- "special_tokens_map_file": "/home/chiayi/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/special_tokens_map.json",
25
- "tokenizer_class": "LlamaTokenizer",
26
- "unk_token": {
27
- "__type": "AddedToken",
28
- "content": "",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1070/trainer_state.json DELETED
@@ -1,1190 +0,0 @@
1
- {
2
- "best_metric": 0.7264513969421387,
3
- "best_model_checkpoint": "lora-alpaca-use-seq/checkpoint-1070",
4
- "epoch": 0.794449987238683,
5
- "global_step": 1070,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "eval_loss": 2.4221816062927246,
13
- "eval_runtime": 0.1575,
14
- "eval_samples_per_second": 6.348,
15
- "eval_steps_per_second": 6.348,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.01,
20
- "learning_rate": 5.9999999999999995e-05,
21
- "loss": 2.4934,
22
- "step": 20
23
- },
24
- {
25
- "epoch": 0.01,
26
- "eval_loss": 2.3228158950805664,
27
- "eval_runtime": 0.1587,
28
- "eval_samples_per_second": 6.302,
29
- "eval_steps_per_second": 6.302,
30
- "step": 20
31
- },
32
- {
33
- "epoch": 0.02,
34
- "eval_loss": 1.8552848100662231,
35
- "eval_runtime": 0.1566,
36
- "eval_samples_per_second": 6.386,
37
- "eval_steps_per_second": 6.386,
38
- "step": 30
39
- },
40
- {
41
- "epoch": 0.03,
42
- "learning_rate": 0.000117,
43
- "loss": 1.9804,
44
- "step": 40
45
- },
46
- {
47
- "epoch": 0.03,
48
- "eval_loss": 1.2810053825378418,
49
- "eval_runtime": 0.1559,
50
- "eval_samples_per_second": 6.416,
51
- "eval_steps_per_second": 6.416,
52
- "step": 40
53
- },
54
- {
55
- "epoch": 0.04,
56
- "eval_loss": 0.9789324998855591,
57
- "eval_runtime": 0.1567,
58
- "eval_samples_per_second": 6.382,
59
- "eval_steps_per_second": 6.382,
60
- "step": 50
61
- },
62
- {
63
- "epoch": 0.04,
64
- "learning_rate": 0.00017699999999999997,
65
- "loss": 1.165,
66
- "step": 60
67
- },
68
- {
69
- "epoch": 0.04,
70
- "eval_loss": 0.9216629266738892,
71
- "eval_runtime": 0.1577,
72
- "eval_samples_per_second": 6.343,
73
- "eval_steps_per_second": 6.343,
74
- "step": 60
75
- },
76
- {
77
- "epoch": 0.05,
78
- "eval_loss": 0.8872115612030029,
79
- "eval_runtime": 0.1579,
80
- "eval_samples_per_second": 6.333,
81
- "eval_steps_per_second": 6.333,
82
- "step": 70
83
- },
84
- {
85
- "epoch": 0.06,
86
- "learning_rate": 0.000237,
87
- "loss": 0.9931,
88
- "step": 80
89
- },
90
- {
91
- "epoch": 0.06,
92
- "eval_loss": 0.8584132194519043,
93
- "eval_runtime": 0.1599,
94
- "eval_samples_per_second": 6.254,
95
- "eval_steps_per_second": 6.254,
96
- "step": 80
97
- },
98
- {
99
- "epoch": 0.07,
100
- "eval_loss": 0.9114644527435303,
101
- "eval_runtime": 0.1598,
102
- "eval_samples_per_second": 6.259,
103
- "eval_steps_per_second": 6.259,
104
- "step": 90
105
- },
106
- {
107
- "epoch": 0.07,
108
- "learning_rate": 0.00029699999999999996,
109
- "loss": 0.9545,
110
- "step": 100
111
- },
112
- {
113
- "epoch": 0.07,
114
- "eval_loss": 0.8909071087837219,
115
- "eval_runtime": 0.1603,
116
- "eval_samples_per_second": 6.24,
117
- "eval_steps_per_second": 6.24,
118
- "step": 100
119
- },
120
- {
121
- "epoch": 0.08,
122
- "eval_loss": 0.9121037721633911,
123
- "eval_runtime": 0.1598,
124
- "eval_samples_per_second": 6.259,
125
- "eval_steps_per_second": 6.259,
126
- "step": 110
127
- },
128
- {
129
- "epoch": 0.09,
130
- "learning_rate": 0.0002954253611556982,
131
- "loss": 0.8947,
132
- "step": 120
133
- },
134
- {
135
- "epoch": 0.09,
136
- "eval_loss": 0.8340358734130859,
137
- "eval_runtime": 0.1605,
138
- "eval_samples_per_second": 6.229,
139
- "eval_steps_per_second": 6.229,
140
- "step": 120
141
- },
142
- {
143
- "epoch": 0.1,
144
- "eval_loss": 0.8849254846572876,
145
- "eval_runtime": 0.158,
146
- "eval_samples_per_second": 6.33,
147
- "eval_steps_per_second": 6.33,
148
- "step": 130
149
- },
150
- {
151
- "epoch": 0.1,
152
- "learning_rate": 0.0002906099518459069,
153
- "loss": 0.8711,
154
- "step": 140
155
- },
156
- {
157
- "epoch": 0.1,
158
- "eval_loss": 0.8834758996963501,
159
- "eval_runtime": 0.157,
160
- "eval_samples_per_second": 6.368,
161
- "eval_steps_per_second": 6.368,
162
- "step": 140
163
- },
164
- {
165
- "epoch": 0.11,
166
- "eval_loss": 0.8293902277946472,
167
- "eval_runtime": 0.1583,
168
- "eval_samples_per_second": 6.316,
169
- "eval_steps_per_second": 6.316,
170
- "step": 150
171
- },
172
- {
173
- "epoch": 0.12,
174
- "learning_rate": 0.00028579454253611556,
175
- "loss": 0.8688,
176
- "step": 160
177
- },
178
- {
179
- "epoch": 0.12,
180
- "eval_loss": 0.8478549718856812,
181
- "eval_runtime": 0.1581,
182
- "eval_samples_per_second": 6.324,
183
- "eval_steps_per_second": 6.324,
184
- "step": 160
185
- },
186
- {
187
- "epoch": 0.13,
188
- "eval_loss": 0.8488869667053223,
189
- "eval_runtime": 0.159,
190
- "eval_samples_per_second": 6.288,
191
- "eval_steps_per_second": 6.288,
192
- "step": 170
193
- },
194
- {
195
- "epoch": 0.13,
196
- "learning_rate": 0.0002809791332263242,
197
- "loss": 0.8551,
198
- "step": 180
199
- },
200
- {
201
- "epoch": 0.13,
202
- "eval_loss": 0.814437747001648,
203
- "eval_runtime": 0.1575,
204
- "eval_samples_per_second": 6.349,
205
- "eval_steps_per_second": 6.349,
206
- "step": 180
207
- },
208
- {
209
- "epoch": 0.14,
210
- "eval_loss": 0.8222396969795227,
211
- "eval_runtime": 0.1583,
212
- "eval_samples_per_second": 6.317,
213
- "eval_steps_per_second": 6.317,
214
- "step": 190
215
- },
216
- {
217
- "epoch": 0.15,
218
- "learning_rate": 0.0002761637239165329,
219
- "loss": 0.8471,
220
- "step": 200
221
- },
222
- {
223
- "epoch": 0.15,
224
- "eval_loss": 0.8183712959289551,
225
- "eval_runtime": 0.1584,
226
- "eval_samples_per_second": 6.315,
227
- "eval_steps_per_second": 6.315,
228
- "step": 200
229
- },
230
- {
231
- "epoch": 0.16,
232
- "eval_loss": 0.8207969665527344,
233
- "eval_runtime": 0.1584,
234
- "eval_samples_per_second": 6.315,
235
- "eval_steps_per_second": 6.315,
236
- "step": 210
237
- },
238
- {
239
- "epoch": 0.16,
240
- "learning_rate": 0.00027134831460674157,
241
- "loss": 0.8578,
242
- "step": 220
243
- },
244
- {
245
- "epoch": 0.16,
246
- "eval_loss": 0.8103113174438477,
247
- "eval_runtime": 0.1584,
248
- "eval_samples_per_second": 6.315,
249
- "eval_steps_per_second": 6.315,
250
- "step": 220
251
- },
252
- {
253
- "epoch": 0.17,
254
- "eval_loss": 0.808212399482727,
255
- "eval_runtime": 0.1584,
256
- "eval_samples_per_second": 6.311,
257
- "eval_steps_per_second": 6.311,
258
- "step": 230
259
- },
260
- {
261
- "epoch": 0.18,
262
- "learning_rate": 0.00026653290529695024,
263
- "loss": 0.8389,
264
- "step": 240
265
- },
266
- {
267
- "epoch": 0.18,
268
- "eval_loss": 0.7917136549949646,
269
- "eval_runtime": 0.1575,
270
- "eval_samples_per_second": 6.351,
271
- "eval_steps_per_second": 6.351,
272
- "step": 240
273
- },
274
- {
275
- "epoch": 0.19,
276
- "eval_loss": 0.8259432315826416,
277
- "eval_runtime": 0.1603,
278
- "eval_samples_per_second": 6.24,
279
- "eval_steps_per_second": 6.24,
280
- "step": 250
281
- },
282
- {
283
- "epoch": 0.19,
284
- "learning_rate": 0.0002617174959871589,
285
- "loss": 0.8343,
286
- "step": 260
287
- },
288
- {
289
- "epoch": 0.19,
290
- "eval_loss": 0.7860772609710693,
291
- "eval_runtime": 0.1577,
292
- "eval_samples_per_second": 6.34,
293
- "eval_steps_per_second": 6.34,
294
- "step": 260
295
- },
296
- {
297
- "epoch": 0.2,
298
- "eval_loss": 0.8120028972625732,
299
- "eval_runtime": 0.1576,
300
- "eval_samples_per_second": 6.347,
301
- "eval_steps_per_second": 6.347,
302
- "step": 270
303
- },
304
- {
305
- "epoch": 0.21,
306
- "learning_rate": 0.0002569020866773676,
307
- "loss": 0.831,
308
- "step": 280
309
- },
310
- {
311
- "epoch": 0.21,
312
- "eval_loss": 0.7905743718147278,
313
- "eval_runtime": 0.157,
314
- "eval_samples_per_second": 6.368,
315
- "eval_steps_per_second": 6.368,
316
- "step": 280
317
- },
318
- {
319
- "epoch": 0.22,
320
- "eval_loss": 0.7759386301040649,
321
- "eval_runtime": 0.1576,
322
- "eval_samples_per_second": 6.344,
323
- "eval_steps_per_second": 6.344,
324
- "step": 290
325
- },
326
- {
327
- "epoch": 0.22,
328
- "learning_rate": 0.00025208667736757625,
329
- "loss": 0.8325,
330
- "step": 300
331
- },
332
- {
333
- "epoch": 0.22,
334
- "eval_loss": 0.8241894245147705,
335
- "eval_runtime": 0.1572,
336
- "eval_samples_per_second": 6.363,
337
- "eval_steps_per_second": 6.363,
338
- "step": 300
339
- },
340
- {
341
- "epoch": 0.23,
342
- "eval_loss": 0.7761509418487549,
343
- "eval_runtime": 0.1576,
344
- "eval_samples_per_second": 6.346,
345
- "eval_steps_per_second": 6.346,
346
- "step": 310
347
- },
348
- {
349
- "epoch": 0.24,
350
- "learning_rate": 0.0002472712680577849,
351
- "loss": 0.8147,
352
- "step": 320
353
- },
354
- {
355
- "epoch": 0.24,
356
- "eval_loss": 0.7789342403411865,
357
- "eval_runtime": 0.1572,
358
- "eval_samples_per_second": 6.363,
359
- "eval_steps_per_second": 6.363,
360
- "step": 320
361
- },
362
- {
363
- "epoch": 0.25,
364
- "eval_loss": 0.7757179737091064,
365
- "eval_runtime": 0.1578,
366
- "eval_samples_per_second": 6.339,
367
- "eval_steps_per_second": 6.339,
368
- "step": 330
369
- },
370
- {
371
- "epoch": 0.25,
372
- "learning_rate": 0.00024245585874799357,
373
- "loss": 0.8204,
374
- "step": 340
375
- },
376
- {
377
- "epoch": 0.25,
378
- "eval_loss": 0.7954628467559814,
379
- "eval_runtime": 0.1576,
380
- "eval_samples_per_second": 6.344,
381
- "eval_steps_per_second": 6.344,
382
- "step": 340
383
- },
384
- {
385
- "epoch": 0.26,
386
- "eval_loss": 0.7957539558410645,
387
- "eval_runtime": 0.1584,
388
- "eval_samples_per_second": 6.315,
389
- "eval_steps_per_second": 6.315,
390
- "step": 350
391
- },
392
- {
393
- "epoch": 0.27,
394
- "learning_rate": 0.00023764044943820224,
395
- "loss": 0.8232,
396
- "step": 360
397
- },
398
- {
399
- "epoch": 0.27,
400
- "eval_loss": 0.7863476276397705,
401
- "eval_runtime": 0.1571,
402
- "eval_samples_per_second": 6.367,
403
- "eval_steps_per_second": 6.367,
404
- "step": 360
405
- },
406
- {
407
- "epoch": 0.27,
408
- "eval_loss": 0.7858812808990479,
409
- "eval_runtime": 0.1583,
410
- "eval_samples_per_second": 6.317,
411
- "eval_steps_per_second": 6.317,
412
- "step": 370
413
- },
414
- {
415
- "epoch": 0.28,
416
- "learning_rate": 0.00023282504012841088,
417
- "loss": 0.8129,
418
- "step": 380
419
- },
420
- {
421
- "epoch": 0.28,
422
- "eval_loss": 0.785178005695343,
423
- "eval_runtime": 0.1577,
424
- "eval_samples_per_second": 6.342,
425
- "eval_steps_per_second": 6.342,
426
- "step": 380
427
- },
428
- {
429
- "epoch": 0.29,
430
- "eval_loss": 0.8090522289276123,
431
- "eval_runtime": 0.1583,
432
- "eval_samples_per_second": 6.318,
433
- "eval_steps_per_second": 6.318,
434
- "step": 390
435
- },
436
- {
437
- "epoch": 0.3,
438
- "learning_rate": 0.00022800963081861955,
439
- "loss": 0.8193,
440
- "step": 400
441
- },
442
- {
443
- "epoch": 0.3,
444
- "eval_loss": 0.7978605031967163,
445
- "eval_runtime": 0.1573,
446
- "eval_samples_per_second": 6.357,
447
- "eval_steps_per_second": 6.357,
448
- "step": 400
449
- },
450
- {
451
- "epoch": 0.3,
452
- "eval_loss": 0.7799659371376038,
453
- "eval_runtime": 0.1581,
454
- "eval_samples_per_second": 6.327,
455
- "eval_steps_per_second": 6.327,
456
- "step": 410
457
- },
458
- {
459
- "epoch": 0.31,
460
- "learning_rate": 0.00022319422150882823,
461
- "loss": 0.8072,
462
- "step": 420
463
- },
464
- {
465
- "epoch": 0.31,
466
- "eval_loss": 0.7800132036209106,
467
- "eval_runtime": 0.1573,
468
- "eval_samples_per_second": 6.359,
469
- "eval_steps_per_second": 6.359,
470
- "step": 420
471
- },
472
- {
473
- "epoch": 0.32,
474
- "eval_loss": 0.7845722436904907,
475
- "eval_runtime": 0.158,
476
- "eval_samples_per_second": 6.329,
477
- "eval_steps_per_second": 6.329,
478
- "step": 430
479
- },
480
- {
481
- "epoch": 0.33,
482
- "learning_rate": 0.0002183788121990369,
483
- "loss": 0.8152,
484
- "step": 440
485
- },
486
- {
487
- "epoch": 0.33,
488
- "eval_loss": 0.7644073963165283,
489
- "eval_runtime": 0.1571,
490
- "eval_samples_per_second": 6.367,
491
- "eval_steps_per_second": 6.367,
492
- "step": 440
493
- },
494
- {
495
- "epoch": 0.33,
496
- "eval_loss": 0.7579188346862793,
497
- "eval_runtime": 0.1571,
498
- "eval_samples_per_second": 6.367,
499
- "eval_steps_per_second": 6.367,
500
- "step": 450
501
- },
502
- {
503
- "epoch": 0.34,
504
- "learning_rate": 0.00021356340288924557,
505
- "loss": 0.8074,
506
- "step": 460
507
- },
508
- {
509
- "epoch": 0.34,
510
- "eval_loss": 0.7676059007644653,
511
- "eval_runtime": 0.1577,
512
- "eval_samples_per_second": 6.341,
513
- "eval_steps_per_second": 6.341,
514
- "step": 460
515
- },
516
- {
517
- "epoch": 0.35,
518
- "eval_loss": 0.7583163976669312,
519
- "eval_runtime": 0.1573,
520
- "eval_samples_per_second": 6.358,
521
- "eval_steps_per_second": 6.358,
522
- "step": 470
523
- },
524
- {
525
- "epoch": 0.36,
526
- "learning_rate": 0.00020874799357945424,
527
- "loss": 0.8118,
528
- "step": 480
529
- },
530
- {
531
- "epoch": 0.36,
532
- "eval_loss": 0.7768086194992065,
533
- "eval_runtime": 0.1577,
534
- "eval_samples_per_second": 6.341,
535
- "eval_steps_per_second": 6.341,
536
- "step": 480
537
- },
538
- {
539
- "epoch": 0.36,
540
- "eval_loss": 0.7975252270698547,
541
- "eval_runtime": 0.157,
542
- "eval_samples_per_second": 6.37,
543
- "eval_steps_per_second": 6.37,
544
- "step": 490
545
- },
546
- {
547
- "epoch": 0.37,
548
- "learning_rate": 0.0002039325842696629,
549
- "loss": 0.8098,
550
- "step": 500
551
- },
552
- {
553
- "epoch": 0.37,
554
- "eval_loss": 0.8035451173782349,
555
- "eval_runtime": 0.1565,
556
- "eval_samples_per_second": 6.388,
557
- "eval_steps_per_second": 6.388,
558
- "step": 500
559
- },
560
- {
561
- "epoch": 0.38,
562
- "eval_loss": 0.7718653678894043,
563
- "eval_runtime": 0.1575,
564
- "eval_samples_per_second": 6.347,
565
- "eval_steps_per_second": 6.347,
566
- "step": 510
567
- },
568
- {
569
- "epoch": 0.39,
570
- "learning_rate": 0.00019911717495987158,
571
- "loss": 0.8167,
572
- "step": 520
573
- },
574
- {
575
- "epoch": 0.39,
576
- "eval_loss": 0.771480917930603,
577
- "eval_runtime": 0.1568,
578
- "eval_samples_per_second": 6.379,
579
- "eval_steps_per_second": 6.379,
580
- "step": 520
581
- },
582
- {
583
- "epoch": 0.39,
584
- "eval_loss": 0.7757036685943604,
585
- "eval_runtime": 0.1571,
586
- "eval_samples_per_second": 6.364,
587
- "eval_steps_per_second": 6.364,
588
- "step": 530
589
- },
590
- {
591
- "epoch": 0.4,
592
- "learning_rate": 0.00019430176565008025,
593
- "loss": 0.8071,
594
- "step": 540
595
- },
596
- {
597
- "epoch": 0.4,
598
- "eval_loss": 0.8048112392425537,
599
- "eval_runtime": 0.1563,
600
- "eval_samples_per_second": 6.396,
601
- "eval_steps_per_second": 6.396,
602
- "step": 540
603
- },
604
- {
605
- "epoch": 0.41,
606
- "eval_loss": 0.7987676858901978,
607
- "eval_runtime": 0.1562,
608
- "eval_samples_per_second": 6.402,
609
- "eval_steps_per_second": 6.402,
610
- "step": 550
611
- },
612
- {
613
- "epoch": 0.42,
614
- "learning_rate": 0.00018948635634028892,
615
- "loss": 0.8143,
616
- "step": 560
617
- },
618
- {
619
- "epoch": 0.42,
620
- "eval_loss": 0.7960355281829834,
621
- "eval_runtime": 0.159,
622
- "eval_samples_per_second": 6.29,
623
- "eval_steps_per_second": 6.29,
624
- "step": 560
625
- },
626
- {
627
- "epoch": 0.42,
628
- "eval_loss": 0.7673896551132202,
629
- "eval_runtime": 0.1569,
630
- "eval_samples_per_second": 6.372,
631
- "eval_steps_per_second": 6.372,
632
- "step": 570
633
- },
634
- {
635
- "epoch": 0.43,
636
- "learning_rate": 0.0001846709470304976,
637
- "loss": 0.7989,
638
- "step": 580
639
- },
640
- {
641
- "epoch": 0.43,
642
- "eval_loss": 0.7655194997787476,
643
- "eval_runtime": 0.1578,
644
- "eval_samples_per_second": 6.336,
645
- "eval_steps_per_second": 6.336,
646
- "step": 580
647
- },
648
- {
649
- "epoch": 0.44,
650
- "eval_loss": 0.7695807218551636,
651
- "eval_runtime": 0.1571,
652
- "eval_samples_per_second": 6.365,
653
- "eval_steps_per_second": 6.365,
654
- "step": 590
655
- },
656
- {
657
- "epoch": 0.45,
658
- "learning_rate": 0.00017985553772070626,
659
- "loss": 0.8121,
660
- "step": 600
661
- },
662
- {
663
- "epoch": 0.45,
664
- "eval_loss": 0.7599303722381592,
665
- "eval_runtime": 0.1566,
666
- "eval_samples_per_second": 6.385,
667
- "eval_steps_per_second": 6.385,
668
- "step": 600
669
- },
670
- {
671
- "epoch": 0.45,
672
- "eval_loss": 0.7409216165542603,
673
- "eval_runtime": 0.1565,
674
- "eval_samples_per_second": 6.389,
675
- "eval_steps_per_second": 6.389,
676
- "step": 610
677
- },
678
- {
679
- "epoch": 0.46,
680
- "learning_rate": 0.00017504012841091494,
681
- "loss": 0.8105,
682
- "step": 620
683
- },
684
- {
685
- "epoch": 0.46,
686
- "eval_loss": 0.7620519995689392,
687
- "eval_runtime": 0.158,
688
- "eval_samples_per_second": 6.331,
689
- "eval_steps_per_second": 6.331,
690
- "step": 620
691
- },
692
- {
693
- "epoch": 0.47,
694
- "eval_loss": 0.7642089128494263,
695
- "eval_runtime": 0.1573,
696
- "eval_samples_per_second": 6.356,
697
- "eval_steps_per_second": 6.356,
698
- "step": 630
699
- },
700
- {
701
- "epoch": 0.48,
702
- "learning_rate": 0.0001702247191011236,
703
- "loss": 0.8073,
704
- "step": 640
705
- },
706
- {
707
- "epoch": 0.48,
708
- "eval_loss": 0.7464691400527954,
709
- "eval_runtime": 0.156,
710
- "eval_samples_per_second": 6.409,
711
- "eval_steps_per_second": 6.409,
712
- "step": 640
713
- },
714
- {
715
- "epoch": 0.48,
716
- "eval_loss": 0.7520545721054077,
717
- "eval_runtime": 0.158,
718
- "eval_samples_per_second": 6.328,
719
- "eval_steps_per_second": 6.328,
720
- "step": 650
721
- },
722
- {
723
- "epoch": 0.49,
724
- "learning_rate": 0.00016540930979133222,
725
- "loss": 0.8115,
726
- "step": 660
727
- },
728
- {
729
- "epoch": 0.49,
730
- "eval_loss": 0.7851120233535767,
731
- "eval_runtime": 0.1573,
732
- "eval_samples_per_second": 6.358,
733
- "eval_steps_per_second": 6.358,
734
- "step": 660
735
- },
736
- {
737
- "epoch": 0.5,
738
- "eval_loss": 0.7845426797866821,
739
- "eval_runtime": 0.1567,
740
- "eval_samples_per_second": 6.381,
741
- "eval_steps_per_second": 6.381,
742
- "step": 670
743
- },
744
- {
745
- "epoch": 0.5,
746
- "learning_rate": 0.0001605939004815409,
747
- "loss": 0.8213,
748
- "step": 680
749
- },
750
- {
751
- "epoch": 0.5,
752
- "eval_loss": 0.7726190686225891,
753
- "eval_runtime": 0.157,
754
- "eval_samples_per_second": 6.37,
755
- "eval_steps_per_second": 6.37,
756
- "step": 680
757
- },
758
- {
759
- "epoch": 0.51,
760
- "eval_loss": 0.7661857604980469,
761
- "eval_runtime": 0.1571,
762
- "eval_samples_per_second": 6.366,
763
- "eval_steps_per_second": 6.366,
764
- "step": 690
765
- },
766
- {
767
- "epoch": 0.52,
768
- "learning_rate": 0.00015577849117174957,
769
- "loss": 0.8155,
770
- "step": 700
771
- },
772
- {
773
- "epoch": 0.52,
774
- "eval_loss": 0.7846134305000305,
775
- "eval_runtime": 0.1564,
776
- "eval_samples_per_second": 6.394,
777
- "eval_steps_per_second": 6.394,
778
- "step": 700
779
- },
780
- {
781
- "epoch": 0.53,
782
- "eval_loss": 0.7888688445091248,
783
- "eval_runtime": 0.1581,
784
- "eval_samples_per_second": 6.324,
785
- "eval_steps_per_second": 6.324,
786
- "step": 710
787
- },
788
- {
789
- "epoch": 0.53,
790
- "learning_rate": 0.00015096308186195824,
791
- "loss": 0.803,
792
- "step": 720
793
- },
794
- {
795
- "epoch": 0.53,
796
- "eval_loss": 0.7981730103492737,
797
- "eval_runtime": 0.1566,
798
- "eval_samples_per_second": 6.385,
799
- "eval_steps_per_second": 6.385,
800
- "step": 720
801
- },
802
- {
803
- "epoch": 0.54,
804
- "eval_loss": 0.8129211664199829,
805
- "eval_runtime": 0.1562,
806
- "eval_samples_per_second": 6.401,
807
- "eval_steps_per_second": 6.401,
808
- "step": 730
809
- },
810
- {
811
- "epoch": 0.55,
812
- "learning_rate": 0.0001461476725521669,
813
- "loss": 0.8108,
814
- "step": 740
815
- },
816
- {
817
- "epoch": 0.55,
818
- "eval_loss": 0.7717241644859314,
819
- "eval_runtime": 0.1569,
820
- "eval_samples_per_second": 6.375,
821
- "eval_steps_per_second": 6.375,
822
- "step": 740
823
- },
824
- {
825
- "epoch": 0.56,
826
- "eval_loss": 0.7536574602127075,
827
- "eval_runtime": 0.1567,
828
- "eval_samples_per_second": 6.38,
829
- "eval_steps_per_second": 6.38,
830
- "step": 750
831
- },
832
- {
833
- "epoch": 0.56,
834
- "learning_rate": 0.00014133226324237558,
835
- "loss": 0.804,
836
- "step": 760
837
- },
838
- {
839
- "epoch": 0.56,
840
- "eval_loss": 0.7452749013900757,
841
- "eval_runtime": 0.1564,
842
- "eval_samples_per_second": 6.392,
843
- "eval_steps_per_second": 6.392,
844
- "step": 760
845
- },
846
- {
847
- "epoch": 0.57,
848
- "eval_loss": 0.7427541613578796,
849
- "eval_runtime": 0.1573,
850
- "eval_samples_per_second": 6.358,
851
- "eval_steps_per_second": 6.358,
852
- "step": 770
853
- },
854
- {
855
- "epoch": 0.58,
856
- "learning_rate": 0.00013651685393258425,
857
- "loss": 0.7983,
858
- "step": 780
859
- },
860
- {
861
- "epoch": 0.58,
862
- "eval_loss": 0.7469484210014343,
863
- "eval_runtime": 0.1562,
864
- "eval_samples_per_second": 6.404,
865
- "eval_steps_per_second": 6.404,
866
- "step": 780
867
- },
868
- {
869
- "epoch": 0.59,
870
- "eval_loss": 0.7667011022567749,
871
- "eval_runtime": 0.1568,
872
- "eval_samples_per_second": 6.376,
873
- "eval_steps_per_second": 6.376,
874
- "step": 790
875
- },
876
- {
877
- "epoch": 0.59,
878
- "learning_rate": 0.00013170144462279292,
879
- "loss": 0.8054,
880
- "step": 800
881
- },
882
- {
883
- "epoch": 0.59,
884
- "eval_loss": 0.7562040090560913,
885
- "eval_runtime": 0.1651,
886
- "eval_samples_per_second": 6.057,
887
- "eval_steps_per_second": 6.057,
888
- "step": 800
889
- },
890
- {
891
- "epoch": 0.6,
892
- "eval_loss": 0.7466799020767212,
893
- "eval_runtime": 0.1572,
894
- "eval_samples_per_second": 6.359,
895
- "eval_steps_per_second": 6.359,
896
- "step": 810
897
- },
898
- {
899
- "epoch": 0.61,
900
- "learning_rate": 0.0001268860353130016,
901
- "loss": 0.8143,
902
- "step": 820
903
- },
904
- {
905
- "epoch": 0.61,
906
- "eval_loss": 0.7601323127746582,
907
- "eval_runtime": 0.1768,
908
- "eval_samples_per_second": 5.657,
909
- "eval_steps_per_second": 5.657,
910
- "step": 820
911
- },
912
- {
913
- "epoch": 0.62,
914
- "eval_loss": 0.7430717349052429,
915
- "eval_runtime": 0.1575,
916
- "eval_samples_per_second": 6.349,
917
- "eval_steps_per_second": 6.349,
918
- "step": 830
919
- },
920
- {
921
- "epoch": 0.62,
922
- "learning_rate": 0.00012207062600321026,
923
- "loss": 0.8038,
924
- "step": 840
925
- },
926
- {
927
- "epoch": 0.62,
928
- "eval_loss": 0.7633467316627502,
929
- "eval_runtime": 0.1577,
930
- "eval_samples_per_second": 6.342,
931
- "eval_steps_per_second": 6.342,
932
- "step": 840
933
- },
934
- {
935
- "epoch": 0.63,
936
- "eval_loss": 0.7451119422912598,
937
- "eval_runtime": 0.1575,
938
- "eval_samples_per_second": 6.349,
939
- "eval_steps_per_second": 6.349,
940
- "step": 850
941
- },
942
- {
943
- "epoch": 0.64,
944
- "learning_rate": 0.00011725521669341892,
945
- "loss": 0.8053,
946
- "step": 860
947
- },
948
- {
949
- "epoch": 0.64,
950
- "eval_loss": 0.7628670930862427,
951
- "eval_runtime": 0.1578,
952
- "eval_samples_per_second": 6.338,
953
- "eval_steps_per_second": 6.338,
954
- "step": 860
955
- },
956
- {
957
- "epoch": 0.65,
958
- "eval_loss": 0.7490127086639404,
959
- "eval_runtime": 0.1571,
960
- "eval_samples_per_second": 6.364,
961
- "eval_steps_per_second": 6.364,
962
- "step": 870
963
- },
964
- {
965
- "epoch": 0.65,
966
- "learning_rate": 0.00011243980738362759,
967
- "loss": 0.8007,
968
- "step": 880
969
- },
970
- {
971
- "epoch": 0.65,
972
- "eval_loss": 0.767558753490448,
973
- "eval_runtime": 0.1564,
974
- "eval_samples_per_second": 6.392,
975
- "eval_steps_per_second": 6.392,
976
- "step": 880
977
- },
978
- {
979
- "epoch": 0.66,
980
- "eval_loss": 0.7646104097366333,
981
- "eval_runtime": 0.1571,
982
- "eval_samples_per_second": 6.365,
983
- "eval_steps_per_second": 6.365,
984
- "step": 890
985
- },
986
- {
987
- "epoch": 0.67,
988
- "learning_rate": 0.00010762439807383626,
989
- "loss": 0.7999,
990
- "step": 900
991
- },
992
- {
993
- "epoch": 0.67,
994
- "eval_loss": 0.7580606937408447,
995
- "eval_runtime": 0.1559,
996
- "eval_samples_per_second": 6.414,
997
- "eval_steps_per_second": 6.414,
998
- "step": 900
999
- },
1000
- {
1001
- "epoch": 0.68,
1002
- "eval_loss": 0.7789003849029541,
1003
- "eval_runtime": 0.1568,
1004
- "eval_samples_per_second": 6.377,
1005
- "eval_steps_per_second": 6.377,
1006
- "step": 910
1007
- },
1008
- {
1009
- "epoch": 0.68,
1010
- "learning_rate": 0.00010280898876404493,
1011
- "loss": 0.8007,
1012
- "step": 920
1013
- },
1014
- {
1015
- "epoch": 0.68,
1016
- "eval_loss": 0.7511205077171326,
1017
- "eval_runtime": 0.1571,
1018
- "eval_samples_per_second": 6.365,
1019
- "eval_steps_per_second": 6.365,
1020
- "step": 920
1021
- },
1022
- {
1023
- "epoch": 0.69,
1024
- "eval_loss": 0.7637555599212646,
1025
- "eval_runtime": 0.1567,
1026
- "eval_samples_per_second": 6.38,
1027
- "eval_steps_per_second": 6.38,
1028
- "step": 930
1029
- },
1030
- {
1031
- "epoch": 0.7,
1032
- "learning_rate": 9.79935794542536e-05,
1033
- "loss": 0.8006,
1034
- "step": 940
1035
- },
1036
- {
1037
- "epoch": 0.7,
1038
- "eval_loss": 0.7561322450637817,
1039
- "eval_runtime": 0.1582,
1040
- "eval_samples_per_second": 6.322,
1041
- "eval_steps_per_second": 6.322,
1042
- "step": 940
1043
- },
1044
- {
1045
- "epoch": 0.71,
1046
- "eval_loss": 0.753495454788208,
1047
- "eval_runtime": 0.1564,
1048
- "eval_samples_per_second": 6.395,
1049
- "eval_steps_per_second": 6.395,
1050
- "step": 950
1051
- },
1052
- {
1053
- "epoch": 0.71,
1054
- "learning_rate": 9.317817014446228e-05,
1055
- "loss": 0.8055,
1056
- "step": 960
1057
- },
1058
- {
1059
- "epoch": 0.71,
1060
- "eval_loss": 0.7372075319290161,
1061
- "eval_runtime": 0.1564,
1062
- "eval_samples_per_second": 6.396,
1063
- "eval_steps_per_second": 6.396,
1064
- "step": 960
1065
- },
1066
- {
1067
- "epoch": 0.72,
1068
- "eval_loss": 0.761570394039154,
1069
- "eval_runtime": 0.158,
1070
- "eval_samples_per_second": 6.329,
1071
- "eval_steps_per_second": 6.329,
1072
- "step": 970
1073
- },
1074
- {
1075
- "epoch": 0.73,
1076
- "learning_rate": 8.836276083467093e-05,
1077
- "loss": 0.7951,
1078
- "step": 980
1079
- },
1080
- {
1081
- "epoch": 0.73,
1082
- "eval_loss": 0.7430101037025452,
1083
- "eval_runtime": 0.1566,
1084
- "eval_samples_per_second": 6.386,
1085
- "eval_steps_per_second": 6.386,
1086
- "step": 980
1087
- },
1088
- {
1089
- "epoch": 0.74,
1090
- "eval_loss": 0.7443385124206543,
1091
- "eval_runtime": 0.1567,
1092
- "eval_samples_per_second": 6.381,
1093
- "eval_steps_per_second": 6.381,
1094
- "step": 990
1095
- },
1096
- {
1097
- "epoch": 0.74,
1098
- "learning_rate": 8.35473515248796e-05,
1099
- "loss": 0.8049,
1100
- "step": 1000
1101
- },
1102
- {
1103
- "epoch": 0.74,
1104
- "eval_loss": 0.7466243505477905,
1105
- "eval_runtime": 0.1568,
1106
- "eval_samples_per_second": 6.376,
1107
- "eval_steps_per_second": 6.376,
1108
- "step": 1000
1109
- },
1110
- {
1111
- "epoch": 0.75,
1112
- "eval_loss": 0.7397581934928894,
1113
- "eval_runtime": 0.1575,
1114
- "eval_samples_per_second": 6.347,
1115
- "eval_steps_per_second": 6.347,
1116
- "step": 1010
1117
- },
1118
- {
1119
- "epoch": 0.76,
1120
- "learning_rate": 7.873194221508827e-05,
1121
- "loss": 0.8131,
1122
- "step": 1020
1123
- },
1124
- {
1125
- "epoch": 0.76,
1126
- "eval_loss": 0.7271538972854614,
1127
- "eval_runtime": 0.1575,
1128
- "eval_samples_per_second": 6.35,
1129
- "eval_steps_per_second": 6.35,
1130
- "step": 1020
1131
- },
1132
- {
1133
- "epoch": 0.76,
1134
- "eval_loss": 0.7354034781455994,
1135
- "eval_runtime": 0.1578,
1136
- "eval_samples_per_second": 6.337,
1137
- "eval_steps_per_second": 6.337,
1138
- "step": 1030
1139
- },
1140
- {
1141
- "epoch": 0.77,
1142
- "learning_rate": 7.391653290529695e-05,
1143
- "loss": 0.7991,
1144
- "step": 1040
1145
- },
1146
- {
1147
- "epoch": 0.77,
1148
- "eval_loss": 0.7297641038894653,
1149
- "eval_runtime": 0.1764,
1150
- "eval_samples_per_second": 5.67,
1151
- "eval_steps_per_second": 5.67,
1152
- "step": 1040
1153
- },
1154
- {
1155
- "epoch": 0.78,
1156
- "eval_loss": 0.7396876811981201,
1157
- "eval_runtime": 0.158,
1158
- "eval_samples_per_second": 6.327,
1159
- "eval_steps_per_second": 6.327,
1160
- "step": 1050
1161
- },
1162
- {
1163
- "epoch": 0.79,
1164
- "learning_rate": 6.910112359550562e-05,
1165
- "loss": 0.8075,
1166
- "step": 1060
1167
- },
1168
- {
1169
- "epoch": 0.79,
1170
- "eval_loss": 0.7296563386917114,
1171
- "eval_runtime": 0.1577,
1172
- "eval_samples_per_second": 6.343,
1173
- "eval_steps_per_second": 6.343,
1174
- "step": 1060
1175
- },
1176
- {
1177
- "epoch": 0.79,
1178
- "eval_loss": 0.7264513969421387,
1179
- "eval_runtime": 0.1571,
1180
- "eval_samples_per_second": 6.366,
1181
- "eval_steps_per_second": 6.366,
1182
- "step": 1070
1183
- }
1184
- ],
1185
- "max_steps": 1346,
1186
- "num_train_epochs": 1,
1187
- "total_flos": 1.3908729346916352e+18,
1188
- "trial_name": null,
1189
- "trial_params": null
1190
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1330/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1330/tokenizer_config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": true,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "clean_up_tokenization_spaces": false,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "model_max_length": 1000000000000000019884624838656,
22
- "pad_token": null,
23
- "sp_model_kwargs": {},
24
- "special_tokens_map_file": "/home/chiayi/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/special_tokens_map.json",
25
- "tokenizer_class": "LlamaTokenizer",
26
- "unk_token": {
27
- "__type": "AddedToken",
28
- "content": "",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1330/trainer_state.json DELETED
@@ -1,1476 +0,0 @@
1
- {
2
- "best_metric": 0.7264513969421387,
3
- "best_model_checkpoint": "lora-alpaca-use-seq/checkpoint-1070",
4
- "epoch": 0.9874939093714471,
5
- "global_step": 1330,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "eval_loss": 2.4221816062927246,
13
- "eval_runtime": 0.1575,
14
- "eval_samples_per_second": 6.348,
15
- "eval_steps_per_second": 6.348,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.01,
20
- "learning_rate": 5.9999999999999995e-05,
21
- "loss": 2.4934,
22
- "step": 20
23
- },
24
- {
25
- "epoch": 0.01,
26
- "eval_loss": 2.3228158950805664,
27
- "eval_runtime": 0.1587,
28
- "eval_samples_per_second": 6.302,
29
- "eval_steps_per_second": 6.302,
30
- "step": 20
31
- },
32
- {
33
- "epoch": 0.02,
34
- "eval_loss": 1.8552848100662231,
35
- "eval_runtime": 0.1566,
36
- "eval_samples_per_second": 6.386,
37
- "eval_steps_per_second": 6.386,
38
- "step": 30
39
- },
40
- {
41
- "epoch": 0.03,
42
- "learning_rate": 0.000117,
43
- "loss": 1.9804,
44
- "step": 40
45
- },
46
- {
47
- "epoch": 0.03,
48
- "eval_loss": 1.2810053825378418,
49
- "eval_runtime": 0.1559,
50
- "eval_samples_per_second": 6.416,
51
- "eval_steps_per_second": 6.416,
52
- "step": 40
53
- },
54
- {
55
- "epoch": 0.04,
56
- "eval_loss": 0.9789324998855591,
57
- "eval_runtime": 0.1567,
58
- "eval_samples_per_second": 6.382,
59
- "eval_steps_per_second": 6.382,
60
- "step": 50
61
- },
62
- {
63
- "epoch": 0.04,
64
- "learning_rate": 0.00017699999999999997,
65
- "loss": 1.165,
66
- "step": 60
67
- },
68
- {
69
- "epoch": 0.04,
70
- "eval_loss": 0.9216629266738892,
71
- "eval_runtime": 0.1577,
72
- "eval_samples_per_second": 6.343,
73
- "eval_steps_per_second": 6.343,
74
- "step": 60
75
- },
76
- {
77
- "epoch": 0.05,
78
- "eval_loss": 0.8872115612030029,
79
- "eval_runtime": 0.1579,
80
- "eval_samples_per_second": 6.333,
81
- "eval_steps_per_second": 6.333,
82
- "step": 70
83
- },
84
- {
85
- "epoch": 0.06,
86
- "learning_rate": 0.000237,
87
- "loss": 0.9931,
88
- "step": 80
89
- },
90
- {
91
- "epoch": 0.06,
92
- "eval_loss": 0.8584132194519043,
93
- "eval_runtime": 0.1599,
94
- "eval_samples_per_second": 6.254,
95
- "eval_steps_per_second": 6.254,
96
- "step": 80
97
- },
98
- {
99
- "epoch": 0.07,
100
- "eval_loss": 0.9114644527435303,
101
- "eval_runtime": 0.1598,
102
- "eval_samples_per_second": 6.259,
103
- "eval_steps_per_second": 6.259,
104
- "step": 90
105
- },
106
- {
107
- "epoch": 0.07,
108
- "learning_rate": 0.00029699999999999996,
109
- "loss": 0.9545,
110
- "step": 100
111
- },
112
- {
113
- "epoch": 0.07,
114
- "eval_loss": 0.8909071087837219,
115
- "eval_runtime": 0.1603,
116
- "eval_samples_per_second": 6.24,
117
- "eval_steps_per_second": 6.24,
118
- "step": 100
119
- },
120
- {
121
- "epoch": 0.08,
122
- "eval_loss": 0.9121037721633911,
123
- "eval_runtime": 0.1598,
124
- "eval_samples_per_second": 6.259,
125
- "eval_steps_per_second": 6.259,
126
- "step": 110
127
- },
128
- {
129
- "epoch": 0.09,
130
- "learning_rate": 0.0002954253611556982,
131
- "loss": 0.8947,
132
- "step": 120
133
- },
134
- {
135
- "epoch": 0.09,
136
- "eval_loss": 0.8340358734130859,
137
- "eval_runtime": 0.1605,
138
- "eval_samples_per_second": 6.229,
139
- "eval_steps_per_second": 6.229,
140
- "step": 120
141
- },
142
- {
143
- "epoch": 0.1,
144
- "eval_loss": 0.8849254846572876,
145
- "eval_runtime": 0.158,
146
- "eval_samples_per_second": 6.33,
147
- "eval_steps_per_second": 6.33,
148
- "step": 130
149
- },
150
- {
151
- "epoch": 0.1,
152
- "learning_rate": 0.0002906099518459069,
153
- "loss": 0.8711,
154
- "step": 140
155
- },
156
- {
157
- "epoch": 0.1,
158
- "eval_loss": 0.8834758996963501,
159
- "eval_runtime": 0.157,
160
- "eval_samples_per_second": 6.368,
161
- "eval_steps_per_second": 6.368,
162
- "step": 140
163
- },
164
- {
165
- "epoch": 0.11,
166
- "eval_loss": 0.8293902277946472,
167
- "eval_runtime": 0.1583,
168
- "eval_samples_per_second": 6.316,
169
- "eval_steps_per_second": 6.316,
170
- "step": 150
171
- },
172
- {
173
- "epoch": 0.12,
174
- "learning_rate": 0.00028579454253611556,
175
- "loss": 0.8688,
176
- "step": 160
177
- },
178
- {
179
- "epoch": 0.12,
180
- "eval_loss": 0.8478549718856812,
181
- "eval_runtime": 0.1581,
182
- "eval_samples_per_second": 6.324,
183
- "eval_steps_per_second": 6.324,
184
- "step": 160
185
- },
186
- {
187
- "epoch": 0.13,
188
- "eval_loss": 0.8488869667053223,
189
- "eval_runtime": 0.159,
190
- "eval_samples_per_second": 6.288,
191
- "eval_steps_per_second": 6.288,
192
- "step": 170
193
- },
194
- {
195
- "epoch": 0.13,
196
- "learning_rate": 0.0002809791332263242,
197
- "loss": 0.8551,
198
- "step": 180
199
- },
200
- {
201
- "epoch": 0.13,
202
- "eval_loss": 0.814437747001648,
203
- "eval_runtime": 0.1575,
204
- "eval_samples_per_second": 6.349,
205
- "eval_steps_per_second": 6.349,
206
- "step": 180
207
- },
208
- {
209
- "epoch": 0.14,
210
- "eval_loss": 0.8222396969795227,
211
- "eval_runtime": 0.1583,
212
- "eval_samples_per_second": 6.317,
213
- "eval_steps_per_second": 6.317,
214
- "step": 190
215
- },
216
- {
217
- "epoch": 0.15,
218
- "learning_rate": 0.0002761637239165329,
219
- "loss": 0.8471,
220
- "step": 200
221
- },
222
- {
223
- "epoch": 0.15,
224
- "eval_loss": 0.8183712959289551,
225
- "eval_runtime": 0.1584,
226
- "eval_samples_per_second": 6.315,
227
- "eval_steps_per_second": 6.315,
228
- "step": 200
229
- },
230
- {
231
- "epoch": 0.16,
232
- "eval_loss": 0.8207969665527344,
233
- "eval_runtime": 0.1584,
234
- "eval_samples_per_second": 6.315,
235
- "eval_steps_per_second": 6.315,
236
- "step": 210
237
- },
238
- {
239
- "epoch": 0.16,
240
- "learning_rate": 0.00027134831460674157,
241
- "loss": 0.8578,
242
- "step": 220
243
- },
244
- {
245
- "epoch": 0.16,
246
- "eval_loss": 0.8103113174438477,
247
- "eval_runtime": 0.1584,
248
- "eval_samples_per_second": 6.315,
249
- "eval_steps_per_second": 6.315,
250
- "step": 220
251
- },
252
- {
253
- "epoch": 0.17,
254
- "eval_loss": 0.808212399482727,
255
- "eval_runtime": 0.1584,
256
- "eval_samples_per_second": 6.311,
257
- "eval_steps_per_second": 6.311,
258
- "step": 230
259
- },
260
- {
261
- "epoch": 0.18,
262
- "learning_rate": 0.00026653290529695024,
263
- "loss": 0.8389,
264
- "step": 240
265
- },
266
- {
267
- "epoch": 0.18,
268
- "eval_loss": 0.7917136549949646,
269
- "eval_runtime": 0.1575,
270
- "eval_samples_per_second": 6.351,
271
- "eval_steps_per_second": 6.351,
272
- "step": 240
273
- },
274
- {
275
- "epoch": 0.19,
276
- "eval_loss": 0.8259432315826416,
277
- "eval_runtime": 0.1603,
278
- "eval_samples_per_second": 6.24,
279
- "eval_steps_per_second": 6.24,
280
- "step": 250
281
- },
282
- {
283
- "epoch": 0.19,
284
- "learning_rate": 0.0002617174959871589,
285
- "loss": 0.8343,
286
- "step": 260
287
- },
288
- {
289
- "epoch": 0.19,
290
- "eval_loss": 0.7860772609710693,
291
- "eval_runtime": 0.1577,
292
- "eval_samples_per_second": 6.34,
293
- "eval_steps_per_second": 6.34,
294
- "step": 260
295
- },
296
- {
297
- "epoch": 0.2,
298
- "eval_loss": 0.8120028972625732,
299
- "eval_runtime": 0.1576,
300
- "eval_samples_per_second": 6.347,
301
- "eval_steps_per_second": 6.347,
302
- "step": 270
303
- },
304
- {
305
- "epoch": 0.21,
306
- "learning_rate": 0.0002569020866773676,
307
- "loss": 0.831,
308
- "step": 280
309
- },
310
- {
311
- "epoch": 0.21,
312
- "eval_loss": 0.7905743718147278,
313
- "eval_runtime": 0.157,
314
- "eval_samples_per_second": 6.368,
315
- "eval_steps_per_second": 6.368,
316
- "step": 280
317
- },
318
- {
319
- "epoch": 0.22,
320
- "eval_loss": 0.7759386301040649,
321
- "eval_runtime": 0.1576,
322
- "eval_samples_per_second": 6.344,
323
- "eval_steps_per_second": 6.344,
324
- "step": 290
325
- },
326
- {
327
- "epoch": 0.22,
328
- "learning_rate": 0.00025208667736757625,
329
- "loss": 0.8325,
330
- "step": 300
331
- },
332
- {
333
- "epoch": 0.22,
334
- "eval_loss": 0.8241894245147705,
335
- "eval_runtime": 0.1572,
336
- "eval_samples_per_second": 6.363,
337
- "eval_steps_per_second": 6.363,
338
- "step": 300
339
- },
340
- {
341
- "epoch": 0.23,
342
- "eval_loss": 0.7761509418487549,
343
- "eval_runtime": 0.1576,
344
- "eval_samples_per_second": 6.346,
345
- "eval_steps_per_second": 6.346,
346
- "step": 310
347
- },
348
- {
349
- "epoch": 0.24,
350
- "learning_rate": 0.0002472712680577849,
351
- "loss": 0.8147,
352
- "step": 320
353
- },
354
- {
355
- "epoch": 0.24,
356
- "eval_loss": 0.7789342403411865,
357
- "eval_runtime": 0.1572,
358
- "eval_samples_per_second": 6.363,
359
- "eval_steps_per_second": 6.363,
360
- "step": 320
361
- },
362
- {
363
- "epoch": 0.25,
364
- "eval_loss": 0.7757179737091064,
365
- "eval_runtime": 0.1578,
366
- "eval_samples_per_second": 6.339,
367
- "eval_steps_per_second": 6.339,
368
- "step": 330
369
- },
370
- {
371
- "epoch": 0.25,
372
- "learning_rate": 0.00024245585874799357,
373
- "loss": 0.8204,
374
- "step": 340
375
- },
376
- {
377
- "epoch": 0.25,
378
- "eval_loss": 0.7954628467559814,
379
- "eval_runtime": 0.1576,
380
- "eval_samples_per_second": 6.344,
381
- "eval_steps_per_second": 6.344,
382
- "step": 340
383
- },
384
- {
385
- "epoch": 0.26,
386
- "eval_loss": 0.7957539558410645,
387
- "eval_runtime": 0.1584,
388
- "eval_samples_per_second": 6.315,
389
- "eval_steps_per_second": 6.315,
390
- "step": 350
391
- },
392
- {
393
- "epoch": 0.27,
394
- "learning_rate": 0.00023764044943820224,
395
- "loss": 0.8232,
396
- "step": 360
397
- },
398
- {
399
- "epoch": 0.27,
400
- "eval_loss": 0.7863476276397705,
401
- "eval_runtime": 0.1571,
402
- "eval_samples_per_second": 6.367,
403
- "eval_steps_per_second": 6.367,
404
- "step": 360
405
- },
406
- {
407
- "epoch": 0.27,
408
- "eval_loss": 0.7858812808990479,
409
- "eval_runtime": 0.1583,
410
- "eval_samples_per_second": 6.317,
411
- "eval_steps_per_second": 6.317,
412
- "step": 370
413
- },
414
- {
415
- "epoch": 0.28,
416
- "learning_rate": 0.00023282504012841088,
417
- "loss": 0.8129,
418
- "step": 380
419
- },
420
- {
421
- "epoch": 0.28,
422
- "eval_loss": 0.785178005695343,
423
- "eval_runtime": 0.1577,
424
- "eval_samples_per_second": 6.342,
425
- "eval_steps_per_second": 6.342,
426
- "step": 380
427
- },
428
- {
429
- "epoch": 0.29,
430
- "eval_loss": 0.8090522289276123,
431
- "eval_runtime": 0.1583,
432
- "eval_samples_per_second": 6.318,
433
- "eval_steps_per_second": 6.318,
434
- "step": 390
435
- },
436
- {
437
- "epoch": 0.3,
438
- "learning_rate": 0.00022800963081861955,
439
- "loss": 0.8193,
440
- "step": 400
441
- },
442
- {
443
- "epoch": 0.3,
444
- "eval_loss": 0.7978605031967163,
445
- "eval_runtime": 0.1573,
446
- "eval_samples_per_second": 6.357,
447
- "eval_steps_per_second": 6.357,
448
- "step": 400
449
- },
450
- {
451
- "epoch": 0.3,
452
- "eval_loss": 0.7799659371376038,
453
- "eval_runtime": 0.1581,
454
- "eval_samples_per_second": 6.327,
455
- "eval_steps_per_second": 6.327,
456
- "step": 410
457
- },
458
- {
459
- "epoch": 0.31,
460
- "learning_rate": 0.00022319422150882823,
461
- "loss": 0.8072,
462
- "step": 420
463
- },
464
- {
465
- "epoch": 0.31,
466
- "eval_loss": 0.7800132036209106,
467
- "eval_runtime": 0.1573,
468
- "eval_samples_per_second": 6.359,
469
- "eval_steps_per_second": 6.359,
470
- "step": 420
471
- },
472
- {
473
- "epoch": 0.32,
474
- "eval_loss": 0.7845722436904907,
475
- "eval_runtime": 0.158,
476
- "eval_samples_per_second": 6.329,
477
- "eval_steps_per_second": 6.329,
478
- "step": 430
479
- },
480
- {
481
- "epoch": 0.33,
482
- "learning_rate": 0.0002183788121990369,
483
- "loss": 0.8152,
484
- "step": 440
485
- },
486
- {
487
- "epoch": 0.33,
488
- "eval_loss": 0.7644073963165283,
489
- "eval_runtime": 0.1571,
490
- "eval_samples_per_second": 6.367,
491
- "eval_steps_per_second": 6.367,
492
- "step": 440
493
- },
494
- {
495
- "epoch": 0.33,
496
- "eval_loss": 0.7579188346862793,
497
- "eval_runtime": 0.1571,
498
- "eval_samples_per_second": 6.367,
499
- "eval_steps_per_second": 6.367,
500
- "step": 450
501
- },
502
- {
503
- "epoch": 0.34,
504
- "learning_rate": 0.00021356340288924557,
505
- "loss": 0.8074,
506
- "step": 460
507
- },
508
- {
509
- "epoch": 0.34,
510
- "eval_loss": 0.7676059007644653,
511
- "eval_runtime": 0.1577,
512
- "eval_samples_per_second": 6.341,
513
- "eval_steps_per_second": 6.341,
514
- "step": 460
515
- },
516
- {
517
- "epoch": 0.35,
518
- "eval_loss": 0.7583163976669312,
519
- "eval_runtime": 0.1573,
520
- "eval_samples_per_second": 6.358,
521
- "eval_steps_per_second": 6.358,
522
- "step": 470
523
- },
524
- {
525
- "epoch": 0.36,
526
- "learning_rate": 0.00020874799357945424,
527
- "loss": 0.8118,
528
- "step": 480
529
- },
530
- {
531
- "epoch": 0.36,
532
- "eval_loss": 0.7768086194992065,
533
- "eval_runtime": 0.1577,
534
- "eval_samples_per_second": 6.341,
535
- "eval_steps_per_second": 6.341,
536
- "step": 480
537
- },
538
- {
539
- "epoch": 0.36,
540
- "eval_loss": 0.7975252270698547,
541
- "eval_runtime": 0.157,
542
- "eval_samples_per_second": 6.37,
543
- "eval_steps_per_second": 6.37,
544
- "step": 490
545
- },
546
- {
547
- "epoch": 0.37,
548
- "learning_rate": 0.0002039325842696629,
549
- "loss": 0.8098,
550
- "step": 500
551
- },
552
- {
553
- "epoch": 0.37,
554
- "eval_loss": 0.8035451173782349,
555
- "eval_runtime": 0.1565,
556
- "eval_samples_per_second": 6.388,
557
- "eval_steps_per_second": 6.388,
558
- "step": 500
559
- },
560
- {
561
- "epoch": 0.38,
562
- "eval_loss": 0.7718653678894043,
563
- "eval_runtime": 0.1575,
564
- "eval_samples_per_second": 6.347,
565
- "eval_steps_per_second": 6.347,
566
- "step": 510
567
- },
568
- {
569
- "epoch": 0.39,
570
- "learning_rate": 0.00019911717495987158,
571
- "loss": 0.8167,
572
- "step": 520
573
- },
574
- {
575
- "epoch": 0.39,
576
- "eval_loss": 0.771480917930603,
577
- "eval_runtime": 0.1568,
578
- "eval_samples_per_second": 6.379,
579
- "eval_steps_per_second": 6.379,
580
- "step": 520
581
- },
582
- {
583
- "epoch": 0.39,
584
- "eval_loss": 0.7757036685943604,
585
- "eval_runtime": 0.1571,
586
- "eval_samples_per_second": 6.364,
587
- "eval_steps_per_second": 6.364,
588
- "step": 530
589
- },
590
- {
591
- "epoch": 0.4,
592
- "learning_rate": 0.00019430176565008025,
593
- "loss": 0.8071,
594
- "step": 540
595
- },
596
- {
597
- "epoch": 0.4,
598
- "eval_loss": 0.8048112392425537,
599
- "eval_runtime": 0.1563,
600
- "eval_samples_per_second": 6.396,
601
- "eval_steps_per_second": 6.396,
602
- "step": 540
603
- },
604
- {
605
- "epoch": 0.41,
606
- "eval_loss": 0.7987676858901978,
607
- "eval_runtime": 0.1562,
608
- "eval_samples_per_second": 6.402,
609
- "eval_steps_per_second": 6.402,
610
- "step": 550
611
- },
612
- {
613
- "epoch": 0.42,
614
- "learning_rate": 0.00018948635634028892,
615
- "loss": 0.8143,
616
- "step": 560
617
- },
618
- {
619
- "epoch": 0.42,
620
- "eval_loss": 0.7960355281829834,
621
- "eval_runtime": 0.159,
622
- "eval_samples_per_second": 6.29,
623
- "eval_steps_per_second": 6.29,
624
- "step": 560
625
- },
626
- {
627
- "epoch": 0.42,
628
- "eval_loss": 0.7673896551132202,
629
- "eval_runtime": 0.1569,
630
- "eval_samples_per_second": 6.372,
631
- "eval_steps_per_second": 6.372,
632
- "step": 570
633
- },
634
- {
635
- "epoch": 0.43,
636
- "learning_rate": 0.0001846709470304976,
637
- "loss": 0.7989,
638
- "step": 580
639
- },
640
- {
641
- "epoch": 0.43,
642
- "eval_loss": 0.7655194997787476,
643
- "eval_runtime": 0.1578,
644
- "eval_samples_per_second": 6.336,
645
- "eval_steps_per_second": 6.336,
646
- "step": 580
647
- },
648
- {
649
- "epoch": 0.44,
650
- "eval_loss": 0.7695807218551636,
651
- "eval_runtime": 0.1571,
652
- "eval_samples_per_second": 6.365,
653
- "eval_steps_per_second": 6.365,
654
- "step": 590
655
- },
656
- {
657
- "epoch": 0.45,
658
- "learning_rate": 0.00017985553772070626,
659
- "loss": 0.8121,
660
- "step": 600
661
- },
662
- {
663
- "epoch": 0.45,
664
- "eval_loss": 0.7599303722381592,
665
- "eval_runtime": 0.1566,
666
- "eval_samples_per_second": 6.385,
667
- "eval_steps_per_second": 6.385,
668
- "step": 600
669
- },
670
- {
671
- "epoch": 0.45,
672
- "eval_loss": 0.7409216165542603,
673
- "eval_runtime": 0.1565,
674
- "eval_samples_per_second": 6.389,
675
- "eval_steps_per_second": 6.389,
676
- "step": 610
677
- },
678
- {
679
- "epoch": 0.46,
680
- "learning_rate": 0.00017504012841091494,
681
- "loss": 0.8105,
682
- "step": 620
683
- },
684
- {
685
- "epoch": 0.46,
686
- "eval_loss": 0.7620519995689392,
687
- "eval_runtime": 0.158,
688
- "eval_samples_per_second": 6.331,
689
- "eval_steps_per_second": 6.331,
690
- "step": 620
691
- },
692
- {
693
- "epoch": 0.47,
694
- "eval_loss": 0.7642089128494263,
695
- "eval_runtime": 0.1573,
696
- "eval_samples_per_second": 6.356,
697
- "eval_steps_per_second": 6.356,
698
- "step": 630
699
- },
700
- {
701
- "epoch": 0.48,
702
- "learning_rate": 0.0001702247191011236,
703
- "loss": 0.8073,
704
- "step": 640
705
- },
706
- {
707
- "epoch": 0.48,
708
- "eval_loss": 0.7464691400527954,
709
- "eval_runtime": 0.156,
710
- "eval_samples_per_second": 6.409,
711
- "eval_steps_per_second": 6.409,
712
- "step": 640
713
- },
714
- {
715
- "epoch": 0.48,
716
- "eval_loss": 0.7520545721054077,
717
- "eval_runtime": 0.158,
718
- "eval_samples_per_second": 6.328,
719
- "eval_steps_per_second": 6.328,
720
- "step": 650
721
- },
722
- {
723
- "epoch": 0.49,
724
- "learning_rate": 0.00016540930979133222,
725
- "loss": 0.8115,
726
- "step": 660
727
- },
728
- {
729
- "epoch": 0.49,
730
- "eval_loss": 0.7851120233535767,
731
- "eval_runtime": 0.1573,
732
- "eval_samples_per_second": 6.358,
733
- "eval_steps_per_second": 6.358,
734
- "step": 660
735
- },
736
- {
737
- "epoch": 0.5,
738
- "eval_loss": 0.7845426797866821,
739
- "eval_runtime": 0.1567,
740
- "eval_samples_per_second": 6.381,
741
- "eval_steps_per_second": 6.381,
742
- "step": 670
743
- },
744
- {
745
- "epoch": 0.5,
746
- "learning_rate": 0.0001605939004815409,
747
- "loss": 0.8213,
748
- "step": 680
749
- },
750
- {
751
- "epoch": 0.5,
752
- "eval_loss": 0.7726190686225891,
753
- "eval_runtime": 0.157,
754
- "eval_samples_per_second": 6.37,
755
- "eval_steps_per_second": 6.37,
756
- "step": 680
757
- },
758
- {
759
- "epoch": 0.51,
760
- "eval_loss": 0.7661857604980469,
761
- "eval_runtime": 0.1571,
762
- "eval_samples_per_second": 6.366,
763
- "eval_steps_per_second": 6.366,
764
- "step": 690
765
- },
766
- {
767
- "epoch": 0.52,
768
- "learning_rate": 0.00015577849117174957,
769
- "loss": 0.8155,
770
- "step": 700
771
- },
772
- {
773
- "epoch": 0.52,
774
- "eval_loss": 0.7846134305000305,
775
- "eval_runtime": 0.1564,
776
- "eval_samples_per_second": 6.394,
777
- "eval_steps_per_second": 6.394,
778
- "step": 700
779
- },
780
- {
781
- "epoch": 0.53,
782
- "eval_loss": 0.7888688445091248,
783
- "eval_runtime": 0.1581,
784
- "eval_samples_per_second": 6.324,
785
- "eval_steps_per_second": 6.324,
786
- "step": 710
787
- },
788
- {
789
- "epoch": 0.53,
790
- "learning_rate": 0.00015096308186195824,
791
- "loss": 0.803,
792
- "step": 720
793
- },
794
- {
795
- "epoch": 0.53,
796
- "eval_loss": 0.7981730103492737,
797
- "eval_runtime": 0.1566,
798
- "eval_samples_per_second": 6.385,
799
- "eval_steps_per_second": 6.385,
800
- "step": 720
801
- },
802
- {
803
- "epoch": 0.54,
804
- "eval_loss": 0.8129211664199829,
805
- "eval_runtime": 0.1562,
806
- "eval_samples_per_second": 6.401,
807
- "eval_steps_per_second": 6.401,
808
- "step": 730
809
- },
810
- {
811
- "epoch": 0.55,
812
- "learning_rate": 0.0001461476725521669,
813
- "loss": 0.8108,
814
- "step": 740
815
- },
816
- {
817
- "epoch": 0.55,
818
- "eval_loss": 0.7717241644859314,
819
- "eval_runtime": 0.1569,
820
- "eval_samples_per_second": 6.375,
821
- "eval_steps_per_second": 6.375,
822
- "step": 740
823
- },
824
- {
825
- "epoch": 0.56,
826
- "eval_loss": 0.7536574602127075,
827
- "eval_runtime": 0.1567,
828
- "eval_samples_per_second": 6.38,
829
- "eval_steps_per_second": 6.38,
830
- "step": 750
831
- },
832
- {
833
- "epoch": 0.56,
834
- "learning_rate": 0.00014133226324237558,
835
- "loss": 0.804,
836
- "step": 760
837
- },
838
- {
839
- "epoch": 0.56,
840
- "eval_loss": 0.7452749013900757,
841
- "eval_runtime": 0.1564,
842
- "eval_samples_per_second": 6.392,
843
- "eval_steps_per_second": 6.392,
844
- "step": 760
845
- },
846
- {
847
- "epoch": 0.57,
848
- "eval_loss": 0.7427541613578796,
849
- "eval_runtime": 0.1573,
850
- "eval_samples_per_second": 6.358,
851
- "eval_steps_per_second": 6.358,
852
- "step": 770
853
- },
854
- {
855
- "epoch": 0.58,
856
- "learning_rate": 0.00013651685393258425,
857
- "loss": 0.7983,
858
- "step": 780
859
- },
860
- {
861
- "epoch": 0.58,
862
- "eval_loss": 0.7469484210014343,
863
- "eval_runtime": 0.1562,
864
- "eval_samples_per_second": 6.404,
865
- "eval_steps_per_second": 6.404,
866
- "step": 780
867
- },
868
- {
869
- "epoch": 0.59,
870
- "eval_loss": 0.7667011022567749,
871
- "eval_runtime": 0.1568,
872
- "eval_samples_per_second": 6.376,
873
- "eval_steps_per_second": 6.376,
874
- "step": 790
875
- },
876
- {
877
- "epoch": 0.59,
878
- "learning_rate": 0.00013170144462279292,
879
- "loss": 0.8054,
880
- "step": 800
881
- },
882
- {
883
- "epoch": 0.59,
884
- "eval_loss": 0.7562040090560913,
885
- "eval_runtime": 0.1651,
886
- "eval_samples_per_second": 6.057,
887
- "eval_steps_per_second": 6.057,
888
- "step": 800
889
- },
890
- {
891
- "epoch": 0.6,
892
- "eval_loss": 0.7466799020767212,
893
- "eval_runtime": 0.1572,
894
- "eval_samples_per_second": 6.359,
895
- "eval_steps_per_second": 6.359,
896
- "step": 810
897
- },
898
- {
899
- "epoch": 0.61,
900
- "learning_rate": 0.0001268860353130016,
901
- "loss": 0.8143,
902
- "step": 820
903
- },
904
- {
905
- "epoch": 0.61,
906
- "eval_loss": 0.7601323127746582,
907
- "eval_runtime": 0.1768,
908
- "eval_samples_per_second": 5.657,
909
- "eval_steps_per_second": 5.657,
910
- "step": 820
911
- },
912
- {
913
- "epoch": 0.62,
914
- "eval_loss": 0.7430717349052429,
915
- "eval_runtime": 0.1575,
916
- "eval_samples_per_second": 6.349,
917
- "eval_steps_per_second": 6.349,
918
- "step": 830
919
- },
920
- {
921
- "epoch": 0.62,
922
- "learning_rate": 0.00012207062600321026,
923
- "loss": 0.8038,
924
- "step": 840
925
- },
926
- {
927
- "epoch": 0.62,
928
- "eval_loss": 0.7633467316627502,
929
- "eval_runtime": 0.1577,
930
- "eval_samples_per_second": 6.342,
931
- "eval_steps_per_second": 6.342,
932
- "step": 840
933
- },
934
- {
935
- "epoch": 0.63,
936
- "eval_loss": 0.7451119422912598,
937
- "eval_runtime": 0.1575,
938
- "eval_samples_per_second": 6.349,
939
- "eval_steps_per_second": 6.349,
940
- "step": 850
941
- },
942
- {
943
- "epoch": 0.64,
944
- "learning_rate": 0.00011725521669341892,
945
- "loss": 0.8053,
946
- "step": 860
947
- },
948
- {
949
- "epoch": 0.64,
950
- "eval_loss": 0.7628670930862427,
951
- "eval_runtime": 0.1578,
952
- "eval_samples_per_second": 6.338,
953
- "eval_steps_per_second": 6.338,
954
- "step": 860
955
- },
956
- {
957
- "epoch": 0.65,
958
- "eval_loss": 0.7490127086639404,
959
- "eval_runtime": 0.1571,
960
- "eval_samples_per_second": 6.364,
961
- "eval_steps_per_second": 6.364,
962
- "step": 870
963
- },
964
- {
965
- "epoch": 0.65,
966
- "learning_rate": 0.00011243980738362759,
967
- "loss": 0.8007,
968
- "step": 880
969
- },
970
- {
971
- "epoch": 0.65,
972
- "eval_loss": 0.767558753490448,
973
- "eval_runtime": 0.1564,
974
- "eval_samples_per_second": 6.392,
975
- "eval_steps_per_second": 6.392,
976
- "step": 880
977
- },
978
- {
979
- "epoch": 0.66,
980
- "eval_loss": 0.7646104097366333,
981
- "eval_runtime": 0.1571,
982
- "eval_samples_per_second": 6.365,
983
- "eval_steps_per_second": 6.365,
984
- "step": 890
985
- },
986
- {
987
- "epoch": 0.67,
988
- "learning_rate": 0.00010762439807383626,
989
- "loss": 0.7999,
990
- "step": 900
991
- },
992
- {
993
- "epoch": 0.67,
994
- "eval_loss": 0.7580606937408447,
995
- "eval_runtime": 0.1559,
996
- "eval_samples_per_second": 6.414,
997
- "eval_steps_per_second": 6.414,
998
- "step": 900
999
- },
1000
- {
1001
- "epoch": 0.68,
1002
- "eval_loss": 0.7789003849029541,
1003
- "eval_runtime": 0.1568,
1004
- "eval_samples_per_second": 6.377,
1005
- "eval_steps_per_second": 6.377,
1006
- "step": 910
1007
- },
1008
- {
1009
- "epoch": 0.68,
1010
- "learning_rate": 0.00010280898876404493,
1011
- "loss": 0.8007,
1012
- "step": 920
1013
- },
1014
- {
1015
- "epoch": 0.68,
1016
- "eval_loss": 0.7511205077171326,
1017
- "eval_runtime": 0.1571,
1018
- "eval_samples_per_second": 6.365,
1019
- "eval_steps_per_second": 6.365,
1020
- "step": 920
1021
- },
1022
- {
1023
- "epoch": 0.69,
1024
- "eval_loss": 0.7637555599212646,
1025
- "eval_runtime": 0.1567,
1026
- "eval_samples_per_second": 6.38,
1027
- "eval_steps_per_second": 6.38,
1028
- "step": 930
1029
- },
1030
- {
1031
- "epoch": 0.7,
1032
- "learning_rate": 9.79935794542536e-05,
1033
- "loss": 0.8006,
1034
- "step": 940
1035
- },
1036
- {
1037
- "epoch": 0.7,
1038
- "eval_loss": 0.7561322450637817,
1039
- "eval_runtime": 0.1582,
1040
- "eval_samples_per_second": 6.322,
1041
- "eval_steps_per_second": 6.322,
1042
- "step": 940
1043
- },
1044
- {
1045
- "epoch": 0.71,
1046
- "eval_loss": 0.753495454788208,
1047
- "eval_runtime": 0.1564,
1048
- "eval_samples_per_second": 6.395,
1049
- "eval_steps_per_second": 6.395,
1050
- "step": 950
1051
- },
1052
- {
1053
- "epoch": 0.71,
1054
- "learning_rate": 9.317817014446228e-05,
1055
- "loss": 0.8055,
1056
- "step": 960
1057
- },
1058
- {
1059
- "epoch": 0.71,
1060
- "eval_loss": 0.7372075319290161,
1061
- "eval_runtime": 0.1564,
1062
- "eval_samples_per_second": 6.396,
1063
- "eval_steps_per_second": 6.396,
1064
- "step": 960
1065
- },
1066
- {
1067
- "epoch": 0.72,
1068
- "eval_loss": 0.761570394039154,
1069
- "eval_runtime": 0.158,
1070
- "eval_samples_per_second": 6.329,
1071
- "eval_steps_per_second": 6.329,
1072
- "step": 970
1073
- },
1074
- {
1075
- "epoch": 0.73,
1076
- "learning_rate": 8.836276083467093e-05,
1077
- "loss": 0.7951,
1078
- "step": 980
1079
- },
1080
- {
1081
- "epoch": 0.73,
1082
- "eval_loss": 0.7430101037025452,
1083
- "eval_runtime": 0.1566,
1084
- "eval_samples_per_second": 6.386,
1085
- "eval_steps_per_second": 6.386,
1086
- "step": 980
1087
- },
1088
- {
1089
- "epoch": 0.74,
1090
- "eval_loss": 0.7443385124206543,
1091
- "eval_runtime": 0.1567,
1092
- "eval_samples_per_second": 6.381,
1093
- "eval_steps_per_second": 6.381,
1094
- "step": 990
1095
- },
1096
- {
1097
- "epoch": 0.74,
1098
- "learning_rate": 8.35473515248796e-05,
1099
- "loss": 0.8049,
1100
- "step": 1000
1101
- },
1102
- {
1103
- "epoch": 0.74,
1104
- "eval_loss": 0.7466243505477905,
1105
- "eval_runtime": 0.1568,
1106
- "eval_samples_per_second": 6.376,
1107
- "eval_steps_per_second": 6.376,
1108
- "step": 1000
1109
- },
1110
- {
1111
- "epoch": 0.75,
1112
- "eval_loss": 0.7397581934928894,
1113
- "eval_runtime": 0.1575,
1114
- "eval_samples_per_second": 6.347,
1115
- "eval_steps_per_second": 6.347,
1116
- "step": 1010
1117
- },
1118
- {
1119
- "epoch": 0.76,
1120
- "learning_rate": 7.873194221508827e-05,
1121
- "loss": 0.8131,
1122
- "step": 1020
1123
- },
1124
- {
1125
- "epoch": 0.76,
1126
- "eval_loss": 0.7271538972854614,
1127
- "eval_runtime": 0.1575,
1128
- "eval_samples_per_second": 6.35,
1129
- "eval_steps_per_second": 6.35,
1130
- "step": 1020
1131
- },
1132
- {
1133
- "epoch": 0.76,
1134
- "eval_loss": 0.7354034781455994,
1135
- "eval_runtime": 0.1578,
1136
- "eval_samples_per_second": 6.337,
1137
- "eval_steps_per_second": 6.337,
1138
- "step": 1030
1139
- },
1140
- {
1141
- "epoch": 0.77,
1142
- "learning_rate": 7.391653290529695e-05,
1143
- "loss": 0.7991,
1144
- "step": 1040
1145
- },
1146
- {
1147
- "epoch": 0.77,
1148
- "eval_loss": 0.7297641038894653,
1149
- "eval_runtime": 0.1764,
1150
- "eval_samples_per_second": 5.67,
1151
- "eval_steps_per_second": 5.67,
1152
- "step": 1040
1153
- },
1154
- {
1155
- "epoch": 0.78,
1156
- "eval_loss": 0.7396876811981201,
1157
- "eval_runtime": 0.158,
1158
- "eval_samples_per_second": 6.327,
1159
- "eval_steps_per_second": 6.327,
1160
- "step": 1050
1161
- },
1162
- {
1163
- "epoch": 0.79,
1164
- "learning_rate": 6.910112359550562e-05,
1165
- "loss": 0.8075,
1166
- "step": 1060
1167
- },
1168
- {
1169
- "epoch": 0.79,
1170
- "eval_loss": 0.7296563386917114,
1171
- "eval_runtime": 0.1577,
1172
- "eval_samples_per_second": 6.343,
1173
- "eval_steps_per_second": 6.343,
1174
- "step": 1060
1175
- },
1176
- {
1177
- "epoch": 0.79,
1178
- "eval_loss": 0.7264513969421387,
1179
- "eval_runtime": 0.1571,
1180
- "eval_samples_per_second": 6.366,
1181
- "eval_steps_per_second": 6.366,
1182
- "step": 1070
1183
- },
1184
- {
1185
- "epoch": 0.8,
1186
- "learning_rate": 6.428571428571427e-05,
1187
- "loss": 0.8075,
1188
- "step": 1080
1189
- },
1190
- {
1191
- "epoch": 0.8,
1192
- "eval_loss": 0.7289776802062988,
1193
- "eval_runtime": 0.1571,
1194
- "eval_samples_per_second": 6.367,
1195
- "eval_steps_per_second": 6.367,
1196
- "step": 1080
1197
- },
1198
- {
1199
- "epoch": 0.81,
1200
- "eval_loss": 0.7540558576583862,
1201
- "eval_runtime": 0.1571,
1202
- "eval_samples_per_second": 6.365,
1203
- "eval_steps_per_second": 6.365,
1204
- "step": 1090
1205
- },
1206
- {
1207
- "epoch": 0.82,
1208
- "learning_rate": 5.9470304975922945e-05,
1209
- "loss": 0.7916,
1210
- "step": 1100
1211
- },
1212
- {
1213
- "epoch": 0.82,
1214
- "eval_loss": 0.7357069253921509,
1215
- "eval_runtime": 0.1564,
1216
- "eval_samples_per_second": 6.393,
1217
- "eval_steps_per_second": 6.393,
1218
- "step": 1100
1219
- },
1220
- {
1221
- "epoch": 0.82,
1222
- "eval_loss": 0.7496874928474426,
1223
- "eval_runtime": 0.1575,
1224
- "eval_samples_per_second": 6.351,
1225
- "eval_steps_per_second": 6.351,
1226
- "step": 1110
1227
- },
1228
- {
1229
- "epoch": 0.83,
1230
- "learning_rate": 5.4654895666131616e-05,
1231
- "loss": 0.7899,
1232
- "step": 1120
1233
- },
1234
- {
1235
- "epoch": 0.83,
1236
- "eval_loss": 0.7296478748321533,
1237
- "eval_runtime": 0.1573,
1238
- "eval_samples_per_second": 6.357,
1239
- "eval_steps_per_second": 6.357,
1240
- "step": 1120
1241
- },
1242
- {
1243
- "epoch": 0.84,
1244
- "eval_loss": 0.7285305261611938,
1245
- "eval_runtime": 0.1576,
1246
- "eval_samples_per_second": 6.346,
1247
- "eval_steps_per_second": 6.346,
1248
- "step": 1130
1249
- },
1250
- {
1251
- "epoch": 0.85,
1252
- "learning_rate": 4.983948635634029e-05,
1253
- "loss": 0.7958,
1254
- "step": 1140
1255
- },
1256
- {
1257
- "epoch": 0.85,
1258
- "eval_loss": 0.7548575401306152,
1259
- "eval_runtime": 0.157,
1260
- "eval_samples_per_second": 6.368,
1261
- "eval_steps_per_second": 6.368,
1262
- "step": 1140
1263
- },
1264
- {
1265
- "epoch": 0.85,
1266
- "eval_loss": 0.75063556432724,
1267
- "eval_runtime": 0.1577,
1268
- "eval_samples_per_second": 6.343,
1269
- "eval_steps_per_second": 6.343,
1270
- "step": 1150
1271
- },
1272
- {
1273
- "epoch": 0.86,
1274
- "learning_rate": 4.502407704654895e-05,
1275
- "loss": 0.8004,
1276
- "step": 1160
1277
- },
1278
- {
1279
- "epoch": 0.86,
1280
- "eval_loss": 0.7478867769241333,
1281
- "eval_runtime": 0.1577,
1282
- "eval_samples_per_second": 6.341,
1283
- "eval_steps_per_second": 6.341,
1284
- "step": 1160
1285
- },
1286
- {
1287
- "epoch": 0.87,
1288
- "eval_loss": 0.7417320609092712,
1289
- "eval_runtime": 0.1572,
1290
- "eval_samples_per_second": 6.361,
1291
- "eval_steps_per_second": 6.361,
1292
- "step": 1170
1293
- },
1294
- {
1295
- "epoch": 0.88,
1296
- "learning_rate": 4.020866773675762e-05,
1297
- "loss": 0.7927,
1298
- "step": 1180
1299
- },
1300
- {
1301
- "epoch": 0.88,
1302
- "eval_loss": 0.7448097467422485,
1303
- "eval_runtime": 0.1578,
1304
- "eval_samples_per_second": 6.337,
1305
- "eval_steps_per_second": 6.337,
1306
- "step": 1180
1307
- },
1308
- {
1309
- "epoch": 0.88,
1310
- "eval_loss": 0.7570788860321045,
1311
- "eval_runtime": 0.1576,
1312
- "eval_samples_per_second": 6.346,
1313
- "eval_steps_per_second": 6.346,
1314
- "step": 1190
1315
- },
1316
- {
1317
- "epoch": 0.89,
1318
- "learning_rate": 3.539325842696629e-05,
1319
- "loss": 0.8013,
1320
- "step": 1200
1321
- },
1322
- {
1323
- "epoch": 0.89,
1324
- "eval_loss": 0.7594838738441467,
1325
- "eval_runtime": 0.1581,
1326
- "eval_samples_per_second": 6.324,
1327
- "eval_steps_per_second": 6.324,
1328
- "step": 1200
1329
- },
1330
- {
1331
- "epoch": 0.9,
1332
- "eval_loss": 0.7382398843765259,
1333
- "eval_runtime": 0.157,
1334
- "eval_samples_per_second": 6.369,
1335
- "eval_steps_per_second": 6.369,
1336
- "step": 1210
1337
- },
1338
- {
1339
- "epoch": 0.91,
1340
- "learning_rate": 3.057784911717496e-05,
1341
- "loss": 0.8002,
1342
- "step": 1220
1343
- },
1344
- {
1345
- "epoch": 0.91,
1346
- "eval_loss": 0.7314504384994507,
1347
- "eval_runtime": 0.1571,
1348
- "eval_samples_per_second": 6.366,
1349
- "eval_steps_per_second": 6.366,
1350
- "step": 1220
1351
- },
1352
- {
1353
- "epoch": 0.91,
1354
- "eval_loss": 0.780935525894165,
1355
- "eval_runtime": 0.1575,
1356
- "eval_samples_per_second": 6.348,
1357
- "eval_steps_per_second": 6.348,
1358
- "step": 1230
1359
- },
1360
- {
1361
- "epoch": 0.92,
1362
- "learning_rate": 2.5762439807383626e-05,
1363
- "loss": 0.7914,
1364
- "step": 1240
1365
- },
1366
- {
1367
- "epoch": 0.92,
1368
- "eval_loss": 0.7425116300582886,
1369
- "eval_runtime": 0.1574,
1370
- "eval_samples_per_second": 6.352,
1371
- "eval_steps_per_second": 6.352,
1372
- "step": 1240
1373
- },
1374
- {
1375
- "epoch": 0.93,
1376
- "eval_loss": 0.7363260984420776,
1377
- "eval_runtime": 0.1564,
1378
- "eval_samples_per_second": 6.394,
1379
- "eval_steps_per_second": 6.394,
1380
- "step": 1250
1381
- },
1382
- {
1383
- "epoch": 0.94,
1384
- "learning_rate": 2.0947030497592293e-05,
1385
- "loss": 0.7911,
1386
- "step": 1260
1387
- },
1388
- {
1389
- "epoch": 0.94,
1390
- "eval_loss": 0.7363616228103638,
1391
- "eval_runtime": 0.1566,
1392
- "eval_samples_per_second": 6.385,
1393
- "eval_steps_per_second": 6.385,
1394
- "step": 1260
1395
- },
1396
- {
1397
- "epoch": 0.94,
1398
- "eval_loss": 0.7710204124450684,
1399
- "eval_runtime": 0.1575,
1400
- "eval_samples_per_second": 6.349,
1401
- "eval_steps_per_second": 6.349,
1402
- "step": 1270
1403
- },
1404
- {
1405
- "epoch": 0.95,
1406
- "learning_rate": 1.613162118780096e-05,
1407
- "loss": 0.7973,
1408
- "step": 1280
1409
- },
1410
- {
1411
- "epoch": 0.95,
1412
- "eval_loss": 0.7362488508224487,
1413
- "eval_runtime": 0.1575,
1414
- "eval_samples_per_second": 6.349,
1415
- "eval_steps_per_second": 6.349,
1416
- "step": 1280
1417
- },
1418
- {
1419
- "epoch": 0.96,
1420
- "eval_loss": 0.7484357357025146,
1421
- "eval_runtime": 0.1581,
1422
- "eval_samples_per_second": 6.326,
1423
- "eval_steps_per_second": 6.326,
1424
- "step": 1290
1425
- },
1426
- {
1427
- "epoch": 0.97,
1428
- "learning_rate": 1.1316211878009629e-05,
1429
- "loss": 0.8008,
1430
- "step": 1300
1431
- },
1432
- {
1433
- "epoch": 0.97,
1434
- "eval_loss": 0.7393674254417419,
1435
- "eval_runtime": 0.1578,
1436
- "eval_samples_per_second": 6.338,
1437
- "eval_steps_per_second": 6.338,
1438
- "step": 1300
1439
- },
1440
- {
1441
- "epoch": 0.97,
1442
- "eval_loss": 0.738249659538269,
1443
- "eval_runtime": 0.1576,
1444
- "eval_samples_per_second": 6.344,
1445
- "eval_steps_per_second": 6.344,
1446
- "step": 1310
1447
- },
1448
- {
1449
- "epoch": 0.98,
1450
- "learning_rate": 6.500802568218298e-06,
1451
- "loss": 0.798,
1452
- "step": 1320
1453
- },
1454
- {
1455
- "epoch": 0.98,
1456
- "eval_loss": 0.7329107522964478,
1457
- "eval_runtime": 0.1575,
1458
- "eval_samples_per_second": 6.351,
1459
- "eval_steps_per_second": 6.351,
1460
- "step": 1320
1461
- },
1462
- {
1463
- "epoch": 0.99,
1464
- "eval_loss": 0.7391780614852905,
1465
- "eval_runtime": 0.1587,
1466
- "eval_samples_per_second": 6.302,
1467
- "eval_steps_per_second": 6.302,
1468
- "step": 1330
1469
- }
1470
- ],
1471
- "max_steps": 1346,
1472
- "num_train_epochs": 1,
1473
- "total_flos": 1.7288420590092288e+18,
1474
- "trial_name": null,
1475
- "trial_params": null
1476
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1340/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1340/tokenizer_config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": true,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "clean_up_tokenization_spaces": false,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "model_max_length": 1000000000000000019884624838656,
22
- "pad_token": null,
23
- "sp_model_kwargs": {},
24
- "special_tokens_map_file": "/home/chiayi/.cache/huggingface/hub/models--decapoda-research--llama-7b-hf/snapshots/5f98eefcc80e437ef68d457ad7bf167c2c6a1348/special_tokens_map.json",
25
- "tokenizer_class": "LlamaTokenizer",
26
- "unk_token": {
27
- "__type": "AddedToken",
28
- "content": "",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama/use-seq/checkpoint-1340/trainer_state.json DELETED
@@ -1,1490 +0,0 @@
1
- {
2
- "best_metric": 0.7264513969421387,
3
- "best_model_checkpoint": "lora-alpaca-use-seq/checkpoint-1070",
4
- "epoch": 0.9949186756073227,
5
- "global_step": 1340,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "eval_loss": 2.4221816062927246,
13
- "eval_runtime": 0.1575,
14
- "eval_samples_per_second": 6.348,
15
- "eval_steps_per_second": 6.348,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.01,
20
- "learning_rate": 5.9999999999999995e-05,
21
- "loss": 2.4934,
22
- "step": 20
23
- },
24
- {
25
- "epoch": 0.01,
26
- "eval_loss": 2.3228158950805664,
27
- "eval_runtime": 0.1587,
28
- "eval_samples_per_second": 6.302,
29
- "eval_steps_per_second": 6.302,
30
- "step": 20
31
- },
32
- {
33
- "epoch": 0.02,
34
- "eval_loss": 1.8552848100662231,
35
- "eval_runtime": 0.1566,
36
- "eval_samples_per_second": 6.386,
37
- "eval_steps_per_second": 6.386,
38
- "step": 30
39
- },
40
- {
41
- "epoch": 0.03,
42
- "learning_rate": 0.000117,
43
- "loss": 1.9804,
44
- "step": 40
45
- },
46
- {
47
- "epoch": 0.03,
48
- "eval_loss": 1.2810053825378418,
49
- "eval_runtime": 0.1559,
50
- "eval_samples_per_second": 6.416,
51
- "eval_steps_per_second": 6.416,
52
- "step": 40
53
- },
54
- {
55
- "epoch": 0.04,
56
- "eval_loss": 0.9789324998855591,
57
- "eval_runtime": 0.1567,
58
- "eval_samples_per_second": 6.382,
59
- "eval_steps_per_second": 6.382,
60
- "step": 50
61
- },
62
- {
63
- "epoch": 0.04,
64
- "learning_rate": 0.00017699999999999997,
65
- "loss": 1.165,
66
- "step": 60
67
- },
68
- {
69
- "epoch": 0.04,
70
- "eval_loss": 0.9216629266738892,
71
- "eval_runtime": 0.1577,
72
- "eval_samples_per_second": 6.343,
73
- "eval_steps_per_second": 6.343,
74
- "step": 60
75
- },
76
- {
77
- "epoch": 0.05,
78
- "eval_loss": 0.8872115612030029,
79
- "eval_runtime": 0.1579,
80
- "eval_samples_per_second": 6.333,
81
- "eval_steps_per_second": 6.333,
82
- "step": 70
83
- },
84
- {
85
- "epoch": 0.06,
86
- "learning_rate": 0.000237,
87
- "loss": 0.9931,
88
- "step": 80
89
- },
90
- {
91
- "epoch": 0.06,
92
- "eval_loss": 0.8584132194519043,
93
- "eval_runtime": 0.1599,
94
- "eval_samples_per_second": 6.254,
95
- "eval_steps_per_second": 6.254,
96
- "step": 80
97
- },
98
- {
99
- "epoch": 0.07,
100
- "eval_loss": 0.9114644527435303,
101
- "eval_runtime": 0.1598,
102
- "eval_samples_per_second": 6.259,
103
- "eval_steps_per_second": 6.259,
104
- "step": 90
105
- },
106
- {
107
- "epoch": 0.07,
108
- "learning_rate": 0.00029699999999999996,
109
- "loss": 0.9545,
110
- "step": 100
111
- },
112
- {
113
- "epoch": 0.07,
114
- "eval_loss": 0.8909071087837219,
115
- "eval_runtime": 0.1603,
116
- "eval_samples_per_second": 6.24,
117
- "eval_steps_per_second": 6.24,
118
- "step": 100
119
- },
120
- {
121
- "epoch": 0.08,
122
- "eval_loss": 0.9121037721633911,
123
- "eval_runtime": 0.1598,
124
- "eval_samples_per_second": 6.259,
125
- "eval_steps_per_second": 6.259,
126
- "step": 110
127
- },
128
- {
129
- "epoch": 0.09,
130
- "learning_rate": 0.0002954253611556982,
131
- "loss": 0.8947,
132
- "step": 120
133
- },
134
- {
135
- "epoch": 0.09,
136
- "eval_loss": 0.8340358734130859,
137
- "eval_runtime": 0.1605,
138
- "eval_samples_per_second": 6.229,
139
- "eval_steps_per_second": 6.229,
140
- "step": 120
141
- },
142
- {
143
- "epoch": 0.1,
144
- "eval_loss": 0.8849254846572876,
145
- "eval_runtime": 0.158,
146
- "eval_samples_per_second": 6.33,
147
- "eval_steps_per_second": 6.33,
148
- "step": 130
149
- },
150
- {
151
- "epoch": 0.1,
152
- "learning_rate": 0.0002906099518459069,
153
- "loss": 0.8711,
154
- "step": 140
155
- },
156
- {
157
- "epoch": 0.1,
158
- "eval_loss": 0.8834758996963501,
159
- "eval_runtime": 0.157,
160
- "eval_samples_per_second": 6.368,
161
- "eval_steps_per_second": 6.368,
162
- "step": 140
163
- },
164
- {
165
- "epoch": 0.11,
166
- "eval_loss": 0.8293902277946472,
167
- "eval_runtime": 0.1583,
168
- "eval_samples_per_second": 6.316,
169
- "eval_steps_per_second": 6.316,
170
- "step": 150
171
- },
172
- {
173
- "epoch": 0.12,
174
- "learning_rate": 0.00028579454253611556,
175
- "loss": 0.8688,
176
- "step": 160
177
- },
178
- {
179
- "epoch": 0.12,
180
- "eval_loss": 0.8478549718856812,
181
- "eval_runtime": 0.1581,
182
- "eval_samples_per_second": 6.324,
183
- "eval_steps_per_second": 6.324,
184
- "step": 160
185
- },
186
- {
187
- "epoch": 0.13,
188
- "eval_loss": 0.8488869667053223,
189
- "eval_runtime": 0.159,
190
- "eval_samples_per_second": 6.288,
191
- "eval_steps_per_second": 6.288,
192
- "step": 170
193
- },
194
- {
195
- "epoch": 0.13,
196
- "learning_rate": 0.0002809791332263242,
197
- "loss": 0.8551,
198
- "step": 180
199
- },
200
- {
201
- "epoch": 0.13,
202
- "eval_loss": 0.814437747001648,
203
- "eval_runtime": 0.1575,
204
- "eval_samples_per_second": 6.349,
205
- "eval_steps_per_second": 6.349,
206
- "step": 180
207
- },
208
- {
209
- "epoch": 0.14,
210
- "eval_loss": 0.8222396969795227,
211
- "eval_runtime": 0.1583,
212
- "eval_samples_per_second": 6.317,
213
- "eval_steps_per_second": 6.317,
214
- "step": 190
215
- },
216
- {
217
- "epoch": 0.15,
218
- "learning_rate": 0.0002761637239165329,
219
- "loss": 0.8471,
220
- "step": 200
221
- },
222
- {
223
- "epoch": 0.15,
224
- "eval_loss": 0.8183712959289551,
225
- "eval_runtime": 0.1584,
226
- "eval_samples_per_second": 6.315,
227
- "eval_steps_per_second": 6.315,
228
- "step": 200
229
- },
230
- {
231
- "epoch": 0.16,
232
- "eval_loss": 0.8207969665527344,
233
- "eval_runtime": 0.1584,
234
- "eval_samples_per_second": 6.315,
235
- "eval_steps_per_second": 6.315,
236
- "step": 210
237
- },
238
- {
239
- "epoch": 0.16,
240
- "learning_rate": 0.00027134831460674157,
241
- "loss": 0.8578,
242
- "step": 220
243
- },
244
- {
245
- "epoch": 0.16,
246
- "eval_loss": 0.8103113174438477,
247
- "eval_runtime": 0.1584,
248
- "eval_samples_per_second": 6.315,
249
- "eval_steps_per_second": 6.315,
250
- "step": 220
251
- },
252
- {
253
- "epoch": 0.17,
254
- "eval_loss": 0.808212399482727,
255
- "eval_runtime": 0.1584,
256
- "eval_samples_per_second": 6.311,
257
- "eval_steps_per_second": 6.311,
258
- "step": 230
259
- },
260
- {
261
- "epoch": 0.18,
262
- "learning_rate": 0.00026653290529695024,
263
- "loss": 0.8389,
264
- "step": 240
265
- },
266
- {
267
- "epoch": 0.18,
268
- "eval_loss": 0.7917136549949646,
269
- "eval_runtime": 0.1575,
270
- "eval_samples_per_second": 6.351,
271
- "eval_steps_per_second": 6.351,
272
- "step": 240
273
- },
274
- {
275
- "epoch": 0.19,
276
- "eval_loss": 0.8259432315826416,
277
- "eval_runtime": 0.1603,
278
- "eval_samples_per_second": 6.24,
279
- "eval_steps_per_second": 6.24,
280
- "step": 250
281
- },
282
- {
283
- "epoch": 0.19,
284
- "learning_rate": 0.0002617174959871589,
285
- "loss": 0.8343,
286
- "step": 260
287
- },
288
- {
289
- "epoch": 0.19,
290
- "eval_loss": 0.7860772609710693,
291
- "eval_runtime": 0.1577,
292
- "eval_samples_per_second": 6.34,
293
- "eval_steps_per_second": 6.34,
294
- "step": 260
295
- },
296
- {
297
- "epoch": 0.2,
298
- "eval_loss": 0.8120028972625732,
299
- "eval_runtime": 0.1576,
300
- "eval_samples_per_second": 6.347,
301
- "eval_steps_per_second": 6.347,
302
- "step": 270
303
- },
304
- {
305
- "epoch": 0.21,
306
- "learning_rate": 0.0002569020866773676,
307
- "loss": 0.831,
308
- "step": 280
309
- },
310
- {
311
- "epoch": 0.21,
312
- "eval_loss": 0.7905743718147278,
313
- "eval_runtime": 0.157,
314
- "eval_samples_per_second": 6.368,
315
- "eval_steps_per_second": 6.368,
316
- "step": 280
317
- },
318
- {
319
- "epoch": 0.22,
320
- "eval_loss": 0.7759386301040649,
321
- "eval_runtime": 0.1576,
322
- "eval_samples_per_second": 6.344,
323
- "eval_steps_per_second": 6.344,
324
- "step": 290
325
- },
326
- {
327
- "epoch": 0.22,
328
- "learning_rate": 0.00025208667736757625,
329
- "loss": 0.8325,
330
- "step": 300
331
- },
332
- {
333
- "epoch": 0.22,
334
- "eval_loss": 0.8241894245147705,
335
- "eval_runtime": 0.1572,
336
- "eval_samples_per_second": 6.363,
337
- "eval_steps_per_second": 6.363,
338
- "step": 300
339
- },
340
- {
341
- "epoch": 0.23,
342
- "eval_loss": 0.7761509418487549,
343
- "eval_runtime": 0.1576,
344
- "eval_samples_per_second": 6.346,
345
- "eval_steps_per_second": 6.346,
346
- "step": 310
347
- },
348
- {
349
- "epoch": 0.24,
350
- "learning_rate": 0.0002472712680577849,
351
- "loss": 0.8147,
352
- "step": 320
353
- },
354
- {
355
- "epoch": 0.24,
356
- "eval_loss": 0.7789342403411865,
357
- "eval_runtime": 0.1572,
358
- "eval_samples_per_second": 6.363,
359
- "eval_steps_per_second": 6.363,
360
- "step": 320
361
- },
362
- {
363
- "epoch": 0.25,
364
- "eval_loss": 0.7757179737091064,
365
- "eval_runtime": 0.1578,
366
- "eval_samples_per_second": 6.339,
367
- "eval_steps_per_second": 6.339,
368
- "step": 330
369
- },
370
- {
371
- "epoch": 0.25,
372
- "learning_rate": 0.00024245585874799357,
373
- "loss": 0.8204,
374
- "step": 340
375
- },
376
- {
377
- "epoch": 0.25,
378
- "eval_loss": 0.7954628467559814,
379
- "eval_runtime": 0.1576,
380
- "eval_samples_per_second": 6.344,
381
- "eval_steps_per_second": 6.344,
382
- "step": 340
383
- },
384
- {
385
- "epoch": 0.26,
386
- "eval_loss": 0.7957539558410645,
387
- "eval_runtime": 0.1584,
388
- "eval_samples_per_second": 6.315,
389
- "eval_steps_per_second": 6.315,
390
- "step": 350
391
- },
392
- {
393
- "epoch": 0.27,
394
- "learning_rate": 0.00023764044943820224,
395
- "loss": 0.8232,
396
- "step": 360
397
- },
398
- {
399
- "epoch": 0.27,
400
- "eval_loss": 0.7863476276397705,
401
- "eval_runtime": 0.1571,
402
- "eval_samples_per_second": 6.367,
403
- "eval_steps_per_second": 6.367,
404
- "step": 360
405
- },
406
- {
407
- "epoch": 0.27,
408
- "eval_loss": 0.7858812808990479,
409
- "eval_runtime": 0.1583,
410
- "eval_samples_per_second": 6.317,
411
- "eval_steps_per_second": 6.317,
412
- "step": 370
413
- },
414
- {
415
- "epoch": 0.28,
416
- "learning_rate": 0.00023282504012841088,
417
- "loss": 0.8129,
418
- "step": 380
419
- },
420
- {
421
- "epoch": 0.28,
422
- "eval_loss": 0.785178005695343,
423
- "eval_runtime": 0.1577,
424
- "eval_samples_per_second": 6.342,
425
- "eval_steps_per_second": 6.342,
426
- "step": 380
427
- },
428
- {
429
- "epoch": 0.29,
430
- "eval_loss": 0.8090522289276123,
431
- "eval_runtime": 0.1583,
432
- "eval_samples_per_second": 6.318,
433
- "eval_steps_per_second": 6.318,
434
- "step": 390
435
- },
436
- {
437
- "epoch": 0.3,
438
- "learning_rate": 0.00022800963081861955,
439
- "loss": 0.8193,
440
- "step": 400
441
- },
442
- {
443
- "epoch": 0.3,
444
- "eval_loss": 0.7978605031967163,
445
- "eval_runtime": 0.1573,
446
- "eval_samples_per_second": 6.357,
447
- "eval_steps_per_second": 6.357,
448
- "step": 400
449
- },
450
- {
451
- "epoch": 0.3,
452
- "eval_loss": 0.7799659371376038,
453
- "eval_runtime": 0.1581,
454
- "eval_samples_per_second": 6.327,
455
- "eval_steps_per_second": 6.327,
456
- "step": 410
457
- },
458
- {
459
- "epoch": 0.31,
460
- "learning_rate": 0.00022319422150882823,
461
- "loss": 0.8072,
462
- "step": 420
463
- },
464
- {
465
- "epoch": 0.31,
466
- "eval_loss": 0.7800132036209106,
467
- "eval_runtime": 0.1573,
468
- "eval_samples_per_second": 6.359,
469
- "eval_steps_per_second": 6.359,
470
- "step": 420
471
- },
472
- {
473
- "epoch": 0.32,
474
- "eval_loss": 0.7845722436904907,
475
- "eval_runtime": 0.158,
476
- "eval_samples_per_second": 6.329,
477
- "eval_steps_per_second": 6.329,
478
- "step": 430
479
- },
480
- {
481
- "epoch": 0.33,
482
- "learning_rate": 0.0002183788121990369,
483
- "loss": 0.8152,
484
- "step": 440
485
- },
486
- {
487
- "epoch": 0.33,
488
- "eval_loss": 0.7644073963165283,
489
- "eval_runtime": 0.1571,
490
- "eval_samples_per_second": 6.367,
491
- "eval_steps_per_second": 6.367,
492
- "step": 440
493
- },
494
- {
495
- "epoch": 0.33,
496
- "eval_loss": 0.7579188346862793,
497
- "eval_runtime": 0.1571,
498
- "eval_samples_per_second": 6.367,
499
- "eval_steps_per_second": 6.367,
500
- "step": 450
501
- },
502
- {
503
- "epoch": 0.34,
504
- "learning_rate": 0.00021356340288924557,
505
- "loss": 0.8074,
506
- "step": 460
507
- },
508
- {
509
- "epoch": 0.34,
510
- "eval_loss": 0.7676059007644653,
511
- "eval_runtime": 0.1577,
512
- "eval_samples_per_second": 6.341,
513
- "eval_steps_per_second": 6.341,
514
- "step": 460
515
- },
516
- {
517
- "epoch": 0.35,
518
- "eval_loss": 0.7583163976669312,
519
- "eval_runtime": 0.1573,
520
- "eval_samples_per_second": 6.358,
521
- "eval_steps_per_second": 6.358,
522
- "step": 470
523
- },
524
- {
525
- "epoch": 0.36,
526
- "learning_rate": 0.00020874799357945424,
527
- "loss": 0.8118,
528
- "step": 480
529
- },
530
- {
531
- "epoch": 0.36,
532
- "eval_loss": 0.7768086194992065,
533
- "eval_runtime": 0.1577,
534
- "eval_samples_per_second": 6.341,
535
- "eval_steps_per_second": 6.341,
536
- "step": 480
537
- },
538
- {
539
- "epoch": 0.36,
540
- "eval_loss": 0.7975252270698547,
541
- "eval_runtime": 0.157,
542
- "eval_samples_per_second": 6.37,
543
- "eval_steps_per_second": 6.37,
544
- "step": 490
545
- },
546
- {
547
- "epoch": 0.37,
548
- "learning_rate": 0.0002039325842696629,
549
- "loss": 0.8098,
550
- "step": 500
551
- },
552
- {
553
- "epoch": 0.37,
554
- "eval_loss": 0.8035451173782349,
555
- "eval_runtime": 0.1565,
556
- "eval_samples_per_second": 6.388,
557
- "eval_steps_per_second": 6.388,
558
- "step": 500
559
- },
560
- {
561
- "epoch": 0.38,
562
- "eval_loss": 0.7718653678894043,
563
- "eval_runtime": 0.1575,
564
- "eval_samples_per_second": 6.347,
565
- "eval_steps_per_second": 6.347,
566
- "step": 510
567
- },
568
- {
569
- "epoch": 0.39,
570
- "learning_rate": 0.00019911717495987158,
571
- "loss": 0.8167,
572
- "step": 520
573
- },
574
- {
575
- "epoch": 0.39,
576
- "eval_loss": 0.771480917930603,
577
- "eval_runtime": 0.1568,
578
- "eval_samples_per_second": 6.379,
579
- "eval_steps_per_second": 6.379,
580
- "step": 520
581
- },
582
- {
583
- "epoch": 0.39,
584
- "eval_loss": 0.7757036685943604,
585
- "eval_runtime": 0.1571,
586
- "eval_samples_per_second": 6.364,
587
- "eval_steps_per_second": 6.364,
588
- "step": 530
589
- },
590
- {
591
- "epoch": 0.4,
592
- "learning_rate": 0.00019430176565008025,
593
- "loss": 0.8071,
594
- "step": 540
595
- },
596
- {
597
- "epoch": 0.4,
598
- "eval_loss": 0.8048112392425537,
599
- "eval_runtime": 0.1563,
600
- "eval_samples_per_second": 6.396,
601
- "eval_steps_per_second": 6.396,
602
- "step": 540
603
- },
604
- {
605
- "epoch": 0.41,
606
- "eval_loss": 0.7987676858901978,
607
- "eval_runtime": 0.1562,
608
- "eval_samples_per_second": 6.402,
609
- "eval_steps_per_second": 6.402,
610
- "step": 550
611
- },
612
- {
613
- "epoch": 0.42,
614
- "learning_rate": 0.00018948635634028892,
615
- "loss": 0.8143,
616
- "step": 560
617
- },
618
- {
619
- "epoch": 0.42,
620
- "eval_loss": 0.7960355281829834,
621
- "eval_runtime": 0.159,
622
- "eval_samples_per_second": 6.29,
623
- "eval_steps_per_second": 6.29,
624
- "step": 560
625
- },
626
- {
627
- "epoch": 0.42,
628
- "eval_loss": 0.7673896551132202,
629
- "eval_runtime": 0.1569,
630
- "eval_samples_per_second": 6.372,
631
- "eval_steps_per_second": 6.372,
632
- "step": 570
633
- },
634
- {
635
- "epoch": 0.43,
636
- "learning_rate": 0.0001846709470304976,
637
- "loss": 0.7989,
638
- "step": 580
639
- },
640
- {
641
- "epoch": 0.43,
642
- "eval_loss": 0.7655194997787476,
643
- "eval_runtime": 0.1578,
644
- "eval_samples_per_second": 6.336,
645
- "eval_steps_per_second": 6.336,
646
- "step": 580
647
- },
648
- {
649
- "epoch": 0.44,
650
- "eval_loss": 0.7695807218551636,
651
- "eval_runtime": 0.1571,
652
- "eval_samples_per_second": 6.365,
653
- "eval_steps_per_second": 6.365,
654
- "step": 590
655
- },
656
- {
657
- "epoch": 0.45,
658
- "learning_rate": 0.00017985553772070626,
659
- "loss": 0.8121,
660
- "step": 600
661
- },
662
- {
663
- "epoch": 0.45,
664
- "eval_loss": 0.7599303722381592,
665
- "eval_runtime": 0.1566,
666
- "eval_samples_per_second": 6.385,
667
- "eval_steps_per_second": 6.385,
668
- "step": 600
669
- },
670
- {
671
- "epoch": 0.45,
672
- "eval_loss": 0.7409216165542603,
673
- "eval_runtime": 0.1565,
674
- "eval_samples_per_second": 6.389,
675
- "eval_steps_per_second": 6.389,
676
- "step": 610
677
- },
678
- {
679
- "epoch": 0.46,
680
- "learning_rate": 0.00017504012841091494,
681
- "loss": 0.8105,
682
- "step": 620
683
- },
684
- {
685
- "epoch": 0.46,
686
- "eval_loss": 0.7620519995689392,
687
- "eval_runtime": 0.158,
688
- "eval_samples_per_second": 6.331,
689
- "eval_steps_per_second": 6.331,
690
- "step": 620
691
- },
692
- {
693
- "epoch": 0.47,
694
- "eval_loss": 0.7642089128494263,
695
- "eval_runtime": 0.1573,
696
- "eval_samples_per_second": 6.356,
697
- "eval_steps_per_second": 6.356,
698
- "step": 630
699
- },
700
- {
701
- "epoch": 0.48,
702
- "learning_rate": 0.0001702247191011236,
703
- "loss": 0.8073,
704
- "step": 640
705
- },
706
- {
707
- "epoch": 0.48,
708
- "eval_loss": 0.7464691400527954,
709
- "eval_runtime": 0.156,
710
- "eval_samples_per_second": 6.409,
711
- "eval_steps_per_second": 6.409,
712
- "step": 640
713
- },
714
- {
715
- "epoch": 0.48,
716
- "eval_loss": 0.7520545721054077,
717
- "eval_runtime": 0.158,
718
- "eval_samples_per_second": 6.328,
719
- "eval_steps_per_second": 6.328,
720
- "step": 650
721
- },
722
- {
723
- "epoch": 0.49,
724
- "learning_rate": 0.00016540930979133222,
725
- "loss": 0.8115,
726
- "step": 660
727
- },
728
- {
729
- "epoch": 0.49,
730
- "eval_loss": 0.7851120233535767,
731
- "eval_runtime": 0.1573,
732
- "eval_samples_per_second": 6.358,
733
- "eval_steps_per_second": 6.358,
734
- "step": 660
735
- },
736
- {
737
- "epoch": 0.5,
738
- "eval_loss": 0.7845426797866821,
739
- "eval_runtime": 0.1567,
740
- "eval_samples_per_second": 6.381,
741
- "eval_steps_per_second": 6.381,
742
- "step": 670
743
- },
744
- {
745
- "epoch": 0.5,
746
- "learning_rate": 0.0001605939004815409,
747
- "loss": 0.8213,
748
- "step": 680
749
- },
750
- {
751
- "epoch": 0.5,
752
- "eval_loss": 0.7726190686225891,
753
- "eval_runtime": 0.157,
754
- "eval_samples_per_second": 6.37,
755
- "eval_steps_per_second": 6.37,
756
- "step": 680
757
- },
758
- {
759
- "epoch": 0.51,
760
- "eval_loss": 0.7661857604980469,
761
- "eval_runtime": 0.1571,
762
- "eval_samples_per_second": 6.366,
763
- "eval_steps_per_second": 6.366,
764
- "step": 690
765
- },
766
- {
767
- "epoch": 0.52,
768
- "learning_rate": 0.00015577849117174957,
769
- "loss": 0.8155,
770
- "step": 700
771
- },
772
- {
773
- "epoch": 0.52,
774
- "eval_loss": 0.7846134305000305,
775
- "eval_runtime": 0.1564,
776
- "eval_samples_per_second": 6.394,
777
- "eval_steps_per_second": 6.394,
778
- "step": 700
779
- },
780
- {
781
- "epoch": 0.53,
782
- "eval_loss": 0.7888688445091248,
783
- "eval_runtime": 0.1581,
784
- "eval_samples_per_second": 6.324,
785
- "eval_steps_per_second": 6.324,
786
- "step": 710
787
- },
788
- {
789
- "epoch": 0.53,
790
- "learning_rate": 0.00015096308186195824,
791
- "loss": 0.803,
792
- "step": 720
793
- },
794
- {
795
- "epoch": 0.53,
796
- "eval_loss": 0.7981730103492737,
797
- "eval_runtime": 0.1566,
798
- "eval_samples_per_second": 6.385,
799
- "eval_steps_per_second": 6.385,
800
- "step": 720
801
- },
802
- {
803
- "epoch": 0.54,
804
- "eval_loss": 0.8129211664199829,
805
- "eval_runtime": 0.1562,
806
- "eval_samples_per_second": 6.401,
807
- "eval_steps_per_second": 6.401,
808
- "step": 730
809
- },
810
- {
811
- "epoch": 0.55,
812
- "learning_rate": 0.0001461476725521669,
813
- "loss": 0.8108,
814
- "step": 740
815
- },
816
- {
817
- "epoch": 0.55,
818
- "eval_loss": 0.7717241644859314,
819
- "eval_runtime": 0.1569,
820
- "eval_samples_per_second": 6.375,
821
- "eval_steps_per_second": 6.375,
822
- "step": 740
823
- },
824
- {
825
- "epoch": 0.56,
826
- "eval_loss": 0.7536574602127075,
827
- "eval_runtime": 0.1567,
828
- "eval_samples_per_second": 6.38,
829
- "eval_steps_per_second": 6.38,
830
- "step": 750
831
- },
832
- {
833
- "epoch": 0.56,
834
- "learning_rate": 0.00014133226324237558,
835
- "loss": 0.804,
836
- "step": 760
837
- },
838
- {
839
- "epoch": 0.56,
840
- "eval_loss": 0.7452749013900757,
841
- "eval_runtime": 0.1564,
842
- "eval_samples_per_second": 6.392,
843
- "eval_steps_per_second": 6.392,
844
- "step": 760
845
- },
846
- {
847
- "epoch": 0.57,
848
- "eval_loss": 0.7427541613578796,
849
- "eval_runtime": 0.1573,
850
- "eval_samples_per_second": 6.358,
851
- "eval_steps_per_second": 6.358,
852
- "step": 770
853
- },
854
- {
855
- "epoch": 0.58,
856
- "learning_rate": 0.00013651685393258425,
857
- "loss": 0.7983,
858
- "step": 780
859
- },
860
- {
861
- "epoch": 0.58,
862
- "eval_loss": 0.7469484210014343,
863
- "eval_runtime": 0.1562,
864
- "eval_samples_per_second": 6.404,
865
- "eval_steps_per_second": 6.404,
866
- "step": 780
867
- },
868
- {
869
- "epoch": 0.59,
870
- "eval_loss": 0.7667011022567749,
871
- "eval_runtime": 0.1568,
872
- "eval_samples_per_second": 6.376,
873
- "eval_steps_per_second": 6.376,
874
- "step": 790
875
- },
876
- {
877
- "epoch": 0.59,
878
- "learning_rate": 0.00013170144462279292,
879
- "loss": 0.8054,
880
- "step": 800
881
- },
882
- {
883
- "epoch": 0.59,
884
- "eval_loss": 0.7562040090560913,
885
- "eval_runtime": 0.1651,
886
- "eval_samples_per_second": 6.057,
887
- "eval_steps_per_second": 6.057,
888
- "step": 800
889
- },
890
- {
891
- "epoch": 0.6,
892
- "eval_loss": 0.7466799020767212,
893
- "eval_runtime": 0.1572,
894
- "eval_samples_per_second": 6.359,
895
- "eval_steps_per_second": 6.359,
896
- "step": 810
897
- },
898
- {
899
- "epoch": 0.61,
900
- "learning_rate": 0.0001268860353130016,
901
- "loss": 0.8143,
902
- "step": 820
903
- },
904
- {
905
- "epoch": 0.61,
906
- "eval_loss": 0.7601323127746582,
907
- "eval_runtime": 0.1768,
908
- "eval_samples_per_second": 5.657,
909
- "eval_steps_per_second": 5.657,
910
- "step": 820
911
- },
912
- {
913
- "epoch": 0.62,
914
- "eval_loss": 0.7430717349052429,
915
- "eval_runtime": 0.1575,
916
- "eval_samples_per_second": 6.349,
917
- "eval_steps_per_second": 6.349,
918
- "step": 830
919
- },
920
- {
921
- "epoch": 0.62,
922
- "learning_rate": 0.00012207062600321026,
923
- "loss": 0.8038,
924
- "step": 840
925
- },
926
- {
927
- "epoch": 0.62,
928
- "eval_loss": 0.7633467316627502,
929
- "eval_runtime": 0.1577,
930
- "eval_samples_per_second": 6.342,
931
- "eval_steps_per_second": 6.342,
932
- "step": 840
933
- },
934
- {
935
- "epoch": 0.63,
936
- "eval_loss": 0.7451119422912598,
937
- "eval_runtime": 0.1575,
938
- "eval_samples_per_second": 6.349,
939
- "eval_steps_per_second": 6.349,
940
- "step": 850
941
- },
942
- {
943
- "epoch": 0.64,
944
- "learning_rate": 0.00011725521669341892,
945
- "loss": 0.8053,
946
- "step": 860
947
- },
948
- {
949
- "epoch": 0.64,
950
- "eval_loss": 0.7628670930862427,
951
- "eval_runtime": 0.1578,
952
- "eval_samples_per_second": 6.338,
953
- "eval_steps_per_second": 6.338,
954
- "step": 860
955
- },
956
- {
957
- "epoch": 0.65,
958
- "eval_loss": 0.7490127086639404,
959
- "eval_runtime": 0.1571,
960
- "eval_samples_per_second": 6.364,
961
- "eval_steps_per_second": 6.364,
962
- "step": 870
963
- },
964
- {
965
- "epoch": 0.65,
966
- "learning_rate": 0.00011243980738362759,
967
- "loss": 0.8007,
968
- "step": 880
969
- },
970
- {
971
- "epoch": 0.65,
972
- "eval_loss": 0.767558753490448,
973
- "eval_runtime": 0.1564,
974
- "eval_samples_per_second": 6.392,
975
- "eval_steps_per_second": 6.392,
976
- "step": 880
977
- },
978
- {
979
- "epoch": 0.66,
980
- "eval_loss": 0.7646104097366333,
981
- "eval_runtime": 0.1571,
982
- "eval_samples_per_second": 6.365,
983
- "eval_steps_per_second": 6.365,
984
- "step": 890
985
- },
986
- {
987
- "epoch": 0.67,
988
- "learning_rate": 0.00010762439807383626,
989
- "loss": 0.7999,
990
- "step": 900
991
- },
992
- {
993
- "epoch": 0.67,
994
- "eval_loss": 0.7580606937408447,
995
- "eval_runtime": 0.1559,
996
- "eval_samples_per_second": 6.414,
997
- "eval_steps_per_second": 6.414,
998
- "step": 900
999
- },
1000
- {
1001
- "epoch": 0.68,
1002
- "eval_loss": 0.7789003849029541,
1003
- "eval_runtime": 0.1568,
1004
- "eval_samples_per_second": 6.377,
1005
- "eval_steps_per_second": 6.377,
1006
- "step": 910
1007
- },
1008
- {
1009
- "epoch": 0.68,
1010
- "learning_rate": 0.00010280898876404493,
1011
- "loss": 0.8007,
1012
- "step": 920
1013
- },
1014
- {
1015
- "epoch": 0.68,
1016
- "eval_loss": 0.7511205077171326,
1017
- "eval_runtime": 0.1571,
1018
- "eval_samples_per_second": 6.365,
1019
- "eval_steps_per_second": 6.365,
1020
- "step": 920
1021
- },
1022
- {
1023
- "epoch": 0.69,
1024
- "eval_loss": 0.7637555599212646,
1025
- "eval_runtime": 0.1567,
1026
- "eval_samples_per_second": 6.38,
1027
- "eval_steps_per_second": 6.38,
1028
- "step": 930
1029
- },
1030
- {
1031
- "epoch": 0.7,
1032
- "learning_rate": 9.79935794542536e-05,
1033
- "loss": 0.8006,
1034
- "step": 940
1035
- },
1036
- {
1037
- "epoch": 0.7,
1038
- "eval_loss": 0.7561322450637817,
1039
- "eval_runtime": 0.1582,
1040
- "eval_samples_per_second": 6.322,
1041
- "eval_steps_per_second": 6.322,
1042
- "step": 940
1043
- },
1044
- {
1045
- "epoch": 0.71,
1046
- "eval_loss": 0.753495454788208,
1047
- "eval_runtime": 0.1564,
1048
- "eval_samples_per_second": 6.395,
1049
- "eval_steps_per_second": 6.395,
1050
- "step": 950
1051
- },
1052
- {
1053
- "epoch": 0.71,
1054
- "learning_rate": 9.317817014446228e-05,
1055
- "loss": 0.8055,
1056
- "step": 960
1057
- },
1058
- {
1059
- "epoch": 0.71,
1060
- "eval_loss": 0.7372075319290161,
1061
- "eval_runtime": 0.1564,
1062
- "eval_samples_per_second": 6.396,
1063
- "eval_steps_per_second": 6.396,
1064
- "step": 960
1065
- },
1066
- {
1067
- "epoch": 0.72,
1068
- "eval_loss": 0.761570394039154,
1069
- "eval_runtime": 0.158,
1070
- "eval_samples_per_second": 6.329,
1071
- "eval_steps_per_second": 6.329,
1072
- "step": 970
1073
- },
1074
- {
1075
- "epoch": 0.73,
1076
- "learning_rate": 8.836276083467093e-05,
1077
- "loss": 0.7951,
1078
- "step": 980
1079
- },
1080
- {
1081
- "epoch": 0.73,
1082
- "eval_loss": 0.7430101037025452,
1083
- "eval_runtime": 0.1566,
1084
- "eval_samples_per_second": 6.386,
1085
- "eval_steps_per_second": 6.386,
1086
- "step": 980
1087
- },
1088
- {
1089
- "epoch": 0.74,
1090
- "eval_loss": 0.7443385124206543,
1091
- "eval_runtime": 0.1567,
1092
- "eval_samples_per_second": 6.381,
1093
- "eval_steps_per_second": 6.381,
1094
- "step": 990
1095
- },
1096
- {
1097
- "epoch": 0.74,
1098
- "learning_rate": 8.35473515248796e-05,
1099
- "loss": 0.8049,
1100
- "step": 1000
1101
- },
1102
- {
1103
- "epoch": 0.74,
1104
- "eval_loss": 0.7466243505477905,
1105
- "eval_runtime": 0.1568,
1106
- "eval_samples_per_second": 6.376,
1107
- "eval_steps_per_second": 6.376,
1108
- "step": 1000
1109
- },
1110
- {
1111
- "epoch": 0.75,
1112
- "eval_loss": 0.7397581934928894,
1113
- "eval_runtime": 0.1575,
1114
- "eval_samples_per_second": 6.347,
1115
- "eval_steps_per_second": 6.347,
1116
- "step": 1010
1117
- },
1118
- {
1119
- "epoch": 0.76,
1120
- "learning_rate": 7.873194221508827e-05,
1121
- "loss": 0.8131,
1122
- "step": 1020
1123
- },
1124
- {
1125
- "epoch": 0.76,
1126
- "eval_loss": 0.7271538972854614,
1127
- "eval_runtime": 0.1575,
1128
- "eval_samples_per_second": 6.35,
1129
- "eval_steps_per_second": 6.35,
1130
- "step": 1020
1131
- },
1132
- {
1133
- "epoch": 0.76,
1134
- "eval_loss": 0.7354034781455994,
1135
- "eval_runtime": 0.1578,
1136
- "eval_samples_per_second": 6.337,
1137
- "eval_steps_per_second": 6.337,
1138
- "step": 1030
1139
- },
1140
- {
1141
- "epoch": 0.77,
1142
- "learning_rate": 7.391653290529695e-05,
1143
- "loss": 0.7991,
1144
- "step": 1040
1145
- },
1146
- {
1147
- "epoch": 0.77,
1148
- "eval_loss": 0.7297641038894653,
1149
- "eval_runtime": 0.1764,
1150
- "eval_samples_per_second": 5.67,
1151
- "eval_steps_per_second": 5.67,
1152
- "step": 1040
1153
- },
1154
- {
1155
- "epoch": 0.78,
1156
- "eval_loss": 0.7396876811981201,
1157
- "eval_runtime": 0.158,
1158
- "eval_samples_per_second": 6.327,
1159
- "eval_steps_per_second": 6.327,
1160
- "step": 1050
1161
- },
1162
- {
1163
- "epoch": 0.79,
1164
- "learning_rate": 6.910112359550562e-05,
1165
- "loss": 0.8075,
1166
- "step": 1060
1167
- },
1168
- {
1169
- "epoch": 0.79,
1170
- "eval_loss": 0.7296563386917114,
1171
- "eval_runtime": 0.1577,
1172
- "eval_samples_per_second": 6.343,
1173
- "eval_steps_per_second": 6.343,
1174
- "step": 1060
1175
- },
1176
- {
1177
- "epoch": 0.79,
1178
- "eval_loss": 0.7264513969421387,
1179
- "eval_runtime": 0.1571,
1180
- "eval_samples_per_second": 6.366,
1181
- "eval_steps_per_second": 6.366,
1182
- "step": 1070
1183
- },
1184
- {
1185
- "epoch": 0.8,
1186
- "learning_rate": 6.428571428571427e-05,
1187
- "loss": 0.8075,
1188
- "step": 1080
1189
- },
1190
- {
1191
- "epoch": 0.8,
1192
- "eval_loss": 0.7289776802062988,
1193
- "eval_runtime": 0.1571,
1194
- "eval_samples_per_second": 6.367,
1195
- "eval_steps_per_second": 6.367,
1196
- "step": 1080
1197
- },
1198
- {
1199
- "epoch": 0.81,
1200
- "eval_loss": 0.7540558576583862,
1201
- "eval_runtime": 0.1571,
1202
- "eval_samples_per_second": 6.365,
1203
- "eval_steps_per_second": 6.365,
1204
- "step": 1090
1205
- },
1206
- {
1207
- "epoch": 0.82,
1208
- "learning_rate": 5.9470304975922945e-05,
1209
- "loss": 0.7916,
1210
- "step": 1100
1211
- },
1212
- {
1213
- "epoch": 0.82,
1214
- "eval_loss": 0.7357069253921509,
1215
- "eval_runtime": 0.1564,
1216
- "eval_samples_per_second": 6.393,
1217
- "eval_steps_per_second": 6.393,
1218
- "step": 1100
1219
- },
1220
- {
1221
- "epoch": 0.82,
1222
- "eval_loss": 0.7496874928474426,
1223
- "eval_runtime": 0.1575,
1224
- "eval_samples_per_second": 6.351,
1225
- "eval_steps_per_second": 6.351,
1226
- "step": 1110
1227
- },
1228
- {
1229
- "epoch": 0.83,
1230
- "learning_rate": 5.4654895666131616e-05,
1231
- "loss": 0.7899,
1232
- "step": 1120
1233
- },
1234
- {
1235
- "epoch": 0.83,
1236
- "eval_loss": 0.7296478748321533,
1237
- "eval_runtime": 0.1573,
1238
- "eval_samples_per_second": 6.357,
1239
- "eval_steps_per_second": 6.357,
1240
- "step": 1120
1241
- },
1242
- {
1243
- "epoch": 0.84,
1244
- "eval_loss": 0.7285305261611938,
1245
- "eval_runtime": 0.1576,
1246
- "eval_samples_per_second": 6.346,
1247
- "eval_steps_per_second": 6.346,
1248
- "step": 1130
1249
- },
1250
- {
1251
- "epoch": 0.85,
1252
- "learning_rate": 4.983948635634029e-05,
1253
- "loss": 0.7958,
1254
- "step": 1140
1255
- },
1256
- {
1257
- "epoch": 0.85,
1258
- "eval_loss": 0.7548575401306152,
1259
- "eval_runtime": 0.157,
1260
- "eval_samples_per_second": 6.368,
1261
- "eval_steps_per_second": 6.368,
1262
- "step": 1140
1263
- },
1264
- {
1265
- "epoch": 0.85,
1266
- "eval_loss": 0.75063556432724,
1267
- "eval_runtime": 0.1577,
1268
- "eval_samples_per_second": 6.343,
1269
- "eval_steps_per_second": 6.343,
1270
- "step": 1150
1271
- },
1272
- {
1273
- "epoch": 0.86,
1274
- "learning_rate": 4.502407704654895e-05,
1275
- "loss": 0.8004,
1276
- "step": 1160
1277
- },
1278
- {
1279
- "epoch": 0.86,
1280
- "eval_loss": 0.7478867769241333,
1281
- "eval_runtime": 0.1577,
1282
- "eval_samples_per_second": 6.341,
1283
- "eval_steps_per_second": 6.341,
1284
- "step": 1160
1285
- },
1286
- {
1287
- "epoch": 0.87,
1288
- "eval_loss": 0.7417320609092712,
1289
- "eval_runtime": 0.1572,
1290
- "eval_samples_per_second": 6.361,
1291
- "eval_steps_per_second": 6.361,
1292
- "step": 1170
1293
- },
1294
- {
1295
- "epoch": 0.88,
1296
- "learning_rate": 4.020866773675762e-05,
1297
- "loss": 0.7927,
1298
- "step": 1180
1299
- },
1300
- {
1301
- "epoch": 0.88,
1302
- "eval_loss": 0.7448097467422485,
1303
- "eval_runtime": 0.1578,
1304
- "eval_samples_per_second": 6.337,
1305
- "eval_steps_per_second": 6.337,
1306
- "step": 1180
1307
- },
1308
- {
1309
- "epoch": 0.88,
1310
- "eval_loss": 0.7570788860321045,
1311
- "eval_runtime": 0.1576,
1312
- "eval_samples_per_second": 6.346,
1313
- "eval_steps_per_second": 6.346,
1314
- "step": 1190
1315
- },
1316
- {
1317
- "epoch": 0.89,
1318
- "learning_rate": 3.539325842696629e-05,
1319
- "loss": 0.8013,
1320
- "step": 1200
1321
- },
1322
- {
1323
- "epoch": 0.89,
1324
- "eval_loss": 0.7594838738441467,
1325
- "eval_runtime": 0.1581,
1326
- "eval_samples_per_second": 6.324,
1327
- "eval_steps_per_second": 6.324,
1328
- "step": 1200
1329
- },
1330
- {
1331
- "epoch": 0.9,
1332
- "eval_loss": 0.7382398843765259,
1333
- "eval_runtime": 0.157,
1334
- "eval_samples_per_second": 6.369,
1335
- "eval_steps_per_second": 6.369,
1336
- "step": 1210
1337
- },
1338
- {
1339
- "epoch": 0.91,
1340
- "learning_rate": 3.057784911717496e-05,
1341
- "loss": 0.8002,
1342
- "step": 1220
1343
- },
1344
- {
1345
- "epoch": 0.91,
1346
- "eval_loss": 0.7314504384994507,
1347
- "eval_runtime": 0.1571,
1348
- "eval_samples_per_second": 6.366,
1349
- "eval_steps_per_second": 6.366,
1350
- "step": 1220
1351
- },
1352
- {
1353
- "epoch": 0.91,
1354
- "eval_loss": 0.780935525894165,
1355
- "eval_runtime": 0.1575,
1356
- "eval_samples_per_second": 6.348,
1357
- "eval_steps_per_second": 6.348,
1358
- "step": 1230
1359
- },
1360
- {
1361
- "epoch": 0.92,
1362
- "learning_rate": 2.5762439807383626e-05,
1363
- "loss": 0.7914,
1364
- "step": 1240
1365
- },
1366
- {
1367
- "epoch": 0.92,
1368
- "eval_loss": 0.7425116300582886,
1369
- "eval_runtime": 0.1574,
1370
- "eval_samples_per_second": 6.352,
1371
- "eval_steps_per_second": 6.352,
1372
- "step": 1240
1373
- },
1374
- {
1375
- "epoch": 0.93,
1376
- "eval_loss": 0.7363260984420776,
1377
- "eval_runtime": 0.1564,
1378
- "eval_samples_per_second": 6.394,
1379
- "eval_steps_per_second": 6.394,
1380
- "step": 1250
1381
- },
1382
- {
1383
- "epoch": 0.94,
1384
- "learning_rate": 2.0947030497592293e-05,
1385
- "loss": 0.7911,
1386
- "step": 1260
1387
- },
1388
- {
1389
- "epoch": 0.94,
1390
- "eval_loss": 0.7363616228103638,
1391
- "eval_runtime": 0.1566,
1392
- "eval_samples_per_second": 6.385,
1393
- "eval_steps_per_second": 6.385,
1394
- "step": 1260
1395
- },
1396
- {
1397
- "epoch": 0.94,
1398
- "eval_loss": 0.7710204124450684,
1399
- "eval_runtime": 0.1575,
1400
- "eval_samples_per_second": 6.349,
1401
- "eval_steps_per_second": 6.349,
1402
- "step": 1270
1403
- },
1404
- {
1405
- "epoch": 0.95,
1406
- "learning_rate": 1.613162118780096e-05,
1407
- "loss": 0.7973,
1408
- "step": 1280
1409
- },
1410
- {
1411
- "epoch": 0.95,
1412
- "eval_loss": 0.7362488508224487,
1413
- "eval_runtime": 0.1575,
1414
- "eval_samples_per_second": 6.349,
1415
- "eval_steps_per_second": 6.349,
1416
- "step": 1280
1417
- },
1418
- {
1419
- "epoch": 0.96,
1420
- "eval_loss": 0.7484357357025146,
1421
- "eval_runtime": 0.1581,
1422
- "eval_samples_per_second": 6.326,
1423
- "eval_steps_per_second": 6.326,
1424
- "step": 1290
1425
- },
1426
- {
1427
- "epoch": 0.97,
1428
- "learning_rate": 1.1316211878009629e-05,
1429
- "loss": 0.8008,
1430
- "step": 1300
1431
- },
1432
- {
1433
- "epoch": 0.97,
1434
- "eval_loss": 0.7393674254417419,
1435
- "eval_runtime": 0.1578,
1436
- "eval_samples_per_second": 6.338,
1437
- "eval_steps_per_second": 6.338,
1438
- "step": 1300
1439
- },
1440
- {
1441
- "epoch": 0.97,
1442
- "eval_loss": 0.738249659538269,
1443
- "eval_runtime": 0.1576,
1444
- "eval_samples_per_second": 6.344,
1445
- "eval_steps_per_second": 6.344,
1446
- "step": 1310
1447
- },
1448
- {
1449
- "epoch": 0.98,
1450
- "learning_rate": 6.500802568218298e-06,
1451
- "loss": 0.798,
1452
- "step": 1320
1453
- },
1454
- {
1455
- "epoch": 0.98,
1456
- "eval_loss": 0.7329107522964478,
1457
- "eval_runtime": 0.1575,
1458
- "eval_samples_per_second": 6.351,
1459
- "eval_steps_per_second": 6.351,
1460
- "step": 1320
1461
- },
1462
- {
1463
- "epoch": 0.99,
1464
- "eval_loss": 0.7391780614852905,
1465
- "eval_runtime": 0.1587,
1466
- "eval_samples_per_second": 6.302,
1467
- "eval_steps_per_second": 6.302,
1468
- "step": 1330
1469
- },
1470
- {
1471
- "epoch": 0.99,
1472
- "learning_rate": 1.6853932584269661e-06,
1473
- "loss": 0.7902,
1474
- "step": 1340
1475
- },
1476
- {
1477
- "epoch": 0.99,
1478
- "eval_loss": 0.7421530485153198,
1479
- "eval_runtime": 0.157,
1480
- "eval_samples_per_second": 6.37,
1481
- "eval_steps_per_second": 6.37,
1482
- "step": 1340
1483
- }
1484
- ],
1485
- "max_steps": 1346,
1486
- "num_train_epochs": 1,
1487
- "total_flos": 1.7418408714829824e+18,
1488
- "trial_name": null,
1489
- "trial_params": null
1490
- }