Training in progress, step 3438, checkpoint
Browse files
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 516810008
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88b916425e9c0934a0f7062407f1b73535e218ac98e877fcb8ee9b8017fd579d
|
3 |
size 516810008
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9798aa39906f12cd89a0a56a12db91e16577b6948826e0e6863eded99960b381
|
3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:805c0bccc0907c9d87d7cceb385924edbfe5fd82910b2d6ec679bbe7866a153f
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 1.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -23437,6 +23437,650 @@
|
|
23437 |
"learning_rate": 1.8731449773342625e-07,
|
23438 |
"loss": 0.0,
|
23439 |
"step": 3346
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23440 |
}
|
23441 |
],
|
23442 |
"logging_steps": 1,
|
@@ -23451,12 +24095,12 @@
|
|
23451 |
"should_evaluate": false,
|
23452 |
"should_log": false,
|
23453 |
"should_save": true,
|
23454 |
-
"should_training_stop":
|
23455 |
},
|
23456 |
"attributes": {}
|
23457 |
}
|
23458 |
},
|
23459 |
-
"total_flos": 3.
|
23460 |
"train_batch_size": 2,
|
23461 |
"trial_name": null,
|
23462 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.9997091755125782,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 3438,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
23437 |
"learning_rate": 1.8731449773342625e-07,
|
23438 |
"loss": 0.0,
|
23439 |
"step": 3346
|
23440 |
+
},
|
23441 |
+
{
|
23442 |
+
"epoch": 1.946779118801803,
|
23443 |
+
"grad_norm": NaN,
|
23444 |
+
"learning_rate": 1.8326705005886624e-07,
|
23445 |
+
"loss": 0.0,
|
23446 |
+
"step": 3347
|
23447 |
+
},
|
23448 |
+
{
|
23449 |
+
"epoch": 1.947360767776647,
|
23450 |
+
"grad_norm": NaN,
|
23451 |
+
"learning_rate": 1.7926372916975965e-07,
|
23452 |
+
"loss": 0.0,
|
23453 |
+
"step": 3348
|
23454 |
+
},
|
23455 |
+
{
|
23456 |
+
"epoch": 1.9479424167514905,
|
23457 |
+
"grad_norm": NaN,
|
23458 |
+
"learning_rate": 1.7530453861218098e-07,
|
23459 |
+
"loss": 0.0,
|
23460 |
+
"step": 3349
|
23461 |
+
},
|
23462 |
+
{
|
23463 |
+
"epoch": 1.948524065726334,
|
23464 |
+
"grad_norm": NaN,
|
23465 |
+
"learning_rate": 1.7138948189311387e-07,
|
23466 |
+
"loss": 0.0,
|
23467 |
+
"step": 3350
|
23468 |
+
},
|
23469 |
+
{
|
23470 |
+
"epoch": 1.9491057147011779,
|
23471 |
+
"grad_norm": NaN,
|
23472 |
+
"learning_rate": 1.6751856248043985e-07,
|
23473 |
+
"loss": 0.0,
|
23474 |
+
"step": 3351
|
23475 |
+
},
|
23476 |
+
{
|
23477 |
+
"epoch": 1.9496873636760215,
|
23478 |
+
"grad_norm": NaN,
|
23479 |
+
"learning_rate": 1.6369178380294948e-07,
|
23480 |
+
"loss": 0.0,
|
23481 |
+
"step": 3352
|
23482 |
+
},
|
23483 |
+
{
|
23484 |
+
"epoch": 1.9502690126508653,
|
23485 |
+
"grad_norm": NaN,
|
23486 |
+
"learning_rate": 1.5990914925034794e-07,
|
23487 |
+
"loss": 0.0,
|
23488 |
+
"step": 3353
|
23489 |
+
},
|
23490 |
+
{
|
23491 |
+
"epoch": 1.9508506616257089,
|
23492 |
+
"grad_norm": NaN,
|
23493 |
+
"learning_rate": 1.561706621732162e-07,
|
23494 |
+
"loss": 0.0,
|
23495 |
+
"step": 3354
|
23496 |
+
},
|
23497 |
+
{
|
23498 |
+
"epoch": 1.9514323106005524,
|
23499 |
+
"grad_norm": NaN,
|
23500 |
+
"learning_rate": 1.5247632588304417e-07,
|
23501 |
+
"loss": 0.0,
|
23502 |
+
"step": 3355
|
23503 |
+
},
|
23504 |
+
{
|
23505 |
+
"epoch": 1.9520139595753963,
|
23506 |
+
"grad_norm": NaN,
|
23507 |
+
"learning_rate": 1.488261436522087e-07,
|
23508 |
+
"loss": 0.0,
|
23509 |
+
"step": 3356
|
23510 |
+
},
|
23511 |
+
{
|
23512 |
+
"epoch": 1.95259560855024,
|
23513 |
+
"grad_norm": NaN,
|
23514 |
+
"learning_rate": 1.45220118713979e-07,
|
23515 |
+
"loss": 0.0,
|
23516 |
+
"step": 3357
|
23517 |
+
},
|
23518 |
+
{
|
23519 |
+
"epoch": 1.9531772575250836,
|
23520 |
+
"grad_norm": NaN,
|
23521 |
+
"learning_rate": 1.4165825426250557e-07,
|
23522 |
+
"loss": 0.0,
|
23523 |
+
"step": 3358
|
23524 |
+
},
|
23525 |
+
{
|
23526 |
+
"epoch": 1.9537589064999272,
|
23527 |
+
"grad_norm": NaN,
|
23528 |
+
"learning_rate": 1.3814055345282573e-07,
|
23529 |
+
"loss": 0.0,
|
23530 |
+
"step": 3359
|
23531 |
+
},
|
23532 |
+
{
|
23533 |
+
"epoch": 1.9543405554747708,
|
23534 |
+
"grad_norm": NaN,
|
23535 |
+
"learning_rate": 1.3466701940085259e-07,
|
23536 |
+
"loss": 0.0,
|
23537 |
+
"step": 3360
|
23538 |
+
},
|
23539 |
+
{
|
23540 |
+
"epoch": 1.9549222044496146,
|
23541 |
+
"grad_norm": NaN,
|
23542 |
+
"learning_rate": 1.3123765518339716e-07,
|
23543 |
+
"loss": 0.0,
|
23544 |
+
"step": 3361
|
23545 |
+
},
|
23546 |
+
{
|
23547 |
+
"epoch": 1.9555038534244584,
|
23548 |
+
"grad_norm": NaN,
|
23549 |
+
"learning_rate": 1.2785246383811288e-07,
|
23550 |
+
"loss": 0.0,
|
23551 |
+
"step": 3362
|
23552 |
+
},
|
23553 |
+
{
|
23554 |
+
"epoch": 1.956085502399302,
|
23555 |
+
"grad_norm": NaN,
|
23556 |
+
"learning_rate": 1.2451144836355123e-07,
|
23557 |
+
"loss": 0.0,
|
23558 |
+
"step": 3363
|
23559 |
+
},
|
23560 |
+
{
|
23561 |
+
"epoch": 1.9566671513741456,
|
23562 |
+
"grad_norm": NaN,
|
23563 |
+
"learning_rate": 1.2121461171912262e-07,
|
23564 |
+
"loss": 0.0,
|
23565 |
+
"step": 3364
|
23566 |
+
},
|
23567 |
+
{
|
23568 |
+
"epoch": 1.9572488003489894,
|
23569 |
+
"grad_norm": NaN,
|
23570 |
+
"learning_rate": 1.179619568251078e-07,
|
23571 |
+
"loss": 0.0,
|
23572 |
+
"step": 3365
|
23573 |
+
},
|
23574 |
+
{
|
23575 |
+
"epoch": 1.9578304493238332,
|
23576 |
+
"grad_norm": NaN,
|
23577 |
+
"learning_rate": 1.147534865626465e-07,
|
23578 |
+
"loss": 0.0,
|
23579 |
+
"step": 3366
|
23580 |
+
},
|
23581 |
+
{
|
23582 |
+
"epoch": 1.9584120982986768,
|
23583 |
+
"grad_norm": NaN,
|
23584 |
+
"learning_rate": 1.1158920377375426e-07,
|
23585 |
+
"loss": 0.0,
|
23586 |
+
"step": 3367
|
23587 |
+
},
|
23588 |
+
{
|
23589 |
+
"epoch": 1.9589937472735204,
|
23590 |
+
"grad_norm": NaN,
|
23591 |
+
"learning_rate": 1.08469111261289e-07,
|
23592 |
+
"loss": 0.0,
|
23593 |
+
"step": 3368
|
23594 |
+
},
|
23595 |
+
{
|
23596 |
+
"epoch": 1.959575396248364,
|
23597 |
+
"grad_norm": NaN,
|
23598 |
+
"learning_rate": 1.0539321178897888e-07,
|
23599 |
+
"loss": 0.0,
|
23600 |
+
"step": 3369
|
23601 |
+
},
|
23602 |
+
{
|
23603 |
+
"epoch": 1.9601570452232078,
|
23604 |
+
"grad_norm": NaN,
|
23605 |
+
"learning_rate": 1.0236150808139999e-07,
|
23606 |
+
"loss": 0.0,
|
23607 |
+
"step": 3370
|
23608 |
+
},
|
23609 |
+
{
|
23610 |
+
"epoch": 1.9607386941980516,
|
23611 |
+
"grad_norm": NaN,
|
23612 |
+
"learning_rate": 9.937400282398201e-08,
|
23613 |
+
"loss": 0.0,
|
23614 |
+
"step": 3371
|
23615 |
+
},
|
23616 |
+
{
|
23617 |
+
"epoch": 1.9613203431728952,
|
23618 |
+
"grad_norm": NaN,
|
23619 |
+
"learning_rate": 9.643069866300259e-08,
|
23620 |
+
"loss": 0.0,
|
23621 |
+
"step": 3372
|
23622 |
+
},
|
23623 |
+
{
|
23624 |
+
"epoch": 1.9619019921477387,
|
23625 |
+
"grad_norm": NaN,
|
23626 |
+
"learning_rate": 9.353159820559287e-08,
|
23627 |
+
"loss": 0.0,
|
23628 |
+
"step": 3373
|
23629 |
+
},
|
23630 |
+
{
|
23631 |
+
"epoch": 1.9624836411225826,
|
23632 |
+
"grad_norm": NaN,
|
23633 |
+
"learning_rate": 9.067670401972095e-08,
|
23634 |
+
"loss": 0.0,
|
23635 |
+
"step": 3374
|
23636 |
+
},
|
23637 |
+
{
|
23638 |
+
"epoch": 1.9630652900974264,
|
23639 |
+
"grad_norm": NaN,
|
23640 |
+
"learning_rate": 8.786601863420286e-08,
|
23641 |
+
"loss": 0.0,
|
23642 |
+
"step": 3375
|
23643 |
+
},
|
23644 |
+
{
|
23645 |
+
"epoch": 1.96364693907227,
|
23646 |
+
"grad_norm": NaN,
|
23647 |
+
"learning_rate": 8.509954453869152e-08,
|
23648 |
+
"loss": 0.0,
|
23649 |
+
"step": 3376
|
23650 |
+
},
|
23651 |
+
{
|
23652 |
+
"epoch": 1.9642285880471135,
|
23653 |
+
"grad_norm": NaN,
|
23654 |
+
"learning_rate": 8.23772841836934e-08,
|
23655 |
+
"loss": 0.0,
|
23656 |
+
"step": 3377
|
23657 |
+
},
|
23658 |
+
{
|
23659 |
+
"epoch": 1.9648102370219571,
|
23660 |
+
"grad_norm": NaN,
|
23661 |
+
"learning_rate": 7.96992399805241e-08,
|
23662 |
+
"loss": 0.0,
|
23663 |
+
"step": 3378
|
23664 |
+
},
|
23665 |
+
{
|
23666 |
+
"epoch": 1.965391885996801,
|
23667 |
+
"grad_norm": NaN,
|
23668 |
+
"learning_rate": 7.706541430135273e-08,
|
23669 |
+
"loss": 0.0,
|
23670 |
+
"step": 3379
|
23671 |
+
},
|
23672 |
+
{
|
23673 |
+
"epoch": 1.9659735349716447,
|
23674 |
+
"grad_norm": NaN,
|
23675 |
+
"learning_rate": 7.447580947917975e-08,
|
23676 |
+
"loss": 0.0,
|
23677 |
+
"step": 3380
|
23678 |
+
},
|
23679 |
+
{
|
23680 |
+
"epoch": 1.9665551839464883,
|
23681 |
+
"grad_norm": NaN,
|
23682 |
+
"learning_rate": 7.193042780782588e-08,
|
23683 |
+
"loss": 0.0,
|
23684 |
+
"step": 3381
|
23685 |
+
},
|
23686 |
+
{
|
23687 |
+
"epoch": 1.967136832921332,
|
23688 |
+
"grad_norm": NaN,
|
23689 |
+
"learning_rate": 6.942927154194867e-08,
|
23690 |
+
"loss": 0.0,
|
23691 |
+
"step": 3382
|
23692 |
+
},
|
23693 |
+
{
|
23694 |
+
"epoch": 1.9677184818961757,
|
23695 |
+
"grad_norm": NaN,
|
23696 |
+
"learning_rate": 6.697234289703147e-08,
|
23697 |
+
"loss": 0.0,
|
23698 |
+
"step": 3383
|
23699 |
+
},
|
23700 |
+
{
|
23701 |
+
"epoch": 1.9683001308710193,
|
23702 |
+
"grad_norm": NaN,
|
23703 |
+
"learning_rate": 6.455964404937232e-08,
|
23704 |
+
"loss": 0.0,
|
23705 |
+
"step": 3384
|
23706 |
+
},
|
23707 |
+
{
|
23708 |
+
"epoch": 1.968881779845863,
|
23709 |
+
"grad_norm": NaN,
|
23710 |
+
"learning_rate": 6.219117713610056e-08,
|
23711 |
+
"loss": 0.0,
|
23712 |
+
"step": 3385
|
23713 |
+
},
|
23714 |
+
{
|
23715 |
+
"epoch": 1.9694634288207067,
|
23716 |
+
"grad_norm": NaN,
|
23717 |
+
"learning_rate": 5.986694425516026e-08,
|
23718 |
+
"loss": 0.0,
|
23719 |
+
"step": 3386
|
23720 |
+
},
|
23721 |
+
{
|
23722 |
+
"epoch": 1.9700450777955503,
|
23723 |
+
"grad_norm": NaN,
|
23724 |
+
"learning_rate": 5.7586947465315675e-08,
|
23725 |
+
"loss": 0.0,
|
23726 |
+
"step": 3387
|
23727 |
+
},
|
23728 |
+
{
|
23729 |
+
"epoch": 1.970626726770394,
|
23730 |
+
"grad_norm": NaN,
|
23731 |
+
"learning_rate": 5.535118878615131e-08,
|
23732 |
+
"loss": 0.0,
|
23733 |
+
"step": 3388
|
23734 |
+
},
|
23735 |
+
{
|
23736 |
+
"epoch": 1.9712083757452379,
|
23737 |
+
"grad_norm": NaN,
|
23738 |
+
"learning_rate": 5.315967019806078e-08,
|
23739 |
+
"loss": 0.0,
|
23740 |
+
"step": 3389
|
23741 |
+
},
|
23742 |
+
{
|
23743 |
+
"epoch": 1.9717900247200815,
|
23744 |
+
"grad_norm": NaN,
|
23745 |
+
"learning_rate": 5.101239364225796e-08,
|
23746 |
+
"loss": 0.0,
|
23747 |
+
"step": 3390
|
23748 |
+
},
|
23749 |
+
{
|
23750 |
+
"epoch": 1.972371673694925,
|
23751 |
+
"grad_norm": NaN,
|
23752 |
+
"learning_rate": 4.890936102075472e-08,
|
23753 |
+
"loss": 0.0,
|
23754 |
+
"step": 3391
|
23755 |
+
},
|
23756 |
+
{
|
23757 |
+
"epoch": 1.9729533226697686,
|
23758 |
+
"grad_norm": NaN,
|
23759 |
+
"learning_rate": 4.685057419638317e-08,
|
23760 |
+
"loss": 0.0,
|
23761 |
+
"step": 3392
|
23762 |
+
},
|
23763 |
+
{
|
23764 |
+
"epoch": 1.9735349716446124,
|
23765 |
+
"grad_norm": NaN,
|
23766 |
+
"learning_rate": 4.4836034992779e-08,
|
23767 |
+
"loss": 0.0,
|
23768 |
+
"step": 3393
|
23769 |
+
},
|
23770 |
+
{
|
23771 |
+
"epoch": 1.9741166206194563,
|
23772 |
+
"grad_norm": NaN,
|
23773 |
+
"learning_rate": 4.286574519438702e-08,
|
23774 |
+
"loss": 0.0,
|
23775 |
+
"step": 3394
|
23776 |
+
},
|
23777 |
+
{
|
23778 |
+
"epoch": 1.9746982695942998,
|
23779 |
+
"grad_norm": NaN,
|
23780 |
+
"learning_rate": 4.0939706546461175e-08,
|
23781 |
+
"loss": 0.0,
|
23782 |
+
"step": 3395
|
23783 |
+
},
|
23784 |
+
{
|
23785 |
+
"epoch": 1.9752799185691434,
|
23786 |
+
"grad_norm": NaN,
|
23787 |
+
"learning_rate": 3.905792075504233e-08,
|
23788 |
+
"loss": 0.0,
|
23789 |
+
"step": 3396
|
23790 |
+
},
|
23791 |
+
{
|
23792 |
+
"epoch": 1.9758615675439872,
|
23793 |
+
"grad_norm": NaN,
|
23794 |
+
"learning_rate": 3.722038948698603e-08,
|
23795 |
+
"loss": 0.0,
|
23796 |
+
"step": 3397
|
23797 |
+
},
|
23798 |
+
{
|
23799 |
+
"epoch": 1.976443216518831,
|
23800 |
+
"grad_norm": NaN,
|
23801 |
+
"learning_rate": 3.542711436995139e-08,
|
23802 |
+
"loss": 0.0,
|
23803 |
+
"step": 3398
|
23804 |
+
},
|
23805 |
+
{
|
23806 |
+
"epoch": 1.9770248654936746,
|
23807 |
+
"grad_norm": NaN,
|
23808 |
+
"learning_rate": 3.3678096992384446e-08,
|
23809 |
+
"loss": 0.0,
|
23810 |
+
"step": 3399
|
23811 |
+
},
|
23812 |
+
{
|
23813 |
+
"epoch": 1.9776065144685182,
|
23814 |
+
"grad_norm": NaN,
|
23815 |
+
"learning_rate": 3.197333890353482e-08,
|
23816 |
+
"loss": 0.0,
|
23817 |
+
"step": 3400
|
23818 |
+
},
|
23819 |
+
{
|
23820 |
+
"epoch": 1.9781881634433618,
|
23821 |
+
"grad_norm": NaN,
|
23822 |
+
"learning_rate": 3.031284161344461e-08,
|
23823 |
+
"loss": 0.0,
|
23824 |
+
"step": 3401
|
23825 |
+
},
|
23826 |
+
{
|
23827 |
+
"epoch": 1.9787698124182056,
|
23828 |
+
"grad_norm": NaN,
|
23829 |
+
"learning_rate": 2.8696606592959475e-08,
|
23830 |
+
"loss": 0.0,
|
23831 |
+
"step": 3402
|
23832 |
+
},
|
23833 |
+
{
|
23834 |
+
"epoch": 1.9793514613930494,
|
23835 |
+
"grad_norm": NaN,
|
23836 |
+
"learning_rate": 2.7124635273712006e-08,
|
23837 |
+
"loss": 0.0,
|
23838 |
+
"step": 3403
|
23839 |
+
},
|
23840 |
+
{
|
23841 |
+
"epoch": 1.979933110367893,
|
23842 |
+
"grad_norm": NaN,
|
23843 |
+
"learning_rate": 2.5596929048116168e-08,
|
23844 |
+
"loss": 0.0,
|
23845 |
+
"step": 3404
|
23846 |
+
},
|
23847 |
+
{
|
23848 |
+
"epoch": 1.9805147593427366,
|
23849 |
+
"grad_norm": NaN,
|
23850 |
+
"learning_rate": 2.41134892694006e-08,
|
23851 |
+
"loss": 0.0,
|
23852 |
+
"step": 3405
|
23853 |
+
},
|
23854 |
+
{
|
23855 |
+
"epoch": 1.9810964083175804,
|
23856 |
+
"grad_norm": NaN,
|
23857 |
+
"learning_rate": 2.2674317251558664e-08,
|
23858 |
+
"loss": 0.0,
|
23859 |
+
"step": 3406
|
23860 |
+
},
|
23861 |
+
{
|
23862 |
+
"epoch": 1.981678057292424,
|
23863 |
+
"grad_norm": NaN,
|
23864 |
+
"learning_rate": 2.127941426938729e-08,
|
23865 |
+
"loss": 0.0,
|
23866 |
+
"step": 3407
|
23867 |
+
},
|
23868 |
+
{
|
23869 |
+
"epoch": 1.9822597062672678,
|
23870 |
+
"grad_norm": NaN,
|
23871 |
+
"learning_rate": 1.9928781558475883e-08,
|
23872 |
+
"loss": 0.0,
|
23873 |
+
"step": 3408
|
23874 |
+
},
|
23875 |
+
{
|
23876 |
+
"epoch": 1.9828413552421114,
|
23877 |
+
"grad_norm": NaN,
|
23878 |
+
"learning_rate": 1.862242031517858e-08,
|
23879 |
+
"loss": 0.0,
|
23880 |
+
"step": 3409
|
23881 |
+
},
|
23882 |
+
{
|
23883 |
+
"epoch": 1.983423004216955,
|
23884 |
+
"grad_norm": NaN,
|
23885 |
+
"learning_rate": 1.7360331696653075e-08,
|
23886 |
+
"loss": 0.0,
|
23887 |
+
"step": 3410
|
23888 |
+
},
|
23889 |
+
{
|
23890 |
+
"epoch": 1.9840046531917988,
|
23891 |
+
"grad_norm": NaN,
|
23892 |
+
"learning_rate": 1.614251682083845e-08,
|
23893 |
+
"loss": 0.0,
|
23894 |
+
"step": 3411
|
23895 |
+
},
|
23896 |
+
{
|
23897 |
+
"epoch": 1.9845863021666426,
|
23898 |
+
"grad_norm": NaN,
|
23899 |
+
"learning_rate": 1.496897676644404e-08,
|
23900 |
+
"loss": 0.0,
|
23901 |
+
"step": 3412
|
23902 |
+
},
|
23903 |
+
{
|
23904 |
+
"epoch": 1.9851679511414861,
|
23905 |
+
"grad_norm": NaN,
|
23906 |
+
"learning_rate": 1.3839712572977227e-08,
|
23907 |
+
"loss": 0.0,
|
23908 |
+
"step": 3413
|
23909 |
+
},
|
23910 |
+
{
|
23911 |
+
"epoch": 1.9857496001163297,
|
23912 |
+
"grad_norm": NaN,
|
23913 |
+
"learning_rate": 1.275472524072674e-08,
|
23914 |
+
"loss": 0.0,
|
23915 |
+
"step": 3414
|
23916 |
+
},
|
23917 |
+
{
|
23918 |
+
"epoch": 1.9863312490911733,
|
23919 |
+
"grad_norm": NaN,
|
23920 |
+
"learning_rate": 1.1714015730740492e-08,
|
23921 |
+
"loss": 0.0,
|
23922 |
+
"step": 3415
|
23923 |
+
},
|
23924 |
+
{
|
23925 |
+
"epoch": 1.9869128980660171,
|
23926 |
+
"grad_norm": NaN,
|
23927 |
+
"learning_rate": 1.0717584964869964e-08,
|
23928 |
+
"loss": 0.0,
|
23929 |
+
"step": 3416
|
23930 |
+
},
|
23931 |
+
{
|
23932 |
+
"epoch": 1.987494547040861,
|
23933 |
+
"grad_norm": NaN,
|
23934 |
+
"learning_rate": 9.765433825736914e-09,
|
23935 |
+
"loss": 0.0,
|
23936 |
+
"step": 3417
|
23937 |
+
},
|
23938 |
+
{
|
23939 |
+
"epoch": 1.9880761960157045,
|
23940 |
+
"grad_norm": NaN,
|
23941 |
+
"learning_rate": 8.857563156738913e-09,
|
23942 |
+
"loss": 0.0,
|
23943 |
+
"step": 3418
|
23944 |
+
},
|
23945 |
+
{
|
23946 |
+
"epoch": 1.988657844990548,
|
23947 |
+
"grad_norm": NaN,
|
23948 |
+
"learning_rate": 7.993973762049356e-09,
|
23949 |
+
"loss": 0.0,
|
23950 |
+
"step": 3419
|
23951 |
+
},
|
23952 |
+
{
|
23953 |
+
"epoch": 1.989239493965392,
|
23954 |
+
"grad_norm": NaN,
|
23955 |
+
"learning_rate": 7.1746664066230094e-09,
|
23956 |
+
"loss": 0.0,
|
23957 |
+
"step": 3420
|
23958 |
+
},
|
23959 |
+
{
|
23960 |
+
"epoch": 1.9898211429402357,
|
23961 |
+
"grad_norm": NaN,
|
23962 |
+
"learning_rate": 6.399641816184909e-09,
|
23963 |
+
"loss": 0.0,
|
23964 |
+
"step": 3421
|
23965 |
+
},
|
23966 |
+
{
|
23967 |
+
"epoch": 1.9904027919150793,
|
23968 |
+
"grad_norm": NaN,
|
23969 |
+
"learning_rate": 5.668900677235911e-09,
|
23970 |
+
"loss": 0.0,
|
23971 |
+
"step": 3422
|
23972 |
+
},
|
23973 |
+
{
|
23974 |
+
"epoch": 1.9909844408899229,
|
23975 |
+
"grad_norm": NaN,
|
23976 |
+
"learning_rate": 4.982443637063794e-09,
|
23977 |
+
"loss": 0.0,
|
23978 |
+
"step": 3423
|
23979 |
+
},
|
23980 |
+
{
|
23981 |
+
"epoch": 1.9915660898647665,
|
23982 |
+
"grad_norm": NaN,
|
23983 |
+
"learning_rate": 4.340271303715504e-09,
|
23984 |
+
"loss": 0.0,
|
23985 |
+
"step": 3424
|
23986 |
+
},
|
23987 |
+
{
|
23988 |
+
"epoch": 1.9921477388396103,
|
23989 |
+
"grad_norm": NaN,
|
23990 |
+
"learning_rate": 3.742384246008257e-09,
|
23991 |
+
"loss": 0.0,
|
23992 |
+
"step": 3425
|
23993 |
+
},
|
23994 |
+
{
|
23995 |
+
"epoch": 1.992729387814454,
|
23996 |
+
"grad_norm": NaN,
|
23997 |
+
"learning_rate": 3.188782993551742e-09,
|
23998 |
+
"loss": 0.0,
|
23999 |
+
"step": 3426
|
24000 |
+
},
|
24001 |
+
{
|
24002 |
+
"epoch": 1.9933110367892977,
|
24003 |
+
"grad_norm": NaN,
|
24004 |
+
"learning_rate": 2.679468036709265e-09,
|
24005 |
+
"loss": 0.0,
|
24006 |
+
"step": 3427
|
24007 |
+
},
|
24008 |
+
{
|
24009 |
+
"epoch": 1.9938926857641412,
|
24010 |
+
"grad_norm": NaN,
|
24011 |
+
"learning_rate": 2.2144398266199518e-09,
|
24012 |
+
"loss": 0.0,
|
24013 |
+
"step": 3428
|
24014 |
+
},
|
24015 |
+
{
|
24016 |
+
"epoch": 1.994474334738985,
|
24017 |
+
"grad_norm": NaN,
|
24018 |
+
"learning_rate": 1.7936987752098511e-09,
|
24019 |
+
"loss": 0.0,
|
24020 |
+
"step": 3429
|
24021 |
+
},
|
24022 |
+
{
|
24023 |
+
"epoch": 1.9950559837138289,
|
24024 |
+
"grad_norm": NaN,
|
24025 |
+
"learning_rate": 1.417245255153077e-09,
|
24026 |
+
"loss": 0.0,
|
24027 |
+
"step": 3430
|
24028 |
+
},
|
24029 |
+
{
|
24030 |
+
"epoch": 1.9956376326886724,
|
24031 |
+
"grad_norm": NaN,
|
24032 |
+
"learning_rate": 1.0850795999051143e-09,
|
24033 |
+
"loss": 0.0,
|
24034 |
+
"step": 3431
|
24035 |
+
},
|
24036 |
+
{
|
24037 |
+
"epoch": 1.996219281663516,
|
24038 |
+
"grad_norm": NaN,
|
24039 |
+
"learning_rate": 7.972021036972699e-10,
|
24040 |
+
"loss": 0.0,
|
24041 |
+
"step": 3432
|
24042 |
+
},
|
24043 |
+
{
|
24044 |
+
"epoch": 1.9968009306383596,
|
24045 |
+
"grad_norm": NaN,
|
24046 |
+
"learning_rate": 5.536130215311186e-10,
|
24047 |
+
"loss": 0.0,
|
24048 |
+
"step": 3433
|
24049 |
+
},
|
24050 |
+
{
|
24051 |
+
"epoch": 1.9973825796132034,
|
24052 |
+
"grad_norm": NaN,
|
24053 |
+
"learning_rate": 3.5431256916185207e-10,
|
24054 |
+
"loss": 0.0,
|
24055 |
+
"step": 3434
|
24056 |
+
},
|
24057 |
+
{
|
24058 |
+
"epoch": 1.9979642285880472,
|
24059 |
+
"grad_norm": NaN,
|
24060 |
+
"learning_rate": 1.9930092313158455e-10,
|
24061 |
+
"loss": 0.0,
|
24062 |
+
"step": 3435
|
24063 |
+
},
|
24064 |
+
{
|
24065 |
+
"epoch": 1.9985458775628908,
|
24066 |
+
"grad_norm": NaN,
|
24067 |
+
"learning_rate": 8.857822075269973e-11,
|
24068 |
+
"loss": 0.0,
|
24069 |
+
"step": 3436
|
24070 |
+
},
|
24071 |
+
{
|
24072 |
+
"epoch": 1.9991275265377344,
|
24073 |
+
"grad_norm": NaN,
|
24074 |
+
"learning_rate": 2.2144560091197363e-11,
|
24075 |
+
"loss": 0.0,
|
24076 |
+
"step": 3437
|
24077 |
+
},
|
24078 |
+
{
|
24079 |
+
"epoch": 1.9997091755125782,
|
24080 |
+
"grad_norm": NaN,
|
24081 |
+
"learning_rate": 0.0,
|
24082 |
+
"loss": 0.0,
|
24083 |
+
"step": 3438
|
24084 |
}
|
24085 |
],
|
24086 |
"logging_steps": 1,
|
|
|
24095 |
"should_evaluate": false,
|
24096 |
"should_log": false,
|
24097 |
"should_save": true,
|
24098 |
+
"should_training_stop": true
|
24099 |
},
|
24100 |
"attributes": {}
|
24101 |
}
|
24102 |
},
|
24103 |
+
"total_flos": 3.203855633350656e+16,
|
24104 |
"train_batch_size": 2,
|
24105 |
"trial_name": null,
|
24106 |
"trial_params": null
|