seblaku commited on
Commit
84f03ee
·
verified ·
1 Parent(s): a773c62

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d469ea9df03972c343769dd3debf699740933ee629d873829a539d3c644ab553
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:515bd1fd2e800ff785b6479fdd6957fd4bf27e5b1c30a3f1cf8aae527f08a08b
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8122a06e66c624c932a42178e5542adf5b26b376eb710b02aa90bcbdec3fb466
3
- size 341314196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bd05b1422ea617e033a87629b7a8d0f6328816ebf140850f114f1f26f4cb7a8
3
+ size 341314644
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cdb60e2481fd56b78600b940df2781df5f95c30c8817875fe3c80f69f6a8ca9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:809cadbdaab075c11480d526dd51b8f21d82fd34b84730fb1208ace6f51dd4d6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5de003083aa1583e9eb79bf332ee530b155db52c504be6ab21416c2870099d65
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5e2a8ca8cbdb222b4fcc6a743fadf9ab6adfaf2459d28805db388e0cbe4b5b9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.36784711480140686,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.1568627450980392,
5
  "eval_steps": 100,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1431,6 +1431,714 @@
1431
  "eval_samples_per_second": 12.655,
1432
  "eval_steps_per_second": 3.164,
1433
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434
  }
1435
  ],
1436
  "logging_steps": 1,
@@ -1459,7 +2167,7 @@
1459
  "attributes": {}
1460
  }
1461
  },
1462
- "total_flos": 3.030141694503813e+17,
1463
  "train_batch_size": 8,
1464
  "trial_name": null,
1465
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.35413047671318054,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
+ "epoch": 0.23529411764705882,
5
  "eval_steps": 100,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1431
  "eval_samples_per_second": 12.655,
1432
  "eval_steps_per_second": 3.164,
1433
  "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.15764705882352942,
1437
+ "grad_norm": 2.9938836097717285,
1438
+ "learning_rate": 3.6127556076296055e-05,
1439
+ "loss": 3.0403,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.1584313725490196,
1444
+ "grad_norm": 2.702965497970581,
1445
+ "learning_rate": 3.5845731083264626e-05,
1446
+ "loss": 1.9497,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.1592156862745098,
1451
+ "grad_norm": 2.5172975063323975,
1452
+ "learning_rate": 3.5563851211955195e-05,
1453
+ "loss": 1.5842,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.16,
1458
+ "grad_norm": 2.603217363357544,
1459
+ "learning_rate": 3.528193475314864e-05,
1460
+ "loss": 1.5727,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.1607843137254902,
1465
+ "grad_norm": 2.615779399871826,
1466
+ "learning_rate": 3.5e-05,
1467
+ "loss": 1.3275,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.16156862745098038,
1472
+ "grad_norm": 2.509752035140991,
1473
+ "learning_rate": 3.4718065246851354e-05,
1474
+ "loss": 1.2942,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.1623529411764706,
1479
+ "grad_norm": 2.902695655822754,
1480
+ "learning_rate": 3.4436148788044806e-05,
1481
+ "loss": 1.1889,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.1631372549019608,
1486
+ "grad_norm": 3.608950138092041,
1487
+ "learning_rate": 3.4154268916735375e-05,
1488
+ "loss": 1.4452,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 0.16392156862745097,
1493
+ "grad_norm": 2.8737595081329346,
1494
+ "learning_rate": 3.387244392370395e-05,
1495
+ "loss": 1.2904,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 0.16470588235294117,
1500
+ "grad_norm": 3.235945463180542,
1501
+ "learning_rate": 3.3590692096170476e-05,
1502
+ "loss": 1.2737,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 0.16549019607843138,
1507
+ "grad_norm": 3.6993987560272217,
1508
+ "learning_rate": 3.3309031716607255e-05,
1509
+ "loss": 1.3786,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 0.16627450980392156,
1514
+ "grad_norm": 3.325028419494629,
1515
+ "learning_rate": 3.302748106155268e-05,
1516
+ "loss": 1.1542,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 0.16705882352941176,
1521
+ "grad_norm": 2.829763650894165,
1522
+ "learning_rate": 3.274605840042531e-05,
1523
+ "loss": 0.9143,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 0.16784313725490196,
1528
+ "grad_norm": 4.127065181732178,
1529
+ "learning_rate": 3.2464781994338315e-05,
1530
+ "loss": 1.4021,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 0.16862745098039217,
1535
+ "grad_norm": 3.2039248943328857,
1536
+ "learning_rate": 3.2183670094914595e-05,
1537
+ "loss": 1.1642,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 0.16941176470588235,
1542
+ "grad_norm": 3.7868335247039795,
1543
+ "learning_rate": 3.190274094310245e-05,
1544
+ "loss": 1.3842,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 0.17019607843137255,
1549
+ "grad_norm": 3.718687057495117,
1550
+ "learning_rate": 3.162201276799197e-05,
1551
+ "loss": 1.4267,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 0.17098039215686275,
1556
+ "grad_norm": 3.7910690307617188,
1557
+ "learning_rate": 3.134150378563213e-05,
1558
+ "loss": 1.2455,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 0.17176470588235293,
1563
+ "grad_norm": 3.5929932594299316,
1564
+ "learning_rate": 3.1061232197848805e-05,
1565
+ "loss": 1.2007,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 0.17254901960784313,
1570
+ "grad_norm": 4.848661422729492,
1571
+ "learning_rate": 3.07812161910637e-05,
1572
+ "loss": 1.5243,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 0.17333333333333334,
1577
+ "grad_norm": 4.240635395050049,
1578
+ "learning_rate": 3.050147393511419e-05,
1579
+ "loss": 1.2182,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 0.17411764705882352,
1584
+ "grad_norm": 4.414323806762695,
1585
+ "learning_rate": 3.0222023582074396e-05,
1586
+ "loss": 1.3929,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 0.17490196078431372,
1591
+ "grad_norm": 4.253678321838379,
1592
+ "learning_rate": 2.994288326507726e-05,
1593
+ "loss": 1.0231,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 0.17568627450980392,
1598
+ "grad_norm": 4.7787089347839355,
1599
+ "learning_rate": 2.9664071097137908e-05,
1600
+ "loss": 1.3696,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 0.17647058823529413,
1605
+ "grad_norm": 6.1173248291015625,
1606
+ "learning_rate": 2.9385605169978387e-05,
1607
+ "loss": 1.3486,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 0.1772549019607843,
1612
+ "grad_norm": 5.057071685791016,
1613
+ "learning_rate": 2.9107503552853648e-05,
1614
+ "loss": 1.1463,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.1780392156862745,
1619
+ "grad_norm": 4.796481132507324,
1620
+ "learning_rate": 2.8829784291379082e-05,
1621
+ "loss": 1.0947,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.17882352941176471,
1626
+ "grad_norm": 4.333375930786133,
1627
+ "learning_rate": 2.8552465406359593e-05,
1628
+ "loss": 1.0557,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.1796078431372549,
1633
+ "grad_norm": 4.937939167022705,
1634
+ "learning_rate": 2.8275564892620202e-05,
1635
+ "loss": 0.8979,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.1803921568627451,
1640
+ "grad_norm": 5.160710334777832,
1641
+ "learning_rate": 2.799910071783845e-05,
1642
+ "loss": 0.9403,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.1811764705882353,
1647
+ "grad_norm": 7.218094348907471,
1648
+ "learning_rate": 2.7723090821378425e-05,
1649
+ "loss": 0.899,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.18196078431372548,
1654
+ "grad_norm": 4.6421332359313965,
1655
+ "learning_rate": 2.7447553113126786e-05,
1656
+ "loss": 0.934,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.18274509803921568,
1661
+ "grad_norm": 7.592020034790039,
1662
+ "learning_rate": 2.7172505472330564e-05,
1663
+ "loss": 0.9639,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.18352941176470589,
1668
+ "grad_norm": 6.882318496704102,
1669
+ "learning_rate": 2.6897965746436997e-05,
1670
+ "loss": 1.3508,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.1843137254901961,
1675
+ "grad_norm": 6.335398197174072,
1676
+ "learning_rate": 2.6623951749935487e-05,
1677
+ "loss": 1.2642,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.18509803921568627,
1682
+ "grad_norm": 5.449366569519043,
1683
+ "learning_rate": 2.635048126320156e-05,
1684
+ "loss": 1.4236,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.18588235294117647,
1689
+ "grad_norm": 8.11070728302002,
1690
+ "learning_rate": 2.6077572031343227e-05,
1691
+ "loss": 1.3364,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.18666666666666668,
1696
+ "grad_norm": 6.608270168304443,
1697
+ "learning_rate": 2.5805241763049455e-05,
1698
+ "loss": 1.4484,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.18745098039215685,
1703
+ "grad_norm": 7.942669868469238,
1704
+ "learning_rate": 2.553350812944107e-05,
1705
+ "loss": 1.4957,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.18823529411764706,
1710
+ "grad_norm": 7.074437618255615,
1711
+ "learning_rate": 2.5262388762924156e-05,
1712
+ "loss": 1.3078,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.18901960784313726,
1717
+ "grad_norm": 7.799503326416016,
1718
+ "learning_rate": 2.499190125604588e-05,
1719
+ "loss": 1.5845,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.18980392156862744,
1724
+ "grad_norm": 9.548646926879883,
1725
+ "learning_rate": 2.4722063160352926e-05,
1726
+ "loss": 2.075,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.19058823529411764,
1731
+ "grad_norm": 9.75729751586914,
1732
+ "learning_rate": 2.4452891985252647e-05,
1733
+ "loss": 2.0484,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.19137254901960785,
1738
+ "grad_norm": 8.452042579650879,
1739
+ "learning_rate": 2.4184405196876842e-05,
1740
+ "loss": 1.6378,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.19215686274509805,
1745
+ "grad_norm": 7.993230819702148,
1746
+ "learning_rate": 2.3916620216948467e-05,
1747
+ "loss": 1.8914,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.19294117647058823,
1752
+ "grad_norm": 20.330001831054688,
1753
+ "learning_rate": 2.3649554421651106e-05,
1754
+ "loss": 2.7448,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.19372549019607843,
1759
+ "grad_norm": 11.52759838104248,
1760
+ "learning_rate": 2.338322514050151e-05,
1761
+ "loss": 1.4114,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.19450980392156864,
1766
+ "grad_norm": 12.445416450500488,
1767
+ "learning_rate": 2.3117649655225048e-05,
1768
+ "loss": 2.3627,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.1952941176470588,
1773
+ "grad_norm": 16.47171401977539,
1774
+ "learning_rate": 2.2852845198634395e-05,
1775
+ "loss": 3.235,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.19607843137254902,
1780
+ "grad_norm": 14.980457305908203,
1781
+ "learning_rate": 2.258882895351125e-05,
1782
+ "loss": 2.4173,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 0.19686274509803922,
1787
+ "grad_norm": 2.359992742538452,
1788
+ "learning_rate": 2.2325618051491415e-05,
1789
+ "loss": 3.2603,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 0.1976470588235294,
1794
+ "grad_norm": 1.96599280834198,
1795
+ "learning_rate": 2.2063229571953114e-05,
1796
+ "loss": 1.6396,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 0.1984313725490196,
1801
+ "grad_norm": 2.0115535259246826,
1802
+ "learning_rate": 2.1801680540908783e-05,
1803
+ "loss": 1.2482,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 0.1992156862745098,
1808
+ "grad_norm": 2.1881866455078125,
1809
+ "learning_rate": 2.154098792990022e-05,
1810
+ "loss": 1.1678,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 0.2,
1815
+ "grad_norm": 2.3712048530578613,
1816
+ "learning_rate": 2.1281168654897375e-05,
1817
+ "loss": 1.134,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 0.2007843137254902,
1822
+ "grad_norm": 2.554553985595703,
1823
+ "learning_rate": 2.102223957520066e-05,
1824
+ "loss": 1.2021,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 0.2015686274509804,
1829
+ "grad_norm": 2.6500658988952637,
1830
+ "learning_rate": 2.0764217492346998e-05,
1831
+ "loss": 1.0142,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 0.2023529411764706,
1836
+ "grad_norm": 3.106962203979492,
1837
+ "learning_rate": 2.0507119149019585e-05,
1838
+ "loss": 1.3034,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 0.20313725490196077,
1843
+ "grad_norm": 2.9345974922180176,
1844
+ "learning_rate": 2.0250961227961486e-05,
1845
+ "loss": 0.9994,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 0.20392156862745098,
1850
+ "grad_norm": 3.326606035232544,
1851
+ "learning_rate": 1.9995760350893095e-05,
1852
+ "loss": 1.2394,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 0.20470588235294118,
1857
+ "grad_norm": 2.8550965785980225,
1858
+ "learning_rate": 1.9741533077433642e-05,
1859
+ "loss": 1.1,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 0.2054901960784314,
1864
+ "grad_norm": 3.312087297439575,
1865
+ "learning_rate": 1.9488295904026556e-05,
1866
+ "loss": 1.1038,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 0.20627450980392156,
1871
+ "grad_norm": 3.6784210205078125,
1872
+ "learning_rate": 1.9236065262869107e-05,
1873
+ "loss": 1.1669,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 0.20705882352941177,
1878
+ "grad_norm": 4.164744853973389,
1879
+ "learning_rate": 1.8984857520846164e-05,
1880
+ "loss": 1.5447,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 0.20784313725490197,
1885
+ "grad_norm": 3.8596198558807373,
1886
+ "learning_rate": 1.87346889784681e-05,
1887
+ "loss": 1.3421,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 0.20862745098039215,
1892
+ "grad_norm": 4.981409549713135,
1893
+ "learning_rate": 1.848557586881313e-05,
1894
+ "loss": 1.5625,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 0.20941176470588235,
1899
+ "grad_norm": 5.043278694152832,
1900
+ "learning_rate": 1.823753435647393e-05,
1901
+ "loss": 1.1419,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 0.21019607843137256,
1906
+ "grad_norm": 5.383882999420166,
1907
+ "learning_rate": 1.7990580536508787e-05,
1908
+ "loss": 1.6431,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 0.21098039215686273,
1913
+ "grad_norm": 3.967205286026001,
1914
+ "learning_rate": 1.774473043339719e-05,
1915
+ "loss": 1.053,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 0.21176470588235294,
1920
+ "grad_norm": 4.5731072425842285,
1921
+ "learning_rate": 1.7500000000000005e-05,
1922
+ "loss": 1.435,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 0.21254901960784314,
1927
+ "grad_norm": 4.236204147338867,
1928
+ "learning_rate": 1.7256405116524344e-05,
1929
+ "loss": 0.9911,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 0.21333333333333335,
1934
+ "grad_norm": 4.109267711639404,
1935
+ "learning_rate": 1.7013961589493098e-05,
1936
+ "loss": 0.8052,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 0.21411764705882352,
1941
+ "grad_norm": 4.672120571136475,
1942
+ "learning_rate": 1.6772685150719313e-05,
1943
+ "loss": 1.013,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 0.21490196078431373,
1948
+ "grad_norm": 6.016241550445557,
1949
+ "learning_rate": 1.653259145628535e-05,
1950
+ "loss": 1.5547,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 0.21568627450980393,
1955
+ "grad_norm": 4.318469524383545,
1956
+ "learning_rate": 1.6293696085526958e-05,
1957
+ "loss": 1.0291,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 0.2164705882352941,
1962
+ "grad_norm": 5.033075332641602,
1963
+ "learning_rate": 1.6056014540022405e-05,
1964
+ "loss": 0.8338,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 0.2172549019607843,
1969
+ "grad_norm": 5.945756912231445,
1970
+ "learning_rate": 1.5819562242586556e-05,
1971
+ "loss": 1.2347,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 0.21803921568627452,
1976
+ "grad_norm": 5.846539497375488,
1977
+ "learning_rate": 1.5584354536270135e-05,
1978
+ "loss": 0.9788,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 0.2188235294117647,
1983
+ "grad_norm": 4.702358245849609,
1984
+ "learning_rate": 1.535040668336417e-05,
1985
+ "loss": 0.9638,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.2196078431372549,
1990
+ "grad_norm": 5.319607734680176,
1991
+ "learning_rate": 1.511773386440955e-05,
1992
+ "loss": 1.1117,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.2203921568627451,
1997
+ "grad_norm": 4.019665241241455,
1998
+ "learning_rate": 1.4886351177212074e-05,
1999
+ "loss": 0.6134,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.2211764705882353,
2004
+ "grad_norm": 5.152523994445801,
2005
+ "learning_rate": 1.4656273635862728e-05,
2006
+ "loss": 1.053,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.22196078431372548,
2011
+ "grad_norm": 6.206692218780518,
2012
+ "learning_rate": 1.4427516169763444e-05,
2013
+ "loss": 1.6169,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.2227450980392157,
2018
+ "grad_norm": 5.860468864440918,
2019
+ "learning_rate": 1.4200093622658394e-05,
2020
+ "loss": 1.1937,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.2235294117647059,
2025
+ "grad_norm": 5.6560773849487305,
2026
+ "learning_rate": 1.3974020751670732e-05,
2027
+ "loss": 1.4664,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.22431372549019607,
2032
+ "grad_norm": 7.365577220916748,
2033
+ "learning_rate": 1.3749312226345108e-05,
2034
+ "loss": 1.4524,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.22509803921568627,
2039
+ "grad_norm": 6.791868686676025,
2040
+ "learning_rate": 1.3525982627695691e-05,
2041
+ "loss": 0.9192,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.22588235294117648,
2046
+ "grad_norm": 7.1987080574035645,
2047
+ "learning_rate": 1.3304046447260085e-05,
2048
+ "loss": 1.5558,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.22666666666666666,
2053
+ "grad_norm": 8.44493293762207,
2054
+ "learning_rate": 1.308351808615902e-05,
2055
+ "loss": 1.6203,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.22745098039215686,
2060
+ "grad_norm": 7.552881240844727,
2061
+ "learning_rate": 1.2864411854161799e-05,
2062
+ "loss": 1.6813,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.22823529411764706,
2067
+ "grad_norm": 9.73569393157959,
2068
+ "learning_rate": 1.2646741968757827e-05,
2069
+ "loss": 1.6688,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.22901960784313727,
2074
+ "grad_norm": 8.195075035095215,
2075
+ "learning_rate": 1.2430522554234036e-05,
2076
+ "loss": 1.8058,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.22980392156862745,
2081
+ "grad_norm": 10.075333595275879,
2082
+ "learning_rate": 1.2215767640758367e-05,
2083
+ "loss": 1.983,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.23058823529411765,
2088
+ "grad_norm": 9.949265480041504,
2089
+ "learning_rate": 1.2002491163469428e-05,
2090
+ "loss": 1.9262,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.23137254901960785,
2095
+ "grad_norm": 9.431818962097168,
2096
+ "learning_rate": 1.1790706961572175e-05,
2097
+ "loss": 2.0562,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.23215686274509803,
2102
+ "grad_norm": 21.495634078979492,
2103
+ "learning_rate": 1.1580428777439972e-05,
2104
+ "loss": 1.9353,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.23294117647058823,
2109
+ "grad_norm": 9.102306365966797,
2110
+ "learning_rate": 1.1371670255722838e-05,
2111
+ "loss": 1.6257,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.23372549019607844,
2116
+ "grad_norm": 14.28601360321045,
2117
+ "learning_rate": 1.1164444942462095e-05,
2118
+ "loss": 2.8148,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.23450980392156862,
2123
+ "grad_norm": 18.514936447143555,
2124
+ "learning_rate": 1.0958766284211363e-05,
2125
+ "loss": 2.4219,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.23529411764705882,
2130
+ "grad_norm": 17.440731048583984,
2131
+ "learning_rate": 1.0754647627164022e-05,
2132
+ "loss": 2.8099,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.23529411764705882,
2137
+ "eval_loss": 0.35413047671318054,
2138
+ "eval_runtime": 168.8442,
2139
+ "eval_samples_per_second": 12.722,
2140
+ "eval_steps_per_second": 3.18,
2141
+ "step": 300
2142
  }
2143
  ],
2144
  "logging_steps": 1,
 
2167
  "attributes": {}
2168
  }
2169
  },
2170
+ "total_flos": 4.536268794133217e+17,
2171
  "train_batch_size": 8,
2172
  "trial_name": null,
2173
  "trial_params": null