beast33 commited on
Commit
c231399
·
verified ·
1 Parent(s): ca6eb4d

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:276a083327871c77b3c09ed9c0fb6f525928ca54af89e4a824708269ba98857c
3
  size 639691872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db1af205eff0d1c3981f58af318310bd969f0ccee8e313b79ebead6997c843b5
3
  size 639691872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af81708a2d376045082da0d8c126f6a3ac9f12d62ba77bbdd2a849780e029ba2
3
- size 325339796
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cf0977855e53e7a960c7caf7aa4b8b6575b7d1cea78d007041fe7b74b13d404
3
+ size 325340244
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b352b4e246ff1febc6f766d4a4b35e365514cd519223a73fd27bbca4225f653a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a30f2d89b79bfb2d9929f2dc3ffeef086eb1500788503382d980f99cbe057e80
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14ac5bb8cc10dadf945bc491fce3dbccaa909e9887500674e5b6e829eb38874a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:825ef3e3db682363455a0d008a860e7cc4412a53aa533791e6b37fda9dca6312
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.5925147533416748,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.554016620498615,
5
  "eval_steps": 100,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1431,6 +1431,714 @@
1431
  "eval_samples_per_second": 6.278,
1432
  "eval_steps_per_second": 1.569,
1433
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434
  }
1435
  ],
1436
  "logging_steps": 1,
@@ -1459,7 +2167,7 @@
1459
  "attributes": {}
1460
  }
1461
  },
1462
- "total_flos": 2.854316616450048e+17,
1463
  "train_batch_size": 8,
1464
  "trial_name": null,
1465
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.556799054145813,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
+ "epoch": 0.8310249307479224,
5
  "eval_steps": 100,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1431
  "eval_samples_per_second": 6.278,
1432
  "eval_steps_per_second": 1.569,
1433
  "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.556786703601108,
1437
+ "grad_norm": 0.4485991597175598,
1438
+ "learning_rate": 4.517077709232411e-05,
1439
+ "loss": 1.7589,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.5595567867036011,
1444
+ "grad_norm": 0.4756779372692108,
1445
+ "learning_rate": 4.471249804477758e-05,
1446
+ "loss": 1.7589,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.5623268698060941,
1451
+ "grad_norm": 0.445344477891922,
1452
+ "learning_rate": 4.4254667782358924e-05,
1453
+ "loss": 1.6403,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.5650969529085873,
1458
+ "grad_norm": 0.448127806186676,
1459
+ "learning_rate": 4.379732516413897e-05,
1460
+ "loss": 1.6286,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.5678670360110804,
1465
+ "grad_norm": 0.4261086881160736,
1466
+ "learning_rate": 4.334050900779893e-05,
1467
+ "loss": 1.6222,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.5706371191135734,
1472
+ "grad_norm": 0.422758013010025,
1473
+ "learning_rate": 4.288425808633575e-05,
1474
+ "loss": 1.5906,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.5734072022160664,
1479
+ "grad_norm": 0.39392560720443726,
1480
+ "learning_rate": 4.2428611124771184e-05,
1481
+ "loss": 1.6972,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.5761772853185596,
1486
+ "grad_norm": 0.40653908252716064,
1487
+ "learning_rate": 4.1973606796864884e-05,
1488
+ "loss": 1.5745,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 0.5789473684210527,
1493
+ "grad_norm": 0.4077155888080597,
1494
+ "learning_rate": 4.151928372183198e-05,
1495
+ "loss": 1.5567,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 0.5817174515235457,
1500
+ "grad_norm": 0.4133901596069336,
1501
+ "learning_rate": 4.1065680461065194e-05,
1502
+ "loss": 1.5616,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 0.5844875346260388,
1507
+ "grad_norm": 0.4191035330295563,
1508
+ "learning_rate": 4.061283551486185e-05,
1509
+ "loss": 1.6254,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 0.5872576177285319,
1514
+ "grad_norm": 0.4195918142795563,
1515
+ "learning_rate": 4.016078731915608e-05,
1516
+ "loss": 1.5852,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 0.590027700831025,
1521
+ "grad_norm": 0.4277445375919342,
1522
+ "learning_rate": 3.970957424225666e-05,
1523
+ "loss": 1.5839,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 0.592797783933518,
1528
+ "grad_norm": 0.4163782596588135,
1529
+ "learning_rate": 3.925923458159023e-05,
1530
+ "loss": 1.6141,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 0.5955678670360111,
1535
+ "grad_norm": 0.42547765374183655,
1536
+ "learning_rate": 3.880980656045087e-05,
1537
+ "loss": 1.5977,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 0.5983379501385041,
1542
+ "grad_norm": 0.4501667320728302,
1543
+ "learning_rate": 3.8361328324755825e-05,
1544
+ "loss": 1.5311,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 0.6011080332409973,
1549
+ "grad_norm": 0.43693989515304565,
1550
+ "learning_rate": 3.791383793980776e-05,
1551
+ "loss": 1.5779,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 0.6038781163434903,
1556
+ "grad_norm": 0.4550263285636902,
1557
+ "learning_rate": 3.746737338706397e-05,
1558
+ "loss": 1.5961,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 0.6066481994459834,
1563
+ "grad_norm": 0.4572041928768158,
1564
+ "learning_rate": 3.70219725609126e-05,
1565
+ "loss": 1.592,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 0.6094182825484764,
1570
+ "grad_norm": 0.4633628726005554,
1571
+ "learning_rate": 3.65776732654563e-05,
1572
+ "loss": 1.5822,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 0.6121883656509696,
1577
+ "grad_norm": 0.4722716808319092,
1578
+ "learning_rate": 3.6134513211303556e-05,
1579
+ "loss": 1.5399,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 0.6149584487534626,
1584
+ "grad_norm": 0.477970689535141,
1585
+ "learning_rate": 3.5692530012367955e-05,
1586
+ "loss": 1.5923,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 0.6177285318559557,
1591
+ "grad_norm": 0.4831191599369049,
1592
+ "learning_rate": 3.5251761182675625e-05,
1593
+ "loss": 1.559,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 0.6204986149584487,
1598
+ "grad_norm": 0.46263793110847473,
1599
+ "learning_rate": 3.481224413318114e-05,
1600
+ "loss": 1.5255,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 0.6232686980609419,
1605
+ "grad_norm": 0.4732118844985962,
1606
+ "learning_rate": 3.4374016168592296e-05,
1607
+ "loss": 1.5777,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 0.6260387811634349,
1612
+ "grad_norm": 0.47649428248405457,
1613
+ "learning_rate": 3.393711448420372e-05,
1614
+ "loss": 1.4918,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.628808864265928,
1619
+ "grad_norm": 0.4683854579925537,
1620
+ "learning_rate": 3.3501576162739904e-05,
1621
+ "loss": 1.4874,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.631578947368421,
1626
+ "grad_norm": 0.4791397750377655,
1627
+ "learning_rate": 3.3067438171207766e-05,
1628
+ "loss": 1.55,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.6343490304709142,
1633
+ "grad_norm": 0.474380224943161,
1634
+ "learning_rate": 3.263473735775899e-05,
1635
+ "loss": 1.5044,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.6371191135734072,
1640
+ "grad_norm": 0.4925787150859833,
1641
+ "learning_rate": 3.220351044856247e-05,
1642
+ "loss": 1.5274,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.6398891966759003,
1647
+ "grad_norm": 0.4613766670227051,
1648
+ "learning_rate": 3.177379404468715e-05,
1649
+ "loss": 1.4617,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.6426592797783933,
1654
+ "grad_norm": 0.4807930588722229,
1655
+ "learning_rate": 3.134562461899545e-05,
1656
+ "loss": 1.4273,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.6454293628808865,
1661
+ "grad_norm": 0.49378934502601624,
1662
+ "learning_rate": 3.091903851304751e-05,
1663
+ "loss": 1.5487,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.6481994459833795,
1668
+ "grad_norm": 0.5002757906913757,
1669
+ "learning_rate": 3.0494071934016737e-05,
1670
+ "loss": 1.4499,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.6509695290858726,
1675
+ "grad_norm": 0.49733448028564453,
1676
+ "learning_rate": 3.0070760951616618e-05,
1677
+ "loss": 1.4476,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.6537396121883656,
1682
+ "grad_norm": 0.5034710764884949,
1683
+ "learning_rate": 2.9649141495039223e-05,
1684
+ "loss": 1.5502,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.6565096952908587,
1689
+ "grad_norm": 0.49712780117988586,
1690
+ "learning_rate": 2.9229249349905684e-05,
1691
+ "loss": 1.4659,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.6592797783933518,
1696
+ "grad_norm": 0.5008156895637512,
1697
+ "learning_rate": 2.8811120155228844e-05,
1698
+ "loss": 1.4596,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.6620498614958449,
1703
+ "grad_norm": 0.5352665185928345,
1704
+ "learning_rate": 2.8394789400388328e-05,
1705
+ "loss": 1.5186,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.6648199445983379,
1710
+ "grad_norm": 0.5389817357063293,
1711
+ "learning_rate": 2.798029242211828e-05,
1712
+ "loss": 1.5449,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.667590027700831,
1717
+ "grad_norm": 0.5315467715263367,
1718
+ "learning_rate": 2.7567664401508225e-05,
1719
+ "loss": 1.4991,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.6703601108033241,
1724
+ "grad_norm": 0.5424023270606995,
1725
+ "learning_rate": 2.7156940361016864e-05,
1726
+ "loss": 1.528,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.6731301939058172,
1731
+ "grad_norm": 0.5434011816978455,
1732
+ "learning_rate": 2.6748155161499567e-05,
1733
+ "loss": 1.4446,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.6759002770083102,
1738
+ "grad_norm": 0.5452317595481873,
1739
+ "learning_rate": 2.634134349924956e-05,
1740
+ "loss": 1.36,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.6786703601108033,
1745
+ "grad_norm": 0.5486112833023071,
1746
+ "learning_rate": 2.5936539903052892e-05,
1747
+ "loss": 1.4738,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.6814404432132964,
1752
+ "grad_norm": 0.582291841506958,
1753
+ "learning_rate": 2.5533778731257824e-05,
1754
+ "loss": 1.4956,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.6842105263157895,
1759
+ "grad_norm": 0.5929666757583618,
1760
+ "learning_rate": 2.513309416885865e-05,
1761
+ "loss": 1.5308,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.6869806094182825,
1766
+ "grad_norm": 0.5847265124320984,
1767
+ "learning_rate": 2.4734520224594093e-05,
1768
+ "loss": 1.5132,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.6897506925207756,
1773
+ "grad_norm": 0.7136350274085999,
1774
+ "learning_rate": 2.433809072806081e-05,
1775
+ "loss": 1.8617,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.6925207756232687,
1780
+ "grad_norm": 1.0075485706329346,
1781
+ "learning_rate": 2.3943839326842092e-05,
1782
+ "loss": 2.1937,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 0.6952908587257618,
1787
+ "grad_norm": 0.42398756742477417,
1788
+ "learning_rate": 2.3551799483651894e-05,
1789
+ "loss": 1.7047,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 0.6980609418282548,
1794
+ "grad_norm": 0.4284769296646118,
1795
+ "learning_rate": 2.3162004473494657e-05,
1796
+ "loss": 1.6634,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 0.7008310249307479,
1801
+ "grad_norm": 0.4206549823284149,
1802
+ "learning_rate": 2.2774487380841115e-05,
1803
+ "loss": 1.6303,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 0.703601108033241,
1808
+ "grad_norm": 0.41631007194519043,
1809
+ "learning_rate": 2.2389281096820075e-05,
1810
+ "loss": 1.6223,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 0.7063711911357341,
1815
+ "grad_norm": 0.4172951579093933,
1816
+ "learning_rate": 2.2006418316426775e-05,
1817
+ "loss": 1.6146,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 0.7091412742382271,
1822
+ "grad_norm": 0.4219145178794861,
1823
+ "learning_rate": 2.1625931535747964e-05,
1824
+ "loss": 1.5774,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 0.7119113573407202,
1829
+ "grad_norm": 0.41025567054748535,
1830
+ "learning_rate": 2.1247853049203543e-05,
1831
+ "loss": 1.6285,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 0.7146814404432132,
1836
+ "grad_norm": 0.4165142774581909,
1837
+ "learning_rate": 2.087221494680563e-05,
1838
+ "loss": 1.6049,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 0.7174515235457064,
1843
+ "grad_norm": 0.4020419418811798,
1844
+ "learning_rate": 2.049904911143492e-05,
1845
+ "loss": 1.5822,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 0.7202216066481995,
1850
+ "grad_norm": 0.39877504110336304,
1851
+ "learning_rate": 2.012838721613447e-05,
1852
+ "loss": 1.5962,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 0.7229916897506925,
1857
+ "grad_norm": 0.404995858669281,
1858
+ "learning_rate": 1.9760260721421426e-05,
1859
+ "loss": 1.61,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 0.7257617728531855,
1864
+ "grad_norm": 0.41909393668174744,
1865
+ "learning_rate": 1.9394700872616855e-05,
1866
+ "loss": 1.5494,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 0.7285318559556787,
1871
+ "grad_norm": 0.42204612493515015,
1872
+ "learning_rate": 1.903173869719362e-05,
1873
+ "loss": 1.673,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 0.7313019390581718,
1878
+ "grad_norm": 0.4202198088169098,
1879
+ "learning_rate": 1.8671405002142918e-05,
1880
+ "loss": 1.5661,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 0.7340720221606648,
1885
+ "grad_norm": 0.42738965153694153,
1886
+ "learning_rate": 1.831373037135955e-05,
1887
+ "loss": 1.5709,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 0.7368421052631579,
1892
+ "grad_norm": 0.4346977174282074,
1893
+ "learning_rate": 1.7958745163045986e-05,
1894
+ "loss": 1.5803,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 0.739612188365651,
1899
+ "grad_norm": 0.4478447437286377,
1900
+ "learning_rate": 1.760647950713566e-05,
1901
+ "loss": 1.584,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 0.7423822714681441,
1906
+ "grad_norm": 0.43284082412719727,
1907
+ "learning_rate": 1.725696330273575e-05,
1908
+ "loss": 1.5911,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 0.7451523545706371,
1913
+ "grad_norm": 0.4424854815006256,
1914
+ "learning_rate": 1.6910226215589303e-05,
1915
+ "loss": 1.5178,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 0.7479224376731302,
1920
+ "grad_norm": 0.4423474669456482,
1921
+ "learning_rate": 1.656629767555739e-05,
1922
+ "loss": 1.5337,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 0.7506925207756233,
1927
+ "grad_norm": 0.450488418340683,
1928
+ "learning_rate": 1.6225206874121218e-05,
1929
+ "loss": 1.5476,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 0.7534626038781164,
1934
+ "grad_norm": 0.4583964943885803,
1935
+ "learning_rate": 1.5886982761904377e-05,
1936
+ "loss": 1.559,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 0.7562326869806094,
1941
+ "grad_norm": 0.45635777711868286,
1942
+ "learning_rate": 1.555165404621567e-05,
1943
+ "loss": 1.5798,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 0.7590027700831025,
1948
+ "grad_norm": 0.4589531123638153,
1949
+ "learning_rate": 1.5219249188612556e-05,
1950
+ "loss": 1.5855,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 0.7617728531855956,
1955
+ "grad_norm": 0.47425007820129395,
1956
+ "learning_rate": 1.488979640248534e-05,
1957
+ "loss": 1.565,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 0.7645429362880887,
1962
+ "grad_norm": 0.4853668808937073,
1963
+ "learning_rate": 1.4563323650662586e-05,
1964
+ "loss": 1.5671,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 0.7673130193905817,
1969
+ "grad_norm": 0.4773860275745392,
1970
+ "learning_rate": 1.4239858643037751e-05,
1971
+ "loss": 1.562,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 0.7700831024930748,
1976
+ "grad_norm": 0.5074953436851501,
1977
+ "learning_rate": 1.3919428834217163e-05,
1978
+ "loss": 1.5222,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 0.7728531855955678,
1983
+ "grad_norm": 0.4980280101299286,
1984
+ "learning_rate": 1.36020614211899e-05,
1985
+ "loss": 1.5357,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.775623268698061,
1990
+ "grad_norm": 0.4977880120277405,
1991
+ "learning_rate": 1.3287783341019278e-05,
1992
+ "loss": 1.5099,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.778393351800554,
1997
+ "grad_norm": 0.490857869386673,
1998
+ "learning_rate": 1.2976621268556571e-05,
1999
+ "loss": 1.4592,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.7811634349030471,
2004
+ "grad_norm": 0.4895660877227783,
2005
+ "learning_rate": 1.2668601614177017e-05,
2006
+ "loss": 1.5262,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.7839335180055401,
2011
+ "grad_norm": 0.5010288953781128,
2012
+ "learning_rate": 1.2363750521538064e-05,
2013
+ "loss": 1.4536,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.7867036011080333,
2018
+ "grad_norm": 0.5021648406982422,
2019
+ "learning_rate": 1.2062093865360458e-05,
2020
+ "loss": 1.4938,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.7894736842105263,
2025
+ "grad_norm": 0.5206820368766785,
2026
+ "learning_rate": 1.1763657249232107e-05,
2027
+ "loss": 1.5172,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.7922437673130194,
2032
+ "grad_norm": 0.5139452815055847,
2033
+ "learning_rate": 1.146846600343488e-05,
2034
+ "loss": 1.4832,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.7950138504155124,
2039
+ "grad_norm": 0.5119965672492981,
2040
+ "learning_rate": 1.1176545182794674e-05,
2041
+ "loss": 1.5057,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.7977839335180056,
2046
+ "grad_norm": 0.5294527411460876,
2047
+ "learning_rate": 1.0887919564554894e-05,
2048
+ "loss": 1.4764,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.8005540166204986,
2053
+ "grad_norm": 0.5366680026054382,
2054
+ "learning_rate": 1.0602613646273374e-05,
2055
+ "loss": 1.4931,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.8033240997229917,
2060
+ "grad_norm": 0.5404147505760193,
2061
+ "learning_rate": 1.032065164374313e-05,
2062
+ "loss": 1.4244,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.8060941828254847,
2067
+ "grad_norm": 0.520129919052124,
2068
+ "learning_rate": 1.0042057488937067e-05,
2069
+ "loss": 1.461,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.8088642659279779,
2074
+ "grad_norm": 0.5262720584869385,
2075
+ "learning_rate": 9.766854827976617e-06,
2076
+ "loss": 1.4333,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.8116343490304709,
2081
+ "grad_norm": 0.5492231249809265,
2082
+ "learning_rate": 9.495067019124792e-06,
2083
+ "loss": 1.5142,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.814404432132964,
2088
+ "grad_norm": 0.5467971563339233,
2089
+ "learning_rate": 9.226717130803636e-06,
2090
+ "loss": 1.4606,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.817174515235457,
2095
+ "grad_norm": 0.5795299410820007,
2096
+ "learning_rate": 8.961827939636196e-06,
2097
+ "loss": 1.4897,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.8199445983379502,
2102
+ "grad_norm": 0.5784875154495239,
2103
+ "learning_rate": 8.700421928513352e-06,
2104
+ "loss": 1.4938,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.8227146814404432,
2109
+ "grad_norm": 0.5875300765037537,
2110
+ "learning_rate": 8.442521284685573e-06,
2111
+ "loss": 1.4066,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.8254847645429363,
2116
+ "grad_norm": 0.6010243892669678,
2117
+ "learning_rate": 8.188147897879667e-06,
2118
+ "loss": 1.482,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.8282548476454293,
2123
+ "grad_norm": 0.6733506321907043,
2124
+ "learning_rate": 7.937323358440935e-06,
2125
+ "loss": 1.6978,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.8310249307479224,
2130
+ "grad_norm": 0.987700879573822,
2131
+ "learning_rate": 7.690068955500624e-06,
2132
+ "loss": 2.2272,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.8310249307479224,
2137
+ "eval_loss": 1.556799054145813,
2138
+ "eval_runtime": 95.6603,
2139
+ "eval_samples_per_second": 6.356,
2140
+ "eval_steps_per_second": 1.589,
2141
+ "step": 300
2142
  }
2143
  ],
2144
  "logging_steps": 1,
 
2167
  "attributes": {}
2168
  }
2169
  },
2170
+ "total_flos": 4.281474924675072e+17,
2171
  "train_batch_size": 8,
2172
  "trial_name": null,
2173
  "trial_params": null