mtzig commited on
Commit
7d64c8d
·
verified ·
1 Parent(s): eff59dc

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e00c75665fad68257b13112a896f0e5bfe09116a5aa7d74e1a11ae36dfaf5ff
3
  size 13648688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3089ee5752591a99a34512cd610e5fd1c9cbde6f2e5e96052322709db58f3c20
3
  size 13648688
last-checkpoint/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f71c9991f0a3e0cd1179cd2e3874f7b658401de9a5504be3d21b724ba049377d
3
+ size 20450800
last-checkpoint/global_step300/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca1c1e21cddf824e4855926726edc0d137e7bf67de7cf8919d8065cbb405d7c8
3
+ size 20450800
last-checkpoint/global_step300/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb97391dd585cf53fa22f1b33502d8c0078861881bc64091e47c722648ffa9b9
3
+ size 20450800
last-checkpoint/global_step300/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:413459590411e0e91e925d5765cb8fff050ffd31b47410bbdaf19c166851ed60
3
+ size 20450800
last-checkpoint/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f10dcf1fc0a43f6046928990df268bbc1f546b31d3568428be3f358423cb0951
3
+ size 152238
last-checkpoint/global_step300/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca6a6f9616d9eb1bef7a71132384d000d3064294e87f3146629a6a6822eddcfa
3
+ size 152238
last-checkpoint/global_step300/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e68119fc8ce07b421ca4fab0993069c3e9775c9c9d11bd16ec80b5f4bf8c9848
3
+ size 152238
last-checkpoint/global_step300/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76d5ab8fb6578710d191973f5159a7cef6a0564d586d1346cf7637a81320c38
3
+ size 152238
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step200
 
1
+ global_step300
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2c24a39de75f23f0d84f98b720b05e4f552cdd0306626c901205b2d9690be33
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:680c0f067459bb4efdac849ce093e2226bf3c2332330a52eb68acec721890eea
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75baba3fff49778fb1f9915f06b5fd052daf5b241c6df48d63ce4cc2fd74ad52
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8261bb0773aaaaaf837917ac2d74751a1b07817c980444e7109f977082d4d80
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43726b0b6816cd7dc4fb0c6379613d398bab9cf9069c5ee8ac83eea24f4fa621
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ba4d6439beb986cf1f95fd682e03fa5844ac212a382301bdd1a868bcc67c311
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c26a88d8955baf0e31629f8efe3e6d01a1336c21cf4cdd20becb43acefdfdd69
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c17d861750a27e832ec9cf9a840f42cdc22319da36842441a78feca72092cef2
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:863b742f77d90cf639cf492ec2d91dab7a9ebd0f58799b06186327b2c961991e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c0f31e0dd8f0cd1067395334590c80ca29a3a9a42118ffbf479961406c7bb0b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.24592683676606208,
5
  "eval_steps": 40,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1479,6 +1479,730 @@
1479
  "eval_samples_per_second": 2.12,
1480
  "eval_steps_per_second": 0.17,
1481
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1482
  }
1483
  ],
1484
  "logging_steps": 1,
@@ -1498,7 +2222,7 @@
1498
  "attributes": {}
1499
  }
1500
  },
1501
- "total_flos": 197328810967040.0,
1502
  "train_batch_size": 4,
1503
  "trial_name": null,
1504
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3688902551490931,
5
  "eval_steps": 40,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1479
  "eval_samples_per_second": 2.12,
1480
  "eval_steps_per_second": 0.17,
1481
  "step": 200
1482
+ },
1483
+ {
1484
+ "epoch": 0.2471564709498924,
1485
+ "grad_norm": 0.26939804480510793,
1486
+ "learning_rate": 1.872049408143808e-05,
1487
+ "loss": 0.1785,
1488
+ "step": 201
1489
+ },
1490
+ {
1491
+ "epoch": 0.24838610513372272,
1492
+ "grad_norm": 0.3016410055860768,
1493
+ "learning_rate": 1.8699380076440242e-05,
1494
+ "loss": 0.2045,
1495
+ "step": 202
1496
+ },
1497
+ {
1498
+ "epoch": 0.24961573931755301,
1499
+ "grad_norm": 0.3469258635915736,
1500
+ "learning_rate": 1.8678105394844114e-05,
1501
+ "loss": 0.2027,
1502
+ "step": 203
1503
+ },
1504
+ {
1505
+ "epoch": 0.25084537350138336,
1506
+ "grad_norm": 0.32607531852363275,
1507
+ "learning_rate": 1.8656670429590745e-05,
1508
+ "loss": 0.1564,
1509
+ "step": 204
1510
+ },
1511
+ {
1512
+ "epoch": 0.2520750076852136,
1513
+ "grad_norm": 0.2760811969255309,
1514
+ "learning_rate": 1.8635075576581587e-05,
1515
+ "loss": 0.1723,
1516
+ "step": 205
1517
+ },
1518
+ {
1519
+ "epoch": 0.25330464186904394,
1520
+ "grad_norm": 0.25212411563493814,
1521
+ "learning_rate": 1.861332123467122e-05,
1522
+ "loss": 0.2283,
1523
+ "step": 206
1524
+ },
1525
+ {
1526
+ "epoch": 0.25453427605287426,
1527
+ "grad_norm": 0.3448353672449361,
1528
+ "learning_rate": 1.859140780565996e-05,
1529
+ "loss": 0.2186,
1530
+ "step": 207
1531
+ },
1532
+ {
1533
+ "epoch": 0.2557639102367046,
1534
+ "grad_norm": 0.35723591175806685,
1535
+ "learning_rate": 1.856933569428644e-05,
1536
+ "loss": 0.2346,
1537
+ "step": 208
1538
+ },
1539
+ {
1540
+ "epoch": 0.2569935444205349,
1541
+ "grad_norm": 0.2721994057338676,
1542
+ "learning_rate": 1.8547105308220142e-05,
1543
+ "loss": 0.2408,
1544
+ "step": 209
1545
+ },
1546
+ {
1547
+ "epoch": 0.2582231786043652,
1548
+ "grad_norm": 0.2946335121583992,
1549
+ "learning_rate": 1.852471705805387e-05,
1550
+ "loss": 0.179,
1551
+ "step": 210
1552
+ },
1553
+ {
1554
+ "epoch": 0.2594528127881955,
1555
+ "grad_norm": 0.44468973852329186,
1556
+ "learning_rate": 1.8502171357296144e-05,
1557
+ "loss": 0.2007,
1558
+ "step": 211
1559
+ },
1560
+ {
1561
+ "epoch": 0.2606824469720258,
1562
+ "grad_norm": 0.4480447012946234,
1563
+ "learning_rate": 1.84794686223636e-05,
1564
+ "loss": 0.2752,
1565
+ "step": 212
1566
+ },
1567
+ {
1568
+ "epoch": 0.2619120811558561,
1569
+ "grad_norm": 0.28800115930201,
1570
+ "learning_rate": 1.8456609272573268e-05,
1571
+ "loss": 0.2106,
1572
+ "step": 213
1573
+ },
1574
+ {
1575
+ "epoch": 0.26314171533968644,
1576
+ "grad_norm": 0.4124378870162344,
1577
+ "learning_rate": 1.8433593730134835e-05,
1578
+ "loss": 0.2648,
1579
+ "step": 214
1580
+ },
1581
+ {
1582
+ "epoch": 0.26437134952351676,
1583
+ "grad_norm": 0.26984478766637626,
1584
+ "learning_rate": 1.841042242014285e-05,
1585
+ "loss": 0.1578,
1586
+ "step": 215
1587
+ },
1588
+ {
1589
+ "epoch": 0.2656009837073471,
1590
+ "grad_norm": 0.4271224156022197,
1591
+ "learning_rate": 1.838709577056888e-05,
1592
+ "loss": 0.241,
1593
+ "step": 216
1594
+ },
1595
+ {
1596
+ "epoch": 0.2668306178911774,
1597
+ "grad_norm": 0.32441622820215904,
1598
+ "learning_rate": 1.8363614212253585e-05,
1599
+ "loss": 0.1615,
1600
+ "step": 217
1601
+ },
1602
+ {
1603
+ "epoch": 0.26806025207500767,
1604
+ "grad_norm": 0.35733117717955937,
1605
+ "learning_rate": 1.833997817889878e-05,
1606
+ "loss": 0.234,
1607
+ "step": 218
1608
+ },
1609
+ {
1610
+ "epoch": 0.269289886258838,
1611
+ "grad_norm": 0.3419406717090276,
1612
+ "learning_rate": 1.8316188107059418e-05,
1613
+ "loss": 0.2538,
1614
+ "step": 219
1615
+ },
1616
+ {
1617
+ "epoch": 0.2705195204426683,
1618
+ "grad_norm": 0.31666227946792574,
1619
+ "learning_rate": 1.8292244436135517e-05,
1620
+ "loss": 0.2518,
1621
+ "step": 220
1622
+ },
1623
+ {
1624
+ "epoch": 0.2717491546264986,
1625
+ "grad_norm": 0.25628519133100447,
1626
+ "learning_rate": 1.8268147608364068e-05,
1627
+ "loss": 0.1488,
1628
+ "step": 221
1629
+ },
1630
+ {
1631
+ "epoch": 0.27297878881032894,
1632
+ "grad_norm": 0.40851754881119196,
1633
+ "learning_rate": 1.8243898068810833e-05,
1634
+ "loss": 0.2662,
1635
+ "step": 222
1636
+ },
1637
+ {
1638
+ "epoch": 0.27420842299415926,
1639
+ "grad_norm": 0.38439452767162985,
1640
+ "learning_rate": 1.8219496265362164e-05,
1641
+ "loss": 0.2033,
1642
+ "step": 223
1643
+ },
1644
+ {
1645
+ "epoch": 0.2754380571779895,
1646
+ "grad_norm": 0.39575941152172794,
1647
+ "learning_rate": 1.81949426487167e-05,
1648
+ "loss": 0.2481,
1649
+ "step": 224
1650
+ },
1651
+ {
1652
+ "epoch": 0.27666769136181985,
1653
+ "grad_norm": 0.39720852466696316,
1654
+ "learning_rate": 1.8170237672377046e-05,
1655
+ "loss": 0.2712,
1656
+ "step": 225
1657
+ },
1658
+ {
1659
+ "epoch": 0.27789732554565016,
1660
+ "grad_norm": 0.56556586568128,
1661
+ "learning_rate": 1.814538179264142e-05,
1662
+ "loss": 0.2856,
1663
+ "step": 226
1664
+ },
1665
+ {
1666
+ "epoch": 0.2791269597294805,
1667
+ "grad_norm": 0.4627206566124822,
1668
+ "learning_rate": 1.81203754685952e-05,
1669
+ "loss": 0.2353,
1670
+ "step": 227
1671
+ },
1672
+ {
1673
+ "epoch": 0.2803565939133108,
1674
+ "grad_norm": 0.2862726313485586,
1675
+ "learning_rate": 1.8095219162102453e-05,
1676
+ "loss": 0.1773,
1677
+ "step": 228
1678
+ },
1679
+ {
1680
+ "epoch": 0.2815862280971411,
1681
+ "grad_norm": 0.36128900613055054,
1682
+ "learning_rate": 1.8069913337797414e-05,
1683
+ "loss": 0.2265,
1684
+ "step": 229
1685
+ },
1686
+ {
1687
+ "epoch": 0.2828158622809714,
1688
+ "grad_norm": 0.3598391415884193,
1689
+ "learning_rate": 1.804445846307588e-05,
1690
+ "loss": 0.2167,
1691
+ "step": 230
1692
+ },
1693
+ {
1694
+ "epoch": 0.2840454964648017,
1695
+ "grad_norm": 0.3038583906156817,
1696
+ "learning_rate": 1.801885500808661e-05,
1697
+ "loss": 0.1929,
1698
+ "step": 231
1699
+ },
1700
+ {
1701
+ "epoch": 0.285275130648632,
1702
+ "grad_norm": 0.2684590976129711,
1703
+ "learning_rate": 1.7993103445722615e-05,
1704
+ "loss": 0.1955,
1705
+ "step": 232
1706
+ },
1707
+ {
1708
+ "epoch": 0.28650476483246234,
1709
+ "grad_norm": 0.3650334815395792,
1710
+ "learning_rate": 1.7967204251612432e-05,
1711
+ "loss": 0.2021,
1712
+ "step": 233
1713
+ },
1714
+ {
1715
+ "epoch": 0.28773439901629266,
1716
+ "grad_norm": 0.3529582774863101,
1717
+ "learning_rate": 1.7941157904111346e-05,
1718
+ "loss": 0.2396,
1719
+ "step": 234
1720
+ },
1721
+ {
1722
+ "epoch": 0.288964033200123,
1723
+ "grad_norm": 0.24658124615059435,
1724
+ "learning_rate": 1.7914964884292543e-05,
1725
+ "loss": 0.1841,
1726
+ "step": 235
1727
+ },
1728
+ {
1729
+ "epoch": 0.29019366738395325,
1730
+ "grad_norm": 0.3897045768921742,
1731
+ "learning_rate": 1.7888625675938237e-05,
1732
+ "loss": 0.2233,
1733
+ "step": 236
1734
+ },
1735
+ {
1736
+ "epoch": 0.29142330156778357,
1737
+ "grad_norm": 0.24436873250999808,
1738
+ "learning_rate": 1.7862140765530718e-05,
1739
+ "loss": 0.1587,
1740
+ "step": 237
1741
+ },
1742
+ {
1743
+ "epoch": 0.2926529357516139,
1744
+ "grad_norm": 0.4324084079497792,
1745
+ "learning_rate": 1.783551064224339e-05,
1746
+ "loss": 0.1914,
1747
+ "step": 238
1748
+ },
1749
+ {
1750
+ "epoch": 0.2938825699354442,
1751
+ "grad_norm": 0.31640333118588265,
1752
+ "learning_rate": 1.7808735797931715e-05,
1753
+ "loss": 0.1512,
1754
+ "step": 239
1755
+ },
1756
+ {
1757
+ "epoch": 0.2951122041192745,
1758
+ "grad_norm": 0.34244615386284427,
1759
+ "learning_rate": 1.7781816727124138e-05,
1760
+ "loss": 0.2004,
1761
+ "step": 240
1762
+ },
1763
+ {
1764
+ "epoch": 0.2951122041192745,
1765
+ "eval_accuracy": 0.7807486631016043,
1766
+ "eval_f1": 0.4225352112676056,
1767
+ "eval_loss": 0.4479687511920929,
1768
+ "eval_precision": 0.7142857142857143,
1769
+ "eval_recall": 0.3,
1770
+ "eval_runtime": 22.697,
1771
+ "eval_samples_per_second": 2.203,
1772
+ "eval_steps_per_second": 0.176,
1773
+ "step": 240
1774
+ },
1775
+ {
1776
+ "epoch": 0.29634183830310484,
1777
+ "grad_norm": 0.24395125168807116,
1778
+ "learning_rate": 1.7754753927012955e-05,
1779
+ "loss": 0.1768,
1780
+ "step": 241
1781
+ },
1782
+ {
1783
+ "epoch": 0.29757147248693516,
1784
+ "grad_norm": 0.2464638831816697,
1785
+ "learning_rate": 1.7727547897445117e-05,
1786
+ "loss": 0.1461,
1787
+ "step": 242
1788
+ },
1789
+ {
1790
+ "epoch": 0.2988011066707654,
1791
+ "grad_norm": 0.33968511169569393,
1792
+ "learning_rate": 1.770019914091302e-05,
1793
+ "loss": 0.161,
1794
+ "step": 243
1795
+ },
1796
+ {
1797
+ "epoch": 0.30003074085459575,
1798
+ "grad_norm": 0.6034876950604299,
1799
+ "learning_rate": 1.76727081625452e-05,
1800
+ "loss": 0.2023,
1801
+ "step": 244
1802
+ },
1803
+ {
1804
+ "epoch": 0.30126037503842606,
1805
+ "grad_norm": 0.4452183735534574,
1806
+ "learning_rate": 1.7645075470097024e-05,
1807
+ "loss": 0.2207,
1808
+ "step": 245
1809
+ },
1810
+ {
1811
+ "epoch": 0.3024900092222564,
1812
+ "grad_norm": 0.29139249592342986,
1813
+ "learning_rate": 1.7617301573941296e-05,
1814
+ "loss": 0.1763,
1815
+ "step": 246
1816
+ },
1817
+ {
1818
+ "epoch": 0.3037196434060867,
1819
+ "grad_norm": 0.33614977158297427,
1820
+ "learning_rate": 1.758938698705884e-05,
1821
+ "loss": 0.2381,
1822
+ "step": 247
1823
+ },
1824
+ {
1825
+ "epoch": 0.304949277589917,
1826
+ "grad_norm": 0.45585634062118247,
1827
+ "learning_rate": 1.7561332225029022e-05,
1828
+ "loss": 0.2215,
1829
+ "step": 248
1830
+ },
1831
+ {
1832
+ "epoch": 0.3061789117737473,
1833
+ "grad_norm": 0.3889096135056135,
1834
+ "learning_rate": 1.7533137806020226e-05,
1835
+ "loss": 0.2512,
1836
+ "step": 249
1837
+ },
1838
+ {
1839
+ "epoch": 0.3074085459575776,
1840
+ "grad_norm": 0.3698201068005034,
1841
+ "learning_rate": 1.7504804250780292e-05,
1842
+ "loss": 0.1962,
1843
+ "step": 250
1844
+ },
1845
+ {
1846
+ "epoch": 0.3086381801414079,
1847
+ "grad_norm": 0.23954623770426672,
1848
+ "learning_rate": 1.747633208262688e-05,
1849
+ "loss": 0.1669,
1850
+ "step": 251
1851
+ },
1852
+ {
1853
+ "epoch": 0.30986781432523824,
1854
+ "grad_norm": 0.3069850649077184,
1855
+ "learning_rate": 1.744772182743782e-05,
1856
+ "loss": 0.173,
1857
+ "step": 252
1858
+ },
1859
+ {
1860
+ "epoch": 0.31109744850906856,
1861
+ "grad_norm": 0.5346208398414731,
1862
+ "learning_rate": 1.74189740136414e-05,
1863
+ "loss": 0.2977,
1864
+ "step": 253
1865
+ },
1866
+ {
1867
+ "epoch": 0.3123270826928989,
1868
+ "grad_norm": 0.4383899410481399,
1869
+ "learning_rate": 1.7390089172206594e-05,
1870
+ "loss": 0.1867,
1871
+ "step": 254
1872
+ },
1873
+ {
1874
+ "epoch": 0.31355671687672915,
1875
+ "grad_norm": 0.34336119858301023,
1876
+ "learning_rate": 1.736106783663326e-05,
1877
+ "loss": 0.2143,
1878
+ "step": 255
1879
+ },
1880
+ {
1881
+ "epoch": 0.31478635106055947,
1882
+ "grad_norm": 0.2605528903698455,
1883
+ "learning_rate": 1.7331910542942298e-05,
1884
+ "loss": 0.2061,
1885
+ "step": 256
1886
+ },
1887
+ {
1888
+ "epoch": 0.3160159852443898,
1889
+ "grad_norm": 0.29064553728173864,
1890
+ "learning_rate": 1.7302617829665725e-05,
1891
+ "loss": 0.1888,
1892
+ "step": 257
1893
+ },
1894
+ {
1895
+ "epoch": 0.3172456194282201,
1896
+ "grad_norm": 0.42180400278336094,
1897
+ "learning_rate": 1.7273190237836757e-05,
1898
+ "loss": 0.1727,
1899
+ "step": 258
1900
+ },
1901
+ {
1902
+ "epoch": 0.3184752536120504,
1903
+ "grad_norm": 0.5419289309694173,
1904
+ "learning_rate": 1.7243628310979793e-05,
1905
+ "loss": 0.2215,
1906
+ "step": 259
1907
+ },
1908
+ {
1909
+ "epoch": 0.31970488779588074,
1910
+ "grad_norm": 0.5826869965037216,
1911
+ "learning_rate": 1.7213932595100384e-05,
1912
+ "loss": 0.2394,
1913
+ "step": 260
1914
+ },
1915
+ {
1916
+ "epoch": 0.32093452197971106,
1917
+ "grad_norm": 0.35549156755432715,
1918
+ "learning_rate": 1.7184103638675157e-05,
1919
+ "loss": 0.2212,
1920
+ "step": 261
1921
+ },
1922
+ {
1923
+ "epoch": 0.3221641561635413,
1924
+ "grad_norm": 0.403587329803515,
1925
+ "learning_rate": 1.715414199264168e-05,
1926
+ "loss": 0.1709,
1927
+ "step": 262
1928
+ },
1929
+ {
1930
+ "epoch": 0.32339379034737165,
1931
+ "grad_norm": 0.568711959322274,
1932
+ "learning_rate": 1.7124048210388268e-05,
1933
+ "loss": 0.1972,
1934
+ "step": 263
1935
+ },
1936
+ {
1937
+ "epoch": 0.32462342453120197,
1938
+ "grad_norm": 0.23810294888269123,
1939
+ "learning_rate": 1.709382284774379e-05,
1940
+ "loss": 0.1846,
1941
+ "step": 264
1942
+ },
1943
+ {
1944
+ "epoch": 0.3258530587150323,
1945
+ "grad_norm": 0.3902059118868234,
1946
+ "learning_rate": 1.706346646296739e-05,
1947
+ "loss": 0.2197,
1948
+ "step": 265
1949
+ },
1950
+ {
1951
+ "epoch": 0.3270826928988626,
1952
+ "grad_norm": 0.30966956204734636,
1953
+ "learning_rate": 1.7032979616738167e-05,
1954
+ "loss": 0.1728,
1955
+ "step": 266
1956
+ },
1957
+ {
1958
+ "epoch": 0.3283123270826929,
1959
+ "grad_norm": 0.3364767329741329,
1960
+ "learning_rate": 1.7002362872144843e-05,
1961
+ "loss": 0.236,
1962
+ "step": 267
1963
+ },
1964
+ {
1965
+ "epoch": 0.3295419612665232,
1966
+ "grad_norm": 0.2575556372712217,
1967
+ "learning_rate": 1.697161679467534e-05,
1968
+ "loss": 0.2166,
1969
+ "step": 268
1970
+ },
1971
+ {
1972
+ "epoch": 0.3307715954503535,
1973
+ "grad_norm": 0.6139887685526306,
1974
+ "learning_rate": 1.6940741952206342e-05,
1975
+ "loss": 0.217,
1976
+ "step": 269
1977
+ },
1978
+ {
1979
+ "epoch": 0.3320012296341838,
1980
+ "grad_norm": 0.3192026684409325,
1981
+ "learning_rate": 1.6909738914992812e-05,
1982
+ "loss": 0.228,
1983
+ "step": 270
1984
+ },
1985
+ {
1986
+ "epoch": 0.33323086381801414,
1987
+ "grad_norm": 0.3191098195045691,
1988
+ "learning_rate": 1.6878608255657457e-05,
1989
+ "loss": 0.2148,
1990
+ "step": 271
1991
+ },
1992
+ {
1993
+ "epoch": 0.33446049800184446,
1994
+ "grad_norm": 0.3876596561870598,
1995
+ "learning_rate": 1.6847350549180148e-05,
1996
+ "loss": 0.2191,
1997
+ "step": 272
1998
+ },
1999
+ {
2000
+ "epoch": 0.3356901321856748,
2001
+ "grad_norm": 0.38903679735549906,
2002
+ "learning_rate": 1.6815966372887305e-05,
2003
+ "loss": 0.2205,
2004
+ "step": 273
2005
+ },
2006
+ {
2007
+ "epoch": 0.33691976636950505,
2008
+ "grad_norm": 0.38626350258480885,
2009
+ "learning_rate": 1.6784456306441234e-05,
2010
+ "loss": 0.2672,
2011
+ "step": 274
2012
+ },
2013
+ {
2014
+ "epoch": 0.33814940055333537,
2015
+ "grad_norm": 0.3297053884257871,
2016
+ "learning_rate": 1.675282093182941e-05,
2017
+ "loss": 0.1686,
2018
+ "step": 275
2019
+ },
2020
+ {
2021
+ "epoch": 0.3393790347371657,
2022
+ "grad_norm": 0.4263354582166456,
2023
+ "learning_rate": 1.672106083335374e-05,
2024
+ "loss": 0.1873,
2025
+ "step": 276
2026
+ },
2027
+ {
2028
+ "epoch": 0.340608668920996,
2029
+ "grad_norm": 0.2796799138264646,
2030
+ "learning_rate": 1.6689176597619773e-05,
2031
+ "loss": 0.1861,
2032
+ "step": 277
2033
+ },
2034
+ {
2035
+ "epoch": 0.3418383031048263,
2036
+ "grad_norm": 0.2741188381316141,
2037
+ "learning_rate": 1.6657168813525855e-05,
2038
+ "loss": 0.2212,
2039
+ "step": 278
2040
+ },
2041
+ {
2042
+ "epoch": 0.34306793728865664,
2043
+ "grad_norm": 0.3392975873746286,
2044
+ "learning_rate": 1.662503807225225e-05,
2045
+ "loss": 0.2087,
2046
+ "step": 279
2047
+ },
2048
+ {
2049
+ "epoch": 0.3442975714724869,
2050
+ "grad_norm": 0.3420260746414455,
2051
+ "learning_rate": 1.659278496725024e-05,
2052
+ "loss": 0.2241,
2053
+ "step": 280
2054
+ },
2055
+ {
2056
+ "epoch": 0.3442975714724869,
2057
+ "eval_accuracy": 0.7807486631016043,
2058
+ "eval_f1": 0.4225352112676056,
2059
+ "eval_loss": 0.4449218809604645,
2060
+ "eval_precision": 0.7142857142857143,
2061
+ "eval_recall": 0.3,
2062
+ "eval_runtime": 22.916,
2063
+ "eval_samples_per_second": 2.182,
2064
+ "eval_steps_per_second": 0.175,
2065
+ "step": 280
2066
+ },
2067
+ {
2068
+ "epoch": 0.3455272056563172,
2069
+ "grad_norm": 0.34182478583361364,
2070
+ "learning_rate": 1.6560410094231144e-05,
2071
+ "loss": 0.2257,
2072
+ "step": 281
2073
+ },
2074
+ {
2075
+ "epoch": 0.34675683984014755,
2076
+ "grad_norm": 0.3981831055380401,
2077
+ "learning_rate": 1.6527914051155328e-05,
2078
+ "loss": 0.2593,
2079
+ "step": 282
2080
+ },
2081
+ {
2082
+ "epoch": 0.34798647402397787,
2083
+ "grad_norm": 0.3376266386557847,
2084
+ "learning_rate": 1.6495297438221145e-05,
2085
+ "loss": 0.1899,
2086
+ "step": 283
2087
+ },
2088
+ {
2089
+ "epoch": 0.3492161082078082,
2090
+ "grad_norm": 0.3924075494987157,
2091
+ "learning_rate": 1.6462560857853876e-05,
2092
+ "loss": 0.2032,
2093
+ "step": 284
2094
+ },
2095
+ {
2096
+ "epoch": 0.3504457423916385,
2097
+ "grad_norm": 0.5277347648864814,
2098
+ "learning_rate": 1.6429704914694573e-05,
2099
+ "loss": 0.1546,
2100
+ "step": 285
2101
+ },
2102
+ {
2103
+ "epoch": 0.3516753765754688,
2104
+ "grad_norm": 0.32345022677167856,
2105
+ "learning_rate": 1.6396730215588913e-05,
2106
+ "loss": 0.2559,
2107
+ "step": 286
2108
+ },
2109
+ {
2110
+ "epoch": 0.3529050107592991,
2111
+ "grad_norm": 0.44254976480188857,
2112
+ "learning_rate": 1.6363637369575984e-05,
2113
+ "loss": 0.2794,
2114
+ "step": 287
2115
+ },
2116
+ {
2117
+ "epoch": 0.3541346449431294,
2118
+ "grad_norm": 0.4548791128214442,
2119
+ "learning_rate": 1.633042698787703e-05,
2120
+ "loss": 0.2879,
2121
+ "step": 288
2122
+ },
2123
+ {
2124
+ "epoch": 0.3553642791269597,
2125
+ "grad_norm": 0.27873749207646975,
2126
+ "learning_rate": 1.6297099683884163e-05,
2127
+ "loss": 0.1636,
2128
+ "step": 289
2129
+ },
2130
+ {
2131
+ "epoch": 0.35659391331079004,
2132
+ "grad_norm": 0.3258495849943053,
2133
+ "learning_rate": 1.626365607314905e-05,
2134
+ "loss": 0.2007,
2135
+ "step": 290
2136
+ },
2137
+ {
2138
+ "epoch": 0.35782354749462036,
2139
+ "grad_norm": 0.4098308834550629,
2140
+ "learning_rate": 1.6230096773371514e-05,
2141
+ "loss": 0.2787,
2142
+ "step": 291
2143
+ },
2144
+ {
2145
+ "epoch": 0.3590531816784507,
2146
+ "grad_norm": 0.2722654753472478,
2147
+ "learning_rate": 1.619642240438816e-05,
2148
+ "loss": 0.19,
2149
+ "step": 292
2150
+ },
2151
+ {
2152
+ "epoch": 0.36028281586228095,
2153
+ "grad_norm": 0.28423296754063243,
2154
+ "learning_rate": 1.616263358816089e-05,
2155
+ "loss": 0.1652,
2156
+ "step": 293
2157
+ },
2158
+ {
2159
+ "epoch": 0.36151245004611127,
2160
+ "grad_norm": 0.29897633728182227,
2161
+ "learning_rate": 1.612873094876545e-05,
2162
+ "loss": 0.2339,
2163
+ "step": 294
2164
+ },
2165
+ {
2166
+ "epoch": 0.3627420842299416,
2167
+ "grad_norm": 0.31056546478547936,
2168
+ "learning_rate": 1.6094715112379874e-05,
2169
+ "loss": 0.1838,
2170
+ "step": 295
2171
+ },
2172
+ {
2173
+ "epoch": 0.3639717184137719,
2174
+ "grad_norm": 0.2931202881800801,
2175
+ "learning_rate": 1.6060586707272943e-05,
2176
+ "loss": 0.1744,
2177
+ "step": 296
2178
+ },
2179
+ {
2180
+ "epoch": 0.3652013525976022,
2181
+ "grad_norm": 0.37366320482050214,
2182
+ "learning_rate": 1.6026346363792565e-05,
2183
+ "loss": 0.1915,
2184
+ "step": 297
2185
+ },
2186
+ {
2187
+ "epoch": 0.36643098678143254,
2188
+ "grad_norm": 0.3528926787344543,
2189
+ "learning_rate": 1.599199471435414e-05,
2190
+ "loss": 0.1589,
2191
+ "step": 298
2192
+ },
2193
+ {
2194
+ "epoch": 0.3676606209652628,
2195
+ "grad_norm": 0.3024625428052122,
2196
+ "learning_rate": 1.5957532393428872e-05,
2197
+ "loss": 0.2285,
2198
+ "step": 299
2199
+ },
2200
+ {
2201
+ "epoch": 0.3688902551490931,
2202
+ "grad_norm": 0.3986663078423136,
2203
+ "learning_rate": 1.5922960037532057e-05,
2204
+ "loss": 0.281,
2205
+ "step": 300
2206
  }
2207
  ],
2208
  "logging_steps": 1,
 
2222
  "attributes": {}
2223
  }
2224
  },
2225
+ "total_flos": 295277918322688.0,
2226
  "train_batch_size": 4,
2227
  "trial_name": null,
2228
  "trial_params": null