diaenra commited on
Commit
548f8eb
·
verified ·
1 Parent(s): 4bcb654

Training in progress, step 298, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5adf84e3dd7beb3769a77eea1152e99bbdbac831184bc3663a32de3dc9625515
3
  size 1154870440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:892b19e9b1c6ec8c931ec31eaaea64c1f54d229ad9301361d06c23010fe58615
3
  size 1154870440
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a36e6b4cd6f4ce7b967eb5ab7b4ea01d50b724f9dce1c3caa5286a9bab778f
3
  size 2309999768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a6cbc8b8810e660d48486c9cac79327a7e9d0bfe0336dab2b0b2be950dafd55
3
  size 2309999768
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cba91b1bc4ddb9a8958c2818c4a9e3864b75cb3e59dbbea15849602039e27f3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec20f60ea398d86da91aa0036eb29c67b6647d2b936f7f70a003e8aecb18b498
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70731b6c1655d0e3656aa4aa24acc1486de561f2947adf3a592e6bdf8c91a623
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7299d6d674e2331f7fef6ad21b861c3a91e48af7b3ecf10af405659140b65185
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8013411567476949,
5
  "eval_steps": 500,
6
- "global_step": 239,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1680,6 +1680,419 @@
1680
  "learning_rate": 2.0354603547267985e-05,
1681
  "loss": 0.4991,
1682
  "step": 239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1683
  }
1684
  ],
1685
  "logging_steps": 1,
@@ -1694,12 +2107,12 @@
1694
  "should_evaluate": false,
1695
  "should_log": false,
1696
  "should_save": true,
1697
- "should_training_stop": false
1698
  },
1699
  "attributes": {}
1700
  }
1701
  },
1702
- "total_flos": 1.6093164856777114e+17,
1703
  "train_batch_size": 4,
1704
  "trial_name": null,
1705
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9991617770326907,
5
  "eval_steps": 500,
6
+ "global_step": 298,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1680
  "learning_rate": 2.0354603547267985e-05,
1681
  "loss": 0.4991,
1682
  "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.8046940486169321,
1686
+ "grad_norm": 15.609128952026367,
1687
+ "learning_rate": 1.9719515643116674e-05,
1688
+ "loss": 0.885,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.8080469404861693,
1693
+ "grad_norm": 14.140607833862305,
1694
+ "learning_rate": 1.9092050688969738e-05,
1695
+ "loss": 0.6477,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.8113998323554066,
1700
+ "grad_norm": 8.173004150390625,
1701
+ "learning_rate": 1.847236664577389e-05,
1702
+ "loss": 0.3195,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.8147527242246437,
1707
+ "grad_norm": 12.263628959655762,
1708
+ "learning_rate": 1.7860619515673033e-05,
1709
+ "loss": 0.3825,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.818105616093881,
1714
+ "grad_norm": 9.158538818359375,
1715
+ "learning_rate": 1.725696330273575e-05,
1716
+ "loss": 0.5394,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.8214585079631181,
1721
+ "grad_norm": 11.503657341003418,
1722
+ "learning_rate": 1.6661549974185424e-05,
1723
+ "loss": 0.5017,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.8248113998323554,
1728
+ "grad_norm": 11.45801830291748,
1729
+ "learning_rate": 1.60745294221434e-05,
1730
+ "loss": 0.5327,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.8281642917015927,
1735
+ "grad_norm": 14.59035873413086,
1736
+ "learning_rate": 1.549604942589441e-05,
1737
+ "loss": 0.6128,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.8315171835708298,
1742
+ "grad_norm": 11.337044715881348,
1743
+ "learning_rate": 1.4926255614683932e-05,
1744
+ "loss": 0.4564,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.8348700754400671,
1749
+ "grad_norm": 9.969059944152832,
1750
+ "learning_rate": 1.4365291431056871e-05,
1751
+ "loss": 0.7156,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.8382229673093042,
1756
+ "grad_norm": 10.448427200317383,
1757
+ "learning_rate": 1.3813298094746491e-05,
1758
+ "loss": 0.5685,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.8415758591785415,
1763
+ "grad_norm": 11.464609146118164,
1764
+ "learning_rate": 1.327041456712334e-05,
1765
+ "loss": 0.6357,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.8449287510477788,
1770
+ "grad_norm": 12.060856819152832,
1771
+ "learning_rate": 1.2736777516212266e-05,
1772
+ "loss": 0.7539,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.8482816429170159,
1777
+ "grad_norm": 12.243000984191895,
1778
+ "learning_rate": 1.2212521282287092e-05,
1779
+ "loss": 0.7645,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.8516345347862532,
1784
+ "grad_norm": 11.656434059143066,
1785
+ "learning_rate": 1.1697777844051105e-05,
1786
+ "loss": 0.708,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.8549874266554903,
1791
+ "grad_norm": 12.392927169799805,
1792
+ "learning_rate": 1.1192676785412154e-05,
1793
+ "loss": 0.7513,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.8583403185247276,
1798
+ "grad_norm": 9.757222175598145,
1799
+ "learning_rate": 1.0697345262860636e-05,
1800
+ "loss": 0.6079,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.8616932103939648,
1805
+ "grad_norm": 8.159704208374023,
1806
+ "learning_rate": 1.021190797345839e-05,
1807
+ "loss": 0.2301,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.865046102263202,
1812
+ "grad_norm": 7.381707668304443,
1813
+ "learning_rate": 9.73648712344707e-06,
1814
+ "loss": 0.337,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.8683989941324393,
1819
+ "grad_norm": 9.870096206665039,
1820
+ "learning_rate": 9.271202397483215e-06,
1821
+ "loss": 0.2997,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.8717518860016764,
1826
+ "grad_norm": 8.603653907775879,
1827
+ "learning_rate": 8.816170928508365e-06,
1828
+ "loss": 0.2499,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.8751047778709137,
1833
+ "grad_norm": 6.676860809326172,
1834
+ "learning_rate": 8.371507268261437e-06,
1835
+ "loss": 0.3058,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.8784576697401508,
1840
+ "grad_norm": 8.418916702270508,
1841
+ "learning_rate": 7.937323358440935e-06,
1842
+ "loss": 0.2895,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.8818105616093881,
1847
+ "grad_norm": 10.64456558227539,
1848
+ "learning_rate": 7.513728502524286e-06,
1849
+ "loss": 0.3863,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.8851634534786254,
1854
+ "grad_norm": 7.481652736663818,
1855
+ "learning_rate": 7.100829338251147e-06,
1856
+ "loss": 0.2319,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.8885163453478625,
1861
+ "grad_norm": 7.484499454498291,
1862
+ "learning_rate": 6.698729810778065e-06,
1863
+ "loss": 0.1861,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.8918692372170998,
1868
+ "grad_norm": 6.579418659210205,
1869
+ "learning_rate": 6.3075311465107535e-06,
1870
+ "loss": 0.2557,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.8952221290863369,
1875
+ "grad_norm": 6.663677215576172,
1876
+ "learning_rate": 5.927331827620903e-06,
1877
+ "loss": 0.3708,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.8985750209555742,
1882
+ "grad_norm": 9.156440734863281,
1883
+ "learning_rate": 5.558227567253832e-06,
1884
+ "loss": 0.3865,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.9019279128248114,
1889
+ "grad_norm": 5.36343240737915,
1890
+ "learning_rate": 5.200311285433213e-06,
1891
+ "loss": 0.2776,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.9052808046940486,
1896
+ "grad_norm": 7.757360935211182,
1897
+ "learning_rate": 4.853673085668947e-06,
1898
+ "loss": 0.335,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.9086336965632859,
1903
+ "grad_norm": 12.774531364440918,
1904
+ "learning_rate": 4.5184002322740785e-06,
1905
+ "loss": 0.3653,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.911986588432523,
1910
+ "grad_norm": 8.137080192565918,
1911
+ "learning_rate": 4.19457712839652e-06,
1912
+ "loss": 0.392,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.9153394803017603,
1917
+ "grad_norm": 9.441322326660156,
1918
+ "learning_rate": 3.8822852947709375e-06,
1919
+ "loss": 0.3059,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.9186923721709975,
1924
+ "grad_norm": 5.269786834716797,
1925
+ "learning_rate": 3.581603349196372e-06,
1926
+ "loss": 0.2275,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.9220452640402347,
1931
+ "grad_norm": 9.277483940124512,
1932
+ "learning_rate": 3.2926069867446675e-06,
1933
+ "loss": 0.335,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.9253981559094719,
1938
+ "grad_norm": 9.3627290725708,
1939
+ "learning_rate": 3.0153689607045845e-06,
1940
+ "loss": 0.4954,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.9287510477787091,
1945
+ "grad_norm": 7.629518508911133,
1946
+ "learning_rate": 2.7499590642665774e-06,
1947
+ "loss": 0.2234,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.9321039396479464,
1952
+ "grad_norm": 5.571048736572266,
1953
+ "learning_rate": 2.496444112952734e-06,
1954
+ "loss": 0.3485,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.9354568315171836,
1959
+ "grad_norm": 18.15276336669922,
1960
+ "learning_rate": 2.2548879277963064e-06,
1961
+ "loss": 0.3434,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.9388097233864208,
1966
+ "grad_norm": 14.836206436157227,
1967
+ "learning_rate": 2.0253513192751373e-06,
1968
+ "loss": 0.3263,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.942162615255658,
1973
+ "grad_norm": 13.923564910888672,
1974
+ "learning_rate": 1.807892072002898e-06,
1975
+ "loss": 0.4678,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.9455155071248952,
1980
+ "grad_norm": 15.387225151062012,
1981
+ "learning_rate": 1.6025649301821876e-06,
1982
+ "loss": 0.4785,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.9488683989941324,
1987
+ "grad_norm": 13.055351257324219,
1988
+ "learning_rate": 1.4094215838229176e-06,
1989
+ "loss": 0.563,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.9522212908633697,
1994
+ "grad_norm": 17.376829147338867,
1995
+ "learning_rate": 1.2285106557296477e-06,
1996
+ "loss": 0.3891,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.9555741827326069,
2001
+ "grad_norm": 17.277982711791992,
2002
+ "learning_rate": 1.0598776892610685e-06,
2003
+ "loss": 0.5198,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.9589270746018441,
2008
+ "grad_norm": 11.45787239074707,
2009
+ "learning_rate": 9.035651368646648e-07,
2010
+ "loss": 0.4912,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.9622799664710813,
2015
+ "grad_norm": 12.024084091186523,
2016
+ "learning_rate": 7.596123493895991e-07,
2017
+ "loss": 0.4365,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.9656328583403185,
2022
+ "grad_norm": 15.519173622131348,
2023
+ "learning_rate": 6.280555661802856e-07,
2024
+ "loss": 0.6457,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.9689857502095558,
2029
+ "grad_norm": 15.936901092529297,
2030
+ "learning_rate": 5.089279059533658e-07,
2031
+ "loss": 0.7402,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.972338642078793,
2036
+ "grad_norm": 14.855793952941895,
2037
+ "learning_rate": 4.02259358460233e-07,
2038
+ "loss": 0.5426,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.9756915339480302,
2043
+ "grad_norm": 17.52886199951172,
2044
+ "learning_rate": 3.080767769372939e-07,
2045
+ "loss": 0.5786,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.9790444258172674,
2050
+ "grad_norm": 14.614376068115234,
2051
+ "learning_rate": 2.2640387134577058e-07,
2052
+ "loss": 0.4134,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.9823973176865046,
2057
+ "grad_norm": 12.858384132385254,
2058
+ "learning_rate": 1.5726120240288634e-07,
2059
+ "loss": 0.9686,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.9857502095557418,
2064
+ "grad_norm": 14.443262100219727,
2065
+ "learning_rate": 1.0066617640578368e-07,
2066
+ "loss": 0.5455,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.989103101424979,
2071
+ "grad_norm": 14.196720123291016,
2072
+ "learning_rate": 5.663304084960186e-08,
2073
+ "loss": 0.4852,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.9924559932942163,
2078
+ "grad_norm": 13.345819473266602,
2079
+ "learning_rate": 2.5172880840745873e-08,
2080
+ "loss": 0.4879,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.9958088851634534,
2085
+ "grad_norm": 4.6811842918396,
2086
+ "learning_rate": 6.293616306246586e-09,
2087
+ "loss": 0.2391,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.9991617770326907,
2092
+ "grad_norm": 7.340142250061035,
2093
+ "learning_rate": 0.0,
2094
+ "loss": 0.4579,
2095
+ "step": 298
2096
  }
2097
  ],
2098
  "logging_steps": 1,
 
2107
  "should_evaluate": false,
2108
  "should_log": false,
2109
  "should_save": true,
2110
+ "should_training_stop": true
2111
  },
2112
  "attributes": {}
2113
  }
2114
  },
2115
+ "total_flos": 1.998363176038564e+17,
2116
  "train_batch_size": 4,
2117
  "trial_name": null,
2118
  "trial_params": null