ProgramInNonsense commited on
Commit
8c5a79f
·
verified ·
1 Parent(s): b533f7f

Training in progress, step 2700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fc54bb7873cd7766315c895133e6f6b29d30c00b75c7d9fa8dd5ddbd9bdf246
3
  size 205573472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a0f7e66a5cf5e2fe250e8be921f24692b83ef6dc556fb1ad68f814e1c8e95fd
3
  size 205573472
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75e3bcc205bcf6e8e55e1f86e630f98a421ee469fc3ef6e9e0462dad7a513599
3
  size 411372650
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d74b30ee9a6aaf67c9615cf76ae290dd35040db63b258316a8bf7a766bc2ed2
3
  size 411372650
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41cf55dad76711a8b3bb0a84c3270c8fa5a9a1e5d84a6ea9b66d5562ec1761d6
3
  size 14308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2c8e97237e376105e407b05be1e33b22c026561b920b8fe9d134eb88ebbcfa
3
  size 14308
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:974af4605b4bf59d5494d851d4737f8534604f3b9bc0e001e663081ee9cce887
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0dfc2af941e2567517229a8f44267f5380076f9a37680e99372f149a8e0c635
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7639342546463013,
3
- "best_model_checkpoint": "./output/checkpoint-2550",
4
- "epoch": 0.016104788490444492,
5
  "eval_steps": 150,
6
- "global_step": 2550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1928,6 +1928,119 @@
1928
  "eval_samples_per_second": 10.974,
1929
  "eval_steps_per_second": 10.974,
1930
  "step": 2550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1931
  }
1932
  ],
1933
  "logging_steps": 10,
@@ -1947,7 +2060,7 @@
1947
  "attributes": {}
1948
  }
1949
  },
1950
- "total_flos": 2.0413045503167693e+17,
1951
  "train_batch_size": 16,
1952
  "trial_name": null,
1953
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7622952461242676,
3
+ "best_model_checkpoint": "./output/checkpoint-2700",
4
+ "epoch": 0.017052128989882405,
5
  "eval_steps": 150,
6
+ "global_step": 2700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1928
  "eval_samples_per_second": 10.974,
1929
  "eval_steps_per_second": 10.974,
1930
  "step": 2550
1931
+ },
1932
+ {
1933
+ "epoch": 0.016167944523740353,
1934
+ "grad_norm": 11.584785461425781,
1935
+ "learning_rate": 2.7323687334514695e-05,
1936
+ "loss": 0.8422,
1937
+ "step": 2560
1938
+ },
1939
+ {
1940
+ "epoch": 0.016231100557036214,
1941
+ "grad_norm": 9.246831893920898,
1942
+ "learning_rate": 2.71473819165525e-05,
1943
+ "loss": 1.0227,
1944
+ "step": 2570
1945
+ },
1946
+ {
1947
+ "epoch": 0.016294256590332075,
1948
+ "grad_norm": 9.19963264465332,
1949
+ "learning_rate": 2.6971090993338606e-05,
1950
+ "loss": 0.9826,
1951
+ "step": 2580
1952
+ },
1953
+ {
1954
+ "epoch": 0.016357412623627936,
1955
+ "grad_norm": 7.812788009643555,
1956
+ "learning_rate": 2.679482181150238e-05,
1957
+ "loss": 0.8062,
1958
+ "step": 2590
1959
+ },
1960
+ {
1961
+ "epoch": 0.016420568656923797,
1962
+ "grad_norm": 6.75607967376709,
1963
+ "learning_rate": 2.6618581616779483e-05,
1964
+ "loss": 0.8495,
1965
+ "step": 2600
1966
+ },
1967
+ {
1968
+ "epoch": 0.016483724690219657,
1969
+ "grad_norm": 8.447277069091797,
1970
+ "learning_rate": 2.644237765371404e-05,
1971
+ "loss": 1.1002,
1972
+ "step": 2610
1973
+ },
1974
+ {
1975
+ "epoch": 0.016546880723515518,
1976
+ "grad_norm": 9.761106491088867,
1977
+ "learning_rate": 2.626621716536085e-05,
1978
+ "loss": 0.9549,
1979
+ "step": 2620
1980
+ },
1981
+ {
1982
+ "epoch": 0.01661003675681138,
1983
+ "grad_norm": 10.971216201782227,
1984
+ "learning_rate": 2.6090107392987575e-05,
1985
+ "loss": 0.9771,
1986
+ "step": 2630
1987
+ },
1988
+ {
1989
+ "epoch": 0.01667319279010724,
1990
+ "grad_norm": 11.389016151428223,
1991
+ "learning_rate": 2.591405557577721e-05,
1992
+ "loss": 0.9737,
1993
+ "step": 2640
1994
+ },
1995
+ {
1996
+ "epoch": 0.0167363488234031,
1997
+ "grad_norm": 9.189516067504883,
1998
+ "learning_rate": 2.5738068950530398e-05,
1999
+ "loss": 0.9855,
2000
+ "step": 2650
2001
+ },
2002
+ {
2003
+ "epoch": 0.01679950485669896,
2004
+ "grad_norm": 8.623804092407227,
2005
+ "learning_rate": 2.5562154751368014e-05,
2006
+ "loss": 1.0468,
2007
+ "step": 2660
2008
+ },
2009
+ {
2010
+ "epoch": 0.016862660889994822,
2011
+ "grad_norm": 10.484329223632812,
2012
+ "learning_rate": 2.5386320209433798e-05,
2013
+ "loss": 1.0479,
2014
+ "step": 2670
2015
+ },
2016
+ {
2017
+ "epoch": 0.016925816923290683,
2018
+ "grad_norm": 9.45596981048584,
2019
+ "learning_rate": 2.5210572552597046e-05,
2020
+ "loss": 0.9985,
2021
+ "step": 2680
2022
+ },
2023
+ {
2024
+ "epoch": 0.016988972956586544,
2025
+ "grad_norm": 11.783865928649902,
2026
+ "learning_rate": 2.5034919005155583e-05,
2027
+ "loss": 1.0066,
2028
+ "step": 2690
2029
+ },
2030
+ {
2031
+ "epoch": 0.017052128989882405,
2032
+ "grad_norm": 11.132217407226562,
2033
+ "learning_rate": 2.4859366787538754e-05,
2034
+ "loss": 0.8558,
2035
+ "step": 2700
2036
+ },
2037
+ {
2038
+ "epoch": 0.017052128989882405,
2039
+ "eval_loss": 0.7622952461242676,
2040
+ "eval_runtime": 45.0496,
2041
+ "eval_samples_per_second": 11.099,
2042
+ "eval_steps_per_second": 11.099,
2043
+ "step": 2700
2044
  }
2045
  ],
2046
  "logging_steps": 10,
 
2060
  "attributes": {}
2061
  }
2062
  },
2063
+ "total_flos": 2.1658889386082304e+17,
2064
  "train_batch_size": 16,
2065
  "trial_name": null,
2066
  "trial_params": null