neuralwonderland commited on
Commit
dc3bce9
·
verified ·
1 Parent(s): e7cb1ea

Training in progress, step 2700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f287e10b3519c25fc83edd7acd8cc7cd1c222a41d94fdee3bad4cce6af776567
3
  size 524363632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f099ec01c680e66dee9f2fb8e1abb29ede61140d1e75177ddc7cf251990d3c9
3
  size 524363632
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2fb02b6a02ceadae09c2702b5172cbc44178e9b1056a25f944bb6996f32c4a8
3
- size 1049049378
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:304267e0a422299b1c99a12310feede4451b34db4869367967330e5221af5e8c
3
+ size 1049049442
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65ad7e6d1a2a4a73b6262b8f73328a902c07cad4e904b1d224f6efe38cd6b2de
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1747ed43b50bf4e0cbd7efe79f5150a5e5a84661860c5f1358011e2b61ea5cd0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a0eec867f87d3a5ffc128abe2e98cc84850220a61f050e675a6109f6d217117
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80f93d246266aa5b09398ee9582ec4f222c7b96eed39032b13a6dea73b1ae8ef
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.2038679122924805,
3
- "best_model_checkpoint": "./output/checkpoint-2400",
4
- "epoch": 0.11422172452407615,
5
  "eval_steps": 150,
6
- "global_step": 2550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1928,6 +1928,119 @@
1928
  "eval_samples_per_second": 9.668,
1929
  "eval_steps_per_second": 9.668,
1930
  "step": 2550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1931
  }
1932
  ],
1933
  "logging_steps": 10,
@@ -1947,7 +2060,7 @@
1947
  "attributes": {}
1948
  }
1949
  },
1950
- "total_flos": 3.2997299945472e+17,
1951
  "train_batch_size": 4,
1952
  "trial_name": null,
1953
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.2033374309539795,
3
+ "best_model_checkpoint": "./output/checkpoint-2700",
4
+ "epoch": 0.12094064949608063,
5
  "eval_steps": 150,
6
+ "global_step": 2700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1928
  "eval_samples_per_second": 9.668,
1929
  "eval_steps_per_second": 9.668,
1930
  "step": 2550
1931
+ },
1932
+ {
1933
+ "epoch": 0.11466965285554312,
1934
+ "grad_norm": 6.124125003814697,
1935
+ "learning_rate": 3.7259573637974587e-06,
1936
+ "loss": 1.0568,
1937
+ "step": 2560
1938
+ },
1939
+ {
1940
+ "epoch": 0.11511758118701008,
1941
+ "grad_norm": 4.3748602867126465,
1942
+ "learning_rate": 3.701915715893523e-06,
1943
+ "loss": 1.4124,
1944
+ "step": 2570
1945
+ },
1946
+ {
1947
+ "epoch": 0.11556550951847705,
1948
+ "grad_norm": 7.382061004638672,
1949
+ "learning_rate": 3.677876044546174e-06,
1950
+ "loss": 1.1357,
1951
+ "step": 2580
1952
+ },
1953
+ {
1954
+ "epoch": 0.11601343784994401,
1955
+ "grad_norm": 4.097735404968262,
1956
+ "learning_rate": 3.6538393379321427e-06,
1957
+ "loss": 1.0885,
1958
+ "step": 2590
1959
+ },
1960
+ {
1961
+ "epoch": 0.11646136618141098,
1962
+ "grad_norm": 5.039736270904541,
1963
+ "learning_rate": 3.6298065841062934e-06,
1964
+ "loss": 1.107,
1965
+ "step": 2600
1966
+ },
1967
+ {
1968
+ "epoch": 0.11690929451287795,
1969
+ "grad_norm": 4.383152008056641,
1970
+ "learning_rate": 3.6057787709610064e-06,
1971
+ "loss": 1.1695,
1972
+ "step": 2610
1973
+ },
1974
+ {
1975
+ "epoch": 0.11735722284434491,
1976
+ "grad_norm": 4.900496482849121,
1977
+ "learning_rate": 3.5817568861855708e-06,
1978
+ "loss": 1.1107,
1979
+ "step": 2620
1980
+ },
1981
+ {
1982
+ "epoch": 0.11780515117581188,
1983
+ "grad_norm": 6.267992973327637,
1984
+ "learning_rate": 3.557741917225579e-06,
1985
+ "loss": 1.1896,
1986
+ "step": 2630
1987
+ },
1988
+ {
1989
+ "epoch": 0.11825307950727884,
1990
+ "grad_norm": 3.8060693740844727,
1991
+ "learning_rate": 3.5337348512423468e-06,
1992
+ "loss": 1.2245,
1993
+ "step": 2640
1994
+ },
1995
+ {
1996
+ "epoch": 0.1187010078387458,
1997
+ "grad_norm": 3.5068161487579346,
1998
+ "learning_rate": 3.5097366750723275e-06,
1999
+ "loss": 1.0629,
2000
+ "step": 2650
2001
+ },
2002
+ {
2003
+ "epoch": 0.11914893617021277,
2004
+ "grad_norm": 4.6765360832214355,
2005
+ "learning_rate": 3.4857483751865478e-06,
2006
+ "loss": 1.1783,
2007
+ "step": 2660
2008
+ },
2009
+ {
2010
+ "epoch": 0.11959686450167974,
2011
+ "grad_norm": 7.864380836486816,
2012
+ "learning_rate": 3.461770937650064e-06,
2013
+ "loss": 1.0683,
2014
+ "step": 2670
2015
+ },
2016
+ {
2017
+ "epoch": 0.1200447928331467,
2018
+ "grad_norm": 3.138843297958374,
2019
+ "learning_rate": 3.437805348081416e-06,
2020
+ "loss": 0.9814,
2021
+ "step": 2680
2022
+ },
2023
+ {
2024
+ "epoch": 0.12049272116461367,
2025
+ "grad_norm": 5.134324550628662,
2026
+ "learning_rate": 3.413852591612125e-06,
2027
+ "loss": 1.1631,
2028
+ "step": 2690
2029
+ },
2030
+ {
2031
+ "epoch": 0.12094064949608063,
2032
+ "grad_norm": 4.688596725463867,
2033
+ "learning_rate": 3.389913652846194e-06,
2034
+ "loss": 1.0644,
2035
+ "step": 2700
2036
+ },
2037
+ {
2038
+ "epoch": 0.12094064949608063,
2039
+ "eval_loss": 1.2033374309539795,
2040
+ "eval_runtime": 51.6099,
2041
+ "eval_samples_per_second": 9.688,
2042
+ "eval_steps_per_second": 9.688,
2043
+ "step": 2700
2044
  }
2045
  ],
2046
  "logging_steps": 10,
 
2060
  "attributes": {}
2061
  }
2062
  },
2063
+ "total_flos": 3.47676625787904e+17,
2064
  "train_batch_size": 4,
2065
  "trial_name": null,
2066
  "trial_params": null