VERSIL91 commited on
Commit
06ee4f0
·
verified ·
1 Parent(s): 506e327

End of training

Browse files
README.md CHANGED
@@ -6,7 +6,7 @@ tags:
6
  - axolotl
7
  - generated_from_trainer
8
  model-index:
9
- - name: 3d9c9a86-41a1-463a-a26d-1a82887c0da8
10
  results: []
11
  ---
12
 
@@ -18,6 +18,12 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  axolotl version: `0.4.1`
20
  ```yaml
 
 
 
 
 
 
21
  adapter: lora
22
  base_model: unsloth/SmolLM2-360M
23
  bf16: auto
@@ -40,40 +46,37 @@ datasets:
40
  debug: null
41
  deepspeed: null
42
  device_map: auto
43
- do_eval: true
44
  early_stopping_patience: null
45
- eval_batch_size: 2
46
  eval_max_new_tokens: 128
47
- eval_steps: null
48
  eval_table_size: null
49
  evals_per_epoch: 4
50
- flash_attention: true
51
- fp16: false
52
  fsdp: null
53
  fsdp_config: null
54
- gradient_accumulation_steps: 4
55
- gradient_checkpointing: false
56
- group_by_length: true
57
  hub_model_id: null
58
  hub_repo: null
59
  hub_strategy: checkpoint
60
  hub_token: null
61
  learning_rate: 0.0001
62
- load_in_4bit: false
63
- load_in_8bit: false
64
  local_rank: null
65
- logging_steps: 5
66
  lora_alpha: 16
67
  lora_dropout: 0.05
68
  lora_fan_in_fan_out: null
69
  lora_model_dir: null
70
  lora_r: 8
71
  lora_target_linear: true
 
 
 
72
  lr_scheduler: cosine
73
- max_grad_norm: 1.0
74
  max_memory:
75
- 0: 75GB
76
- max_steps: 200
77
  micro_batch_size: 2
78
  mlflow_experiment_name: /tmp/a87830592f0aef9a_train_data.json
79
  model_type: AutoModelForCausalLM
@@ -81,24 +84,27 @@ num_epochs: 1
81
  optimizer: adamw_bnb_8bit
82
  output_dir: miner_id_24
83
  pad_to_sequence_len: true
 
 
 
84
  resume_from_checkpoint: null
85
  s2_attention: null
86
  sample_packing: false
87
- save_steps: null
88
- saves_per_epoch: null
89
- sequence_len: 1024
90
  strict: false
91
  tf32: false
92
  tokenizer_type: AutoTokenizer
 
93
  train_on_inputs: false
94
  trust_remote_code: true
95
  val_set_size: 0.05
96
- wandb_entity: sn56-miner
97
- wandb_mode: disabled
98
- wandb_name: sn56a5/d1f354f0
99
- wandb_project: god
100
- wandb_run: 8g06
101
- wandb_runid: sn56a5/d1f354f0
102
  warmup_steps: 10
103
  weight_decay: 0.0
104
  xformers_attention: null
@@ -107,11 +113,11 @@ xformers_attention: null
107
 
108
  </details><br>
109
 
110
- # 3d9c9a86-41a1-463a-a26d-1a82887c0da8
111
 
112
  This model is a fine-tuned version of [unsloth/SmolLM2-360M](https://huggingface.co/unsloth/SmolLM2-360M) on the None dataset.
113
  It achieves the following results on the evaluation set:
114
- - Loss: 0.2026
115
 
116
  ## Model description
117
 
@@ -134,25 +140,22 @@ The following hyperparameters were used during training:
134
  - train_batch_size: 2
135
  - eval_batch_size: 2
136
  - seed: 42
137
- - distributed_type: multi-GPU
138
- - num_devices: 4
139
- - gradient_accumulation_steps: 4
140
  - total_train_batch_size: 32
141
- - total_eval_batch_size: 8
142
  - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
143
  - lr_scheduler_type: cosine
144
  - lr_scheduler_warmup_steps: 10
145
- - training_steps: 200
146
 
147
  ### Training results
148
 
149
  | Training Loss | Epoch | Step | Validation Loss |
150
  |:-------------:|:------:|:----:|:---------------:|
151
- | No log | 0.0003 | 1 | 0.5289 |
152
- | 0.3662 | 0.0167 | 50 | 0.3411 |
153
- | 0.2267 | 0.0335 | 100 | 0.2339 |
154
- | 0.1912 | 0.0502 | 150 | 0.2059 |
155
- | 0.1909 | 0.0669 | 200 | 0.2026 |
156
 
157
 
158
  ### Framework versions
 
6
  - axolotl
7
  - generated_from_trainer
8
  model-index:
9
+ - name: 20b2db71-bebe-45cf-98f2-4bbd5debff43
10
  results: []
11
  ---
12
 
 
18
 
19
  axolotl version: `0.4.1`
20
  ```yaml
21
+ accelerate_config:
22
+ dynamo_backend: inductor
23
+ mixed_precision: bf16
24
+ num_machines: 1
25
+ num_processes: auto
26
+ use_cpu: false
27
  adapter: lora
28
  base_model: unsloth/SmolLM2-360M
29
  bf16: auto
 
46
  debug: null
47
  deepspeed: null
48
  device_map: auto
 
49
  early_stopping_patience: null
 
50
  eval_max_new_tokens: 128
 
51
  eval_table_size: null
52
  evals_per_epoch: 4
53
+ flash_attention: false
54
+ fp16: null
55
  fsdp: null
56
  fsdp_config: null
57
+ gradient_accumulation_steps: 16
58
+ gradient_checkpointing: true
59
+ group_by_length: false
60
  hub_model_id: null
61
  hub_repo: null
62
  hub_strategy: checkpoint
63
  hub_token: null
64
  learning_rate: 0.0001
 
 
65
  local_rank: null
66
+ logging_steps: 1
67
  lora_alpha: 16
68
  lora_dropout: 0.05
69
  lora_fan_in_fan_out: null
70
  lora_model_dir: null
71
  lora_r: 8
72
  lora_target_linear: true
73
+ lora_target_modules:
74
+ - q_proj
75
+ - v_proj
76
  lr_scheduler: cosine
 
77
  max_memory:
78
+ 0: 70GiB
79
+ max_steps: 100
80
  micro_batch_size: 2
81
  mlflow_experiment_name: /tmp/a87830592f0aef9a_train_data.json
82
  model_type: AutoModelForCausalLM
 
84
  optimizer: adamw_bnb_8bit
85
  output_dir: miner_id_24
86
  pad_to_sequence_len: true
87
+ quantization_config:
88
+ llm_int8_enable_fp32_cpu_offload: true
89
+ load_in_8bit: true
90
  resume_from_checkpoint: null
91
  s2_attention: null
92
  sample_packing: false
93
+ saves_per_epoch: 4
94
+ sequence_len: 512
 
95
  strict: false
96
  tf32: false
97
  tokenizer_type: AutoTokenizer
98
+ torch_compile: true
99
  train_on_inputs: false
100
  trust_remote_code: true
101
  val_set_size: 0.05
102
+ wandb_entity: null
103
+ wandb_mode: online
104
+ wandb_name: 20b2db71-bebe-45cf-98f2-4bbd5debff43
105
+ wandb_project: Gradients-On-Demand
106
+ wandb_run: your_name
107
+ wandb_runid: 20b2db71-bebe-45cf-98f2-4bbd5debff43
108
  warmup_steps: 10
109
  weight_decay: 0.0
110
  xformers_attention: null
 
113
 
114
  </details><br>
115
 
116
+ # 20b2db71-bebe-45cf-98f2-4bbd5debff43
117
 
118
  This model is a fine-tuned version of [unsloth/SmolLM2-360M](https://huggingface.co/unsloth/SmolLM2-360M) on the None dataset.
119
  It achieves the following results on the evaluation set:
120
+ - Loss: nan
121
 
122
  ## Model description
123
 
 
140
  - train_batch_size: 2
141
  - eval_batch_size: 2
142
  - seed: 42
143
+ - gradient_accumulation_steps: 16
 
 
144
  - total_train_batch_size: 32
 
145
  - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
146
  - lr_scheduler_type: cosine
147
  - lr_scheduler_warmup_steps: 10
148
+ - training_steps: 100
149
 
150
  ### Training results
151
 
152
  | Training Loss | Epoch | Step | Validation Loss |
153
  |:-------------:|:------:|:----:|:---------------:|
154
+ | 32.0444 | 0.0003 | 1 | nan |
155
+ | 0.0 | 0.0084 | 25 | nan |
156
+ | 0.0 | 0.0167 | 50 | nan |
157
+ | 0.0 | 0.0251 | 75 | nan |
158
+ | 0.0 | 0.0335 | 100 | nan |
159
 
160
 
161
  ### Framework versions
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "v_proj",
26
  "down_proj",
27
- "gate_proj",
28
  "o_proj",
29
- "up_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "gate_proj",
24
  "q_proj",
25
+ "up_proj",
26
  "down_proj",
 
27
  "o_proj",
28
+ "k_proj",
29
+ "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8e0fb66e7a88caec0a08aeb69dbec5cc8d7ee535d28fe3f36e8b869cabd6384
3
  size 17528138
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1a04cf82e6d853f68c54944a28d8e79cc282e58f6aed24deccf8d305b4f627
3
  size 17528138
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33301e5b49f7775f53a83a53756b9192184d0948a701ff258a1b6201ff69e090
3
  size 17425352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef249b0f207339e330f5bb95b2f74751d4ce2da3dcb80ce68a26897590a0919
3
  size 17425352
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
  "q_proj",
25
- "v_proj",
26
  "down_proj",
27
- "gate_proj",
28
  "o_proj",
29
- "up_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "gate_proj",
24
  "q_proj",
25
+ "up_proj",
26
  "down_proj",
 
27
  "o_proj",
28
+ "k_proj",
29
+ "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33301e5b49f7775f53a83a53756b9192184d0948a701ff258a1b6201ff69e090
3
  size 17425352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef249b0f207339e330f5bb95b2f74751d4ce2da3dcb80ce68a26897590a0919
3
  size 17425352
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9df1216f40cd7740e0fc5dd3b6cd38bcbe6759bb03764ede2f4d27879faf0598
3
  size 10251668
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2877331f933cd0d3eda6c6f20f3203f32ec583e75c93d59882ed85052b3507b
3
  size 10251668
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4c6893fc4a9ed236abb30b22cb769913941a10e57b959801e5127eb964077c8
3
+ size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2d754412c61116546142914503e7369d0cc35d3c380a07e5218f595d76b6d96
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49d60a69e2379be2053e816cbaff31e6c931b5922dd86c71c9eaf473299cbf62
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,339 +1,759 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.06691201070592172,
5
- "eval_steps": 50,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.00033456005352960856,
13
- "eval_loss": 0.528853178024292,
14
- "eval_runtime": 39.5233,
15
- "eval_samples_per_second": 127.393,
16
- "eval_steps_per_second": 15.94,
17
  "step": 1
18
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {
20
  "epoch": 0.0016728002676480427,
21
- "grad_norm": 0.0893344134092331,
22
  "learning_rate": 5e-05,
23
- "loss": 0.5191,
24
  "step": 5
25
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  {
27
  "epoch": 0.0033456005352960855,
28
- "grad_norm": 0.12134993076324463,
29
  "learning_rate": 0.0001,
30
- "loss": 0.584,
31
  "step": 10
32
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  {
34
  "epoch": 0.005018400802944129,
35
- "grad_norm": 0.08858965337276459,
36
- "learning_rate": 9.98292246503335e-05,
37
- "loss": 0.5242,
38
  "step": 15
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  {
41
  "epoch": 0.006691201070592171,
42
- "grad_norm": 0.09778374433517456,
43
- "learning_rate": 9.931806517013612e-05,
44
- "loss": 0.5202,
45
  "step": 20
46
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  {
48
  "epoch": 0.008364001338240215,
49
- "grad_norm": 0.10649651288986206,
50
- "learning_rate": 9.847001329696653e-05,
51
- "loss": 0.4911,
 
52
  "step": 25
53
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  {
55
  "epoch": 0.010036801605888258,
56
- "grad_norm": 0.06970233470201492,
57
- "learning_rate": 9.729086208503174e-05,
58
- "loss": 0.3985,
59
  "step": 30
60
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  {
62
  "epoch": 0.0117096018735363,
63
- "grad_norm": 0.0955878272652626,
64
- "learning_rate": 9.578866633275288e-05,
65
- "loss": 0.43,
66
  "step": 35
67
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  {
69
  "epoch": 0.013382402141184342,
70
- "grad_norm": 0.060985565185546875,
71
- "learning_rate": 9.397368756032445e-05,
72
- "loss": 0.4198,
73
  "step": 40
74
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  {
76
  "epoch": 0.015055202408832385,
77
- "grad_norm": 0.0769026130437851,
78
- "learning_rate": 9.185832391312644e-05,
79
- "loss": 0.4067,
80
  "step": 45
81
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  {
83
  "epoch": 0.01672800267648043,
84
- "grad_norm": 0.1008349359035492,
85
- "learning_rate": 8.945702546981969e-05,
86
- "loss": 0.3662,
87
  "step": 50
88
  },
89
  {
90
  "epoch": 0.01672800267648043,
91
- "eval_loss": 0.34106191992759705,
92
- "eval_runtime": 39.432,
93
- "eval_samples_per_second": 127.688,
94
- "eval_steps_per_second": 15.977,
95
  "step": 50
96
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  {
98
  "epoch": 0.01840080294412847,
99
- "grad_norm": 0.07255814969539642,
100
- "learning_rate": 8.678619553365659e-05,
101
- "loss": 0.3496,
102
  "step": 55
103
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  {
105
  "epoch": 0.020073603211776515,
106
- "grad_norm": 0.10015455633401871,
107
- "learning_rate": 8.386407858128706e-05,
108
- "loss": 0.328,
109
  "step": 60
110
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  {
112
  "epoch": 0.021746403479424557,
113
- "grad_norm": 0.07114718109369278,
114
- "learning_rate": 8.07106356344834e-05,
115
- "loss": 0.3169,
116
  "step": 65
117
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  {
119
  "epoch": 0.0234192037470726,
120
- "grad_norm": 0.07913073152303696,
121
- "learning_rate": 7.734740790612136e-05,
122
- "loss": 0.3134,
123
  "step": 70
124
  },
125
  {
126
- "epoch": 0.025092004014720642,
127
- "grad_norm": 0.1750458925962448,
128
- "learning_rate": 7.379736965185368e-05,
129
- "loss": 0.2852,
130
- "step": 75
131
  },
132
  {
133
- "epoch": 0.026764804282368684,
134
- "grad_norm": 0.061932601034641266,
135
- "learning_rate": 7.008477123264848e-05,
136
- "loss": 0.2737,
137
- "step": 80
138
  },
139
  {
140
- "epoch": 0.02843760455001673,
141
- "grad_norm": 0.06881576776504517,
142
- "learning_rate": 6.623497346023418e-05,
143
- "loss": 0.2502,
144
- "step": 85
145
  },
146
  {
147
- "epoch": 0.03011040481766477,
148
- "grad_norm": 0.062322817742824554,
149
- "learning_rate": 6.227427435703997e-05,
150
- "loss": 0.2659,
151
- "step": 90
152
  },
153
  {
154
- "epoch": 0.031783205085312814,
155
- "grad_norm": 0.06230627000331879,
156
- "learning_rate": 5.8229729514036705e-05,
157
- "loss": 0.2645,
158
- "step": 95
159
  },
160
  {
161
- "epoch": 0.03345600535296086,
162
- "grad_norm": 0.11283061653375626,
163
- "learning_rate": 5.4128967273616625e-05,
164
- "loss": 0.2267,
165
- "step": 100
 
166
  },
167
  {
168
- "epoch": 0.03345600535296086,
169
- "eval_loss": 0.23386509716510773,
170
- "eval_runtime": 39.348,
171
- "eval_samples_per_second": 127.961,
172
- "eval_steps_per_second": 16.011,
173
- "step": 100
174
  },
175
  {
176
- "epoch": 0.0351288056206089,
177
- "grad_norm": 0.0672779530286789,
178
- "learning_rate": 5e-05,
179
- "loss": 0.2497,
180
- "step": 105
181
  },
182
  {
183
- "epoch": 0.03680160588825694,
184
- "grad_norm": 0.06097684055566788,
185
- "learning_rate": 4.5871032726383386e-05,
186
- "loss": 0.2516,
187
- "step": 110
188
  },
189
  {
190
- "epoch": 0.038474406155904986,
191
- "grad_norm": 0.06827884912490845,
192
- "learning_rate": 4.17702704859633e-05,
193
- "loss": 0.2279,
194
- "step": 115
195
  },
196
  {
197
- "epoch": 0.04014720642355303,
198
- "grad_norm": 0.0606272853910923,
199
- "learning_rate": 3.772572564296005e-05,
200
- "loss": 0.2541,
201
- "step": 120
202
  },
203
  {
204
- "epoch": 0.04182000669120107,
205
- "grad_norm": 0.14288191497325897,
206
- "learning_rate": 3.3765026539765834e-05,
207
- "loss": 0.1806,
208
- "step": 125
209
  },
210
  {
211
- "epoch": 0.04349280695884911,
212
- "grad_norm": 0.052643969655036926,
213
- "learning_rate": 2.991522876735154e-05,
214
- "loss": 0.246,
215
- "step": 130
216
  },
217
  {
218
- "epoch": 0.04516560722649716,
219
- "grad_norm": 0.06944818049669266,
220
- "learning_rate": 2.6202630348146324e-05,
221
- "loss": 0.2419,
222
- "step": 135
223
  },
224
  {
225
- "epoch": 0.0468384074941452,
226
- "grad_norm": 0.06182454898953438,
227
- "learning_rate": 2.2652592093878666e-05,
228
- "loss": 0.2159,
229
- "step": 140
230
  },
231
  {
232
- "epoch": 0.04851120776179324,
233
- "grad_norm": 0.06247089058160782,
234
- "learning_rate": 1.928936436551661e-05,
235
- "loss": 0.2275,
236
- "step": 145
237
  },
238
  {
239
- "epoch": 0.050184008029441285,
240
- "grad_norm": 0.1229231059551239,
241
- "learning_rate": 1.6135921418712956e-05,
242
- "loss": 0.1912,
243
- "step": 150
244
  },
245
  {
246
- "epoch": 0.050184008029441285,
247
- "eval_loss": 0.2059282809495926,
248
- "eval_runtime": 39.4366,
249
- "eval_samples_per_second": 127.673,
250
- "eval_steps_per_second": 15.975,
251
- "step": 150
252
  },
253
  {
254
- "epoch": 0.05185680829708933,
255
- "grad_norm": 0.06170298531651497,
256
- "learning_rate": 1.3213804466343421e-05,
257
- "loss": 0.2107,
258
- "step": 155
259
  },
260
  {
261
- "epoch": 0.05352960856473737,
262
- "grad_norm": 0.06112481653690338,
263
- "learning_rate": 1.0542974530180327e-05,
264
- "loss": 0.2021,
265
- "step": 160
266
  },
267
  {
268
- "epoch": 0.05520240883238541,
269
- "grad_norm": 0.060423221439123154,
270
- "learning_rate": 8.141676086873572e-06,
271
- "loss": 0.2191,
272
- "step": 165
273
  },
274
  {
275
- "epoch": 0.05687520910003346,
276
- "grad_norm": 0.05647290125489235,
277
- "learning_rate": 6.026312439675552e-06,
278
- "loss": 0.2134,
279
- "step": 170
280
  },
281
  {
282
- "epoch": 0.0585480093676815,
283
- "grad_norm": 0.10841598361730576,
284
- "learning_rate": 4.2113336672471245e-06,
285
- "loss": 0.212,
286
- "step": 175
287
  },
288
  {
289
- "epoch": 0.06022080963532954,
290
- "grad_norm": 0.05532608553767204,
291
- "learning_rate": 2.7091379149682685e-06,
292
- "loss": 0.2154,
293
- "step": 180
294
  },
295
  {
296
- "epoch": 0.061893609902977584,
297
- "grad_norm": 0.06235940009355545,
298
- "learning_rate": 1.5299867030334814e-06,
299
- "loss": 0.1993,
300
- "step": 185
301
  },
302
  {
303
- "epoch": 0.06356641017062563,
304
- "grad_norm": 0.06464989483356476,
305
- "learning_rate": 6.819348298638839e-07,
306
- "loss": 0.1926,
307
- "step": 190
308
  },
309
  {
310
- "epoch": 0.06523921043827367,
311
- "grad_norm": 0.06490304321050644,
312
- "learning_rate": 1.7077534966650766e-07,
313
- "loss": 0.2289,
314
- "step": 195
315
  },
316
  {
317
- "epoch": 0.06691201070592172,
318
- "grad_norm": 0.10941293835639954,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  "learning_rate": 0.0,
320
- "loss": 0.1909,
321
- "step": 200
322
  },
323
  {
324
- "epoch": 0.06691201070592172,
325
- "eval_loss": 0.20260320603847504,
326
- "eval_runtime": 39.2566,
327
- "eval_samples_per_second": 128.259,
328
- "eval_steps_per_second": 16.048,
329
- "step": 200
330
  }
331
  ],
332
- "logging_steps": 5,
333
- "max_steps": 200,
334
  "num_input_tokens_seen": 0,
335
  "num_train_epochs": 1,
336
- "save_steps": 500,
337
  "stateful_callbacks": {
338
  "TrainerControl": {
339
  "args": {
@@ -346,7 +766,7 @@
346
  "attributes": {}
347
  }
348
  },
349
- "total_flos": 1.670528771293184e+16,
350
  "train_batch_size": 2,
351
  "trial_name": null,
352
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.03345600535296086,
5
+ "eval_steps": 25,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.00033456005352960856,
13
+ "grad_norm": NaN,
14
+ "learning_rate": 1e-05,
15
+ "loss": 32.0444,
 
16
  "step": 1
17
  },
18
+ {
19
+ "epoch": 0.00033456005352960856,
20
+ "eval_loss": NaN,
21
+ "eval_runtime": 225.0524,
22
+ "eval_samples_per_second": 22.373,
23
+ "eval_steps_per_second": 11.189,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.0006691201070592171,
28
+ "grad_norm": NaN,
29
+ "learning_rate": 2e-05,
30
+ "loss": 0.0,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.0010036801605888257,
35
+ "grad_norm": NaN,
36
+ "learning_rate": 3e-05,
37
+ "loss": 0.0,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.0013382402141184342,
42
+ "grad_norm": NaN,
43
+ "learning_rate": 4e-05,
44
+ "loss": 0.0,
45
+ "step": 4
46
+ },
47
  {
48
  "epoch": 0.0016728002676480427,
49
+ "grad_norm": NaN,
50
  "learning_rate": 5e-05,
51
+ "loss": 0.0,
52
  "step": 5
53
  },
54
+ {
55
+ "epoch": 0.0020073603211776514,
56
+ "grad_norm": NaN,
57
+ "learning_rate": 6e-05,
58
+ "loss": 0.0,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.00234192037470726,
63
+ "grad_norm": NaN,
64
+ "learning_rate": 7e-05,
65
+ "loss": 0.0,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.0026764804282368685,
70
+ "grad_norm": NaN,
71
+ "learning_rate": 8e-05,
72
+ "loss": 0.0,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.003011040481766477,
77
+ "grad_norm": NaN,
78
+ "learning_rate": 9e-05,
79
+ "loss": 0.0,
80
+ "step": 9
81
+ },
82
  {
83
  "epoch": 0.0033456005352960855,
84
+ "grad_norm": NaN,
85
  "learning_rate": 0.0001,
86
+ "loss": 0.0,
87
  "step": 10
88
  },
89
+ {
90
+ "epoch": 0.0036801605888256944,
91
+ "grad_norm": NaN,
92
+ "learning_rate": 9.99695413509548e-05,
93
+ "loss": 0.0,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.004014720642355303,
98
+ "grad_norm": NaN,
99
+ "learning_rate": 9.987820251299122e-05,
100
+ "loss": 0.0,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.004349280695884911,
105
+ "grad_norm": NaN,
106
+ "learning_rate": 9.972609476841367e-05,
107
+ "loss": 0.0,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.00468384074941452,
112
+ "grad_norm": NaN,
113
+ "learning_rate": 9.951340343707852e-05,
114
+ "loss": 0.0,
115
+ "step": 14
116
+ },
117
  {
118
  "epoch": 0.005018400802944129,
119
+ "grad_norm": NaN,
120
+ "learning_rate": 9.924038765061042e-05,
121
+ "loss": 0.0,
122
  "step": 15
123
  },
124
+ {
125
+ "epoch": 0.005352960856473737,
126
+ "grad_norm": NaN,
127
+ "learning_rate": 9.890738003669029e-05,
128
+ "loss": 0.0,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.005687520910003346,
133
+ "grad_norm": NaN,
134
+ "learning_rate": 9.851478631379982e-05,
135
+ "loss": 0.0,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.006022080963532954,
140
+ "grad_norm": NaN,
141
+ "learning_rate": 9.806308479691595e-05,
142
+ "loss": 0.0,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.006356641017062563,
147
+ "grad_norm": NaN,
148
+ "learning_rate": 9.755282581475769e-05,
149
+ "loss": 0.0,
150
+ "step": 19
151
+ },
152
  {
153
  "epoch": 0.006691201070592171,
154
+ "grad_norm": NaN,
155
+ "learning_rate": 9.698463103929542e-05,
156
+ "loss": 0.0,
157
  "step": 20
158
  },
159
+ {
160
+ "epoch": 0.00702576112412178,
161
+ "grad_norm": NaN,
162
+ "learning_rate": 9.635919272833938e-05,
163
+ "loss": 0.0,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.007360321177651389,
168
+ "grad_norm": NaN,
169
+ "learning_rate": 9.567727288213005e-05,
170
+ "loss": 0.0,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.007694881231180997,
175
+ "grad_norm": NaN,
176
+ "learning_rate": 9.493970231495835e-05,
177
+ "loss": 0.0,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.008029441284710606,
182
+ "grad_norm": NaN,
183
+ "learning_rate": 9.414737964294636e-05,
184
+ "loss": 0.0,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.008364001338240215,
189
+ "grad_norm": NaN,
190
+ "learning_rate": 9.330127018922194e-05,
191
+ "loss": 0.0,
192
+ "step": 25
193
+ },
194
  {
195
  "epoch": 0.008364001338240215,
196
+ "eval_loss": NaN,
197
+ "eval_runtime": 127.579,
198
+ "eval_samples_per_second": 39.466,
199
+ "eval_steps_per_second": 19.737,
200
  "step": 25
201
  },
202
+ {
203
+ "epoch": 0.008698561391769822,
204
+ "grad_norm": NaN,
205
+ "learning_rate": 9.24024048078213e-05,
206
+ "loss": 0.0,
207
+ "step": 26
208
+ },
209
+ {
210
+ "epoch": 0.009033121445299431,
211
+ "grad_norm": NaN,
212
+ "learning_rate": 9.145187862775209e-05,
213
+ "loss": 0.0,
214
+ "step": 27
215
+ },
216
+ {
217
+ "epoch": 0.00936768149882904,
218
+ "grad_norm": NaN,
219
+ "learning_rate": 9.045084971874738e-05,
220
+ "loss": 0.0,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 0.009702241552358649,
225
+ "grad_norm": NaN,
226
+ "learning_rate": 8.940053768033609e-05,
227
+ "loss": 0.0,
228
+ "step": 29
229
+ },
230
  {
231
  "epoch": 0.010036801605888258,
232
+ "grad_norm": NaN,
233
+ "learning_rate": 8.83022221559489e-05,
234
+ "loss": 0.0,
235
  "step": 30
236
  },
237
+ {
238
+ "epoch": 0.010371361659417865,
239
+ "grad_norm": NaN,
240
+ "learning_rate": 8.715724127386972e-05,
241
+ "loss": 0.0,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.010705921712947474,
246
+ "grad_norm": NaN,
247
+ "learning_rate": 8.596699001693255e-05,
248
+ "loss": 0.0,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.011040481766477083,
253
+ "grad_norm": NaN,
254
+ "learning_rate": 8.473291852294987e-05,
255
+ "loss": 0.0,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 0.011375041820006692,
260
+ "grad_norm": NaN,
261
+ "learning_rate": 8.345653031794292e-05,
262
+ "loss": 0.0,
263
+ "step": 34
264
+ },
265
  {
266
  "epoch": 0.0117096018735363,
267
+ "grad_norm": NaN,
268
+ "learning_rate": 8.213938048432697e-05,
269
+ "loss": 0.0,
270
  "step": 35
271
  },
272
+ {
273
+ "epoch": 0.012044161927065908,
274
+ "grad_norm": NaN,
275
+ "learning_rate": 8.07830737662829e-05,
276
+ "loss": 0.0,
277
+ "step": 36
278
+ },
279
+ {
280
+ "epoch": 0.012378721980595517,
281
+ "grad_norm": NaN,
282
+ "learning_rate": 7.938926261462366e-05,
283
+ "loss": 0.0,
284
+ "step": 37
285
+ },
286
+ {
287
+ "epoch": 0.012713282034125126,
288
+ "grad_norm": NaN,
289
+ "learning_rate": 7.795964517353735e-05,
290
+ "loss": 0.0,
291
+ "step": 38
292
+ },
293
+ {
294
+ "epoch": 0.013047842087654735,
295
+ "grad_norm": NaN,
296
+ "learning_rate": 7.649596321166024e-05,
297
+ "loss": 0.0,
298
+ "step": 39
299
+ },
300
  {
301
  "epoch": 0.013382402141184342,
302
+ "grad_norm": NaN,
303
+ "learning_rate": 7.500000000000001e-05,
304
+ "loss": 0.0,
305
  "step": 40
306
  },
307
+ {
308
+ "epoch": 0.01371696219471395,
309
+ "grad_norm": NaN,
310
+ "learning_rate": 7.347357813929454e-05,
311
+ "loss": 0.0,
312
+ "step": 41
313
+ },
314
+ {
315
+ "epoch": 0.01405152224824356,
316
+ "grad_norm": NaN,
317
+ "learning_rate": 7.191855733945387e-05,
318
+ "loss": 0.0,
319
+ "step": 42
320
+ },
321
+ {
322
+ "epoch": 0.014386082301773169,
323
+ "grad_norm": NaN,
324
+ "learning_rate": 7.033683215379002e-05,
325
+ "loss": 0.0,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.014720642355302778,
330
+ "grad_norm": NaN,
331
+ "learning_rate": 6.873032967079561e-05,
332
+ "loss": 0.0,
333
+ "step": 44
334
+ },
335
  {
336
  "epoch": 0.015055202408832385,
337
+ "grad_norm": NaN,
338
+ "learning_rate": 6.710100716628344e-05,
339
+ "loss": 0.0,
340
  "step": 45
341
  },
342
+ {
343
+ "epoch": 0.015389762462361994,
344
+ "grad_norm": NaN,
345
+ "learning_rate": 6.545084971874738e-05,
346
+ "loss": 0.0,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.015724322515891603,
351
+ "grad_norm": NaN,
352
+ "learning_rate": 6.378186779084995e-05,
353
+ "loss": 0.0,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.01605888256942121,
358
+ "grad_norm": NaN,
359
+ "learning_rate": 6.209609477998338e-05,
360
+ "loss": 0.0,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.01639344262295082,
365
+ "grad_norm": NaN,
366
+ "learning_rate": 6.0395584540887963e-05,
367
+ "loss": 0.0,
368
+ "step": 49
369
+ },
370
  {
371
  "epoch": 0.01672800267648043,
372
+ "grad_norm": NaN,
373
+ "learning_rate": 5.868240888334653e-05,
374
+ "loss": 0.0,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.01672800267648043,
379
+ "eval_loss": NaN,
380
+ "eval_runtime": 65.6977,
381
+ "eval_samples_per_second": 76.639,
382
+ "eval_steps_per_second": 38.327,
383
  "step": 50
384
  },
385
+ {
386
+ "epoch": 0.01706256273001004,
387
+ "grad_norm": NaN,
388
+ "learning_rate": 5.695865504800327e-05,
389
+ "loss": 0.0,
390
+ "step": 51
391
+ },
392
+ {
393
+ "epoch": 0.017397122783539644,
394
+ "grad_norm": NaN,
395
+ "learning_rate": 5.522642316338268e-05,
396
+ "loss": 0.0,
397
+ "step": 52
398
+ },
399
+ {
400
+ "epoch": 0.017731682837069253,
401
+ "grad_norm": NaN,
402
+ "learning_rate": 5.348782368720626e-05,
403
+ "loss": 0.0,
404
+ "step": 53
405
+ },
406
+ {
407
+ "epoch": 0.018066242890598862,
408
+ "grad_norm": NaN,
409
+ "learning_rate": 5.174497483512506e-05,
410
+ "loss": 0.0,
411
+ "step": 54
412
+ },
413
  {
414
  "epoch": 0.01840080294412847,
415
+ "grad_norm": NaN,
416
+ "learning_rate": 5e-05,
417
+ "loss": 0.0,
418
  "step": 55
419
  },
420
+ {
421
+ "epoch": 0.01873536299765808,
422
+ "grad_norm": NaN,
423
+ "learning_rate": 4.825502516487497e-05,
424
+ "loss": 0.0,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 0.01906992305118769,
429
+ "grad_norm": NaN,
430
+ "learning_rate": 4.6512176312793736e-05,
431
+ "loss": 0.0,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 0.019404483104717297,
436
+ "grad_norm": NaN,
437
+ "learning_rate": 4.477357683661734e-05,
438
+ "loss": 0.0,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 0.019739043158246906,
443
+ "grad_norm": NaN,
444
+ "learning_rate": 4.3041344951996746e-05,
445
+ "loss": 0.0,
446
+ "step": 59
447
+ },
448
  {
449
  "epoch": 0.020073603211776515,
450
+ "grad_norm": NaN,
451
+ "learning_rate": 4.131759111665349e-05,
452
+ "loss": 0.0,
453
  "step": 60
454
  },
455
+ {
456
+ "epoch": 0.02040816326530612,
457
+ "grad_norm": NaN,
458
+ "learning_rate": 3.960441545911204e-05,
459
+ "loss": 0.0,
460
+ "step": 61
461
+ },
462
+ {
463
+ "epoch": 0.02074272331883573,
464
+ "grad_norm": NaN,
465
+ "learning_rate": 3.790390522001662e-05,
466
+ "loss": 0.0,
467
+ "step": 62
468
+ },
469
+ {
470
+ "epoch": 0.02107728337236534,
471
+ "grad_norm": NaN,
472
+ "learning_rate": 3.6218132209150045e-05,
473
+ "loss": 0.0,
474
+ "step": 63
475
+ },
476
+ {
477
+ "epoch": 0.021411843425894948,
478
+ "grad_norm": NaN,
479
+ "learning_rate": 3.4549150281252636e-05,
480
+ "loss": 0.0,
481
+ "step": 64
482
+ },
483
  {
484
  "epoch": 0.021746403479424557,
485
+ "grad_norm": NaN,
486
+ "learning_rate": 3.289899283371657e-05,
487
+ "loss": 0.0,
488
  "step": 65
489
  },
490
+ {
491
+ "epoch": 0.022080963532954166,
492
+ "grad_norm": NaN,
493
+ "learning_rate": 3.12696703292044e-05,
494
+ "loss": 0.0,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.022415523586483774,
499
+ "grad_norm": NaN,
500
+ "learning_rate": 2.9663167846209998e-05,
501
+ "loss": 0.0,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.022750083640013383,
506
+ "grad_norm": NaN,
507
+ "learning_rate": 2.8081442660546125e-05,
508
+ "loss": 0.0,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.023084643693542992,
513
+ "grad_norm": NaN,
514
+ "learning_rate": 2.6526421860705473e-05,
515
+ "loss": 0.0,
516
+ "step": 69
517
+ },
518
  {
519
  "epoch": 0.0234192037470726,
520
+ "grad_norm": NaN,
521
+ "learning_rate": 2.500000000000001e-05,
522
+ "loss": 0.0,
523
  "step": 70
524
  },
525
  {
526
+ "epoch": 0.023753763800602207,
527
+ "grad_norm": NaN,
528
+ "learning_rate": 2.350403678833976e-05,
529
+ "loss": 0.0,
530
+ "step": 71
531
  },
532
  {
533
+ "epoch": 0.024088323854131816,
534
+ "grad_norm": NaN,
535
+ "learning_rate": 2.2040354826462668e-05,
536
+ "loss": 0.0,
537
+ "step": 72
538
  },
539
  {
540
+ "epoch": 0.024422883907661425,
541
+ "grad_norm": NaN,
542
+ "learning_rate": 2.061073738537635e-05,
543
+ "loss": 0.0,
544
+ "step": 73
545
  },
546
  {
547
+ "epoch": 0.024757443961191034,
548
+ "grad_norm": NaN,
549
+ "learning_rate": 1.9216926233717085e-05,
550
+ "loss": 0.0,
551
+ "step": 74
552
  },
553
  {
554
+ "epoch": 0.025092004014720642,
555
+ "grad_norm": NaN,
556
+ "learning_rate": 1.7860619515673033e-05,
557
+ "loss": 0.0,
558
+ "step": 75
559
  },
560
  {
561
+ "epoch": 0.025092004014720642,
562
+ "eval_loss": NaN,
563
+ "eval_runtime": 74.8837,
564
+ "eval_samples_per_second": 67.238,
565
+ "eval_steps_per_second": 33.625,
566
+ "step": 75
567
  },
568
  {
569
+ "epoch": 0.02542656406825025,
570
+ "grad_norm": NaN,
571
+ "learning_rate": 1.6543469682057106e-05,
572
+ "loss": 0.0,
573
+ "step": 76
 
574
  },
575
  {
576
+ "epoch": 0.02576112412177986,
577
+ "grad_norm": NaN,
578
+ "learning_rate": 1.526708147705013e-05,
579
+ "loss": 0.0,
580
+ "step": 77
581
  },
582
  {
583
+ "epoch": 0.02609568417530947,
584
+ "grad_norm": NaN,
585
+ "learning_rate": 1.4033009983067452e-05,
586
+ "loss": 0.0,
587
+ "step": 78
588
  },
589
  {
590
+ "epoch": 0.026430244228839078,
591
+ "grad_norm": NaN,
592
+ "learning_rate": 1.2842758726130283e-05,
593
+ "loss": 0.0,
594
+ "step": 79
595
  },
596
  {
597
+ "epoch": 0.026764804282368684,
598
+ "grad_norm": NaN,
599
+ "learning_rate": 1.1697777844051105e-05,
600
+ "loss": 0.0,
601
+ "step": 80
602
  },
603
  {
604
+ "epoch": 0.027099364335898293,
605
+ "grad_norm": NaN,
606
+ "learning_rate": 1.0599462319663905e-05,
607
+ "loss": 0.0,
608
+ "step": 81
609
  },
610
  {
611
+ "epoch": 0.0274339243894279,
612
+ "grad_norm": NaN,
613
+ "learning_rate": 9.549150281252633e-06,
614
+ "loss": 0.0,
615
+ "step": 82
616
  },
617
  {
618
+ "epoch": 0.02776848444295751,
619
+ "grad_norm": NaN,
620
+ "learning_rate": 8.548121372247918e-06,
621
+ "loss": 0.0,
622
+ "step": 83
623
  },
624
  {
625
+ "epoch": 0.02810304449648712,
626
+ "grad_norm": NaN,
627
+ "learning_rate": 7.597595192178702e-06,
628
+ "loss": 0.0,
629
+ "step": 84
630
  },
631
  {
632
+ "epoch": 0.02843760455001673,
633
+ "grad_norm": NaN,
634
+ "learning_rate": 6.698729810778065e-06,
635
+ "loss": 0.0,
636
+ "step": 85
637
  },
638
  {
639
+ "epoch": 0.028772164603546337,
640
+ "grad_norm": NaN,
641
+ "learning_rate": 5.852620357053651e-06,
642
+ "loss": 0.0,
643
+ "step": 86
644
  },
645
  {
646
+ "epoch": 0.029106724657075946,
647
+ "grad_norm": NaN,
648
+ "learning_rate": 5.060297685041659e-06,
649
+ "loss": 0.0,
650
+ "step": 87
 
651
  },
652
  {
653
+ "epoch": 0.029441284710605555,
654
+ "grad_norm": NaN,
655
+ "learning_rate": 4.322727117869951e-06,
656
+ "loss": 0.0,
657
+ "step": 88
658
  },
659
  {
660
+ "epoch": 0.02977584476413516,
661
+ "grad_norm": NaN,
662
+ "learning_rate": 3.6408072716606346e-06,
663
+ "loss": 0.0,
664
+ "step": 89
665
  },
666
  {
667
+ "epoch": 0.03011040481766477,
668
+ "grad_norm": NaN,
669
+ "learning_rate": 3.0153689607045845e-06,
670
+ "loss": 0.0,
671
+ "step": 90
672
  },
673
  {
674
+ "epoch": 0.03044496487119438,
675
+ "grad_norm": NaN,
676
+ "learning_rate": 2.4471741852423237e-06,
677
+ "loss": 0.0,
678
+ "step": 91
679
  },
680
  {
681
+ "epoch": 0.030779524924723987,
682
+ "grad_norm": NaN,
683
+ "learning_rate": 1.9369152030840556e-06,
684
+ "loss": 0.0,
685
+ "step": 92
686
  },
687
  {
688
+ "epoch": 0.031114084978253596,
689
+ "grad_norm": NaN,
690
+ "learning_rate": 1.4852136862001764e-06,
691
+ "loss": 0.0,
692
+ "step": 93
693
  },
694
  {
695
+ "epoch": 0.031448645031783205,
696
+ "grad_norm": NaN,
697
+ "learning_rate": 1.0926199633097157e-06,
698
+ "loss": 0.0,
699
+ "step": 94
700
  },
701
  {
702
+ "epoch": 0.031783205085312814,
703
+ "grad_norm": NaN,
704
+ "learning_rate": 7.596123493895991e-07,
705
+ "loss": 0.0,
706
+ "step": 95
707
  },
708
  {
709
+ "epoch": 0.03211776513884242,
710
+ "grad_norm": NaN,
711
+ "learning_rate": 4.865965629214819e-07,
712
+ "loss": 0.0,
713
+ "step": 96
714
  },
715
  {
716
+ "epoch": 0.03245232519237203,
717
+ "grad_norm": NaN,
718
+ "learning_rate": 2.7390523158633554e-07,
719
+ "loss": 0.0,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.03278688524590164,
724
+ "grad_norm": NaN,
725
+ "learning_rate": 1.2179748700879012e-07,
726
+ "loss": 0.0,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.03312144529943125,
731
+ "grad_norm": NaN,
732
+ "learning_rate": 3.04586490452119e-08,
733
+ "loss": 0.0,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.03345600535296086,
738
+ "grad_norm": NaN,
739
  "learning_rate": 0.0,
740
+ "loss": 0.0,
741
+ "step": 100
742
  },
743
  {
744
+ "epoch": 0.03345600535296086,
745
+ "eval_loss": NaN,
746
+ "eval_runtime": 83.2987,
747
+ "eval_samples_per_second": 60.445,
748
+ "eval_steps_per_second": 30.229,
749
+ "step": 100
750
  }
751
  ],
752
+ "logging_steps": 1,
753
+ "max_steps": 100,
754
  "num_input_tokens_seen": 0,
755
  "num_train_epochs": 1,
756
+ "save_steps": 25,
757
  "stateful_callbacks": {
758
  "TrainerControl": {
759
  "args": {
 
766
  "attributes": {}
767
  }
768
  },
769
+ "total_flos": 6271342215168000.0,
770
  "train_batch_size": 2,
771
  "trial_name": null,
772
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0662807773092334a74bd9571af7ccfea1da3a5b81276092de9ac7b5492fa0e4
3
- size 6712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239f06c62ee4317bc3f67ccabbea4f161d802c558557cbb5c5e70285a5b6026c
3
+ size 6776
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0662807773092334a74bd9571af7ccfea1da3a5b81276092de9ac7b5492fa0e4
3
- size 6712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239f06c62ee4317bc3f67ccabbea4f161d802c558557cbb5c5e70285a5b6026c
3
+ size 6776