sharafeddeen commited on
Commit
6f4315c
·
verified ·
1 Parent(s): 103d739

latest training checkpoint

Browse files

not all the files were pushed to huggingface after training

adapter_config.json CHANGED
@@ -1,28 +1,28 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": false,
8
- "init_lora_weights": true,
9
- "layers_pattern": null,
10
- "layers_to_transform": null,
11
- "loftq_config": {},
12
- "lora_alpha": 16,
13
- "lora_dropout": 0.1,
14
- "megatron_config": null,
15
- "megatron_core": "megatron.core",
16
- "modules_to_save": null,
17
- "peft_type": "LORA",
18
- "r": 64,
19
- "rank_pattern": {},
20
- "revision": null,
21
- "target_modules": [
22
- "v_proj",
23
- "k_proj",
24
- "o_proj",
25
- "q_proj"
26
- ],
27
- "task_type": "CAUSAL_LM"
28
  }
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.1,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 64,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "v_proj",
23
+ "k_proj",
24
+ "o_proj",
25
+ "q_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM"
28
  }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.48.1"
9
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccbfc43bd01ba12dab30c58e7339048877e7ae60f576ce22eb9e47a3e4a95407
3
+ size 35674618
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfddc12c09ed68ce102a9ef57f981c5e4cedd262ad7aee169a63e2dbf351ac69
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31b716b23b3a1c16554a0f288cd041670104f4c187deb0150cab279be98f0445
3
+ size 1064
special_tokens_map.json CHANGED
@@ -1,23 +1,23 @@
1
- {
2
- "bos_token": {
3
- "content": "<|begin▁of▁sentence|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|end▁of▁sentence|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<|end▁of▁sentence|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
- }
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893
3
- size 11422778
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:253accc92cf719c21724d425d5158ff4ee96a808ba33dbf033067f5df9633eff
3
+ size 12180269
tokenizer_config.json CHANGED
@@ -1,195 +1,195 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "add_prefix_space": null,
5
- "added_tokens_decoder": {
6
- "151643": {
7
- "content": "<|end▁of▁sentence|>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "151644": {
15
- "content": "<|User|>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": false
21
- },
22
- "151645": {
23
- "content": "<|Assistant|>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": false,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "151646": {
31
- "content": "<|begin▁of▁sentence|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "151647": {
39
- "content": "<|EOT|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false,
44
- "special": false
45
- },
46
- "151648": {
47
- "content": "<think>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": false,
51
- "single_word": false,
52
- "special": false
53
- },
54
- "151649": {
55
- "content": "</think>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": false,
59
- "single_word": false,
60
- "special": false
61
- },
62
- "151650": {
63
- "content": "<|quad_start|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": false,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "151651": {
71
- "content": "<|quad_end|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": false,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "151652": {
79
- "content": "<|vision_start|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": false,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "151653": {
87
- "content": "<|vision_end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": false,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "151654": {
95
- "content": "<|vision_pad|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": false,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "151655": {
103
- "content": "<|image_pad|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": false,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "151656": {
111
- "content": "<|video_pad|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": false,
115
- "single_word": false,
116
- "special": true
117
- },
118
- "151657": {
119
- "content": "<tool_call>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false,
124
- "special": false
125
- },
126
- "151658": {
127
- "content": "</tool_call>",
128
- "lstrip": false,
129
- "normalized": false,
130
- "rstrip": false,
131
- "single_word": false,
132
- "special": false
133
- },
134
- "151659": {
135
- "content": "<|fim_prefix|>",
136
- "lstrip": false,
137
- "normalized": false,
138
- "rstrip": false,
139
- "single_word": false,
140
- "special": false
141
- },
142
- "151660": {
143
- "content": "<|fim_middle|>",
144
- "lstrip": false,
145
- "normalized": false,
146
- "rstrip": false,
147
- "single_word": false,
148
- "special": false
149
- },
150
- "151661": {
151
- "content": "<|fim_suffix|>",
152
- "lstrip": false,
153
- "normalized": false,
154
- "rstrip": false,
155
- "single_word": false,
156
- "special": false
157
- },
158
- "151662": {
159
- "content": "<|fim_pad|>",
160
- "lstrip": false,
161
- "normalized": false,
162
- "rstrip": false,
163
- "single_word": false,
164
- "special": false
165
- },
166
- "151663": {
167
- "content": "<|repo_name|>",
168
- "lstrip": false,
169
- "normalized": false,
170
- "rstrip": false,
171
- "single_word": false,
172
- "special": false
173
- },
174
- "151664": {
175
- "content": "<|file_sep|>",
176
- "lstrip": false,
177
- "normalized": false,
178
- "rstrip": false,
179
- "single_word": false,
180
- "special": false
181
- }
182
- },
183
- "bos_token": "<|begin▁of▁sentence|>",
184
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
185
- "clean_up_tokenization_spaces": false,
186
- "eos_token": "<|end▁of▁sentence|>",
187
- "extra_special_tokens": {},
188
- "legacy": true,
189
- "model_max_length": 16384,
190
- "pad_token": "<|end▁of▁sentence|>",
191
- "sp_model_kwargs": {},
192
- "tokenizer_class": "LlamaTokenizer",
193
- "unk_token": null,
194
- "use_default_system_prompt": false
195
- }
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|end▁of▁sentence|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|User|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "151645": {
23
+ "content": "<|Assistant|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "151646": {
31
+ "content": "<|begin▁of▁sentence|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|EOT|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "151648": {
47
+ "content": "<think>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "151649": {
55
+ "content": "</think>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ }
182
+ },
183
+ "bos_token": "<|begin▁of▁sentence|>",
184
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
185
+ "clean_up_tokenization_spaces": false,
186
+ "eos_token": "<|end▁of▁sentence|>",
187
+ "extra_special_tokens": {},
188
+ "legacy": true,
189
+ "model_max_length": 16384,
190
+ "pad_token": "<|end▁of▁sentence|>",
191
+ "sp_model_kwargs": {},
192
+ "tokenizer_class": "LlamaTokenizer",
193
+ "unk_token": null,
194
+ "use_default_system_prompt": false
195
+ }
trainer_state.json ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.695652173913043,
5
+ "eval_steps": 500,
6
+ "global_step": 70,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06956521739130435,
13
+ "grad_norm": 0.027621353045105934,
14
+ "learning_rate": 1.9989930665413148e-05,
15
+ "loss": 0.6357,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.1391304347826087,
20
+ "grad_norm": 0.027610378339886665,
21
+ "learning_rate": 1.9959742939952393e-05,
22
+ "loss": 0.6181,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.20869565217391303,
27
+ "grad_norm": 0.029305126518011093,
28
+ "learning_rate": 1.990949761767935e-05,
29
+ "loss": 0.642,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.2782608695652174,
34
+ "grad_norm": 0.028681788593530655,
35
+ "learning_rate": 1.98392958859863e-05,
36
+ "loss": 0.646,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.34782608695652173,
41
+ "grad_norm": 0.028845500200986862,
42
+ "learning_rate": 1.9749279121818235e-05,
43
+ "loss": 0.6416,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.41739130434782606,
48
+ "grad_norm": 0.03184295445680618,
49
+ "learning_rate": 1.9639628606958535e-05,
50
+ "loss": 0.6625,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.48695652173913045,
55
+ "grad_norm": 0.030502479523420334,
56
+ "learning_rate": 1.9510565162951538e-05,
57
+ "loss": 0.645,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.5565217391304348,
62
+ "grad_norm": 0.030076082795858383,
63
+ "learning_rate": 1.9362348706397374e-05,
64
+ "loss": 0.6285,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.6260869565217392,
69
+ "grad_norm": 0.032947439700365067,
70
+ "learning_rate": 1.919527772551451e-05,
71
+ "loss": 0.6641,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.6956521739130435,
76
+ "grad_norm": 0.03192123770713806,
77
+ "learning_rate": 1.900968867902419e-05,
78
+ "loss": 0.6508,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.7652173913043478,
83
+ "grad_norm": 0.0321270152926445,
84
+ "learning_rate": 1.880595531856738e-05,
85
+ "loss": 0.6436,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.8347826086956521,
90
+ "grad_norm": 0.03189520537853241,
91
+ "learning_rate": 1.8584487936018663e-05,
92
+ "loss": 0.6653,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.9043478260869565,
97
+ "grad_norm": 0.03357314690947533,
98
+ "learning_rate": 1.834573253721303e-05,
99
+ "loss": 0.636,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.9739130434782609,
104
+ "grad_norm": 0.031242311000823975,
105
+ "learning_rate": 1.8090169943749477e-05,
106
+ "loss": 0.6537,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 1.0,
111
+ "grad_norm": 0.03731789439916611,
112
+ "learning_rate": 1.78183148246803e-05,
113
+ "loss": 0.6894,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 1.0,
118
+ "eval_loss": 0.6298047304153442,
119
+ "eval_runtime": 2.6261,
120
+ "eval_samples_per_second": 1.523,
121
+ "eval_steps_per_second": 1.523,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 1.0695652173913044,
126
+ "grad_norm": 0.0358547680079937,
127
+ "learning_rate": 1.7530714660036112e-05,
128
+ "loss": 0.6516,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 1.1391304347826088,
133
+ "grad_norm": 0.0324733592569828,
134
+ "learning_rate": 1.7227948638273918e-05,
135
+ "loss": 0.6244,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 1.208695652173913,
140
+ "grad_norm": 0.03361370787024498,
141
+ "learning_rate": 1.691062648986865e-05,
142
+ "loss": 0.6417,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 1.2782608695652173,
147
+ "grad_norm": 0.033928435295820236,
148
+ "learning_rate": 1.657938725939713e-05,
149
+ "loss": 0.6601,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 1.3478260869565217,
154
+ "grad_norm": 0.03608125448226929,
155
+ "learning_rate": 1.6234898018587336e-05,
156
+ "loss": 0.6532,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 1.4173913043478261,
161
+ "grad_norm": 0.03397082909941673,
162
+ "learning_rate": 1.5877852522924733e-05,
163
+ "loss": 0.6205,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 1.4869565217391305,
168
+ "grad_norm": 0.03406910225749016,
169
+ "learning_rate": 1.5508969814521026e-05,
170
+ "loss": 0.6256,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 1.5565217391304347,
175
+ "grad_norm": 0.034135762602090836,
176
+ "learning_rate": 1.5128992774059063e-05,
177
+ "loss": 0.6332,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 1.626086956521739,
182
+ "grad_norm": 0.03473026305437088,
183
+ "learning_rate": 1.4738686624729987e-05,
184
+ "loss": 0.6561,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 1.6956521739130435,
189
+ "grad_norm": 0.03459230437874794,
190
+ "learning_rate": 1.4338837391175582e-05,
191
+ "loss": 0.6391,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 1.7652173913043478,
196
+ "grad_norm": 0.0341939777135849,
197
+ "learning_rate": 1.3930250316539237e-05,
198
+ "loss": 0.6393,
199
+ "step": 26
200
+ },
201
+ {
202
+ "epoch": 1.8347826086956522,
203
+ "grad_norm": 0.03445998951792717,
204
+ "learning_rate": 1.3513748240813429e-05,
205
+ "loss": 0.6429,
206
+ "step": 27
207
+ },
208
+ {
209
+ "epoch": 1.9043478260869566,
210
+ "grad_norm": 0.03501657024025917,
211
+ "learning_rate": 1.3090169943749475e-05,
212
+ "loss": 0.6483,
213
+ "step": 28
214
+ },
215
+ {
216
+ "epoch": 1.973913043478261,
217
+ "grad_norm": 0.03379024565219879,
218
+ "learning_rate": 1.2660368455666752e-05,
219
+ "loss": 0.6247,
220
+ "step": 29
221
+ },
222
+ {
223
+ "epoch": 2.0,
224
+ "grad_norm": 0.034777939319610596,
225
+ "learning_rate": 1.2225209339563144e-05,
226
+ "loss": 0.6324,
227
+ "step": 30
228
+ },
229
+ {
230
+ "epoch": 2.0,
231
+ "eval_loss": 0.620289146900177,
232
+ "eval_runtime": 2.6255,
233
+ "eval_samples_per_second": 1.524,
234
+ "eval_steps_per_second": 1.524,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 2.0695652173913044,
239
+ "grad_norm": 0.035359956324100494,
240
+ "learning_rate": 1.1785568947986368e-05,
241
+ "loss": 0.6416,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 2.139130434782609,
246
+ "grad_norm": 0.03541162610054016,
247
+ "learning_rate": 1.1342332658176556e-05,
248
+ "loss": 0.6432,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 2.208695652173913,
253
+ "grad_norm": 0.03285665437579155,
254
+ "learning_rate": 1.0896393089034336e-05,
255
+ "loss": 0.6128,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 2.2782608695652176,
260
+ "grad_norm": 0.03517802432179451,
261
+ "learning_rate": 1.044864830350515e-05,
262
+ "loss": 0.6557,
263
+ "step": 34
264
+ },
265
+ {
266
+ "epoch": 2.3478260869565215,
267
+ "grad_norm": 0.03373734652996063,
268
+ "learning_rate": 1e-05,
269
+ "loss": 0.6298,
270
+ "step": 35
271
+ },
272
+ {
273
+ "epoch": 2.417391304347826,
274
+ "grad_norm": 0.0340682752430439,
275
+ "learning_rate": 9.551351696494854e-06,
276
+ "loss": 0.6192,
277
+ "step": 36
278
+ },
279
+ {
280
+ "epoch": 2.4869565217391303,
281
+ "grad_norm": 0.03553012013435364,
282
+ "learning_rate": 9.103606910965666e-06,
283
+ "loss": 0.637,
284
+ "step": 37
285
+ },
286
+ {
287
+ "epoch": 2.5565217391304347,
288
+ "grad_norm": 0.036372989416122437,
289
+ "learning_rate": 8.657667341823449e-06,
290
+ "loss": 0.6286,
291
+ "step": 38
292
+ },
293
+ {
294
+ "epoch": 2.626086956521739,
295
+ "grad_norm": 0.03410479426383972,
296
+ "learning_rate": 8.214431052013636e-06,
297
+ "loss": 0.602,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 2.6956521739130435,
302
+ "grad_norm": 0.035654351115226746,
303
+ "learning_rate": 7.774790660436857e-06,
304
+ "loss": 0.6282,
305
+ "step": 40
306
+ },
307
+ {
308
+ "epoch": 2.765217391304348,
309
+ "grad_norm": 0.03418293967843056,
310
+ "learning_rate": 7.33963154433325e-06,
311
+ "loss": 0.6384,
312
+ "step": 41
313
+ },
314
+ {
315
+ "epoch": 2.8347826086956522,
316
+ "grad_norm": 0.035010650753974915,
317
+ "learning_rate": 6.909830056250527e-06,
318
+ "loss": 0.6232,
319
+ "step": 42
320
+ },
321
+ {
322
+ "epoch": 2.9043478260869566,
323
+ "grad_norm": 0.03416571021080017,
324
+ "learning_rate": 6.486251759186573e-06,
325
+ "loss": 0.6533,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 2.973913043478261,
330
+ "grad_norm": 0.03395598754286766,
331
+ "learning_rate": 6.069749683460765e-06,
332
+ "loss": 0.6205,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 3.0,
337
+ "grad_norm": 0.0333721823990345,
338
+ "learning_rate": 5.66116260882442e-06,
339
+ "loss": 0.6163,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 3.0,
344
+ "eval_loss": 0.6122885942459106,
345
+ "eval_runtime": 2.6295,
346
+ "eval_samples_per_second": 1.521,
347
+ "eval_steps_per_second": 1.521,
348
+ "step": 45
349
+ },
350
+ {
351
+ "epoch": 3.0695652173913044,
352
+ "grad_norm": 0.033700425177812576,
353
+ "learning_rate": 5.2613133752700145e-06,
354
+ "loss": 0.6408,
355
+ "step": 46
356
+ },
357
+ {
358
+ "epoch": 3.139130434782609,
359
+ "grad_norm": 0.03324361890554428,
360
+ "learning_rate": 4.87100722594094e-06,
361
+ "loss": 0.62,
362
+ "step": 47
363
+ },
364
+ {
365
+ "epoch": 3.208695652173913,
366
+ "grad_norm": 0.03405730798840523,
367
+ "learning_rate": 4.491030185478976e-06,
368
+ "loss": 0.6257,
369
+ "step": 48
370
+ },
371
+ {
372
+ "epoch": 3.2782608695652176,
373
+ "grad_norm": 0.03492354601621628,
374
+ "learning_rate": 4.12214747707527e-06,
375
+ "loss": 0.6349,
376
+ "step": 49
377
+ },
378
+ {
379
+ "epoch": 3.3478260869565215,
380
+ "grad_norm": 0.034349583089351654,
381
+ "learning_rate": 3.7651019814126656e-06,
382
+ "loss": 0.6282,
383
+ "step": 50
384
+ },
385
+ {
386
+ "epoch": 3.417391304347826,
387
+ "grad_norm": 0.03548488765954971,
388
+ "learning_rate": 3.4206127406028744e-06,
389
+ "loss": 0.6427,
390
+ "step": 51
391
+ },
392
+ {
393
+ "epoch": 3.4869565217391303,
394
+ "grad_norm": 0.03456812724471092,
395
+ "learning_rate": 3.089373510131354e-06,
396
+ "loss": 0.62,
397
+ "step": 52
398
+ },
399
+ {
400
+ "epoch": 3.5565217391304347,
401
+ "grad_norm": 0.03488166257739067,
402
+ "learning_rate": 2.7720513617260857e-06,
403
+ "loss": 0.6263,
404
+ "step": 53
405
+ },
406
+ {
407
+ "epoch": 3.626086956521739,
408
+ "grad_norm": 0.0336606539785862,
409
+ "learning_rate": 2.469285339963892e-06,
410
+ "loss": 0.63,
411
+ "step": 54
412
+ },
413
+ {
414
+ "epoch": 3.6956521739130435,
415
+ "grad_norm": 0.03267661854624748,
416
+ "learning_rate": 2.1816851753197023e-06,
417
+ "loss": 0.6297,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 3.765217391304348,
422
+ "grad_norm": 0.0337102934718132,
423
+ "learning_rate": 1.9098300562505266e-06,
424
+ "loss": 0.5945,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 3.8347826086956522,
429
+ "grad_norm": 0.035517044365406036,
430
+ "learning_rate": 1.6542674627869738e-06,
431
+ "loss": 0.6343,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 3.9043478260869566,
436
+ "grad_norm": 0.03479280695319176,
437
+ "learning_rate": 1.4155120639813392e-06,
438
+ "loss": 0.6163,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 3.973913043478261,
443
+ "grad_norm": 0.035013146698474884,
444
+ "learning_rate": 1.19404468143262e-06,
445
+ "loss": 0.6114,
446
+ "step": 59
447
+ },
448
+ {
449
+ "epoch": 4.0,
450
+ "grad_norm": 0.03396342694759369,
451
+ "learning_rate": 9.903113209758098e-07,
452
+ "loss": 0.6191,
453
+ "step": 60
454
+ },
455
+ {
456
+ "epoch": 4.0,
457
+ "eval_loss": 0.609683632850647,
458
+ "eval_runtime": 2.6229,
459
+ "eval_samples_per_second": 1.525,
460
+ "eval_steps_per_second": 1.525,
461
+ "step": 60
462
+ },
463
+ {
464
+ "epoch": 4.069565217391304,
465
+ "grad_norm": 0.03345245122909546,
466
+ "learning_rate": 8.047222744854943e-07,
467
+ "loss": 0.6416,
468
+ "step": 61
469
+ },
470
+ {
471
+ "epoch": 4.139130434782609,
472
+ "grad_norm": 0.03468826040625572,
473
+ "learning_rate": 6.37651293602628e-07,
474
+ "loss": 0.6269,
475
+ "step": 62
476
+ },
477
+ {
478
+ "epoch": 4.208695652173913,
479
+ "grad_norm": 0.031891003251075745,
480
+ "learning_rate": 4.894348370484648e-07,
481
+ "loss": 0.6147,
482
+ "step": 63
483
+ },
484
+ {
485
+ "epoch": 4.278260869565218,
486
+ "grad_norm": 0.033924877643585205,
487
+ "learning_rate": 3.603713930414676e-07,
488
+ "loss": 0.611,
489
+ "step": 64
490
+ },
491
+ {
492
+ "epoch": 4.3478260869565215,
493
+ "grad_norm": 0.033348944038152695,
494
+ "learning_rate": 2.507208781817638e-07,
495
+ "loss": 0.6372,
496
+ "step": 65
497
+ },
498
+ {
499
+ "epoch": 4.417391304347826,
500
+ "grad_norm": 0.03544296696782112,
501
+ "learning_rate": 1.6070411401370335e-07,
502
+ "loss": 0.6176,
503
+ "step": 66
504
+ },
505
+ {
506
+ "epoch": 4.48695652173913,
507
+ "grad_norm": 0.03346049040555954,
508
+ "learning_rate": 9.0502382320653e-08,
509
+ "loss": 0.6282,
510
+ "step": 67
511
+ },
512
+ {
513
+ "epoch": 4.556521739130435,
514
+ "grad_norm": 0.03482738509774208,
515
+ "learning_rate": 4.025706004760932e-08,
516
+ "loss": 0.6432,
517
+ "step": 68
518
+ },
519
+ {
520
+ "epoch": 4.626086956521739,
521
+ "grad_norm": 0.03497692570090294,
522
+ "learning_rate": 1.0069334586854106e-08,
523
+ "loss": 0.6121,
524
+ "step": 69
525
+ },
526
+ {
527
+ "epoch": 4.695652173913043,
528
+ "grad_norm": 0.034334901720285416,
529
+ "learning_rate": 0.0,
530
+ "loss": 0.6054,
531
+ "step": 70
532
+ },
533
+ {
534
+ "epoch": 4.695652173913043,
535
+ "eval_loss": 0.609480619430542,
536
+ "eval_runtime": 2.628,
537
+ "eval_samples_per_second": 1.522,
538
+ "eval_steps_per_second": 1.522,
539
+ "step": 70
540
+ }
541
+ ],
542
+ "logging_steps": 1,
543
+ "max_steps": 70,
544
+ "num_input_tokens_seen": 0,
545
+ "num_train_epochs": 5,
546
+ "save_steps": 500,
547
+ "stateful_callbacks": {
548
+ "TrainerControl": {
549
+ "args": {
550
+ "should_epoch_stop": false,
551
+ "should_evaluate": false,
552
+ "should_log": false,
553
+ "should_save": true,
554
+ "should_training_stop": true
555
+ },
556
+ "attributes": {}
557
+ }
558
+ },
559
+ "total_flos": 1.657443415228416e+17,
560
+ "train_batch_size": 1,
561
+ "trial_name": null,
562
+ "trial_params": null
563
+ }