zhangchuheng123 commited on
Commit
ac6cf52
1 Parent(s): ea306ed

Upload new model

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "outputs/checkpoint-808",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
@@ -21,7 +21,7 @@
21
  "rope_theta": 10000.0,
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
- "transformers_version": "4.34.0",
25
  "use_cache": true,
26
  "vocab_size": 32001
27
  }
 
1
  {
2
+ "_name_or_path": "outputs/checkpoint-1960",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
 
21
  "rope_theta": 10000.0,
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.34.1",
25
  "use_cache": true,
26
  "vocab_size": 32001
27
  }
generation_config.json CHANGED
@@ -6,5 +6,5 @@
6
  "pad_token_id": 0,
7
  "temperature": 0.6,
8
  "top_p": 0.9,
9
- "transformers_version": "4.34.0"
10
  }
 
6
  "pad_token_id": 0,
7
  "temperature": 0.6,
8
  "top_p": 0.9,
9
+ "transformers_version": "4.34.1"
10
  }
pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de11e1eac25e346e42ff6cd66da9a9b234ce42d76b27076d3b1eca29adf06bab
3
  size 9976628314
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5d5d4f4733a155c795fe325b94aa9b5c44ea92d8d21af94b79a49e4df3b96d2
3
  size 9976628314
pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1b13c92a83acd990ae75b07b7a8e19532c81d3e873a7ad842d4fd96463786eb
3
  size 3500318979
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6f83ffcea0dc63f67b272d10a6ce0be34f06bf9bf3ab08992a589b426ee3c79
3
  size 3500318979
special_tokens_map.json CHANGED
@@ -1,9 +1,30 @@
1
  {
2
- "additional_special_tokens": [
3
- "[PAD]"
4
- ],
5
- "bos_token": "<s>",
6
- "eos_token": "</s>",
7
- "pad_token": "[PAD]",
8
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
  }
tokenizer_config.json CHANGED
@@ -35,9 +35,6 @@
35
  "special": true
36
  }
37
  },
38
- "additional_special_tokens": [
39
- "[PAD]"
40
- ],
41
  "bos_token": "<s>",
42
  "clean_up_tokenization_spaces": false,
43
  "eos_token": "</s>",
@@ -48,7 +45,6 @@
48
  "sp_model_kwargs": {},
49
  "spaces_between_special_tokens": false,
50
  "tokenizer_class": "LlamaTokenizer",
51
- "tokenizer_file": null,
52
  "unk_token": "<unk>",
53
  "use_default_system_prompt": true
54
  }
 
35
  "special": true
36
  }
37
  },
 
 
 
38
  "bos_token": "<s>",
39
  "clean_up_tokenization_spaces": false,
40
  "eos_token": "</s>",
 
45
  "sp_model_kwargs": {},
46
  "spaces_between_special_tokens": false,
47
  "tokenizer_class": "LlamaTokenizer",
 
48
  "unk_token": "<unk>",
49
  "use_default_system_prompt": true
50
  }
trainer_state.json CHANGED
@@ -1,242 +1,150 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.292993630573248,
5
  "eval_steps": 100,
6
- "global_step": 1616,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.64,
13
  "learning_rate": 2e-05,
14
- "loss": 1.1355,
15
  "step": 100
16
  },
17
  {
18
- "epoch": 0.64,
19
- "eval_loss": 1.157366156578064,
20
- "eval_runtime": 30.4555,
21
- "eval_samples_per_second": 65.67,
22
- "eval_steps_per_second": 1.051,
23
- "step": 100
24
- },
25
- {
26
- "epoch": 1.27,
27
  "learning_rate": 2e-05,
28
- "loss": 0.881,
29
  "step": 200
30
  },
31
  {
32
- "epoch": 1.27,
33
- "eval_loss": 1.3262789249420166,
34
- "eval_runtime": 29.1383,
35
- "eval_samples_per_second": 68.638,
36
- "eval_steps_per_second": 1.098,
37
- "step": 200
38
- },
39
- {
40
- "epoch": 1.91,
41
  "learning_rate": 2e-05,
42
- "loss": 0.5496,
43
  "step": 300
44
  },
45
  {
46
- "epoch": 1.91,
47
- "eval_loss": 1.3176809549331665,
48
- "eval_runtime": 28.8909,
49
- "eval_samples_per_second": 69.226,
50
- "eval_steps_per_second": 1.108,
51
- "step": 300
52
- },
53
- {
54
- "epoch": 2.55,
55
  "learning_rate": 2e-05,
56
- "loss": 0.2597,
57
- "step": 400
58
- },
59
- {
60
- "epoch": 2.55,
61
- "eval_loss": 1.5806535482406616,
62
- "eval_runtime": 29.1193,
63
- "eval_samples_per_second": 68.683,
64
- "eval_steps_per_second": 1.099,
65
  "step": 400
66
  },
67
  {
68
- "epoch": 3.18,
69
  "learning_rate": 2e-05,
70
- "loss": 0.1877,
71
- "step": 500
72
- },
73
- {
74
- "epoch": 3.18,
75
- "eval_loss": 1.7526657581329346,
76
- "eval_runtime": 29.1196,
77
- "eval_samples_per_second": 68.682,
78
- "eval_steps_per_second": 1.099,
79
  "step": 500
80
  },
81
  {
82
- "epoch": 3.82,
83
  "learning_rate": 2e-05,
84
- "loss": 0.1158,
85
  "step": 600
86
  },
87
  {
88
- "epoch": 3.82,
89
- "eval_loss": 1.7486767768859863,
90
- "eval_runtime": 29.2706,
91
- "eval_samples_per_second": 68.328,
92
- "eval_steps_per_second": 1.093,
93
- "step": 600
94
- },
95
- {
96
- "epoch": 4.46,
97
  "learning_rate": 2e-05,
98
- "loss": 0.0855,
99
  "step": 700
100
  },
101
  {
102
- "epoch": 4.46,
103
- "eval_loss": 1.873838186264038,
104
- "eval_runtime": 29.2292,
105
- "eval_samples_per_second": 68.425,
106
- "eval_steps_per_second": 1.095,
107
- "step": 700
108
- },
109
- {
110
- "epoch": 5.1,
111
  "learning_rate": 2e-05,
112
- "loss": 0.0645,
113
- "step": 800
114
- },
115
- {
116
- "epoch": 5.1,
117
- "eval_loss": 1.9275007247924805,
118
- "eval_runtime": 29.013,
119
- "eval_samples_per_second": 68.935,
120
- "eval_steps_per_second": 1.103,
121
  "step": 800
122
  },
123
  {
124
- "epoch": 5.73,
125
  "learning_rate": 2e-05,
126
- "loss": 0.0518,
127
- "step": 900
128
- },
129
- {
130
- "epoch": 5.73,
131
- "eval_loss": 1.9070993661880493,
132
- "eval_runtime": 30.5536,
133
- "eval_samples_per_second": 65.459,
134
- "eval_steps_per_second": 1.047,
135
  "step": 900
136
  },
137
  {
138
- "epoch": 6.37,
139
  "learning_rate": 2e-05,
140
- "loss": 0.0464,
141
- "step": 1000
142
- },
143
- {
144
- "epoch": 6.37,
145
- "eval_loss": 1.9601927995681763,
146
- "eval_runtime": 28.9827,
147
- "eval_samples_per_second": 69.007,
148
- "eval_steps_per_second": 1.104,
149
  "step": 1000
150
  },
151
  {
152
- "epoch": 7.01,
153
  "learning_rate": 2e-05,
154
- "loss": 0.0367,
155
  "step": 1100
156
  },
157
  {
158
- "epoch": 7.01,
159
- "eval_loss": 1.973179817199707,
160
- "eval_runtime": 29.0672,
161
- "eval_samples_per_second": 68.806,
162
- "eval_steps_per_second": 1.101,
163
- "step": 1100
164
  },
165
  {
166
- "epoch": 7.64,
167
  "learning_rate": 2e-05,
168
- "loss": 0.0288,
169
- "step": 1200
170
  },
171
  {
172
- "epoch": 7.64,
173
- "eval_loss": 2.0399632453918457,
174
- "eval_runtime": 29.0049,
175
- "eval_samples_per_second": 68.954,
176
- "eval_steps_per_second": 1.103,
177
- "step": 1200
178
  },
179
  {
180
- "epoch": 8.28,
181
  "learning_rate": 2e-05,
182
- "loss": 0.0265,
183
- "step": 1300
184
  },
185
  {
186
- "epoch": 8.28,
187
- "eval_loss": 2.0276734828948975,
188
- "eval_runtime": 28.9115,
189
- "eval_samples_per_second": 69.177,
190
- "eval_steps_per_second": 1.107,
191
- "step": 1300
192
  },
193
  {
194
- "epoch": 8.92,
195
  "learning_rate": 2e-05,
196
- "loss": 0.0287,
197
- "step": 1400
198
  },
199
  {
200
- "epoch": 8.92,
201
- "eval_loss": 2.049071788787842,
202
- "eval_runtime": 29.0231,
203
- "eval_samples_per_second": 68.911,
204
- "eval_steps_per_second": 1.103,
205
- "step": 1400
206
  },
207
  {
208
- "epoch": 9.55,
209
  "learning_rate": 2e-05,
210
- "loss": 0.0195,
211
- "step": 1500
212
  },
213
  {
214
- "epoch": 9.55,
215
- "eval_loss": 2.043515205383301,
216
- "eval_runtime": 29.1891,
217
- "eval_samples_per_second": 68.519,
218
- "eval_steps_per_second": 1.096,
219
- "step": 1500
220
  },
221
  {
222
- "epoch": 10.19,
223
  "learning_rate": 2e-05,
224
- "loss": 0.0215,
225
- "step": 1600
226
  },
227
  {
228
- "epoch": 10.19,
229
- "eval_loss": 2.101804733276367,
230
- "eval_runtime": 28.9478,
231
- "eval_samples_per_second": 69.09,
232
- "eval_steps_per_second": 1.105,
233
- "step": 1600
234
  }
235
  ],
236
  "logging_steps": 100,
237
- "max_steps": 1616,
238
- "num_train_epochs": 11,
239
- "save_steps": 808,
240
  "total_flos": 0.0,
241
  "trial_name": null,
242
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 8.963414634146341,
5
  "eval_steps": 100,
6
+ "global_step": 2205,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.41,
13
  "learning_rate": 2e-05,
14
+ "loss": 1.3987,
15
  "step": 100
16
  },
17
  {
18
+ "epoch": 0.81,
 
 
 
 
 
 
 
 
19
  "learning_rate": 2e-05,
20
+ "loss": 1.1878,
21
  "step": 200
22
  },
23
  {
24
+ "epoch": 1.22,
 
 
 
 
 
 
 
 
25
  "learning_rate": 2e-05,
26
+ "loss": 0.7575,
27
  "step": 300
28
  },
29
  {
30
+ "epoch": 1.63,
 
 
 
 
 
 
 
 
31
  "learning_rate": 2e-05,
32
+ "loss": 0.7709,
 
 
 
 
 
 
 
 
33
  "step": 400
34
  },
35
  {
36
+ "epoch": 2.03,
37
  "learning_rate": 2e-05,
38
+ "loss": 0.3723,
 
 
 
 
 
 
 
 
39
  "step": 500
40
  },
41
  {
42
+ "epoch": 2.44,
43
  "learning_rate": 2e-05,
44
+ "loss": 0.3363,
45
  "step": 600
46
  },
47
  {
48
+ "epoch": 2.85,
 
 
 
 
 
 
 
 
49
  "learning_rate": 2e-05,
50
+ "loss": 0.3537,
51
  "step": 700
52
  },
53
  {
54
+ "epoch": 3.25,
 
 
 
 
 
 
 
 
55
  "learning_rate": 2e-05,
56
+ "loss": 0.1487,
 
 
 
 
 
 
 
 
57
  "step": 800
58
  },
59
  {
60
+ "epoch": 3.66,
61
  "learning_rate": 2e-05,
62
+ "loss": 0.1769,
 
 
 
 
 
 
 
 
63
  "step": 900
64
  },
65
  {
66
+ "epoch": 4.07,
67
  "learning_rate": 2e-05,
68
+ "loss": 0.112,
 
 
 
 
 
 
 
 
69
  "step": 1000
70
  },
71
  {
72
+ "epoch": 4.47,
73
  "learning_rate": 2e-05,
74
+ "loss": 0.1087,
75
  "step": 1100
76
  },
77
  {
78
+ "epoch": 4.88,
79
+ "learning_rate": 2e-05,
80
+ "loss": 0.134,
81
+ "step": 1200
 
 
82
  },
83
  {
84
+ "epoch": 5.28,
85
  "learning_rate": 2e-05,
86
+ "loss": 0.0765,
87
+ "step": 1300
88
  },
89
  {
90
+ "epoch": 5.69,
91
+ "learning_rate": 2e-05,
92
+ "loss": 0.1008,
93
+ "step": 1400
 
 
94
  },
95
  {
96
+ "epoch": 6.1,
97
  "learning_rate": 2e-05,
98
+ "loss": 0.1036,
99
+ "step": 1500
100
  },
101
  {
102
+ "epoch": 6.5,
103
+ "learning_rate": 2e-05,
104
+ "loss": 0.0787,
105
+ "step": 1600
 
 
106
  },
107
  {
108
+ "epoch": 6.91,
109
  "learning_rate": 2e-05,
110
+ "loss": 0.0881,
111
+ "step": 1700
112
  },
113
  {
114
+ "epoch": 7.32,
115
+ "learning_rate": 2e-05,
116
+ "loss": 0.0721,
117
+ "step": 1800
 
 
118
  },
119
  {
120
+ "epoch": 7.72,
121
  "learning_rate": 2e-05,
122
+ "loss": 0.0671,
123
+ "step": 1900
124
  },
125
  {
126
+ "epoch": 8.13,
127
+ "learning_rate": 2e-05,
128
+ "loss": 0.0551,
129
+ "step": 2000
 
 
130
  },
131
  {
132
+ "epoch": 8.54,
133
  "learning_rate": 2e-05,
134
+ "loss": 0.0654,
135
+ "step": 2100
136
  },
137
  {
138
+ "epoch": 8.94,
139
+ "learning_rate": 2e-05,
140
+ "loss": 0.0747,
141
+ "step": 2200
 
 
142
  }
143
  ],
144
  "logging_steps": 100,
145
+ "max_steps": 2205,
146
+ "num_train_epochs": 9,
147
+ "save_steps": 245,
148
  "total_flos": 0.0,
149
  "trial_name": null,
150
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c542b075d6cb9ea197a0a9c5bf12a21814b6b9c336def7a2a6e041ffceb5612
3
- size 6715
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc185baba748b6f040311f7d8ec8004bd4ae30f3802fbb54461bd7b4e96b9175
3
+ size 6779