shnl commited on
Commit
66c156a
1 Parent(s): 078e117

'instruction'

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "VietAI/vit5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 3072,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "relu",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "relu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": false,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "n_positions": 512,
20
+ "num_decoder_layers": 12,
21
+ "num_heads": 12,
22
+ "num_layers": 12,
23
+ "output_past": true,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.30.2",
29
+ "use_cache": true,
30
+ "vocab_size": 36096
31
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.30.2"
7
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45d931fba99c218d711fb10875cf1391e1d530b1473b6c8d5511e14e541dd449
3
+ size 1807760133
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:546c17891fd693b6a7806460deeb5dae57f3e4571d8673306243685c5d9f883d
3
+ size 903892625
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa402679ff7c808d1df7eb5944a4125d8cfd7b874b70b59ec9ed9c684ec28d7
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf928355da597edad3b10c86cba6df7361b9e35e5e45cc85d1ee1a4d4fa92559
3
+ size 627
special_tokens_map.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>"
99
+ ],
100
+ "eos_token": "</s>",
101
+ "pad_token": "<pad>",
102
+ "unk_token": "<unk>"
103
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59986b62f9f0b90edafb9b073ea7b93d21114a5841219a1ea2399ade73f729c6
3
+ size 820370
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>"
99
+ ],
100
+ "clean_up_tokenization_spaces": true,
101
+ "eos_token": "</s>",
102
+ "extra_ids": 96,
103
+ "model_max_length": 1000000000000000019884624838656,
104
+ "pad_token": "<pad>",
105
+ "sp_model_kwargs": {},
106
+ "tokenizer_class": "T5Tokenizer",
107
+ "unk_token": "<unk>"
108
+ }
trainer_state.json ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.974948758824869,
5
+ "global_step": 10950,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.23,
12
+ "learning_rate": 4.553734061930783e-05,
13
+ "loss": 4.5455,
14
+ "step": 250
15
+ },
16
+ {
17
+ "epoch": 0.23,
18
+ "eval_loss": 0.06012005731463432,
19
+ "eval_runtime": 125.7302,
20
+ "eval_samples_per_second": 19.86,
21
+ "eval_steps_per_second": 1.249,
22
+ "step": 250
23
+ },
24
+ {
25
+ "epoch": 0.46,
26
+ "learning_rate": 9.107468123861566e-05,
27
+ "loss": 0.0471,
28
+ "step": 500
29
+ },
30
+ {
31
+ "epoch": 0.46,
32
+ "eval_loss": 0.05808735638856888,
33
+ "eval_runtime": 125.3179,
34
+ "eval_samples_per_second": 19.925,
35
+ "eval_steps_per_second": 1.253,
36
+ "step": 500
37
+ },
38
+ {
39
+ "epoch": 0.68,
40
+ "learning_rate": 9.807120237981e-05,
41
+ "loss": 0.0443,
42
+ "step": 750
43
+ },
44
+ {
45
+ "epoch": 0.68,
46
+ "eval_loss": 0.05481741577386856,
47
+ "eval_runtime": 125.3239,
48
+ "eval_samples_per_second": 19.924,
49
+ "eval_steps_per_second": 1.253,
50
+ "step": 750
51
+ },
52
+ {
53
+ "epoch": 0.91,
54
+ "learning_rate": 9.567220036464831e-05,
55
+ "loss": 0.0428,
56
+ "step": 1000
57
+ },
58
+ {
59
+ "epoch": 0.91,
60
+ "eval_loss": 0.05039665102958679,
61
+ "eval_runtime": 125.3169,
62
+ "eval_samples_per_second": 19.925,
63
+ "eval_steps_per_second": 1.253,
64
+ "step": 1000
65
+ },
66
+ {
67
+ "epoch": 1.14,
68
+ "learning_rate": 9.327319834948663e-05,
69
+ "loss": 0.0329,
70
+ "step": 1250
71
+ },
72
+ {
73
+ "epoch": 1.14,
74
+ "eval_loss": 0.05058171600103378,
75
+ "eval_runtime": 125.3095,
76
+ "eval_samples_per_second": 19.927,
77
+ "eval_steps_per_second": 1.253,
78
+ "step": 1250
79
+ },
80
+ {
81
+ "epoch": 1.37,
82
+ "learning_rate": 9.087419633432492e-05,
83
+ "loss": 0.0299,
84
+ "step": 1500
85
+ },
86
+ {
87
+ "epoch": 1.37,
88
+ "eval_loss": 0.04842585325241089,
89
+ "eval_runtime": 125.3053,
90
+ "eval_samples_per_second": 19.927,
91
+ "eval_steps_per_second": 1.253,
92
+ "step": 1500
93
+ },
94
+ {
95
+ "epoch": 1.59,
96
+ "learning_rate": 8.847519431916324e-05,
97
+ "loss": 0.0295,
98
+ "step": 1750
99
+ },
100
+ {
101
+ "epoch": 1.59,
102
+ "eval_loss": 0.049905285239219666,
103
+ "eval_runtime": 125.3158,
104
+ "eval_samples_per_second": 19.926,
105
+ "eval_steps_per_second": 1.253,
106
+ "step": 1750
107
+ },
108
+ {
109
+ "epoch": 1.82,
110
+ "learning_rate": 8.607619230400153e-05,
111
+ "loss": 0.0304,
112
+ "step": 2000
113
+ },
114
+ {
115
+ "epoch": 1.82,
116
+ "eval_loss": 0.047520652413368225,
117
+ "eval_runtime": 125.3111,
118
+ "eval_samples_per_second": 19.926,
119
+ "eval_steps_per_second": 1.253,
120
+ "step": 2000
121
+ },
122
+ {
123
+ "epoch": 2.05,
124
+ "learning_rate": 8.367719028883985e-05,
125
+ "loss": 0.0277,
126
+ "step": 2250
127
+ },
128
+ {
129
+ "epoch": 2.05,
130
+ "eval_loss": 0.04981054365634918,
131
+ "eval_runtime": 125.3207,
132
+ "eval_samples_per_second": 19.925,
133
+ "eval_steps_per_second": 1.253,
134
+ "step": 2250
135
+ },
136
+ {
137
+ "epoch": 2.28,
138
+ "learning_rate": 8.127818827367816e-05,
139
+ "loss": 0.0186,
140
+ "step": 2500
141
+ },
142
+ {
143
+ "epoch": 2.28,
144
+ "eval_loss": 0.04637761414051056,
145
+ "eval_runtime": 125.3226,
146
+ "eval_samples_per_second": 19.925,
147
+ "eval_steps_per_second": 1.253,
148
+ "step": 2500
149
+ },
150
+ {
151
+ "epoch": 2.51,
152
+ "learning_rate": 7.887918625851645e-05,
153
+ "loss": 0.0199,
154
+ "step": 2750
155
+ },
156
+ {
157
+ "epoch": 2.51,
158
+ "eval_loss": 0.04944201186299324,
159
+ "eval_runtime": 125.3132,
160
+ "eval_samples_per_second": 19.926,
161
+ "eval_steps_per_second": 1.253,
162
+ "step": 2750
163
+ },
164
+ {
165
+ "epoch": 2.73,
166
+ "learning_rate": 7.648018424335477e-05,
167
+ "loss": 0.0205,
168
+ "step": 3000
169
+ },
170
+ {
171
+ "epoch": 2.73,
172
+ "eval_loss": 0.045388150960206985,
173
+ "eval_runtime": 125.3269,
174
+ "eval_samples_per_second": 19.924,
175
+ "eval_steps_per_second": 1.253,
176
+ "step": 3000
177
+ },
178
+ {
179
+ "epoch": 2.96,
180
+ "learning_rate": 7.408118222819308e-05,
181
+ "loss": 0.0202,
182
+ "step": 3250
183
+ },
184
+ {
185
+ "epoch": 2.96,
186
+ "eval_loss": 0.04644118994474411,
187
+ "eval_runtime": 125.317,
188
+ "eval_samples_per_second": 19.925,
189
+ "eval_steps_per_second": 1.253,
190
+ "step": 3250
191
+ },
192
+ {
193
+ "epoch": 3.19,
194
+ "learning_rate": 7.168218021303138e-05,
195
+ "loss": 0.015,
196
+ "step": 3500
197
+ },
198
+ {
199
+ "epoch": 3.19,
200
+ "eval_loss": 0.0492834635078907,
201
+ "eval_runtime": 125.3122,
202
+ "eval_samples_per_second": 19.926,
203
+ "eval_steps_per_second": 1.253,
204
+ "step": 3500
205
+ },
206
+ {
207
+ "epoch": 3.42,
208
+ "learning_rate": 6.928317819786969e-05,
209
+ "loss": 0.0136,
210
+ "step": 3750
211
+ },
212
+ {
213
+ "epoch": 3.42,
214
+ "eval_loss": 0.050950221717357635,
215
+ "eval_runtime": 125.3101,
216
+ "eval_samples_per_second": 19.927,
217
+ "eval_steps_per_second": 1.253,
218
+ "step": 3750
219
+ },
220
+ {
221
+ "epoch": 3.64,
222
+ "learning_rate": 6.6884176182708e-05,
223
+ "loss": 0.0139,
224
+ "step": 4000
225
+ },
226
+ {
227
+ "epoch": 3.64,
228
+ "eval_loss": 0.0521300733089447,
229
+ "eval_runtime": 125.3091,
230
+ "eval_samples_per_second": 19.927,
231
+ "eval_steps_per_second": 1.253,
232
+ "step": 4000
233
+ },
234
+ {
235
+ "epoch": 3.87,
236
+ "learning_rate": 6.44851741675463e-05,
237
+ "loss": 0.0149,
238
+ "step": 4250
239
+ },
240
+ {
241
+ "epoch": 3.87,
242
+ "eval_loss": 0.049005962908267975,
243
+ "eval_runtime": 125.2924,
244
+ "eval_samples_per_second": 19.929,
245
+ "eval_steps_per_second": 1.253,
246
+ "step": 4250
247
+ },
248
+ {
249
+ "epoch": 4.1,
250
+ "learning_rate": 6.208617215238462e-05,
251
+ "loss": 0.012,
252
+ "step": 4500
253
+ },
254
+ {
255
+ "epoch": 4.1,
256
+ "eval_loss": 0.05201614275574684,
257
+ "eval_runtime": 125.3236,
258
+ "eval_samples_per_second": 19.924,
259
+ "eval_steps_per_second": 1.253,
260
+ "step": 4500
261
+ },
262
+ {
263
+ "epoch": 4.33,
264
+ "learning_rate": 5.968717013722291e-05,
265
+ "loss": 0.0094,
266
+ "step": 4750
267
+ },
268
+ {
269
+ "epoch": 4.33,
270
+ "eval_loss": 0.052882954478263855,
271
+ "eval_runtime": 125.3182,
272
+ "eval_samples_per_second": 19.925,
273
+ "eval_steps_per_second": 1.253,
274
+ "step": 4750
275
+ },
276
+ {
277
+ "epoch": 4.55,
278
+ "learning_rate": 5.7288168122061226e-05,
279
+ "loss": 0.0104,
280
+ "step": 5000
281
+ },
282
+ {
283
+ "epoch": 4.55,
284
+ "eval_loss": 0.05443257838487625,
285
+ "eval_runtime": 125.308,
286
+ "eval_samples_per_second": 19.927,
287
+ "eval_steps_per_second": 1.253,
288
+ "step": 5000
289
+ },
290
+ {
291
+ "epoch": 4.78,
292
+ "learning_rate": 5.488916610689954e-05,
293
+ "loss": 0.0095,
294
+ "step": 5250
295
+ },
296
+ {
297
+ "epoch": 4.78,
298
+ "eval_loss": 0.0512896366417408,
299
+ "eval_runtime": 125.3186,
300
+ "eval_samples_per_second": 19.925,
301
+ "eval_steps_per_second": 1.253,
302
+ "step": 5250
303
+ },
304
+ {
305
+ "epoch": 5.01,
306
+ "learning_rate": 5.249016409173784e-05,
307
+ "loss": 0.0099,
308
+ "step": 5500
309
+ },
310
+ {
311
+ "epoch": 5.01,
312
+ "eval_loss": 0.05250149220228195,
313
+ "eval_runtime": 125.3202,
314
+ "eval_samples_per_second": 19.925,
315
+ "eval_steps_per_second": 1.253,
316
+ "step": 5500
317
+ },
318
+ {
319
+ "epoch": 5.24,
320
+ "learning_rate": 5.009116207657615e-05,
321
+ "loss": 0.0067,
322
+ "step": 5750
323
+ },
324
+ {
325
+ "epoch": 5.24,
326
+ "eval_loss": 0.05294517055153847,
327
+ "eval_runtime": 125.3036,
328
+ "eval_samples_per_second": 19.928,
329
+ "eval_steps_per_second": 1.253,
330
+ "step": 5750
331
+ },
332
+ {
333
+ "epoch": 5.47,
334
+ "learning_rate": 4.769216006141446e-05,
335
+ "loss": 0.0064,
336
+ "step": 6000
337
+ },
338
+ {
339
+ "epoch": 5.47,
340
+ "eval_loss": 0.05718787759542465,
341
+ "eval_runtime": 125.3174,
342
+ "eval_samples_per_second": 19.925,
343
+ "eval_steps_per_second": 1.253,
344
+ "step": 6000
345
+ },
346
+ {
347
+ "epoch": 5.69,
348
+ "learning_rate": 4.5293158046252756e-05,
349
+ "loss": 0.0062,
350
+ "step": 6250
351
+ },
352
+ {
353
+ "epoch": 5.69,
354
+ "eval_loss": 0.0588238462805748,
355
+ "eval_runtime": 125.32,
356
+ "eval_samples_per_second": 19.925,
357
+ "eval_steps_per_second": 1.253,
358
+ "step": 6250
359
+ },
360
+ {
361
+ "epoch": 5.92,
362
+ "learning_rate": 4.289415603109107e-05,
363
+ "loss": 0.0066,
364
+ "step": 6500
365
+ },
366
+ {
367
+ "epoch": 5.92,
368
+ "eval_loss": 0.055590804666280746,
369
+ "eval_runtime": 125.32,
370
+ "eval_samples_per_second": 19.925,
371
+ "eval_steps_per_second": 1.253,
372
+ "step": 6500
373
+ },
374
+ {
375
+ "epoch": 6.15,
376
+ "learning_rate": 4.0495154015929375e-05,
377
+ "loss": 0.0049,
378
+ "step": 6750
379
+ },
380
+ {
381
+ "epoch": 6.15,
382
+ "eval_loss": 0.060405001044273376,
383
+ "eval_runtime": 125.3175,
384
+ "eval_samples_per_second": 19.925,
385
+ "eval_steps_per_second": 1.253,
386
+ "step": 6750
387
+ },
388
+ {
389
+ "epoch": 6.38,
390
+ "learning_rate": 3.809615200076768e-05,
391
+ "loss": 0.0044,
392
+ "step": 7000
393
+ },
394
+ {
395
+ "epoch": 6.38,
396
+ "eval_loss": 0.0592646524310112,
397
+ "eval_runtime": 125.3136,
398
+ "eval_samples_per_second": 19.926,
399
+ "eval_steps_per_second": 1.253,
400
+ "step": 7000
401
+ },
402
+ {
403
+ "epoch": 6.6,
404
+ "learning_rate": 3.569714998560599e-05,
405
+ "loss": 0.0042,
406
+ "step": 7250
407
+ },
408
+ {
409
+ "epoch": 6.6,
410
+ "eval_loss": 0.059081513434648514,
411
+ "eval_runtime": 125.2994,
412
+ "eval_samples_per_second": 19.928,
413
+ "eval_steps_per_second": 1.253,
414
+ "step": 7250
415
+ },
416
+ {
417
+ "epoch": 6.83,
418
+ "learning_rate": 3.32981479704443e-05,
419
+ "loss": 0.0048,
420
+ "step": 7500
421
+ },
422
+ {
423
+ "epoch": 6.83,
424
+ "eval_loss": 0.06123210862278938,
425
+ "eval_runtime": 125.3056,
426
+ "eval_samples_per_second": 19.927,
427
+ "eval_steps_per_second": 1.253,
428
+ "step": 7500
429
+ },
430
+ {
431
+ "epoch": 7.06,
432
+ "learning_rate": 3.0899145955282606e-05,
433
+ "loss": 0.004,
434
+ "step": 7750
435
+ },
436
+ {
437
+ "epoch": 7.06,
438
+ "eval_loss": 0.060906291007995605,
439
+ "eval_runtime": 125.3113,
440
+ "eval_samples_per_second": 19.926,
441
+ "eval_steps_per_second": 1.253,
442
+ "step": 7750
443
+ },
444
+ {
445
+ "epoch": 7.29,
446
+ "learning_rate": 2.850014394012091e-05,
447
+ "loss": 0.003,
448
+ "step": 8000
449
+ },
450
+ {
451
+ "epoch": 7.29,
452
+ "eval_loss": 0.06742047518491745,
453
+ "eval_runtime": 125.3117,
454
+ "eval_samples_per_second": 19.926,
455
+ "eval_steps_per_second": 1.253,
456
+ "step": 8000
457
+ },
458
+ {
459
+ "epoch": 7.52,
460
+ "learning_rate": 2.6101141924959215e-05,
461
+ "loss": 0.003,
462
+ "step": 8250
463
+ },
464
+ {
465
+ "epoch": 7.52,
466
+ "eval_loss": 0.0640687569975853,
467
+ "eval_runtime": 125.31,
468
+ "eval_samples_per_second": 19.927,
469
+ "eval_steps_per_second": 1.253,
470
+ "step": 8250
471
+ },
472
+ {
473
+ "epoch": 7.74,
474
+ "learning_rate": 2.3702139909797524e-05,
475
+ "loss": 0.0027,
476
+ "step": 8500
477
+ },
478
+ {
479
+ "epoch": 7.74,
480
+ "eval_loss": 0.06774434447288513,
481
+ "eval_runtime": 125.316,
482
+ "eval_samples_per_second": 19.926,
483
+ "eval_steps_per_second": 1.253,
484
+ "step": 8500
485
+ },
486
+ {
487
+ "epoch": 7.97,
488
+ "learning_rate": 2.1303137894635834e-05,
489
+ "loss": 0.0028,
490
+ "step": 8750
491
+ },
492
+ {
493
+ "epoch": 7.97,
494
+ "eval_loss": 0.06737840920686722,
495
+ "eval_runtime": 125.323,
496
+ "eval_samples_per_second": 19.925,
497
+ "eval_steps_per_second": 1.253,
498
+ "step": 8750
499
+ },
500
+ {
501
+ "epoch": 8.2,
502
+ "learning_rate": 1.890413587947414e-05,
503
+ "loss": 0.0021,
504
+ "step": 9000
505
+ },
506
+ {
507
+ "epoch": 8.2,
508
+ "eval_loss": 0.06941425800323486,
509
+ "eval_runtime": 125.3147,
510
+ "eval_samples_per_second": 19.926,
511
+ "eval_steps_per_second": 1.253,
512
+ "step": 9000
513
+ },
514
+ {
515
+ "epoch": 8.43,
516
+ "learning_rate": 1.6505133864312446e-05,
517
+ "loss": 0.0018,
518
+ "step": 9250
519
+ },
520
+ {
521
+ "epoch": 8.43,
522
+ "eval_loss": 0.07149343937635422,
523
+ "eval_runtime": 125.3155,
524
+ "eval_samples_per_second": 19.926,
525
+ "eval_steps_per_second": 1.253,
526
+ "step": 9250
527
+ },
528
+ {
529
+ "epoch": 8.65,
530
+ "learning_rate": 1.4106131849150753e-05,
531
+ "loss": 0.0021,
532
+ "step": 9500
533
+ },
534
+ {
535
+ "epoch": 8.65,
536
+ "eval_loss": 0.06807977706193924,
537
+ "eval_runtime": 125.3164,
538
+ "eval_samples_per_second": 19.926,
539
+ "eval_steps_per_second": 1.253,
540
+ "step": 9500
541
+ },
542
+ {
543
+ "epoch": 8.88,
544
+ "learning_rate": 1.1707129833989061e-05,
545
+ "loss": 0.0017,
546
+ "step": 9750
547
+ },
548
+ {
549
+ "epoch": 8.88,
550
+ "eval_loss": 0.07044515013694763,
551
+ "eval_runtime": 125.3307,
552
+ "eval_samples_per_second": 19.923,
553
+ "eval_steps_per_second": 1.253,
554
+ "step": 9750
555
+ },
556
+ {
557
+ "epoch": 9.11,
558
+ "learning_rate": 9.308127818827369e-06,
559
+ "loss": 0.0014,
560
+ "step": 10000
561
+ },
562
+ {
563
+ "epoch": 9.11,
564
+ "eval_loss": 0.07252407819032669,
565
+ "eval_runtime": 125.329,
566
+ "eval_samples_per_second": 19.924,
567
+ "eval_steps_per_second": 1.253,
568
+ "step": 10000
569
+ },
570
+ {
571
+ "epoch": 9.34,
572
+ "learning_rate": 6.909125803665675e-06,
573
+ "loss": 0.0012,
574
+ "step": 10250
575
+ },
576
+ {
577
+ "epoch": 9.34,
578
+ "eval_loss": 0.07298342883586884,
579
+ "eval_runtime": 125.3163,
580
+ "eval_samples_per_second": 19.926,
581
+ "eval_steps_per_second": 1.253,
582
+ "step": 10250
583
+ },
584
+ {
585
+ "epoch": 9.57,
586
+ "learning_rate": 4.510123788503983e-06,
587
+ "loss": 0.0012,
588
+ "step": 10500
589
+ },
590
+ {
591
+ "epoch": 9.57,
592
+ "eval_loss": 0.07320970296859741,
593
+ "eval_runtime": 125.318,
594
+ "eval_samples_per_second": 19.925,
595
+ "eval_steps_per_second": 1.253,
596
+ "step": 10500
597
+ },
598
+ {
599
+ "epoch": 9.79,
600
+ "learning_rate": 2.11112177334229e-06,
601
+ "loss": 0.0013,
602
+ "step": 10750
603
+ },
604
+ {
605
+ "epoch": 9.79,
606
+ "eval_loss": 0.07287949323654175,
607
+ "eval_runtime": 125.3257,
608
+ "eval_samples_per_second": 19.924,
609
+ "eval_steps_per_second": 1.253,
610
+ "step": 10750
611
+ }
612
+ ],
613
+ "max_steps": 10970,
614
+ "num_train_epochs": 10,
615
+ "total_flos": 2.0454829414889472e+17,
616
+ "trial_name": null,
617
+ "trial_params": null
618
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4ab448b7200df8a6ea7c01858ce1acb157a82a9450e57880108f11850920715
3
+ size 4027