ypl commited on
Commit
9784007
·
verified ·
1 Parent(s): 166ab0f

End of training

Browse files
README.md CHANGED
@@ -1,6 +1,4 @@
1
  ---
2
- license: apache-2.0
3
- base_model: facebook/bart-base
4
  tags:
5
  - generated_from_trainer
6
  model-index:
@@ -13,9 +11,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # bart_test_p2
15
 
16
- This model is a fine-tuned version of [facebook/bart-base](https://huggingface.co/facebook/bart-base) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.0481
19
 
20
  ## Model description
21
 
@@ -34,32 +32,56 @@ More information needed
34
  ### Training hyperparameters
35
 
36
  The following hyperparameters were used during training:
37
- - learning_rate: 0.0001
38
  - train_batch_size: 8
39
  - eval_batch_size: 8
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
43
- - num_epochs: 5
44
 
45
  ### Training results
46
 
47
- | Training Loss | Epoch | Step | Validation Loss |
48
- |:-------------:|:-----:|:----:|:---------------:|
49
- | 0.1378 | 0.34 | 500 | 0.1011 |
50
- | 0.0977 | 0.67 | 1000 | 0.0810 |
51
- | 0.0802 | 1.01 | 1500 | 0.0678 |
52
- | 0.0533 | 1.35 | 2000 | 0.0639 |
53
- | 0.0534 | 1.69 | 2500 | 0.0560 |
54
- | 0.0435 | 2.02 | 3000 | 0.0531 |
55
- | 0.0303 | 2.36 | 3500 | 0.0544 |
56
- | 0.0323 | 2.7 | 4000 | 0.0521 |
57
- | 0.0254 | 3.04 | 4500 | 0.0488 |
58
- | 0.022 | 3.37 | 5000 | 0.0490 |
59
- | 0.0199 | 3.71 | 5500 | 0.0480 |
60
- | 0.0142 | 4.05 | 6000 | 0.0477 |
61
- | 0.0134 | 4.39 | 6500 | 0.0481 |
62
- | 0.0113 | 4.72 | 7000 | 0.0481 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  ### Framework versions
 
1
  ---
 
 
2
  tags:
3
  - generated_from_trainer
4
  model-index:
 
11
 
12
  # bart_test_p2
13
 
14
+ This model was trained from scratch on the None dataset.
15
  It achieves the following results on the evaluation set:
16
+ - Loss: 0.0194
17
 
18
  ## Model description
19
 
 
32
  ### Training hyperparameters
33
 
34
  The following hyperparameters were used during training:
35
+ - learning_rate: 1e-05
36
  - train_batch_size: 8
37
  - eval_batch_size: 8
38
  - seed: 42
39
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
40
  - lr_scheduler_type: linear
41
+ - num_epochs: 3
42
 
43
  ### Training results
44
 
45
+ | Training Loss | Epoch | Step | Validation Loss |
46
+ |:-------------:|:-----:|:-----:|:---------------:|
47
+ | 0.0273 | 0.08 | 500 | 0.0224 |
48
+ | 0.0255 | 0.16 | 1000 | 0.0215 |
49
+ | 0.0245 | 0.24 | 1500 | 0.0213 |
50
+ | 0.0234 | 0.32 | 2000 | 0.0211 |
51
+ | 0.025 | 0.39 | 2500 | 0.0207 |
52
+ | 0.0243 | 0.47 | 3000 | 0.0208 |
53
+ | 0.0236 | 0.55 | 3500 | 0.0206 |
54
+ | 0.0246 | 0.63 | 4000 | 0.0204 |
55
+ | 0.0235 | 0.71 | 4500 | 0.0202 |
56
+ | 0.0231 | 0.79 | 5000 | 0.0203 |
57
+ | 0.0221 | 0.87 | 5500 | 0.0201 |
58
+ | 0.0239 | 0.95 | 6000 | 0.0199 |
59
+ | 0.0209 | 1.03 | 6500 | 0.0200 |
60
+ | 0.0193 | 1.1 | 7000 | 0.0198 |
61
+ | 0.0207 | 1.18 | 7500 | 0.0199 |
62
+ | 0.0189 | 1.26 | 8000 | 0.0201 |
63
+ | 0.0193 | 1.34 | 8500 | 0.0200 |
64
+ | 0.0186 | 1.42 | 9000 | 0.0197 |
65
+ | 0.0199 | 1.5 | 9500 | 0.0197 |
66
+ | 0.0207 | 1.58 | 10000 | 0.0195 |
67
+ | 0.0199 | 1.66 | 10500 | 0.0196 |
68
+ | 0.0188 | 1.74 | 11000 | 0.0195 |
69
+ | 0.0194 | 1.81 | 11500 | 0.0194 |
70
+ | 0.0201 | 1.89 | 12000 | 0.0195 |
71
+ | 0.0181 | 1.97 | 12500 | 0.0194 |
72
+ | 0.0177 | 2.05 | 13000 | 0.0194 |
73
+ | 0.0161 | 2.13 | 13500 | 0.0196 |
74
+ | 0.0172 | 2.21 | 14000 | 0.0195 |
75
+ | 0.0184 | 2.29 | 14500 | 0.0195 |
76
+ | 0.0168 | 2.37 | 15000 | 0.0195 |
77
+ | 0.0176 | 2.44 | 15500 | 0.0194 |
78
+ | 0.0177 | 2.52 | 16000 | 0.0194 |
79
+ | 0.0158 | 2.6 | 16500 | 0.0194 |
80
+ | 0.0177 | 2.68 | 17000 | 0.0193 |
81
+ | 0.0179 | 2.76 | 17500 | 0.0193 |
82
+ | 0.0167 | 2.84 | 18000 | 0.0194 |
83
+ | 0.0177 | 2.92 | 18500 | 0.0193 |
84
+ | 0.0171 | 3.0 | 19000 | 0.0194 |
85
 
86
 
87
  ### Framework versions
backup_checkpoint-12000/config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-base",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 2,
27
+ "forced_bos_token_id": 0,
28
+ "forced_eos_token_id": 2,
29
+ "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
+ "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_position_embeddings": 1024,
43
+ "model_type": "bart",
44
+ "no_repeat_ngram_size": 3,
45
+ "normalize_before": false,
46
+ "normalize_embedding": true,
47
+ "num_beams": 4,
48
+ "num_hidden_layers": 6,
49
+ "pad_token_id": 1,
50
+ "scale_embedding": false,
51
+ "task_specific_params": {
52
+ "summarization": {
53
+ "length_penalty": 1.0,
54
+ "max_length": 128,
55
+ "min_length": 12,
56
+ "num_beams": 4
57
+ },
58
+ "summarization_cnn": {
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "num_beams": 4
63
+ },
64
+ "summarization_xsum": {
65
+ "length_penalty": 1.0,
66
+ "max_length": 62,
67
+ "min_length": 11,
68
+ "num_beams": 6
69
+ }
70
+ },
71
+ "torch_dtype": "float32",
72
+ "transformers_version": "4.37.0.dev0",
73
+ "use_cache": true,
74
+ "vocab_size": 50265
75
+ }
backup_checkpoint-12000/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": 2,
7
+ "forced_bos_token_id": 0,
8
+ "forced_eos_token_id": 2,
9
+ "no_repeat_ngram_size": 3,
10
+ "num_beams": 4,
11
+ "pad_token_id": 1,
12
+ "transformers_version": "4.37.0.dev0"
13
+ }
backup_checkpoint-12000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5990d57924b063382afa18e7f1e8a07e2208b81d4c47b82a69b18a27be4f3a0
3
+ size 557912620
backup_checkpoint-12000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01995ea2be2ccd0fa68c02532530506f317d4b6e7c0734555d85d1b276da1a11
3
+ size 1115579898
backup_checkpoint-12000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad3b281e73f1cb93b0cc5cffda813f0a4ed9d78ba586dc62adf279c06a2b7600
3
+ size 14244
backup_checkpoint-12000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:439eb315d94372a87edf0a17384904142733c05e44126c626d64f9d13dc8e1a9
3
+ size 1064
backup_checkpoint-12000/trainer_state.json ADDED
@@ -0,0 +1,939 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.8927444794952681,
5
+ "eval_steps": 500,
6
+ "global_step": 12000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 9.999684542586752e-05,
14
+ "loss": 12.4787,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.02,
19
+ "learning_rate": 9.96845425867508e-05,
20
+ "loss": 1.3146,
21
+ "step": 100
22
+ },
23
+ {
24
+ "epoch": 0.03,
25
+ "learning_rate": 9.936908517350158e-05,
26
+ "loss": 0.0928,
27
+ "step": 200
28
+ },
29
+ {
30
+ "epoch": 0.05,
31
+ "learning_rate": 9.905362776025237e-05,
32
+ "loss": 0.0903,
33
+ "step": 300
34
+ },
35
+ {
36
+ "epoch": 0.06,
37
+ "learning_rate": 9.873817034700316e-05,
38
+ "loss": 0.0675,
39
+ "step": 400
40
+ },
41
+ {
42
+ "epoch": 0.08,
43
+ "learning_rate": 9.842271293375394e-05,
44
+ "loss": 0.0744,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.08,
49
+ "eval_loss": 0.06298290193080902,
50
+ "eval_runtime": 191.3725,
51
+ "eval_samples_per_second": 113.574,
52
+ "eval_steps_per_second": 14.197,
53
+ "step": 500
54
+ },
55
+ {
56
+ "epoch": 0.09,
57
+ "learning_rate": 9.810725552050474e-05,
58
+ "loss": 0.0682,
59
+ "step": 600
60
+ },
61
+ {
62
+ "epoch": 0.11,
63
+ "learning_rate": 9.779179810725552e-05,
64
+ "loss": 0.072,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.13,
69
+ "learning_rate": 9.747634069400632e-05,
70
+ "loss": 0.0731,
71
+ "step": 800
72
+ },
73
+ {
74
+ "epoch": 0.14,
75
+ "learning_rate": 9.71608832807571e-05,
76
+ "loss": 0.0638,
77
+ "step": 900
78
+ },
79
+ {
80
+ "epoch": 0.16,
81
+ "learning_rate": 9.684542586750788e-05,
82
+ "loss": 0.062,
83
+ "step": 1000
84
+ },
85
+ {
86
+ "epoch": 0.16,
87
+ "eval_loss": 0.056409742683172226,
88
+ "eval_runtime": 191.8919,
89
+ "eval_samples_per_second": 113.267,
90
+ "eval_steps_per_second": 14.159,
91
+ "step": 1000
92
+ },
93
+ {
94
+ "epoch": 0.17,
95
+ "learning_rate": 9.652996845425868e-05,
96
+ "loss": 0.0663,
97
+ "step": 1100
98
+ },
99
+ {
100
+ "epoch": 0.19,
101
+ "learning_rate": 9.621451104100947e-05,
102
+ "loss": 0.0598,
103
+ "step": 1200
104
+ },
105
+ {
106
+ "epoch": 0.21,
107
+ "learning_rate": 9.589905362776026e-05,
108
+ "loss": 0.0625,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 0.22,
113
+ "learning_rate": 9.558359621451105e-05,
114
+ "loss": 0.0673,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.24,
119
+ "learning_rate": 9.526813880126184e-05,
120
+ "loss": 0.0625,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 0.24,
125
+ "eval_loss": 0.05498537793755531,
126
+ "eval_runtime": 191.898,
127
+ "eval_samples_per_second": 113.263,
128
+ "eval_steps_per_second": 14.159,
129
+ "step": 1500
130
+ },
131
+ {
132
+ "epoch": 0.25,
133
+ "learning_rate": 9.495268138801262e-05,
134
+ "loss": 0.0606,
135
+ "step": 1600
136
+ },
137
+ {
138
+ "epoch": 0.27,
139
+ "learning_rate": 9.463722397476341e-05,
140
+ "loss": 0.0684,
141
+ "step": 1700
142
+ },
143
+ {
144
+ "epoch": 0.28,
145
+ "learning_rate": 9.43217665615142e-05,
146
+ "loss": 0.0607,
147
+ "step": 1800
148
+ },
149
+ {
150
+ "epoch": 0.3,
151
+ "learning_rate": 9.400630914826499e-05,
152
+ "loss": 0.0679,
153
+ "step": 1900
154
+ },
155
+ {
156
+ "epoch": 0.32,
157
+ "learning_rate": 9.369085173501577e-05,
158
+ "loss": 0.0633,
159
+ "step": 2000
160
+ },
161
+ {
162
+ "epoch": 0.32,
163
+ "eval_loss": 0.052233804017305374,
164
+ "eval_runtime": 191.8762,
165
+ "eval_samples_per_second": 113.276,
166
+ "eval_steps_per_second": 14.16,
167
+ "step": 2000
168
+ },
169
+ {
170
+ "epoch": 0.33,
171
+ "learning_rate": 9.337539432176656e-05,
172
+ "loss": 0.061,
173
+ "step": 2100
174
+ },
175
+ {
176
+ "epoch": 0.35,
177
+ "learning_rate": 9.305993690851735e-05,
178
+ "loss": 0.0715,
179
+ "step": 2200
180
+ },
181
+ {
182
+ "epoch": 0.36,
183
+ "learning_rate": 9.274447949526815e-05,
184
+ "loss": 0.0586,
185
+ "step": 2300
186
+ },
187
+ {
188
+ "epoch": 0.38,
189
+ "learning_rate": 9.242902208201893e-05,
190
+ "loss": 0.0581,
191
+ "step": 2400
192
+ },
193
+ {
194
+ "epoch": 0.39,
195
+ "learning_rate": 9.211356466876973e-05,
196
+ "loss": 0.0588,
197
+ "step": 2500
198
+ },
199
+ {
200
+ "epoch": 0.39,
201
+ "eval_loss": 0.05040860176086426,
202
+ "eval_runtime": 191.6049,
203
+ "eval_samples_per_second": 113.437,
204
+ "eval_steps_per_second": 14.18,
205
+ "step": 2500
206
+ },
207
+ {
208
+ "epoch": 0.41,
209
+ "learning_rate": 9.179810725552051e-05,
210
+ "loss": 0.0572,
211
+ "step": 2600
212
+ },
213
+ {
214
+ "epoch": 0.43,
215
+ "learning_rate": 9.148264984227129e-05,
216
+ "loss": 0.0555,
217
+ "step": 2700
218
+ },
219
+ {
220
+ "epoch": 0.44,
221
+ "learning_rate": 9.116719242902209e-05,
222
+ "loss": 0.0547,
223
+ "step": 2800
224
+ },
225
+ {
226
+ "epoch": 0.46,
227
+ "learning_rate": 9.085173501577287e-05,
228
+ "loss": 0.0596,
229
+ "step": 2900
230
+ },
231
+ {
232
+ "epoch": 0.47,
233
+ "learning_rate": 9.053627760252367e-05,
234
+ "loss": 0.0593,
235
+ "step": 3000
236
+ },
237
+ {
238
+ "epoch": 0.47,
239
+ "eval_loss": 0.04637610912322998,
240
+ "eval_runtime": 192.1829,
241
+ "eval_samples_per_second": 113.095,
242
+ "eval_steps_per_second": 14.138,
243
+ "step": 3000
244
+ },
245
+ {
246
+ "epoch": 0.49,
247
+ "learning_rate": 9.022082018927446e-05,
248
+ "loss": 0.0536,
249
+ "step": 3100
250
+ },
251
+ {
252
+ "epoch": 0.5,
253
+ "learning_rate": 8.990536277602523e-05,
254
+ "loss": 0.0506,
255
+ "step": 3200
256
+ },
257
+ {
258
+ "epoch": 0.52,
259
+ "learning_rate": 8.958990536277603e-05,
260
+ "loss": 0.0601,
261
+ "step": 3300
262
+ },
263
+ {
264
+ "epoch": 0.54,
265
+ "learning_rate": 8.927444794952682e-05,
266
+ "loss": 0.05,
267
+ "step": 3400
268
+ },
269
+ {
270
+ "epoch": 0.55,
271
+ "learning_rate": 8.89589905362776e-05,
272
+ "loss": 0.0492,
273
+ "step": 3500
274
+ },
275
+ {
276
+ "epoch": 0.55,
277
+ "eval_loss": 0.0455920547246933,
278
+ "eval_runtime": 192.0796,
279
+ "eval_samples_per_second": 113.156,
280
+ "eval_steps_per_second": 14.145,
281
+ "step": 3500
282
+ },
283
+ {
284
+ "epoch": 0.57,
285
+ "learning_rate": 8.86435331230284e-05,
286
+ "loss": 0.0547,
287
+ "step": 3600
288
+ },
289
+ {
290
+ "epoch": 0.58,
291
+ "learning_rate": 8.832807570977918e-05,
292
+ "loss": 0.0582,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 0.6,
297
+ "learning_rate": 8.801261829652997e-05,
298
+ "loss": 0.0535,
299
+ "step": 3800
300
+ },
301
+ {
302
+ "epoch": 0.62,
303
+ "learning_rate": 8.769716088328076e-05,
304
+ "loss": 0.0554,
305
+ "step": 3900
306
+ },
307
+ {
308
+ "epoch": 0.63,
309
+ "learning_rate": 8.738170347003154e-05,
310
+ "loss": 0.0506,
311
+ "step": 4000
312
+ },
313
+ {
314
+ "epoch": 0.63,
315
+ "eval_loss": 0.04419832304120064,
316
+ "eval_runtime": 192.193,
317
+ "eval_samples_per_second": 113.089,
318
+ "eval_steps_per_second": 14.137,
319
+ "step": 4000
320
+ },
321
+ {
322
+ "epoch": 0.65,
323
+ "learning_rate": 8.706624605678234e-05,
324
+ "loss": 0.0508,
325
+ "step": 4100
326
+ },
327
+ {
328
+ "epoch": 0.66,
329
+ "learning_rate": 8.675078864353314e-05,
330
+ "loss": 0.0566,
331
+ "step": 4200
332
+ },
333
+ {
334
+ "epoch": 0.68,
335
+ "learning_rate": 8.64353312302839e-05,
336
+ "loss": 0.0556,
337
+ "step": 4300
338
+ },
339
+ {
340
+ "epoch": 0.69,
341
+ "learning_rate": 8.61198738170347e-05,
342
+ "loss": 0.0595,
343
+ "step": 4400
344
+ },
345
+ {
346
+ "epoch": 0.71,
347
+ "learning_rate": 8.58044164037855e-05,
348
+ "loss": 0.0584,
349
+ "step": 4500
350
+ },
351
+ {
352
+ "epoch": 0.71,
353
+ "eval_loss": 0.043363627046346664,
354
+ "eval_runtime": 191.6803,
355
+ "eval_samples_per_second": 113.392,
356
+ "eval_steps_per_second": 14.175,
357
+ "step": 4500
358
+ },
359
+ {
360
+ "epoch": 0.73,
361
+ "learning_rate": 8.548895899053628e-05,
362
+ "loss": 0.0546,
363
+ "step": 4600
364
+ },
365
+ {
366
+ "epoch": 0.74,
367
+ "learning_rate": 8.517350157728708e-05,
368
+ "loss": 0.0457,
369
+ "step": 4700
370
+ },
371
+ {
372
+ "epoch": 0.76,
373
+ "learning_rate": 8.485804416403787e-05,
374
+ "loss": 0.0531,
375
+ "step": 4800
376
+ },
377
+ {
378
+ "epoch": 0.77,
379
+ "learning_rate": 8.454258675078864e-05,
380
+ "loss": 0.0506,
381
+ "step": 4900
382
+ },
383
+ {
384
+ "epoch": 0.79,
385
+ "learning_rate": 8.422712933753944e-05,
386
+ "loss": 0.0455,
387
+ "step": 5000
388
+ },
389
+ {
390
+ "epoch": 0.79,
391
+ "eval_loss": 0.04175921157002449,
392
+ "eval_runtime": 192.0012,
393
+ "eval_samples_per_second": 113.202,
394
+ "eval_steps_per_second": 14.151,
395
+ "step": 5000
396
+ },
397
+ {
398
+ "epoch": 0.8,
399
+ "learning_rate": 8.391167192429022e-05,
400
+ "loss": 0.053,
401
+ "step": 5100
402
+ },
403
+ {
404
+ "epoch": 0.82,
405
+ "learning_rate": 8.359621451104101e-05,
406
+ "loss": 0.0499,
407
+ "step": 5200
408
+ },
409
+ {
410
+ "epoch": 0.84,
411
+ "learning_rate": 8.328075709779181e-05,
412
+ "loss": 0.0407,
413
+ "step": 5300
414
+ },
415
+ {
416
+ "epoch": 0.85,
417
+ "learning_rate": 8.296529968454258e-05,
418
+ "loss": 0.0559,
419
+ "step": 5400
420
+ },
421
+ {
422
+ "epoch": 0.87,
423
+ "learning_rate": 8.264984227129337e-05,
424
+ "loss": 0.0477,
425
+ "step": 5500
426
+ },
427
+ {
428
+ "epoch": 0.87,
429
+ "eval_loss": 0.042047169059515,
430
+ "eval_runtime": 192.1,
431
+ "eval_samples_per_second": 113.144,
432
+ "eval_steps_per_second": 14.144,
433
+ "step": 5500
434
+ },
435
+ {
436
+ "epoch": 0.88,
437
+ "learning_rate": 8.233438485804417e-05,
438
+ "loss": 0.0488,
439
+ "step": 5600
440
+ },
441
+ {
442
+ "epoch": 0.9,
443
+ "learning_rate": 8.201892744479495e-05,
444
+ "loss": 0.0537,
445
+ "step": 5700
446
+ },
447
+ {
448
+ "epoch": 0.91,
449
+ "learning_rate": 8.170347003154575e-05,
450
+ "loss": 0.0515,
451
+ "step": 5800
452
+ },
453
+ {
454
+ "epoch": 0.93,
455
+ "learning_rate": 8.138801261829655e-05,
456
+ "loss": 0.0511,
457
+ "step": 5900
458
+ },
459
+ {
460
+ "epoch": 0.95,
461
+ "learning_rate": 8.107255520504731e-05,
462
+ "loss": 0.0548,
463
+ "step": 6000
464
+ },
465
+ {
466
+ "epoch": 0.95,
467
+ "eval_loss": 0.039724551141262054,
468
+ "eval_runtime": 191.9769,
469
+ "eval_samples_per_second": 113.217,
470
+ "eval_steps_per_second": 14.153,
471
+ "step": 6000
472
+ },
473
+ {
474
+ "epoch": 0.96,
475
+ "learning_rate": 8.075709779179811e-05,
476
+ "loss": 0.0482,
477
+ "step": 6100
478
+ },
479
+ {
480
+ "epoch": 0.98,
481
+ "learning_rate": 8.04416403785489e-05,
482
+ "loss": 0.0474,
483
+ "step": 6200
484
+ },
485
+ {
486
+ "epoch": 0.99,
487
+ "learning_rate": 8.012618296529969e-05,
488
+ "loss": 0.0488,
489
+ "step": 6300
490
+ },
491
+ {
492
+ "epoch": 1.01,
493
+ "learning_rate": 7.981072555205048e-05,
494
+ "loss": 0.0346,
495
+ "step": 6400
496
+ },
497
+ {
498
+ "epoch": 1.03,
499
+ "learning_rate": 7.949526813880127e-05,
500
+ "loss": 0.035,
501
+ "step": 6500
502
+ },
503
+ {
504
+ "epoch": 1.03,
505
+ "eval_loss": 0.039325978606939316,
506
+ "eval_runtime": 191.6608,
507
+ "eval_samples_per_second": 113.403,
508
+ "eval_steps_per_second": 14.176,
509
+ "step": 6500
510
+ },
511
+ {
512
+ "epoch": 1.04,
513
+ "learning_rate": 7.917981072555205e-05,
514
+ "loss": 0.0355,
515
+ "step": 6600
516
+ },
517
+ {
518
+ "epoch": 1.06,
519
+ "learning_rate": 7.886435331230284e-05,
520
+ "loss": 0.0363,
521
+ "step": 6700
522
+ },
523
+ {
524
+ "epoch": 1.07,
525
+ "learning_rate": 7.854889589905363e-05,
526
+ "loss": 0.0407,
527
+ "step": 6800
528
+ },
529
+ {
530
+ "epoch": 1.09,
531
+ "learning_rate": 7.823343848580442e-05,
532
+ "loss": 0.0385,
533
+ "step": 6900
534
+ },
535
+ {
536
+ "epoch": 1.1,
537
+ "learning_rate": 7.791798107255522e-05,
538
+ "loss": 0.0347,
539
+ "step": 7000
540
+ },
541
+ {
542
+ "epoch": 1.1,
543
+ "eval_loss": 0.03924637660384178,
544
+ "eval_runtime": 192.1827,
545
+ "eval_samples_per_second": 113.096,
546
+ "eval_steps_per_second": 14.138,
547
+ "step": 7000
548
+ },
549
+ {
550
+ "epoch": 1.12,
551
+ "learning_rate": 7.760252365930599e-05,
552
+ "loss": 0.0357,
553
+ "step": 7100
554
+ },
555
+ {
556
+ "epoch": 1.14,
557
+ "learning_rate": 7.728706624605678e-05,
558
+ "loss": 0.041,
559
+ "step": 7200
560
+ },
561
+ {
562
+ "epoch": 1.15,
563
+ "learning_rate": 7.697160883280758e-05,
564
+ "loss": 0.0313,
565
+ "step": 7300
566
+ },
567
+ {
568
+ "epoch": 1.17,
569
+ "learning_rate": 7.665615141955836e-05,
570
+ "loss": 0.0336,
571
+ "step": 7400
572
+ },
573
+ {
574
+ "epoch": 1.18,
575
+ "learning_rate": 7.634069400630916e-05,
576
+ "loss": 0.0303,
577
+ "step": 7500
578
+ },
579
+ {
580
+ "epoch": 1.18,
581
+ "eval_loss": 0.03979608044028282,
582
+ "eval_runtime": 192.103,
583
+ "eval_samples_per_second": 113.142,
584
+ "eval_steps_per_second": 14.143,
585
+ "step": 7500
586
+ },
587
+ {
588
+ "epoch": 1.2,
589
+ "learning_rate": 7.602523659305994e-05,
590
+ "loss": 0.0356,
591
+ "step": 7600
592
+ },
593
+ {
594
+ "epoch": 1.21,
595
+ "learning_rate": 7.570977917981072e-05,
596
+ "loss": 0.0379,
597
+ "step": 7700
598
+ },
599
+ {
600
+ "epoch": 1.23,
601
+ "learning_rate": 7.539432176656152e-05,
602
+ "loss": 0.0365,
603
+ "step": 7800
604
+ },
605
+ {
606
+ "epoch": 1.25,
607
+ "learning_rate": 7.50788643533123e-05,
608
+ "loss": 0.0393,
609
+ "step": 7900
610
+ },
611
+ {
612
+ "epoch": 1.26,
613
+ "learning_rate": 7.47634069400631e-05,
614
+ "loss": 0.0374,
615
+ "step": 8000
616
+ },
617
+ {
618
+ "epoch": 1.26,
619
+ "eval_loss": 0.038463614881038666,
620
+ "eval_runtime": 192.1601,
621
+ "eval_samples_per_second": 113.109,
622
+ "eval_steps_per_second": 14.139,
623
+ "step": 8000
624
+ },
625
+ {
626
+ "epoch": 1.28,
627
+ "learning_rate": 7.444794952681389e-05,
628
+ "loss": 0.0303,
629
+ "step": 8100
630
+ },
631
+ {
632
+ "epoch": 1.29,
633
+ "learning_rate": 7.413249211356468e-05,
634
+ "loss": 0.0384,
635
+ "step": 8200
636
+ },
637
+ {
638
+ "epoch": 1.31,
639
+ "learning_rate": 7.381703470031546e-05,
640
+ "loss": 0.0383,
641
+ "step": 8300
642
+ },
643
+ {
644
+ "epoch": 1.32,
645
+ "learning_rate": 7.350157728706625e-05,
646
+ "loss": 0.0378,
647
+ "step": 8400
648
+ },
649
+ {
650
+ "epoch": 1.34,
651
+ "learning_rate": 7.318611987381704e-05,
652
+ "loss": 0.0343,
653
+ "step": 8500
654
+ },
655
+ {
656
+ "epoch": 1.34,
657
+ "eval_loss": 0.03961439058184624,
658
+ "eval_runtime": 192.1114,
659
+ "eval_samples_per_second": 113.137,
660
+ "eval_steps_per_second": 14.143,
661
+ "step": 8500
662
+ },
663
+ {
664
+ "epoch": 1.36,
665
+ "learning_rate": 7.287066246056783e-05,
666
+ "loss": 0.0362,
667
+ "step": 8600
668
+ },
669
+ {
670
+ "epoch": 1.37,
671
+ "learning_rate": 7.255520504731861e-05,
672
+ "loss": 0.0435,
673
+ "step": 8700
674
+ },
675
+ {
676
+ "epoch": 1.39,
677
+ "learning_rate": 7.22397476340694e-05,
678
+ "loss": 0.0382,
679
+ "step": 8800
680
+ },
681
+ {
682
+ "epoch": 1.4,
683
+ "learning_rate": 7.192429022082019e-05,
684
+ "loss": 0.0439,
685
+ "step": 8900
686
+ },
687
+ {
688
+ "epoch": 1.42,
689
+ "learning_rate": 7.160883280757098e-05,
690
+ "loss": 0.0374,
691
+ "step": 9000
692
+ },
693
+ {
694
+ "epoch": 1.42,
695
+ "eval_loss": 0.037410151213407516,
696
+ "eval_runtime": 192.1572,
697
+ "eval_samples_per_second": 113.111,
698
+ "eval_steps_per_second": 14.139,
699
+ "step": 9000
700
+ },
701
+ {
702
+ "epoch": 1.44,
703
+ "learning_rate": 7.129337539432177e-05,
704
+ "loss": 0.0342,
705
+ "step": 9100
706
+ },
707
+ {
708
+ "epoch": 1.45,
709
+ "learning_rate": 7.097791798107257e-05,
710
+ "loss": 0.0337,
711
+ "step": 9200
712
+ },
713
+ {
714
+ "epoch": 1.47,
715
+ "learning_rate": 7.066246056782335e-05,
716
+ "loss": 0.0358,
717
+ "step": 9300
718
+ },
719
+ {
720
+ "epoch": 1.48,
721
+ "learning_rate": 7.034700315457413e-05,
722
+ "loss": 0.0301,
723
+ "step": 9400
724
+ },
725
+ {
726
+ "epoch": 1.5,
727
+ "learning_rate": 7.003154574132493e-05,
728
+ "loss": 0.0362,
729
+ "step": 9500
730
+ },
731
+ {
732
+ "epoch": 1.5,
733
+ "eval_loss": 0.03637044504284859,
734
+ "eval_runtime": 192.1621,
735
+ "eval_samples_per_second": 113.108,
736
+ "eval_steps_per_second": 14.139,
737
+ "step": 9500
738
+ },
739
+ {
740
+ "epoch": 1.51,
741
+ "learning_rate": 6.971608832807571e-05,
742
+ "loss": 0.037,
743
+ "step": 9600
744
+ },
745
+ {
746
+ "epoch": 1.53,
747
+ "learning_rate": 6.94006309148265e-05,
748
+ "loss": 0.0374,
749
+ "step": 9700
750
+ },
751
+ {
752
+ "epoch": 1.55,
753
+ "learning_rate": 6.908517350157729e-05,
754
+ "loss": 0.0379,
755
+ "step": 9800
756
+ },
757
+ {
758
+ "epoch": 1.56,
759
+ "learning_rate": 6.876971608832808e-05,
760
+ "loss": 0.0328,
761
+ "step": 9900
762
+ },
763
+ {
764
+ "epoch": 1.58,
765
+ "learning_rate": 6.845425867507887e-05,
766
+ "loss": 0.0315,
767
+ "step": 10000
768
+ },
769
+ {
770
+ "epoch": 1.58,
771
+ "eval_loss": 0.03636159375309944,
772
+ "eval_runtime": 192.0641,
773
+ "eval_samples_per_second": 113.165,
774
+ "eval_steps_per_second": 14.146,
775
+ "step": 10000
776
+ },
777
+ {
778
+ "epoch": 1.59,
779
+ "learning_rate": 6.813880126182965e-05,
780
+ "loss": 0.0328,
781
+ "step": 10100
782
+ },
783
+ {
784
+ "epoch": 1.61,
785
+ "learning_rate": 6.782334384858045e-05,
786
+ "loss": 0.0355,
787
+ "step": 10200
788
+ },
789
+ {
790
+ "epoch": 1.62,
791
+ "learning_rate": 6.750788643533124e-05,
792
+ "loss": 0.0398,
793
+ "step": 10300
794
+ },
795
+ {
796
+ "epoch": 1.64,
797
+ "learning_rate": 6.719242902208202e-05,
798
+ "loss": 0.0372,
799
+ "step": 10400
800
+ },
801
+ {
802
+ "epoch": 1.66,
803
+ "learning_rate": 6.68769716088328e-05,
804
+ "loss": 0.0399,
805
+ "step": 10500
806
+ },
807
+ {
808
+ "epoch": 1.66,
809
+ "eval_loss": 0.03677404299378395,
810
+ "eval_runtime": 191.6576,
811
+ "eval_samples_per_second": 113.405,
812
+ "eval_steps_per_second": 14.176,
813
+ "step": 10500
814
+ },
815
+ {
816
+ "epoch": 1.67,
817
+ "learning_rate": 6.65615141955836e-05,
818
+ "loss": 0.0394,
819
+ "step": 10600
820
+ },
821
+ {
822
+ "epoch": 1.69,
823
+ "learning_rate": 6.624605678233438e-05,
824
+ "loss": 0.0355,
825
+ "step": 10700
826
+ },
827
+ {
828
+ "epoch": 1.7,
829
+ "learning_rate": 6.593059936908518e-05,
830
+ "loss": 0.0326,
831
+ "step": 10800
832
+ },
833
+ {
834
+ "epoch": 1.72,
835
+ "learning_rate": 6.561514195583596e-05,
836
+ "loss": 0.0307,
837
+ "step": 10900
838
+ },
839
+ {
840
+ "epoch": 1.74,
841
+ "learning_rate": 6.529968454258676e-05,
842
+ "loss": 0.0372,
843
+ "step": 11000
844
+ },
845
+ {
846
+ "epoch": 1.74,
847
+ "eval_loss": 0.03506915271282196,
848
+ "eval_runtime": 192.4202,
849
+ "eval_samples_per_second": 112.956,
850
+ "eval_steps_per_second": 14.12,
851
+ "step": 11000
852
+ },
853
+ {
854
+ "epoch": 1.75,
855
+ "learning_rate": 6.498422712933754e-05,
856
+ "loss": 0.0321,
857
+ "step": 11100
858
+ },
859
+ {
860
+ "epoch": 1.77,
861
+ "learning_rate": 6.466876971608832e-05,
862
+ "loss": 0.0272,
863
+ "step": 11200
864
+ },
865
+ {
866
+ "epoch": 1.78,
867
+ "learning_rate": 6.435331230283912e-05,
868
+ "loss": 0.0322,
869
+ "step": 11300
870
+ },
871
+ {
872
+ "epoch": 1.8,
873
+ "learning_rate": 6.403785488958992e-05,
874
+ "loss": 0.0355,
875
+ "step": 11400
876
+ },
877
+ {
878
+ "epoch": 1.81,
879
+ "learning_rate": 6.37223974763407e-05,
880
+ "loss": 0.0324,
881
+ "step": 11500
882
+ },
883
+ {
884
+ "epoch": 1.81,
885
+ "eval_loss": 0.034638404846191406,
886
+ "eval_runtime": 192.5205,
887
+ "eval_samples_per_second": 112.897,
888
+ "eval_steps_per_second": 14.113,
889
+ "step": 11500
890
+ },
891
+ {
892
+ "epoch": 1.83,
893
+ "learning_rate": 6.34069400630915e-05,
894
+ "loss": 0.0362,
895
+ "step": 11600
896
+ },
897
+ {
898
+ "epoch": 1.85,
899
+ "learning_rate": 6.309148264984228e-05,
900
+ "loss": 0.03,
901
+ "step": 11700
902
+ },
903
+ {
904
+ "epoch": 1.86,
905
+ "learning_rate": 6.277602523659306e-05,
906
+ "loss": 0.0313,
907
+ "step": 11800
908
+ },
909
+ {
910
+ "epoch": 1.88,
911
+ "learning_rate": 6.246056782334385e-05,
912
+ "loss": 0.0348,
913
+ "step": 11900
914
+ },
915
+ {
916
+ "epoch": 1.89,
917
+ "learning_rate": 6.214511041009464e-05,
918
+ "loss": 0.0351,
919
+ "step": 12000
920
+ },
921
+ {
922
+ "epoch": 1.89,
923
+ "eval_loss": 0.03646688908338547,
924
+ "eval_runtime": 192.2872,
925
+ "eval_samples_per_second": 113.034,
926
+ "eval_steps_per_second": 14.13,
927
+ "step": 12000
928
+ }
929
+ ],
930
+ "logging_steps": 100,
931
+ "max_steps": 31700,
932
+ "num_input_tokens_seen": 0,
933
+ "num_train_epochs": 5,
934
+ "save_steps": 500,
935
+ "total_flos": 4226972988026880.0,
936
+ "train_batch_size": 8,
937
+ "trial_name": null,
938
+ "trial_params": null
939
+ }
backup_checkpoint-12000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266031b962f594171a699a315d204ad9c02913dc636256066f9f3f3625b85451
3
+ size 4664
backup_checkpoint/config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-base",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 2,
27
+ "forced_bos_token_id": 0,
28
+ "forced_eos_token_id": 2,
29
+ "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
+ "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_position_embeddings": 1024,
43
+ "model_type": "bart",
44
+ "no_repeat_ngram_size": 3,
45
+ "normalize_before": false,
46
+ "normalize_embedding": true,
47
+ "num_beams": 4,
48
+ "num_hidden_layers": 6,
49
+ "pad_token_id": 1,
50
+ "scale_embedding": false,
51
+ "task_specific_params": {
52
+ "summarization": {
53
+ "length_penalty": 1.0,
54
+ "max_length": 128,
55
+ "min_length": 12,
56
+ "num_beams": 4
57
+ },
58
+ "summarization_cnn": {
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "num_beams": 4
63
+ },
64
+ "summarization_xsum": {
65
+ "length_penalty": 1.0,
66
+ "max_length": 62,
67
+ "min_length": 11,
68
+ "num_beams": 6
69
+ }
70
+ },
71
+ "torch_dtype": "float32",
72
+ "transformers_version": "4.37.0.dev0",
73
+ "use_cache": true,
74
+ "vocab_size": 50265
75
+ }
backup_checkpoint/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": 2,
7
+ "forced_bos_token_id": 0,
8
+ "forced_eos_token_id": 2,
9
+ "no_repeat_ngram_size": 3,
10
+ "num_beams": 4,
11
+ "pad_token_id": 1,
12
+ "transformers_version": "4.37.0.dev0"
13
+ }
backup_checkpoint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acc561d99f17d9573b5d2c97086c82ccd1e57e20c82f54bfb6c055c5dae1a3cb
3
+ size 557912620
backup_checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0d53e4853b78131aa2fd3dba9e24dcfc369ab8bb95af928a52af89426571012
3
+ size 1115579898
backup_checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a73c6f1e1eb77013e5ab142cf4e8d03715ff991905e41d762946a7eb2154ef3f
3
+ size 14244
backup_checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c6bfad6b9ed7f7b64c27c27fdb69168a5257375021c87f904bde03904fc246
3
+ size 1064
backup_checkpoint/trainer_state.json ADDED
@@ -0,0 +1,749 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.498422712933754,
5
+ "eval_steps": 500,
6
+ "global_step": 9500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 9.999684542586752e-05,
14
+ "loss": 12.4787,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.02,
19
+ "learning_rate": 9.96845425867508e-05,
20
+ "loss": 1.3146,
21
+ "step": 100
22
+ },
23
+ {
24
+ "epoch": 0.03,
25
+ "learning_rate": 9.936908517350158e-05,
26
+ "loss": 0.0928,
27
+ "step": 200
28
+ },
29
+ {
30
+ "epoch": 0.05,
31
+ "learning_rate": 9.905362776025237e-05,
32
+ "loss": 0.0903,
33
+ "step": 300
34
+ },
35
+ {
36
+ "epoch": 0.06,
37
+ "learning_rate": 9.873817034700316e-05,
38
+ "loss": 0.0675,
39
+ "step": 400
40
+ },
41
+ {
42
+ "epoch": 0.08,
43
+ "learning_rate": 9.842271293375394e-05,
44
+ "loss": 0.0744,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.08,
49
+ "eval_loss": 0.06298290193080902,
50
+ "eval_runtime": 191.3725,
51
+ "eval_samples_per_second": 113.574,
52
+ "eval_steps_per_second": 14.197,
53
+ "step": 500
54
+ },
55
+ {
56
+ "epoch": 0.09,
57
+ "learning_rate": 9.810725552050474e-05,
58
+ "loss": 0.0682,
59
+ "step": 600
60
+ },
61
+ {
62
+ "epoch": 0.11,
63
+ "learning_rate": 9.779179810725552e-05,
64
+ "loss": 0.072,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.13,
69
+ "learning_rate": 9.747634069400632e-05,
70
+ "loss": 0.0731,
71
+ "step": 800
72
+ },
73
+ {
74
+ "epoch": 0.14,
75
+ "learning_rate": 9.71608832807571e-05,
76
+ "loss": 0.0638,
77
+ "step": 900
78
+ },
79
+ {
80
+ "epoch": 0.16,
81
+ "learning_rate": 9.684542586750788e-05,
82
+ "loss": 0.062,
83
+ "step": 1000
84
+ },
85
+ {
86
+ "epoch": 0.16,
87
+ "eval_loss": 0.056409742683172226,
88
+ "eval_runtime": 191.8919,
89
+ "eval_samples_per_second": 113.267,
90
+ "eval_steps_per_second": 14.159,
91
+ "step": 1000
92
+ },
93
+ {
94
+ "epoch": 0.17,
95
+ "learning_rate": 9.652996845425868e-05,
96
+ "loss": 0.0663,
97
+ "step": 1100
98
+ },
99
+ {
100
+ "epoch": 0.19,
101
+ "learning_rate": 9.621451104100947e-05,
102
+ "loss": 0.0598,
103
+ "step": 1200
104
+ },
105
+ {
106
+ "epoch": 0.21,
107
+ "learning_rate": 9.589905362776026e-05,
108
+ "loss": 0.0625,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 0.22,
113
+ "learning_rate": 9.558359621451105e-05,
114
+ "loss": 0.0673,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.24,
119
+ "learning_rate": 9.526813880126184e-05,
120
+ "loss": 0.0625,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 0.24,
125
+ "eval_loss": 0.05498537793755531,
126
+ "eval_runtime": 191.898,
127
+ "eval_samples_per_second": 113.263,
128
+ "eval_steps_per_second": 14.159,
129
+ "step": 1500
130
+ },
131
+ {
132
+ "epoch": 0.25,
133
+ "learning_rate": 9.495268138801262e-05,
134
+ "loss": 0.0606,
135
+ "step": 1600
136
+ },
137
+ {
138
+ "epoch": 0.27,
139
+ "learning_rate": 9.463722397476341e-05,
140
+ "loss": 0.0684,
141
+ "step": 1700
142
+ },
143
+ {
144
+ "epoch": 0.28,
145
+ "learning_rate": 9.43217665615142e-05,
146
+ "loss": 0.0607,
147
+ "step": 1800
148
+ },
149
+ {
150
+ "epoch": 0.3,
151
+ "learning_rate": 9.400630914826499e-05,
152
+ "loss": 0.0679,
153
+ "step": 1900
154
+ },
155
+ {
156
+ "epoch": 0.32,
157
+ "learning_rate": 9.369085173501577e-05,
158
+ "loss": 0.0633,
159
+ "step": 2000
160
+ },
161
+ {
162
+ "epoch": 0.32,
163
+ "eval_loss": 0.052233804017305374,
164
+ "eval_runtime": 191.8762,
165
+ "eval_samples_per_second": 113.276,
166
+ "eval_steps_per_second": 14.16,
167
+ "step": 2000
168
+ },
169
+ {
170
+ "epoch": 0.33,
171
+ "learning_rate": 9.337539432176656e-05,
172
+ "loss": 0.061,
173
+ "step": 2100
174
+ },
175
+ {
176
+ "epoch": 0.35,
177
+ "learning_rate": 9.305993690851735e-05,
178
+ "loss": 0.0715,
179
+ "step": 2200
180
+ },
181
+ {
182
+ "epoch": 0.36,
183
+ "learning_rate": 9.274447949526815e-05,
184
+ "loss": 0.0586,
185
+ "step": 2300
186
+ },
187
+ {
188
+ "epoch": 0.38,
189
+ "learning_rate": 9.242902208201893e-05,
190
+ "loss": 0.0581,
191
+ "step": 2400
192
+ },
193
+ {
194
+ "epoch": 0.39,
195
+ "learning_rate": 9.211356466876973e-05,
196
+ "loss": 0.0588,
197
+ "step": 2500
198
+ },
199
+ {
200
+ "epoch": 0.39,
201
+ "eval_loss": 0.05040860176086426,
202
+ "eval_runtime": 191.6049,
203
+ "eval_samples_per_second": 113.437,
204
+ "eval_steps_per_second": 14.18,
205
+ "step": 2500
206
+ },
207
+ {
208
+ "epoch": 0.41,
209
+ "learning_rate": 9.179810725552051e-05,
210
+ "loss": 0.0572,
211
+ "step": 2600
212
+ },
213
+ {
214
+ "epoch": 0.43,
215
+ "learning_rate": 9.148264984227129e-05,
216
+ "loss": 0.0555,
217
+ "step": 2700
218
+ },
219
+ {
220
+ "epoch": 0.44,
221
+ "learning_rate": 9.116719242902209e-05,
222
+ "loss": 0.0547,
223
+ "step": 2800
224
+ },
225
+ {
226
+ "epoch": 0.46,
227
+ "learning_rate": 9.085173501577287e-05,
228
+ "loss": 0.0596,
229
+ "step": 2900
230
+ },
231
+ {
232
+ "epoch": 0.47,
233
+ "learning_rate": 9.053627760252367e-05,
234
+ "loss": 0.0593,
235
+ "step": 3000
236
+ },
237
+ {
238
+ "epoch": 0.47,
239
+ "eval_loss": 0.04637610912322998,
240
+ "eval_runtime": 192.1829,
241
+ "eval_samples_per_second": 113.095,
242
+ "eval_steps_per_second": 14.138,
243
+ "step": 3000
244
+ },
245
+ {
246
+ "epoch": 0.49,
247
+ "learning_rate": 9.022082018927446e-05,
248
+ "loss": 0.0536,
249
+ "step": 3100
250
+ },
251
+ {
252
+ "epoch": 0.5,
253
+ "learning_rate": 8.990536277602523e-05,
254
+ "loss": 0.0506,
255
+ "step": 3200
256
+ },
257
+ {
258
+ "epoch": 0.52,
259
+ "learning_rate": 8.958990536277603e-05,
260
+ "loss": 0.0601,
261
+ "step": 3300
262
+ },
263
+ {
264
+ "epoch": 0.54,
265
+ "learning_rate": 8.927444794952682e-05,
266
+ "loss": 0.05,
267
+ "step": 3400
268
+ },
269
+ {
270
+ "epoch": 0.55,
271
+ "learning_rate": 8.89589905362776e-05,
272
+ "loss": 0.0492,
273
+ "step": 3500
274
+ },
275
+ {
276
+ "epoch": 0.55,
277
+ "eval_loss": 0.0455920547246933,
278
+ "eval_runtime": 192.0796,
279
+ "eval_samples_per_second": 113.156,
280
+ "eval_steps_per_second": 14.145,
281
+ "step": 3500
282
+ },
283
+ {
284
+ "epoch": 0.57,
285
+ "learning_rate": 8.86435331230284e-05,
286
+ "loss": 0.0547,
287
+ "step": 3600
288
+ },
289
+ {
290
+ "epoch": 0.58,
291
+ "learning_rate": 8.832807570977918e-05,
292
+ "loss": 0.0582,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 0.6,
297
+ "learning_rate": 8.801261829652997e-05,
298
+ "loss": 0.0535,
299
+ "step": 3800
300
+ },
301
+ {
302
+ "epoch": 0.62,
303
+ "learning_rate": 8.769716088328076e-05,
304
+ "loss": 0.0554,
305
+ "step": 3900
306
+ },
307
+ {
308
+ "epoch": 0.63,
309
+ "learning_rate": 8.738170347003154e-05,
310
+ "loss": 0.0506,
311
+ "step": 4000
312
+ },
313
+ {
314
+ "epoch": 0.63,
315
+ "eval_loss": 0.04419832304120064,
316
+ "eval_runtime": 192.193,
317
+ "eval_samples_per_second": 113.089,
318
+ "eval_steps_per_second": 14.137,
319
+ "step": 4000
320
+ },
321
+ {
322
+ "epoch": 0.65,
323
+ "learning_rate": 8.706624605678234e-05,
324
+ "loss": 0.0508,
325
+ "step": 4100
326
+ },
327
+ {
328
+ "epoch": 0.66,
329
+ "learning_rate": 8.675078864353314e-05,
330
+ "loss": 0.0566,
331
+ "step": 4200
332
+ },
333
+ {
334
+ "epoch": 0.68,
335
+ "learning_rate": 8.64353312302839e-05,
336
+ "loss": 0.0556,
337
+ "step": 4300
338
+ },
339
+ {
340
+ "epoch": 0.69,
341
+ "learning_rate": 8.61198738170347e-05,
342
+ "loss": 0.0595,
343
+ "step": 4400
344
+ },
345
+ {
346
+ "epoch": 0.71,
347
+ "learning_rate": 8.58044164037855e-05,
348
+ "loss": 0.0584,
349
+ "step": 4500
350
+ },
351
+ {
352
+ "epoch": 0.71,
353
+ "eval_loss": 0.043363627046346664,
354
+ "eval_runtime": 191.6803,
355
+ "eval_samples_per_second": 113.392,
356
+ "eval_steps_per_second": 14.175,
357
+ "step": 4500
358
+ },
359
+ {
360
+ "epoch": 0.73,
361
+ "learning_rate": 8.548895899053628e-05,
362
+ "loss": 0.0546,
363
+ "step": 4600
364
+ },
365
+ {
366
+ "epoch": 0.74,
367
+ "learning_rate": 8.517350157728708e-05,
368
+ "loss": 0.0457,
369
+ "step": 4700
370
+ },
371
+ {
372
+ "epoch": 0.76,
373
+ "learning_rate": 8.485804416403787e-05,
374
+ "loss": 0.0531,
375
+ "step": 4800
376
+ },
377
+ {
378
+ "epoch": 0.77,
379
+ "learning_rate": 8.454258675078864e-05,
380
+ "loss": 0.0506,
381
+ "step": 4900
382
+ },
383
+ {
384
+ "epoch": 0.79,
385
+ "learning_rate": 8.422712933753944e-05,
386
+ "loss": 0.0455,
387
+ "step": 5000
388
+ },
389
+ {
390
+ "epoch": 0.79,
391
+ "eval_loss": 0.04175921157002449,
392
+ "eval_runtime": 192.0012,
393
+ "eval_samples_per_second": 113.202,
394
+ "eval_steps_per_second": 14.151,
395
+ "step": 5000
396
+ },
397
+ {
398
+ "epoch": 0.8,
399
+ "learning_rate": 8.391167192429022e-05,
400
+ "loss": 0.053,
401
+ "step": 5100
402
+ },
403
+ {
404
+ "epoch": 0.82,
405
+ "learning_rate": 8.359621451104101e-05,
406
+ "loss": 0.0499,
407
+ "step": 5200
408
+ },
409
+ {
410
+ "epoch": 0.84,
411
+ "learning_rate": 8.328075709779181e-05,
412
+ "loss": 0.0407,
413
+ "step": 5300
414
+ },
415
+ {
416
+ "epoch": 0.85,
417
+ "learning_rate": 8.296529968454258e-05,
418
+ "loss": 0.0559,
419
+ "step": 5400
420
+ },
421
+ {
422
+ "epoch": 0.87,
423
+ "learning_rate": 8.264984227129337e-05,
424
+ "loss": 0.0477,
425
+ "step": 5500
426
+ },
427
+ {
428
+ "epoch": 0.87,
429
+ "eval_loss": 0.042047169059515,
430
+ "eval_runtime": 192.1,
431
+ "eval_samples_per_second": 113.144,
432
+ "eval_steps_per_second": 14.144,
433
+ "step": 5500
434
+ },
435
+ {
436
+ "epoch": 0.88,
437
+ "learning_rate": 8.233438485804417e-05,
438
+ "loss": 0.0488,
439
+ "step": 5600
440
+ },
441
+ {
442
+ "epoch": 0.9,
443
+ "learning_rate": 8.201892744479495e-05,
444
+ "loss": 0.0537,
445
+ "step": 5700
446
+ },
447
+ {
448
+ "epoch": 0.91,
449
+ "learning_rate": 8.170347003154575e-05,
450
+ "loss": 0.0515,
451
+ "step": 5800
452
+ },
453
+ {
454
+ "epoch": 0.93,
455
+ "learning_rate": 8.138801261829655e-05,
456
+ "loss": 0.0511,
457
+ "step": 5900
458
+ },
459
+ {
460
+ "epoch": 0.95,
461
+ "learning_rate": 8.107255520504731e-05,
462
+ "loss": 0.0548,
463
+ "step": 6000
464
+ },
465
+ {
466
+ "epoch": 0.95,
467
+ "eval_loss": 0.039724551141262054,
468
+ "eval_runtime": 191.9769,
469
+ "eval_samples_per_second": 113.217,
470
+ "eval_steps_per_second": 14.153,
471
+ "step": 6000
472
+ },
473
+ {
474
+ "epoch": 0.96,
475
+ "learning_rate": 8.075709779179811e-05,
476
+ "loss": 0.0482,
477
+ "step": 6100
478
+ },
479
+ {
480
+ "epoch": 0.98,
481
+ "learning_rate": 8.04416403785489e-05,
482
+ "loss": 0.0474,
483
+ "step": 6200
484
+ },
485
+ {
486
+ "epoch": 0.99,
487
+ "learning_rate": 8.012618296529969e-05,
488
+ "loss": 0.0488,
489
+ "step": 6300
490
+ },
491
+ {
492
+ "epoch": 1.01,
493
+ "learning_rate": 7.981072555205048e-05,
494
+ "loss": 0.0346,
495
+ "step": 6400
496
+ },
497
+ {
498
+ "epoch": 1.03,
499
+ "learning_rate": 7.949526813880127e-05,
500
+ "loss": 0.035,
501
+ "step": 6500
502
+ },
503
+ {
504
+ "epoch": 1.03,
505
+ "eval_loss": 0.039325978606939316,
506
+ "eval_runtime": 191.6608,
507
+ "eval_samples_per_second": 113.403,
508
+ "eval_steps_per_second": 14.176,
509
+ "step": 6500
510
+ },
511
+ {
512
+ "epoch": 1.04,
513
+ "learning_rate": 7.917981072555205e-05,
514
+ "loss": 0.0355,
515
+ "step": 6600
516
+ },
517
+ {
518
+ "epoch": 1.06,
519
+ "learning_rate": 7.886435331230284e-05,
520
+ "loss": 0.0363,
521
+ "step": 6700
522
+ },
523
+ {
524
+ "epoch": 1.07,
525
+ "learning_rate": 7.854889589905363e-05,
526
+ "loss": 0.0407,
527
+ "step": 6800
528
+ },
529
+ {
530
+ "epoch": 1.09,
531
+ "learning_rate": 7.823343848580442e-05,
532
+ "loss": 0.0385,
533
+ "step": 6900
534
+ },
535
+ {
536
+ "epoch": 1.1,
537
+ "learning_rate": 7.791798107255522e-05,
538
+ "loss": 0.0347,
539
+ "step": 7000
540
+ },
541
+ {
542
+ "epoch": 1.1,
543
+ "eval_loss": 0.03924637660384178,
544
+ "eval_runtime": 192.1827,
545
+ "eval_samples_per_second": 113.096,
546
+ "eval_steps_per_second": 14.138,
547
+ "step": 7000
548
+ },
549
+ {
550
+ "epoch": 1.12,
551
+ "learning_rate": 7.760252365930599e-05,
552
+ "loss": 0.0357,
553
+ "step": 7100
554
+ },
555
+ {
556
+ "epoch": 1.14,
557
+ "learning_rate": 7.728706624605678e-05,
558
+ "loss": 0.041,
559
+ "step": 7200
560
+ },
561
+ {
562
+ "epoch": 1.15,
563
+ "learning_rate": 7.697160883280758e-05,
564
+ "loss": 0.0313,
565
+ "step": 7300
566
+ },
567
+ {
568
+ "epoch": 1.17,
569
+ "learning_rate": 7.665615141955836e-05,
570
+ "loss": 0.0336,
571
+ "step": 7400
572
+ },
573
+ {
574
+ "epoch": 1.18,
575
+ "learning_rate": 7.634069400630916e-05,
576
+ "loss": 0.0303,
577
+ "step": 7500
578
+ },
579
+ {
580
+ "epoch": 1.18,
581
+ "eval_loss": 0.03979608044028282,
582
+ "eval_runtime": 192.103,
583
+ "eval_samples_per_second": 113.142,
584
+ "eval_steps_per_second": 14.143,
585
+ "step": 7500
586
+ },
587
+ {
588
+ "epoch": 1.2,
589
+ "learning_rate": 7.602523659305994e-05,
590
+ "loss": 0.0356,
591
+ "step": 7600
592
+ },
593
+ {
594
+ "epoch": 1.21,
595
+ "learning_rate": 7.570977917981072e-05,
596
+ "loss": 0.0379,
597
+ "step": 7700
598
+ },
599
+ {
600
+ "epoch": 1.23,
601
+ "learning_rate": 7.539432176656152e-05,
602
+ "loss": 0.0365,
603
+ "step": 7800
604
+ },
605
+ {
606
+ "epoch": 1.25,
607
+ "learning_rate": 7.50788643533123e-05,
608
+ "loss": 0.0393,
609
+ "step": 7900
610
+ },
611
+ {
612
+ "epoch": 1.26,
613
+ "learning_rate": 7.47634069400631e-05,
614
+ "loss": 0.0374,
615
+ "step": 8000
616
+ },
617
+ {
618
+ "epoch": 1.26,
619
+ "eval_loss": 0.038463614881038666,
620
+ "eval_runtime": 192.1601,
621
+ "eval_samples_per_second": 113.109,
622
+ "eval_steps_per_second": 14.139,
623
+ "step": 8000
624
+ },
625
+ {
626
+ "epoch": 1.28,
627
+ "learning_rate": 7.444794952681389e-05,
628
+ "loss": 0.0303,
629
+ "step": 8100
630
+ },
631
+ {
632
+ "epoch": 1.29,
633
+ "learning_rate": 7.413249211356468e-05,
634
+ "loss": 0.0384,
635
+ "step": 8200
636
+ },
637
+ {
638
+ "epoch": 1.31,
639
+ "learning_rate": 7.381703470031546e-05,
640
+ "loss": 0.0383,
641
+ "step": 8300
642
+ },
643
+ {
644
+ "epoch": 1.32,
645
+ "learning_rate": 7.350157728706625e-05,
646
+ "loss": 0.0378,
647
+ "step": 8400
648
+ },
649
+ {
650
+ "epoch": 1.34,
651
+ "learning_rate": 7.318611987381704e-05,
652
+ "loss": 0.0343,
653
+ "step": 8500
654
+ },
655
+ {
656
+ "epoch": 1.34,
657
+ "eval_loss": 0.03961439058184624,
658
+ "eval_runtime": 192.1114,
659
+ "eval_samples_per_second": 113.137,
660
+ "eval_steps_per_second": 14.143,
661
+ "step": 8500
662
+ },
663
+ {
664
+ "epoch": 1.36,
665
+ "learning_rate": 7.287066246056783e-05,
666
+ "loss": 0.0362,
667
+ "step": 8600
668
+ },
669
+ {
670
+ "epoch": 1.37,
671
+ "learning_rate": 7.255520504731861e-05,
672
+ "loss": 0.0435,
673
+ "step": 8700
674
+ },
675
+ {
676
+ "epoch": 1.39,
677
+ "learning_rate": 7.22397476340694e-05,
678
+ "loss": 0.0382,
679
+ "step": 8800
680
+ },
681
+ {
682
+ "epoch": 1.4,
683
+ "learning_rate": 7.192429022082019e-05,
684
+ "loss": 0.0439,
685
+ "step": 8900
686
+ },
687
+ {
688
+ "epoch": 1.42,
689
+ "learning_rate": 7.160883280757098e-05,
690
+ "loss": 0.0374,
691
+ "step": 9000
692
+ },
693
+ {
694
+ "epoch": 1.42,
695
+ "eval_loss": 0.037410151213407516,
696
+ "eval_runtime": 192.1572,
697
+ "eval_samples_per_second": 113.111,
698
+ "eval_steps_per_second": 14.139,
699
+ "step": 9000
700
+ },
701
+ {
702
+ "epoch": 1.44,
703
+ "learning_rate": 7.129337539432177e-05,
704
+ "loss": 0.0342,
705
+ "step": 9100
706
+ },
707
+ {
708
+ "epoch": 1.45,
709
+ "learning_rate": 7.097791798107257e-05,
710
+ "loss": 0.0337,
711
+ "step": 9200
712
+ },
713
+ {
714
+ "epoch": 1.47,
715
+ "learning_rate": 7.066246056782335e-05,
716
+ "loss": 0.0358,
717
+ "step": 9300
718
+ },
719
+ {
720
+ "epoch": 1.48,
721
+ "learning_rate": 7.034700315457413e-05,
722
+ "loss": 0.0301,
723
+ "step": 9400
724
+ },
725
+ {
726
+ "epoch": 1.5,
727
+ "learning_rate": 7.003154574132493e-05,
728
+ "loss": 0.0362,
729
+ "step": 9500
730
+ },
731
+ {
732
+ "epoch": 1.5,
733
+ "eval_loss": 0.03637044504284859,
734
+ "eval_runtime": 192.1621,
735
+ "eval_samples_per_second": 113.108,
736
+ "eval_steps_per_second": 14.139,
737
+ "step": 9500
738
+ }
739
+ ],
740
+ "logging_steps": 100,
741
+ "max_steps": 31700,
742
+ "num_input_tokens_seen": 0,
743
+ "num_train_epochs": 5,
744
+ "save_steps": 500,
745
+ "total_flos": 3348871507537920.0,
746
+ "train_batch_size": 8,
747
+ "trial_name": null,
748
+ "trial_params": null
749
+ }
backup_checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266031b962f594171a699a315d204ad9c02913dc636256066f9f3f3625b85451
3
+ size 4664
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "facebook/bart-base",
3
  "activation_dropout": 0.1,
4
  "activation_function": "gelu",
5
  "add_bias_logits": false,
 
1
  {
2
+ "_name_or_path": "./bart_test_p2/backup_checkpoint-12000",
3
  "activation_dropout": 0.1,
4
  "activation_function": "gelu",
5
  "add_bias_logits": false,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63a52d6d98a3375c580bc5ffac8d45a114f4a730d1feeecf10b875a6bc899bcf
3
  size 557912620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2d4cc6d9060405892bbe3e531d4520c68bd9a3a2fbf019f9e89cb820504c2f
3
  size 557912620
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d64fe19098eb8c4aceccd9b112bb8997186b2065d41f670050189a9a7aeb2b2b
3
  size 4664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da19c72832637aa9b262dfd4f56dcc1ff2faa4b4d65254eee1a7a257ba33f327
3
  size 4664