vishalkatheriya18 commited on
Commit
69f744d
1 Parent(s): eddb68a

End of training

Browse files
README.md ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: facebook/convnextv2-tiny-1k-224
4
+ tags:
5
+ - generated_from_trainer
6
+ datasets:
7
+ - imagefolder
8
+ metrics:
9
+ - accuracy
10
+ - precision
11
+ - recall
12
+ model-index:
13
+ - name: convnextv2-tiny-1k-224-finetuned-neck-style
14
+ results:
15
+ - task:
16
+ name: Image Classification
17
+ type: image-classification
18
+ dataset:
19
+ name: imagefolder
20
+ type: imagefolder
21
+ config: default
22
+ split: train
23
+ args: default
24
+ metrics:
25
+ - name: Accuracy
26
+ type: accuracy
27
+ value: 0.8492753623188406
28
+ - name: Precision
29
+ type: precision
30
+ value: 0.8507158478342087
31
+ - name: Recall
32
+ type: recall
33
+ value: 0.8492753623188406
34
+ ---
35
+
36
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
37
+ should probably proofread and complete it, then remove this comment. -->
38
+
39
+ # convnextv2-tiny-1k-224-finetuned-neck-style
40
+
41
+ This model is a fine-tuned version of [facebook/convnextv2-tiny-1k-224](https://huggingface.co/facebook/convnextv2-tiny-1k-224) on the imagefolder dataset.
42
+ It achieves the following results on the evaluation set:
43
+ - Loss: 0.6084
44
+ - Accuracy: 0.8493
45
+ - Precision: 0.8507
46
+ - Recall: 0.8493
47
+
48
+ ## Model description
49
+
50
+ More information needed
51
+
52
+ ## Intended uses & limitations
53
+
54
+ More information needed
55
+
56
+ ## Training and evaluation data
57
+
58
+ More information needed
59
+
60
+ ## Training procedure
61
+
62
+ ### Training hyperparameters
63
+
64
+ The following hyperparameters were used during training:
65
+ - learning_rate: 5e-05
66
+ - train_batch_size: 32
67
+ - eval_batch_size: 32
68
+ - seed: 42
69
+ - gradient_accumulation_steps: 4
70
+ - total_train_batch_size: 128
71
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
72
+ - lr_scheduler_type: linear
73
+ - lr_scheduler_warmup_ratio: 0.1
74
+ - num_epochs: 100
75
+
76
+ ### Training results
77
+
78
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | Precision | Recall |
79
+ |:-------------:|:-------:|:----:|:---------------:|:--------:|:---------:|:------:|
80
+ | 1.613 | 0.9897 | 24 | 1.5833 | 0.2928 | 0.3248 | 0.2928 |
81
+ | 1.5494 | 1.9794 | 48 | 1.4944 | 0.3681 | 0.4410 | 0.3681 |
82
+ | 1.3989 | 2.9691 | 72 | 1.3424 | 0.5159 | 0.5262 | 0.5159 |
83
+ | 1.2238 | 4.0 | 97 | 1.1162 | 0.6261 | 0.6666 | 0.6261 |
84
+ | 0.9585 | 4.9897 | 121 | 0.8966 | 0.6986 | 0.7014 | 0.6986 |
85
+ | 0.8934 | 5.9794 | 145 | 0.7638 | 0.7507 | 0.7490 | 0.7507 |
86
+ | 0.7589 | 6.9691 | 169 | 0.6776 | 0.7652 | 0.7719 | 0.7652 |
87
+ | 0.6746 | 8.0 | 194 | 0.6127 | 0.7623 | 0.7628 | 0.7623 |
88
+ | 0.6048 | 8.9897 | 218 | 0.5221 | 0.8203 | 0.8217 | 0.8203 |
89
+ | 0.531 | 9.9794 | 242 | 0.4931 | 0.8116 | 0.8204 | 0.8116 |
90
+ | 0.57 | 10.9691 | 266 | 0.4480 | 0.8319 | 0.8345 | 0.8319 |
91
+ | 0.4624 | 12.0 | 291 | 0.4214 | 0.8464 | 0.8460 | 0.8464 |
92
+ | 0.417 | 12.9897 | 315 | 0.4439 | 0.8493 | 0.8486 | 0.8493 |
93
+ | 0.3814 | 13.9794 | 339 | 0.4138 | 0.8464 | 0.8478 | 0.8464 |
94
+ | 0.3737 | 14.9691 | 363 | 0.4139 | 0.8464 | 0.8466 | 0.8464 |
95
+ | 0.3971 | 16.0 | 388 | 0.4119 | 0.8638 | 0.8665 | 0.8638 |
96
+ | 0.343 | 16.9897 | 412 | 0.4421 | 0.8609 | 0.8659 | 0.8609 |
97
+ | 0.3311 | 17.9794 | 436 | 0.4581 | 0.8493 | 0.8504 | 0.8493 |
98
+ | 0.2652 | 18.9691 | 460 | 0.4563 | 0.8406 | 0.8441 | 0.8406 |
99
+ | 0.3026 | 20.0 | 485 | 0.4536 | 0.8522 | 0.8549 | 0.8522 |
100
+ | 0.2562 | 20.9897 | 509 | 0.4409 | 0.8464 | 0.8493 | 0.8464 |
101
+ | 0.2282 | 21.9794 | 533 | 0.4389 | 0.8435 | 0.8451 | 0.8435 |
102
+ | 0.2374 | 22.9691 | 557 | 0.4452 | 0.8580 | 0.8589 | 0.8580 |
103
+ | 0.216 | 24.0 | 582 | 0.4375 | 0.8580 | 0.8581 | 0.8580 |
104
+ | 0.2127 | 24.9897 | 606 | 0.4422 | 0.8580 | 0.8588 | 0.8580 |
105
+ | 0.2004 | 25.9794 | 630 | 0.4635 | 0.8522 | 0.8519 | 0.8522 |
106
+ | 0.2029 | 26.9691 | 654 | 0.5215 | 0.8493 | 0.8546 | 0.8493 |
107
+ | 0.1794 | 28.0 | 679 | 0.4756 | 0.8638 | 0.8669 | 0.8638 |
108
+ | 0.1835 | 28.9897 | 703 | 0.4728 | 0.8609 | 0.8650 | 0.8609 |
109
+ | 0.1781 | 29.9794 | 727 | 0.4637 | 0.8551 | 0.8568 | 0.8551 |
110
+ | 0.1671 | 30.9691 | 751 | 0.4856 | 0.8580 | 0.8599 | 0.8580 |
111
+ | 0.1762 | 32.0 | 776 | 0.5008 | 0.8667 | 0.8684 | 0.8667 |
112
+ | 0.1867 | 32.9897 | 800 | 0.5058 | 0.8580 | 0.8585 | 0.8580 |
113
+ | 0.1409 | 33.9794 | 824 | 0.5490 | 0.8406 | 0.8409 | 0.8406 |
114
+ | 0.1315 | 34.9691 | 848 | 0.5284 | 0.8348 | 0.8356 | 0.8348 |
115
+ | 0.1315 | 36.0 | 873 | 0.5415 | 0.8464 | 0.8488 | 0.8464 |
116
+ | 0.1974 | 36.9897 | 897 | 0.5194 | 0.8493 | 0.8536 | 0.8493 |
117
+ | 0.1337 | 37.9794 | 921 | 0.5088 | 0.8609 | 0.8603 | 0.8609 |
118
+ | 0.173 | 38.9691 | 945 | 0.4912 | 0.8667 | 0.8680 | 0.8667 |
119
+ | 0.1409 | 40.0 | 970 | 0.5223 | 0.8493 | 0.8502 | 0.8493 |
120
+ | 0.1379 | 40.9897 | 994 | 0.5204 | 0.8493 | 0.8487 | 0.8493 |
121
+ | 0.1437 | 41.9794 | 1018 | 0.5860 | 0.8522 | 0.8551 | 0.8522 |
122
+ | 0.1022 | 42.9691 | 1042 | 0.5461 | 0.8464 | 0.8492 | 0.8464 |
123
+ | 0.1181 | 44.0 | 1067 | 0.5411 | 0.8551 | 0.8566 | 0.8551 |
124
+ | 0.1212 | 44.9897 | 1091 | 0.5294 | 0.8580 | 0.8580 | 0.8580 |
125
+ | 0.1049 | 45.9794 | 1115 | 0.5667 | 0.8493 | 0.8492 | 0.8493 |
126
+ | 0.1132 | 46.9691 | 1139 | 0.5908 | 0.8464 | 0.8491 | 0.8464 |
127
+ | 0.1313 | 48.0 | 1164 | 0.5996 | 0.8522 | 0.8582 | 0.8522 |
128
+ | 0.1312 | 48.9897 | 1188 | 0.5430 | 0.8580 | 0.8607 | 0.8580 |
129
+ | 0.0996 | 49.9794 | 1212 | 0.5777 | 0.8522 | 0.8561 | 0.8522 |
130
+ | 0.1389 | 50.9691 | 1236 | 0.5758 | 0.8435 | 0.8486 | 0.8435 |
131
+ | 0.1079 | 52.0 | 1261 | 0.5540 | 0.8580 | 0.8611 | 0.8580 |
132
+ | 0.0972 | 52.9897 | 1285 | 0.5600 | 0.8551 | 0.8559 | 0.8551 |
133
+ | 0.0985 | 53.9794 | 1309 | 0.5392 | 0.8638 | 0.8656 | 0.8638 |
134
+ | 0.1112 | 54.9691 | 1333 | 0.5411 | 0.8638 | 0.8656 | 0.8638 |
135
+ | 0.1308 | 56.0 | 1358 | 0.5445 | 0.8638 | 0.8654 | 0.8638 |
136
+ | 0.1005 | 56.9897 | 1382 | 0.5554 | 0.8551 | 0.8551 | 0.8551 |
137
+ | 0.0871 | 57.9794 | 1406 | 0.5966 | 0.8406 | 0.8441 | 0.8406 |
138
+ | 0.1102 | 58.9691 | 1430 | 0.5807 | 0.8522 | 0.8543 | 0.8522 |
139
+ | 0.1028 | 60.0 | 1455 | 0.5654 | 0.8435 | 0.8491 | 0.8435 |
140
+ | 0.107 | 60.9897 | 1479 | 0.5779 | 0.8435 | 0.8461 | 0.8435 |
141
+ | 0.0848 | 61.9794 | 1503 | 0.5843 | 0.8551 | 0.8569 | 0.8551 |
142
+ | 0.0976 | 62.9691 | 1527 | 0.6162 | 0.8435 | 0.8454 | 0.8435 |
143
+ | 0.0977 | 64.0 | 1552 | 0.5822 | 0.8464 | 0.8469 | 0.8464 |
144
+ | 0.1256 | 64.9897 | 1576 | 0.5757 | 0.8493 | 0.8514 | 0.8493 |
145
+ | 0.0883 | 65.9794 | 1600 | 0.5716 | 0.8464 | 0.8467 | 0.8464 |
146
+ | 0.0808 | 66.9691 | 1624 | 0.5726 | 0.8551 | 0.8562 | 0.8551 |
147
+ | 0.1034 | 68.0 | 1649 | 0.5413 | 0.8551 | 0.8549 | 0.8551 |
148
+ | 0.0845 | 68.9897 | 1673 | 0.5826 | 0.8435 | 0.8477 | 0.8435 |
149
+ | 0.0916 | 69.9794 | 1697 | 0.5661 | 0.8522 | 0.8522 | 0.8522 |
150
+ | 0.0912 | 70.9691 | 1721 | 0.5771 | 0.8493 | 0.8498 | 0.8493 |
151
+ | 0.0863 | 72.0 | 1746 | 0.5769 | 0.8551 | 0.8550 | 0.8551 |
152
+ | 0.083 | 72.9897 | 1770 | 0.5860 | 0.8493 | 0.8486 | 0.8493 |
153
+ | 0.0839 | 73.9794 | 1794 | 0.5647 | 0.8551 | 0.8551 | 0.8551 |
154
+ | 0.0903 | 74.9691 | 1818 | 0.6012 | 0.8551 | 0.8535 | 0.8551 |
155
+ | 0.074 | 76.0 | 1843 | 0.6048 | 0.8464 | 0.8461 | 0.8464 |
156
+ | 0.0907 | 76.9897 | 1867 | 0.5807 | 0.8493 | 0.8495 | 0.8493 |
157
+ | 0.0613 | 77.9794 | 1891 | 0.5775 | 0.8377 | 0.8382 | 0.8377 |
158
+ | 0.0964 | 78.9691 | 1915 | 0.5759 | 0.8667 | 0.8676 | 0.8667 |
159
+ | 0.0735 | 80.0 | 1940 | 0.5962 | 0.8551 | 0.8566 | 0.8551 |
160
+ | 0.0663 | 80.9897 | 1964 | 0.5769 | 0.8435 | 0.8441 | 0.8435 |
161
+ | 0.0719 | 81.9794 | 1988 | 0.5826 | 0.8493 | 0.8507 | 0.8493 |
162
+ | 0.0718 | 82.9691 | 2012 | 0.5880 | 0.8580 | 0.8590 | 0.8580 |
163
+ | 0.0925 | 84.0 | 2037 | 0.5986 | 0.8493 | 0.8513 | 0.8493 |
164
+ | 0.0621 | 84.9897 | 2061 | 0.5915 | 0.8493 | 0.8497 | 0.8493 |
165
+ | 0.059 | 85.9794 | 2085 | 0.5779 | 0.8580 | 0.8577 | 0.8580 |
166
+ | 0.0806 | 86.9691 | 2109 | 0.5928 | 0.8493 | 0.8501 | 0.8493 |
167
+ | 0.0617 | 88.0 | 2134 | 0.6062 | 0.8522 | 0.8520 | 0.8522 |
168
+ | 0.0651 | 88.9897 | 2158 | 0.6067 | 0.8522 | 0.8519 | 0.8522 |
169
+ | 0.0754 | 89.9794 | 2182 | 0.6108 | 0.8551 | 0.8553 | 0.8551 |
170
+ | 0.0682 | 90.9691 | 2206 | 0.6185 | 0.8493 | 0.8489 | 0.8493 |
171
+ | 0.0763 | 92.0 | 2231 | 0.6168 | 0.8580 | 0.8575 | 0.8580 |
172
+ | 0.0703 | 92.9897 | 2255 | 0.6259 | 0.8522 | 0.8521 | 0.8522 |
173
+ | 0.0861 | 93.9794 | 2279 | 0.6128 | 0.8551 | 0.8553 | 0.8551 |
174
+ | 0.0807 | 94.9691 | 2303 | 0.6140 | 0.8551 | 0.8547 | 0.8551 |
175
+ | 0.0621 | 96.0 | 2328 | 0.6133 | 0.8522 | 0.8532 | 0.8522 |
176
+ | 0.0831 | 96.9897 | 2352 | 0.6101 | 0.8493 | 0.8507 | 0.8493 |
177
+ | 0.0625 | 97.9794 | 2376 | 0.6097 | 0.8493 | 0.8507 | 0.8493 |
178
+ | 0.0571 | 98.9691 | 2400 | 0.6084 | 0.8493 | 0.8507 | 0.8493 |
179
+
180
+
181
+ ### Framework versions
182
+
183
+ - Transformers 4.44.0
184
+ - Pytorch 2.4.0
185
+ - Datasets 2.21.0
186
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 98.96907216494846,
3
+ "eval_accuracy": 0.8492753623188406,
4
+ "eval_loss": 0.6083797812461853,
5
+ "eval_precision": 0.8507158478342087,
6
+ "eval_recall": 0.8492753623188406,
7
+ "eval_runtime": 1.7897,
8
+ "eval_samples_per_second": 192.765,
9
+ "eval_steps_per_second": 6.146,
10
+ "total_flos": 7.732715563096474e+18,
11
+ "train_loss": 0.2344164727628231,
12
+ "train_runtime": 4723.8268,
13
+ "train_samples_per_second": 65.709,
14
+ "train_steps_per_second": 0.508
15
+ }
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/convnextv2-tiny-1k-224",
3
+ "architectures": [
4
+ "ConvNextV2ForImageClassification"
5
+ ],
6
+ "depths": [
7
+ 3,
8
+ 3,
9
+ 9,
10
+ 3
11
+ ],
12
+ "drop_path_rate": 0.0,
13
+ "hidden_act": "gelu",
14
+ "hidden_sizes": [
15
+ 96,
16
+ 192,
17
+ 384,
18
+ 768
19
+ ],
20
+ "id2label": {
21
+ "0": "mandarin neck",
22
+ "1": "notch neck",
23
+ "2": "round neck",
24
+ "3": "shirt collar",
25
+ "4": "v neck"
26
+ },
27
+ "image_size": 224,
28
+ "initializer_range": 0.02,
29
+ "label2id": {
30
+ "mandarin neck": 0,
31
+ "notch neck": 1,
32
+ "round neck": 2,
33
+ "shirt collar": 3,
34
+ "v neck": 4
35
+ },
36
+ "layer_norm_eps": 1e-12,
37
+ "model_type": "convnextv2",
38
+ "num_channels": 3,
39
+ "num_stages": 4,
40
+ "out_features": [
41
+ "stage4"
42
+ ],
43
+ "out_indices": [
44
+ 4
45
+ ],
46
+ "patch_size": 4,
47
+ "problem_type": "single_label_classification",
48
+ "stage_names": [
49
+ "stem",
50
+ "stage1",
51
+ "stage2",
52
+ "stage3",
53
+ "stage4"
54
+ ],
55
+ "torch_dtype": "float32",
56
+ "transformers_version": "4.44.0"
57
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 98.96907216494846,
3
+ "eval_accuracy": 0.8492753623188406,
4
+ "eval_loss": 0.6083797812461853,
5
+ "eval_precision": 0.8507158478342087,
6
+ "eval_recall": 0.8492753623188406,
7
+ "eval_runtime": 1.7897,
8
+ "eval_samples_per_second": 192.765,
9
+ "eval_steps_per_second": 6.146
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd531331fc29d04c342346e07e220f05b4ea254206e16c40e28e5a1524c30d57
3
+ size 111505052
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_pct": 0.875,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.485,
8
+ 0.456,
9
+ 0.406
10
+ ],
11
+ "image_processor_type": "ConvNextImageProcessor",
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "shortest_edge": 224
21
+ }
22
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 98.96907216494846,
3
+ "total_flos": 7.732715563096474e+18,
4
+ "train_loss": 0.2344164727628231,
5
+ "train_runtime": 4723.8268,
6
+ "train_samples_per_second": 65.709,
7
+ "train_steps_per_second": 0.508
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2811 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 98.96907216494846,
5
+ "eval_steps": 500,
6
+ "global_step": 2400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.41237113402061853,
13
+ "grad_norm": 5.916716575622559,
14
+ "learning_rate": 2.0833333333333334e-06,
15
+ "loss": 1.6297,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.8247422680412371,
20
+ "grad_norm": 5.051618576049805,
21
+ "learning_rate": 4.166666666666667e-06,
22
+ "loss": 1.613,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.9896907216494846,
27
+ "eval_accuracy": 0.2927536231884058,
28
+ "eval_loss": 1.5833344459533691,
29
+ "eval_precision": 0.3247879943590829,
30
+ "eval_recall": 0.2927536231884058,
31
+ "eval_runtime": 2.9495,
32
+ "eval_samples_per_second": 116.97,
33
+ "eval_steps_per_second": 3.729,
34
+ "step": 24
35
+ },
36
+ {
37
+ "epoch": 1.2371134020618557,
38
+ "grad_norm": 4.8794169425964355,
39
+ "learning_rate": 6.25e-06,
40
+ "loss": 1.5792,
41
+ "step": 30
42
+ },
43
+ {
44
+ "epoch": 1.6494845360824741,
45
+ "grad_norm": 6.336801052093506,
46
+ "learning_rate": 8.333333333333334e-06,
47
+ "loss": 1.5494,
48
+ "step": 40
49
+ },
50
+ {
51
+ "epoch": 1.9793814432989691,
52
+ "eval_accuracy": 0.3681159420289855,
53
+ "eval_loss": 1.4944071769714355,
54
+ "eval_precision": 0.440954469667821,
55
+ "eval_recall": 0.3681159420289855,
56
+ "eval_runtime": 1.7863,
57
+ "eval_samples_per_second": 193.135,
58
+ "eval_steps_per_second": 6.158,
59
+ "step": 48
60
+ },
61
+ {
62
+ "epoch": 2.0618556701030926,
63
+ "grad_norm": 8.574434280395508,
64
+ "learning_rate": 1.0416666666666668e-05,
65
+ "loss": 1.5014,
66
+ "step": 50
67
+ },
68
+ {
69
+ "epoch": 2.4742268041237114,
70
+ "grad_norm": 6.564225673675537,
71
+ "learning_rate": 1.25e-05,
72
+ "loss": 1.4422,
73
+ "step": 60
74
+ },
75
+ {
76
+ "epoch": 2.88659793814433,
77
+ "grad_norm": 5.804593086242676,
78
+ "learning_rate": 1.4583333333333335e-05,
79
+ "loss": 1.3989,
80
+ "step": 70
81
+ },
82
+ {
83
+ "epoch": 2.9690721649484537,
84
+ "eval_accuracy": 0.5159420289855072,
85
+ "eval_loss": 1.3423842191696167,
86
+ "eval_precision": 0.52619860815513,
87
+ "eval_recall": 0.5159420289855072,
88
+ "eval_runtime": 1.8303,
89
+ "eval_samples_per_second": 188.493,
90
+ "eval_steps_per_second": 6.01,
91
+ "step": 72
92
+ },
93
+ {
94
+ "epoch": 3.2989690721649483,
95
+ "grad_norm": 6.893215656280518,
96
+ "learning_rate": 1.6666666666666667e-05,
97
+ "loss": 1.2968,
98
+ "step": 80
99
+ },
100
+ {
101
+ "epoch": 3.711340206185567,
102
+ "grad_norm": 12.37126350402832,
103
+ "learning_rate": 1.8750000000000002e-05,
104
+ "loss": 1.2238,
105
+ "step": 90
106
+ },
107
+ {
108
+ "epoch": 4.0,
109
+ "eval_accuracy": 0.6260869565217392,
110
+ "eval_loss": 1.1162269115447998,
111
+ "eval_precision": 0.6665610702002287,
112
+ "eval_recall": 0.6260869565217392,
113
+ "eval_runtime": 1.8634,
114
+ "eval_samples_per_second": 185.144,
115
+ "eval_steps_per_second": 5.903,
116
+ "step": 97
117
+ },
118
+ {
119
+ "epoch": 4.123711340206185,
120
+ "grad_norm": 6.501392841339111,
121
+ "learning_rate": 2.0833333333333336e-05,
122
+ "loss": 1.1194,
123
+ "step": 100
124
+ },
125
+ {
126
+ "epoch": 4.536082474226804,
127
+ "grad_norm": 14.653229713439941,
128
+ "learning_rate": 2.2916666666666667e-05,
129
+ "loss": 1.0499,
130
+ "step": 110
131
+ },
132
+ {
133
+ "epoch": 4.948453608247423,
134
+ "grad_norm": 15.2618408203125,
135
+ "learning_rate": 2.5e-05,
136
+ "loss": 0.9585,
137
+ "step": 120
138
+ },
139
+ {
140
+ "epoch": 4.989690721649485,
141
+ "eval_accuracy": 0.6985507246376812,
142
+ "eval_loss": 0.8966168761253357,
143
+ "eval_precision": 0.7013922738306568,
144
+ "eval_recall": 0.6985507246376812,
145
+ "eval_runtime": 1.8339,
146
+ "eval_samples_per_second": 188.12,
147
+ "eval_steps_per_second": 5.998,
148
+ "step": 121
149
+ },
150
+ {
151
+ "epoch": 5.360824742268041,
152
+ "grad_norm": 12.275806427001953,
153
+ "learning_rate": 2.7083333333333332e-05,
154
+ "loss": 0.8986,
155
+ "step": 130
156
+ },
157
+ {
158
+ "epoch": 5.77319587628866,
159
+ "grad_norm": 15.373220443725586,
160
+ "learning_rate": 2.916666666666667e-05,
161
+ "loss": 0.8934,
162
+ "step": 140
163
+ },
164
+ {
165
+ "epoch": 5.979381443298969,
166
+ "eval_accuracy": 0.7507246376811594,
167
+ "eval_loss": 0.763816773891449,
168
+ "eval_precision": 0.7489666881245252,
169
+ "eval_recall": 0.7507246376811594,
170
+ "eval_runtime": 1.9332,
171
+ "eval_samples_per_second": 178.459,
172
+ "eval_steps_per_second": 5.69,
173
+ "step": 145
174
+ },
175
+ {
176
+ "epoch": 6.185567010309279,
177
+ "grad_norm": 15.394486427307129,
178
+ "learning_rate": 3.125e-05,
179
+ "loss": 0.8326,
180
+ "step": 150
181
+ },
182
+ {
183
+ "epoch": 6.597938144329897,
184
+ "grad_norm": 14.27376937866211,
185
+ "learning_rate": 3.3333333333333335e-05,
186
+ "loss": 0.7589,
187
+ "step": 160
188
+ },
189
+ {
190
+ "epoch": 6.969072164948454,
191
+ "eval_accuracy": 0.7652173913043478,
192
+ "eval_loss": 0.6776081919670105,
193
+ "eval_precision": 0.771906259033061,
194
+ "eval_recall": 0.7652173913043478,
195
+ "eval_runtime": 1.836,
196
+ "eval_samples_per_second": 187.91,
197
+ "eval_steps_per_second": 5.991,
198
+ "step": 169
199
+ },
200
+ {
201
+ "epoch": 7.010309278350515,
202
+ "grad_norm": 21.43760871887207,
203
+ "learning_rate": 3.541666666666667e-05,
204
+ "loss": 0.7404,
205
+ "step": 170
206
+ },
207
+ {
208
+ "epoch": 7.422680412371134,
209
+ "grad_norm": 15.207581520080566,
210
+ "learning_rate": 3.7500000000000003e-05,
211
+ "loss": 0.653,
212
+ "step": 180
213
+ },
214
+ {
215
+ "epoch": 7.835051546391752,
216
+ "grad_norm": 25.153663635253906,
217
+ "learning_rate": 3.958333333333333e-05,
218
+ "loss": 0.6746,
219
+ "step": 190
220
+ },
221
+ {
222
+ "epoch": 8.0,
223
+ "eval_accuracy": 0.7623188405797101,
224
+ "eval_loss": 0.6126735210418701,
225
+ "eval_precision": 0.7628428431334807,
226
+ "eval_recall": 0.7623188405797101,
227
+ "eval_runtime": 1.8501,
228
+ "eval_samples_per_second": 186.474,
229
+ "eval_steps_per_second": 5.946,
230
+ "step": 194
231
+ },
232
+ {
233
+ "epoch": 8.24742268041237,
234
+ "grad_norm": 23.2750301361084,
235
+ "learning_rate": 4.166666666666667e-05,
236
+ "loss": 0.6516,
237
+ "step": 200
238
+ },
239
+ {
240
+ "epoch": 8.65979381443299,
241
+ "grad_norm": 21.777841567993164,
242
+ "learning_rate": 4.375e-05,
243
+ "loss": 0.6048,
244
+ "step": 210
245
+ },
246
+ {
247
+ "epoch": 8.989690721649485,
248
+ "eval_accuracy": 0.8202898550724638,
249
+ "eval_loss": 0.5220813751220703,
250
+ "eval_precision": 0.8216835971752063,
251
+ "eval_recall": 0.8202898550724638,
252
+ "eval_runtime": 1.8243,
253
+ "eval_samples_per_second": 189.114,
254
+ "eval_steps_per_second": 6.03,
255
+ "step": 218
256
+ },
257
+ {
258
+ "epoch": 9.072164948453608,
259
+ "grad_norm": 15.630614280700684,
260
+ "learning_rate": 4.5833333333333334e-05,
261
+ "loss": 0.5723,
262
+ "step": 220
263
+ },
264
+ {
265
+ "epoch": 9.484536082474227,
266
+ "grad_norm": 13.571239471435547,
267
+ "learning_rate": 4.791666666666667e-05,
268
+ "loss": 0.5436,
269
+ "step": 230
270
+ },
271
+ {
272
+ "epoch": 9.896907216494846,
273
+ "grad_norm": 24.206087112426758,
274
+ "learning_rate": 5e-05,
275
+ "loss": 0.531,
276
+ "step": 240
277
+ },
278
+ {
279
+ "epoch": 9.97938144329897,
280
+ "eval_accuracy": 0.8115942028985508,
281
+ "eval_loss": 0.4930874705314636,
282
+ "eval_precision": 0.8203605371226137,
283
+ "eval_recall": 0.8115942028985508,
284
+ "eval_runtime": 1.788,
285
+ "eval_samples_per_second": 192.958,
286
+ "eval_steps_per_second": 6.152,
287
+ "step": 242
288
+ },
289
+ {
290
+ "epoch": 10.309278350515465,
291
+ "grad_norm": 17.16573715209961,
292
+ "learning_rate": 4.976851851851852e-05,
293
+ "loss": 0.5034,
294
+ "step": 250
295
+ },
296
+ {
297
+ "epoch": 10.721649484536082,
298
+ "grad_norm": 19.933942794799805,
299
+ "learning_rate": 4.9537037037037035e-05,
300
+ "loss": 0.57,
301
+ "step": 260
302
+ },
303
+ {
304
+ "epoch": 10.969072164948454,
305
+ "eval_accuracy": 0.8318840579710145,
306
+ "eval_loss": 0.44795188307762146,
307
+ "eval_precision": 0.8344579895060443,
308
+ "eval_recall": 0.8318840579710145,
309
+ "eval_runtime": 1.8183,
310
+ "eval_samples_per_second": 189.733,
311
+ "eval_steps_per_second": 6.049,
312
+ "step": 266
313
+ },
314
+ {
315
+ "epoch": 11.1340206185567,
316
+ "grad_norm": 25.91600799560547,
317
+ "learning_rate": 4.930555555555556e-05,
318
+ "loss": 0.4791,
319
+ "step": 270
320
+ },
321
+ {
322
+ "epoch": 11.54639175257732,
323
+ "grad_norm": 23.493484497070312,
324
+ "learning_rate": 4.9074074074074075e-05,
325
+ "loss": 0.4372,
326
+ "step": 280
327
+ },
328
+ {
329
+ "epoch": 11.958762886597938,
330
+ "grad_norm": 14.273780822753906,
331
+ "learning_rate": 4.8842592592592595e-05,
332
+ "loss": 0.4624,
333
+ "step": 290
334
+ },
335
+ {
336
+ "epoch": 12.0,
337
+ "eval_accuracy": 0.8463768115942029,
338
+ "eval_loss": 0.42139920592308044,
339
+ "eval_precision": 0.846014277166443,
340
+ "eval_recall": 0.8463768115942029,
341
+ "eval_runtime": 1.7884,
342
+ "eval_samples_per_second": 192.914,
343
+ "eval_steps_per_second": 6.151,
344
+ "step": 291
345
+ },
346
+ {
347
+ "epoch": 12.371134020618557,
348
+ "grad_norm": 26.43771743774414,
349
+ "learning_rate": 4.8611111111111115e-05,
350
+ "loss": 0.4509,
351
+ "step": 300
352
+ },
353
+ {
354
+ "epoch": 12.783505154639176,
355
+ "grad_norm": 29.501718521118164,
356
+ "learning_rate": 4.837962962962963e-05,
357
+ "loss": 0.417,
358
+ "step": 310
359
+ },
360
+ {
361
+ "epoch": 12.989690721649485,
362
+ "eval_accuracy": 0.8492753623188406,
363
+ "eval_loss": 0.44392213225364685,
364
+ "eval_precision": 0.8485676738054103,
365
+ "eval_recall": 0.8492753623188406,
366
+ "eval_runtime": 1.762,
367
+ "eval_samples_per_second": 195.797,
368
+ "eval_steps_per_second": 6.243,
369
+ "step": 315
370
+ },
371
+ {
372
+ "epoch": 13.195876288659793,
373
+ "grad_norm": 16.380001068115234,
374
+ "learning_rate": 4.814814814814815e-05,
375
+ "loss": 0.4042,
376
+ "step": 320
377
+ },
378
+ {
379
+ "epoch": 13.608247422680412,
380
+ "grad_norm": 26.098731994628906,
381
+ "learning_rate": 4.791666666666667e-05,
382
+ "loss": 0.3814,
383
+ "step": 330
384
+ },
385
+ {
386
+ "epoch": 13.97938144329897,
387
+ "eval_accuracy": 0.8463768115942029,
388
+ "eval_loss": 0.41379421949386597,
389
+ "eval_precision": 0.8477774513274812,
390
+ "eval_recall": 0.8463768115942029,
391
+ "eval_runtime": 1.7998,
392
+ "eval_samples_per_second": 191.689,
393
+ "eval_steps_per_second": 6.112,
394
+ "step": 339
395
+ },
396
+ {
397
+ "epoch": 14.02061855670103,
398
+ "grad_norm": 13.136883735656738,
399
+ "learning_rate": 4.768518518518519e-05,
400
+ "loss": 0.4209,
401
+ "step": 340
402
+ },
403
+ {
404
+ "epoch": 14.43298969072165,
405
+ "grad_norm": 18.104930877685547,
406
+ "learning_rate": 4.745370370370371e-05,
407
+ "loss": 0.3817,
408
+ "step": 350
409
+ },
410
+ {
411
+ "epoch": 14.845360824742269,
412
+ "grad_norm": 27.79136848449707,
413
+ "learning_rate": 4.722222222222222e-05,
414
+ "loss": 0.3737,
415
+ "step": 360
416
+ },
417
+ {
418
+ "epoch": 14.969072164948454,
419
+ "eval_accuracy": 0.8463768115942029,
420
+ "eval_loss": 0.41388532519340515,
421
+ "eval_precision": 0.8466409143288909,
422
+ "eval_recall": 0.8463768115942029,
423
+ "eval_runtime": 1.8854,
424
+ "eval_samples_per_second": 182.983,
425
+ "eval_steps_per_second": 5.834,
426
+ "step": 363
427
+ },
428
+ {
429
+ "epoch": 15.257731958762886,
430
+ "grad_norm": 33.14027786254883,
431
+ "learning_rate": 4.699074074074074e-05,
432
+ "loss": 0.3782,
433
+ "step": 370
434
+ },
435
+ {
436
+ "epoch": 15.670103092783505,
437
+ "grad_norm": 10.574623107910156,
438
+ "learning_rate": 4.675925925925926e-05,
439
+ "loss": 0.3971,
440
+ "step": 380
441
+ },
442
+ {
443
+ "epoch": 16.0,
444
+ "eval_accuracy": 0.863768115942029,
445
+ "eval_loss": 0.4119352400302887,
446
+ "eval_precision": 0.8664915871553495,
447
+ "eval_recall": 0.863768115942029,
448
+ "eval_runtime": 1.8638,
449
+ "eval_samples_per_second": 185.11,
450
+ "eval_steps_per_second": 5.902,
451
+ "step": 388
452
+ },
453
+ {
454
+ "epoch": 16.082474226804123,
455
+ "grad_norm": 14.796497344970703,
456
+ "learning_rate": 4.652777777777778e-05,
457
+ "loss": 0.3227,
458
+ "step": 390
459
+ },
460
+ {
461
+ "epoch": 16.49484536082474,
462
+ "grad_norm": 13.750545501708984,
463
+ "learning_rate": 4.62962962962963e-05,
464
+ "loss": 0.306,
465
+ "step": 400
466
+ },
467
+ {
468
+ "epoch": 16.90721649484536,
469
+ "grad_norm": 15.056818962097168,
470
+ "learning_rate": 4.6064814814814814e-05,
471
+ "loss": 0.343,
472
+ "step": 410
473
+ },
474
+ {
475
+ "epoch": 16.989690721649485,
476
+ "eval_accuracy": 0.8608695652173913,
477
+ "eval_loss": 0.4421471655368805,
478
+ "eval_precision": 0.8659298079116737,
479
+ "eval_recall": 0.8608695652173913,
480
+ "eval_runtime": 1.7876,
481
+ "eval_samples_per_second": 192.996,
482
+ "eval_steps_per_second": 6.154,
483
+ "step": 412
484
+ },
485
+ {
486
+ "epoch": 17.31958762886598,
487
+ "grad_norm": 19.41351318359375,
488
+ "learning_rate": 4.5833333333333334e-05,
489
+ "loss": 0.3383,
490
+ "step": 420
491
+ },
492
+ {
493
+ "epoch": 17.7319587628866,
494
+ "grad_norm": 22.833810806274414,
495
+ "learning_rate": 4.5601851851851854e-05,
496
+ "loss": 0.3311,
497
+ "step": 430
498
+ },
499
+ {
500
+ "epoch": 17.97938144329897,
501
+ "eval_accuracy": 0.8492753623188406,
502
+ "eval_loss": 0.45808833837509155,
503
+ "eval_precision": 0.8503668982654489,
504
+ "eval_recall": 0.8492753623188406,
505
+ "eval_runtime": 1.8173,
506
+ "eval_samples_per_second": 189.846,
507
+ "eval_steps_per_second": 6.053,
508
+ "step": 436
509
+ },
510
+ {
511
+ "epoch": 18.144329896907216,
512
+ "grad_norm": 9.80312442779541,
513
+ "learning_rate": 4.5370370370370374e-05,
514
+ "loss": 0.301,
515
+ "step": 440
516
+ },
517
+ {
518
+ "epoch": 18.556701030927837,
519
+ "grad_norm": 17.442903518676758,
520
+ "learning_rate": 4.5138888888888894e-05,
521
+ "loss": 0.2594,
522
+ "step": 450
523
+ },
524
+ {
525
+ "epoch": 18.969072164948454,
526
+ "grad_norm": 25.01900863647461,
527
+ "learning_rate": 4.490740740740741e-05,
528
+ "loss": 0.2652,
529
+ "step": 460
530
+ },
531
+ {
532
+ "epoch": 18.969072164948454,
533
+ "eval_accuracy": 0.8405797101449275,
534
+ "eval_loss": 0.4563068747520447,
535
+ "eval_precision": 0.8441116322796441,
536
+ "eval_recall": 0.8405797101449275,
537
+ "eval_runtime": 1.8121,
538
+ "eval_samples_per_second": 190.387,
539
+ "eval_steps_per_second": 6.07,
540
+ "step": 460
541
+ },
542
+ {
543
+ "epoch": 19.38144329896907,
544
+ "grad_norm": 22.951929092407227,
545
+ "learning_rate": 4.467592592592593e-05,
546
+ "loss": 0.2726,
547
+ "step": 470
548
+ },
549
+ {
550
+ "epoch": 19.79381443298969,
551
+ "grad_norm": 17.189971923828125,
552
+ "learning_rate": 4.4444444444444447e-05,
553
+ "loss": 0.3026,
554
+ "step": 480
555
+ },
556
+ {
557
+ "epoch": 20.0,
558
+ "eval_accuracy": 0.8521739130434782,
559
+ "eval_loss": 0.4535578489303589,
560
+ "eval_precision": 0.8549145070160367,
561
+ "eval_recall": 0.8521739130434782,
562
+ "eval_runtime": 1.8156,
563
+ "eval_samples_per_second": 190.019,
564
+ "eval_steps_per_second": 6.059,
565
+ "step": 485
566
+ },
567
+ {
568
+ "epoch": 20.20618556701031,
569
+ "grad_norm": 19.29929542541504,
570
+ "learning_rate": 4.4212962962962966e-05,
571
+ "loss": 0.2808,
572
+ "step": 490
573
+ },
574
+ {
575
+ "epoch": 20.61855670103093,
576
+ "grad_norm": 23.201435089111328,
577
+ "learning_rate": 4.3981481481481486e-05,
578
+ "loss": 0.2562,
579
+ "step": 500
580
+ },
581
+ {
582
+ "epoch": 20.989690721649485,
583
+ "eval_accuracy": 0.8463768115942029,
584
+ "eval_loss": 0.44093257188796997,
585
+ "eval_precision": 0.8493084398986088,
586
+ "eval_recall": 0.8463768115942029,
587
+ "eval_runtime": 1.9468,
588
+ "eval_samples_per_second": 177.217,
589
+ "eval_steps_per_second": 5.65,
590
+ "step": 509
591
+ },
592
+ {
593
+ "epoch": 21.030927835051546,
594
+ "grad_norm": 12.947028160095215,
595
+ "learning_rate": 4.375e-05,
596
+ "loss": 0.2739,
597
+ "step": 510
598
+ },
599
+ {
600
+ "epoch": 21.443298969072163,
601
+ "grad_norm": 21.544536590576172,
602
+ "learning_rate": 4.351851851851852e-05,
603
+ "loss": 0.2383,
604
+ "step": 520
605
+ },
606
+ {
607
+ "epoch": 21.855670103092784,
608
+ "grad_norm": 12.224617958068848,
609
+ "learning_rate": 4.328703703703704e-05,
610
+ "loss": 0.2282,
611
+ "step": 530
612
+ },
613
+ {
614
+ "epoch": 21.97938144329897,
615
+ "eval_accuracy": 0.8434782608695652,
616
+ "eval_loss": 0.4388555884361267,
617
+ "eval_precision": 0.8451190974708183,
618
+ "eval_recall": 0.8434782608695652,
619
+ "eval_runtime": 1.7718,
620
+ "eval_samples_per_second": 194.721,
621
+ "eval_steps_per_second": 6.208,
622
+ "step": 533
623
+ },
624
+ {
625
+ "epoch": 22.2680412371134,
626
+ "grad_norm": 17.55919647216797,
627
+ "learning_rate": 4.305555555555556e-05,
628
+ "loss": 0.2505,
629
+ "step": 540
630
+ },
631
+ {
632
+ "epoch": 22.68041237113402,
633
+ "grad_norm": 10.570196151733398,
634
+ "learning_rate": 4.282407407407408e-05,
635
+ "loss": 0.2374,
636
+ "step": 550
637
+ },
638
+ {
639
+ "epoch": 22.969072164948454,
640
+ "eval_accuracy": 0.8579710144927536,
641
+ "eval_loss": 0.4452122747898102,
642
+ "eval_precision": 0.8589461524849866,
643
+ "eval_recall": 0.8579710144927536,
644
+ "eval_runtime": 1.8751,
645
+ "eval_samples_per_second": 183.989,
646
+ "eval_steps_per_second": 5.866,
647
+ "step": 557
648
+ },
649
+ {
650
+ "epoch": 23.09278350515464,
651
+ "grad_norm": 25.781587600708008,
652
+ "learning_rate": 4.259259259259259e-05,
653
+ "loss": 0.2355,
654
+ "step": 560
655
+ },
656
+ {
657
+ "epoch": 23.50515463917526,
658
+ "grad_norm": 22.854766845703125,
659
+ "learning_rate": 4.236111111111111e-05,
660
+ "loss": 0.2553,
661
+ "step": 570
662
+ },
663
+ {
664
+ "epoch": 23.917525773195877,
665
+ "grad_norm": 15.405595779418945,
666
+ "learning_rate": 4.212962962962963e-05,
667
+ "loss": 0.216,
668
+ "step": 580
669
+ },
670
+ {
671
+ "epoch": 24.0,
672
+ "eval_accuracy": 0.8579710144927536,
673
+ "eval_loss": 0.4375264048576355,
674
+ "eval_precision": 0.858123097800969,
675
+ "eval_recall": 0.8579710144927536,
676
+ "eval_runtime": 1.8051,
677
+ "eval_samples_per_second": 191.128,
678
+ "eval_steps_per_second": 6.094,
679
+ "step": 582
680
+ },
681
+ {
682
+ "epoch": 24.329896907216494,
683
+ "grad_norm": 15.453635215759277,
684
+ "learning_rate": 4.1898148148148145e-05,
685
+ "loss": 0.2019,
686
+ "step": 590
687
+ },
688
+ {
689
+ "epoch": 24.742268041237114,
690
+ "grad_norm": 12.363275527954102,
691
+ "learning_rate": 4.166666666666667e-05,
692
+ "loss": 0.2127,
693
+ "step": 600
694
+ },
695
+ {
696
+ "epoch": 24.989690721649485,
697
+ "eval_accuracy": 0.8579710144927536,
698
+ "eval_loss": 0.44218453764915466,
699
+ "eval_precision": 0.8587798835624924,
700
+ "eval_recall": 0.8579710144927536,
701
+ "eval_runtime": 1.9062,
702
+ "eval_samples_per_second": 180.991,
703
+ "eval_steps_per_second": 5.771,
704
+ "step": 606
705
+ },
706
+ {
707
+ "epoch": 25.15463917525773,
708
+ "grad_norm": 15.13847827911377,
709
+ "learning_rate": 4.1435185185185185e-05,
710
+ "loss": 0.2301,
711
+ "step": 610
712
+ },
713
+ {
714
+ "epoch": 25.567010309278352,
715
+ "grad_norm": 20.761062622070312,
716
+ "learning_rate": 4.1203703703703705e-05,
717
+ "loss": 0.1807,
718
+ "step": 620
719
+ },
720
+ {
721
+ "epoch": 25.97938144329897,
722
+ "grad_norm": 17.889150619506836,
723
+ "learning_rate": 4.0972222222222225e-05,
724
+ "loss": 0.2004,
725
+ "step": 630
726
+ },
727
+ {
728
+ "epoch": 25.97938144329897,
729
+ "eval_accuracy": 0.8521739130434782,
730
+ "eval_loss": 0.46348363161087036,
731
+ "eval_precision": 0.8519325944084339,
732
+ "eval_recall": 0.8521739130434782,
733
+ "eval_runtime": 1.7728,
734
+ "eval_samples_per_second": 194.609,
735
+ "eval_steps_per_second": 6.205,
736
+ "step": 630
737
+ },
738
+ {
739
+ "epoch": 26.391752577319586,
740
+ "grad_norm": 23.56374168395996,
741
+ "learning_rate": 4.074074074074074e-05,
742
+ "loss": 0.2427,
743
+ "step": 640
744
+ },
745
+ {
746
+ "epoch": 26.804123711340207,
747
+ "grad_norm": 9.772664070129395,
748
+ "learning_rate": 4.0509259259259265e-05,
749
+ "loss": 0.2029,
750
+ "step": 650
751
+ },
752
+ {
753
+ "epoch": 26.969072164948454,
754
+ "eval_accuracy": 0.8492753623188406,
755
+ "eval_loss": 0.5214529037475586,
756
+ "eval_precision": 0.8545500895204992,
757
+ "eval_recall": 0.8492753623188406,
758
+ "eval_runtime": 1.9291,
759
+ "eval_samples_per_second": 178.841,
760
+ "eval_steps_per_second": 5.702,
761
+ "step": 654
762
+ },
763
+ {
764
+ "epoch": 27.216494845360824,
765
+ "grad_norm": 14.480449676513672,
766
+ "learning_rate": 4.027777777777778e-05,
767
+ "loss": 0.1903,
768
+ "step": 660
769
+ },
770
+ {
771
+ "epoch": 27.628865979381445,
772
+ "grad_norm": 16.415973663330078,
773
+ "learning_rate": 4.00462962962963e-05,
774
+ "loss": 0.1794,
775
+ "step": 670
776
+ },
777
+ {
778
+ "epoch": 28.0,
779
+ "eval_accuracy": 0.863768115942029,
780
+ "eval_loss": 0.47563326358795166,
781
+ "eval_precision": 0.8669166767891824,
782
+ "eval_recall": 0.863768115942029,
783
+ "eval_runtime": 1.7555,
784
+ "eval_samples_per_second": 196.529,
785
+ "eval_steps_per_second": 6.266,
786
+ "step": 679
787
+ },
788
+ {
789
+ "epoch": 28.04123711340206,
790
+ "grad_norm": 8.689855575561523,
791
+ "learning_rate": 3.981481481481482e-05,
792
+ "loss": 0.1822,
793
+ "step": 680
794
+ },
795
+ {
796
+ "epoch": 28.45360824742268,
797
+ "grad_norm": 12.505402565002441,
798
+ "learning_rate": 3.958333333333333e-05,
799
+ "loss": 0.1828,
800
+ "step": 690
801
+ },
802
+ {
803
+ "epoch": 28.8659793814433,
804
+ "grad_norm": 15.491950988769531,
805
+ "learning_rate": 3.935185185185186e-05,
806
+ "loss": 0.1835,
807
+ "step": 700
808
+ },
809
+ {
810
+ "epoch": 28.989690721649485,
811
+ "eval_accuracy": 0.8608695652173913,
812
+ "eval_loss": 0.4727528393268585,
813
+ "eval_precision": 0.8649801117780185,
814
+ "eval_recall": 0.8608695652173913,
815
+ "eval_runtime": 1.8858,
816
+ "eval_samples_per_second": 182.95,
817
+ "eval_steps_per_second": 5.833,
818
+ "step": 703
819
+ },
820
+ {
821
+ "epoch": 29.278350515463917,
822
+ "grad_norm": 16.289226531982422,
823
+ "learning_rate": 3.912037037037037e-05,
824
+ "loss": 0.1907,
825
+ "step": 710
826
+ },
827
+ {
828
+ "epoch": 29.690721649484537,
829
+ "grad_norm": 13.304434776306152,
830
+ "learning_rate": 3.888888888888889e-05,
831
+ "loss": 0.1781,
832
+ "step": 720
833
+ },
834
+ {
835
+ "epoch": 29.97938144329897,
836
+ "eval_accuracy": 0.855072463768116,
837
+ "eval_loss": 0.4636934697628021,
838
+ "eval_precision": 0.8568131435327558,
839
+ "eval_recall": 0.855072463768116,
840
+ "eval_runtime": 1.8681,
841
+ "eval_samples_per_second": 184.683,
842
+ "eval_steps_per_second": 5.888,
843
+ "step": 727
844
+ },
845
+ {
846
+ "epoch": 30.103092783505154,
847
+ "grad_norm": 6.991786003112793,
848
+ "learning_rate": 3.865740740740741e-05,
849
+ "loss": 0.1829,
850
+ "step": 730
851
+ },
852
+ {
853
+ "epoch": 30.51546391752577,
854
+ "grad_norm": 10.514315605163574,
855
+ "learning_rate": 3.8425925925925924e-05,
856
+ "loss": 0.1627,
857
+ "step": 740
858
+ },
859
+ {
860
+ "epoch": 30.927835051546392,
861
+ "grad_norm": 9.121224403381348,
862
+ "learning_rate": 3.8194444444444444e-05,
863
+ "loss": 0.1671,
864
+ "step": 750
865
+ },
866
+ {
867
+ "epoch": 30.969072164948454,
868
+ "eval_accuracy": 0.8579710144927536,
869
+ "eval_loss": 0.485573947429657,
870
+ "eval_precision": 0.8599276434444294,
871
+ "eval_recall": 0.8579710144927536,
872
+ "eval_runtime": 1.9437,
873
+ "eval_samples_per_second": 177.497,
874
+ "eval_steps_per_second": 5.659,
875
+ "step": 751
876
+ },
877
+ {
878
+ "epoch": 31.34020618556701,
879
+ "grad_norm": 13.762226104736328,
880
+ "learning_rate": 3.7962962962962964e-05,
881
+ "loss": 0.1721,
882
+ "step": 760
883
+ },
884
+ {
885
+ "epoch": 31.75257731958763,
886
+ "grad_norm": 10.415836334228516,
887
+ "learning_rate": 3.7731481481481484e-05,
888
+ "loss": 0.1762,
889
+ "step": 770
890
+ },
891
+ {
892
+ "epoch": 32.0,
893
+ "eval_accuracy": 0.8666666666666667,
894
+ "eval_loss": 0.5007998943328857,
895
+ "eval_precision": 0.8684023473901008,
896
+ "eval_recall": 0.8666666666666667,
897
+ "eval_runtime": 1.769,
898
+ "eval_samples_per_second": 195.026,
899
+ "eval_steps_per_second": 6.218,
900
+ "step": 776
901
+ },
902
+ {
903
+ "epoch": 32.16494845360825,
904
+ "grad_norm": 10.8311767578125,
905
+ "learning_rate": 3.7500000000000003e-05,
906
+ "loss": 0.1707,
907
+ "step": 780
908
+ },
909
+ {
910
+ "epoch": 32.577319587628864,
911
+ "grad_norm": 12.070932388305664,
912
+ "learning_rate": 3.726851851851852e-05,
913
+ "loss": 0.1673,
914
+ "step": 790
915
+ },
916
+ {
917
+ "epoch": 32.98969072164948,
918
+ "grad_norm": 8.654770851135254,
919
+ "learning_rate": 3.7037037037037037e-05,
920
+ "loss": 0.1867,
921
+ "step": 800
922
+ },
923
+ {
924
+ "epoch": 32.98969072164948,
925
+ "eval_accuracy": 0.8579710144927536,
926
+ "eval_loss": 0.5058211088180542,
927
+ "eval_precision": 0.8584843785997619,
928
+ "eval_recall": 0.8579710144927536,
929
+ "eval_runtime": 1.8394,
930
+ "eval_samples_per_second": 187.561,
931
+ "eval_steps_per_second": 5.98,
932
+ "step": 800
933
+ },
934
+ {
935
+ "epoch": 33.402061855670105,
936
+ "grad_norm": 8.323944091796875,
937
+ "learning_rate": 3.6805555555555556e-05,
938
+ "loss": 0.1553,
939
+ "step": 810
940
+ },
941
+ {
942
+ "epoch": 33.81443298969072,
943
+ "grad_norm": 14.134881973266602,
944
+ "learning_rate": 3.6574074074074076e-05,
945
+ "loss": 0.1409,
946
+ "step": 820
947
+ },
948
+ {
949
+ "epoch": 33.97938144329897,
950
+ "eval_accuracy": 0.8405797101449275,
951
+ "eval_loss": 0.5489646792411804,
952
+ "eval_precision": 0.8408524440704116,
953
+ "eval_recall": 0.8405797101449275,
954
+ "eval_runtime": 1.7738,
955
+ "eval_samples_per_second": 194.496,
956
+ "eval_steps_per_second": 6.201,
957
+ "step": 824
958
+ },
959
+ {
960
+ "epoch": 34.22680412371134,
961
+ "grad_norm": 17.74443244934082,
962
+ "learning_rate": 3.6342592592592596e-05,
963
+ "loss": 0.1498,
964
+ "step": 830
965
+ },
966
+ {
967
+ "epoch": 34.63917525773196,
968
+ "grad_norm": 14.35798454284668,
969
+ "learning_rate": 3.611111111111111e-05,
970
+ "loss": 0.1315,
971
+ "step": 840
972
+ },
973
+ {
974
+ "epoch": 34.96907216494845,
975
+ "eval_accuracy": 0.8347826086956521,
976
+ "eval_loss": 0.528394877910614,
977
+ "eval_precision": 0.8356368409524089,
978
+ "eval_recall": 0.8347826086956521,
979
+ "eval_runtime": 1.8034,
980
+ "eval_samples_per_second": 191.304,
981
+ "eval_steps_per_second": 6.1,
982
+ "step": 848
983
+ },
984
+ {
985
+ "epoch": 35.05154639175258,
986
+ "grad_norm": 15.67455005645752,
987
+ "learning_rate": 3.587962962962963e-05,
988
+ "loss": 0.163,
989
+ "step": 850
990
+ },
991
+ {
992
+ "epoch": 35.4639175257732,
993
+ "grad_norm": 6.1969828605651855,
994
+ "learning_rate": 3.564814814814815e-05,
995
+ "loss": 0.1406,
996
+ "step": 860
997
+ },
998
+ {
999
+ "epoch": 35.876288659793815,
1000
+ "grad_norm": 14.651385307312012,
1001
+ "learning_rate": 3.541666666666667e-05,
1002
+ "loss": 0.1315,
1003
+ "step": 870
1004
+ },
1005
+ {
1006
+ "epoch": 36.0,
1007
+ "eval_accuracy": 0.8463768115942029,
1008
+ "eval_loss": 0.5415348410606384,
1009
+ "eval_precision": 0.8487979974677805,
1010
+ "eval_recall": 0.8463768115942029,
1011
+ "eval_runtime": 1.7509,
1012
+ "eval_samples_per_second": 197.042,
1013
+ "eval_steps_per_second": 6.282,
1014
+ "step": 873
1015
+ },
1016
+ {
1017
+ "epoch": 36.28865979381443,
1018
+ "grad_norm": 15.739358901977539,
1019
+ "learning_rate": 3.518518518518519e-05,
1020
+ "loss": 0.1944,
1021
+ "step": 880
1022
+ },
1023
+ {
1024
+ "epoch": 36.70103092783505,
1025
+ "grad_norm": 16.889202117919922,
1026
+ "learning_rate": 3.49537037037037e-05,
1027
+ "loss": 0.1974,
1028
+ "step": 890
1029
+ },
1030
+ {
1031
+ "epoch": 36.98969072164948,
1032
+ "eval_accuracy": 0.8492753623188406,
1033
+ "eval_loss": 0.519416332244873,
1034
+ "eval_precision": 0.8536148561469765,
1035
+ "eval_recall": 0.8492753623188406,
1036
+ "eval_runtime": 1.7833,
1037
+ "eval_samples_per_second": 193.461,
1038
+ "eval_steps_per_second": 6.168,
1039
+ "step": 897
1040
+ },
1041
+ {
1042
+ "epoch": 37.11340206185567,
1043
+ "grad_norm": 10.011604309082031,
1044
+ "learning_rate": 3.472222222222222e-05,
1045
+ "loss": 0.1605,
1046
+ "step": 900
1047
+ },
1048
+ {
1049
+ "epoch": 37.52577319587629,
1050
+ "grad_norm": 18.694128036499023,
1051
+ "learning_rate": 3.449074074074074e-05,
1052
+ "loss": 0.1515,
1053
+ "step": 910
1054
+ },
1055
+ {
1056
+ "epoch": 37.93814432989691,
1057
+ "grad_norm": 9.140711784362793,
1058
+ "learning_rate": 3.425925925925926e-05,
1059
+ "loss": 0.1337,
1060
+ "step": 920
1061
+ },
1062
+ {
1063
+ "epoch": 37.97938144329897,
1064
+ "eval_accuracy": 0.8608695652173913,
1065
+ "eval_loss": 0.5088416337966919,
1066
+ "eval_precision": 0.8602982452483552,
1067
+ "eval_recall": 0.8608695652173913,
1068
+ "eval_runtime": 1.7456,
1069
+ "eval_samples_per_second": 197.634,
1070
+ "eval_steps_per_second": 6.301,
1071
+ "step": 921
1072
+ },
1073
+ {
1074
+ "epoch": 38.350515463917525,
1075
+ "grad_norm": 12.548330307006836,
1076
+ "learning_rate": 3.402777777777778e-05,
1077
+ "loss": 0.1439,
1078
+ "step": 930
1079
+ },
1080
+ {
1081
+ "epoch": 38.76288659793814,
1082
+ "grad_norm": 12.762455940246582,
1083
+ "learning_rate": 3.3796296296296295e-05,
1084
+ "loss": 0.173,
1085
+ "step": 940
1086
+ },
1087
+ {
1088
+ "epoch": 38.96907216494845,
1089
+ "eval_accuracy": 0.8666666666666667,
1090
+ "eval_loss": 0.4912014305591583,
1091
+ "eval_precision": 0.867978256170476,
1092
+ "eval_recall": 0.8666666666666667,
1093
+ "eval_runtime": 1.8067,
1094
+ "eval_samples_per_second": 190.96,
1095
+ "eval_steps_per_second": 6.089,
1096
+ "step": 945
1097
+ },
1098
+ {
1099
+ "epoch": 39.175257731958766,
1100
+ "grad_norm": 12.083857536315918,
1101
+ "learning_rate": 3.3564814814814815e-05,
1102
+ "loss": 0.1477,
1103
+ "step": 950
1104
+ },
1105
+ {
1106
+ "epoch": 39.58762886597938,
1107
+ "grad_norm": 17.14080238342285,
1108
+ "learning_rate": 3.3333333333333335e-05,
1109
+ "loss": 0.1285,
1110
+ "step": 960
1111
+ },
1112
+ {
1113
+ "epoch": 40.0,
1114
+ "grad_norm": 13.190485000610352,
1115
+ "learning_rate": 3.3101851851851855e-05,
1116
+ "loss": 0.1409,
1117
+ "step": 970
1118
+ },
1119
+ {
1120
+ "epoch": 40.0,
1121
+ "eval_accuracy": 0.8492753623188406,
1122
+ "eval_loss": 0.5222660899162292,
1123
+ "eval_precision": 0.8501727809182621,
1124
+ "eval_recall": 0.8492753623188406,
1125
+ "eval_runtime": 1.8482,
1126
+ "eval_samples_per_second": 186.669,
1127
+ "eval_steps_per_second": 5.952,
1128
+ "step": 970
1129
+ },
1130
+ {
1131
+ "epoch": 40.41237113402062,
1132
+ "grad_norm": 8.88687801361084,
1133
+ "learning_rate": 3.2870370370370375e-05,
1134
+ "loss": 0.151,
1135
+ "step": 980
1136
+ },
1137
+ {
1138
+ "epoch": 40.824742268041234,
1139
+ "grad_norm": 7.21800422668457,
1140
+ "learning_rate": 3.263888888888889e-05,
1141
+ "loss": 0.1379,
1142
+ "step": 990
1143
+ },
1144
+ {
1145
+ "epoch": 40.98969072164948,
1146
+ "eval_accuracy": 0.8492753623188406,
1147
+ "eval_loss": 0.5204349160194397,
1148
+ "eval_precision": 0.8486749182344644,
1149
+ "eval_recall": 0.8492753623188406,
1150
+ "eval_runtime": 1.8062,
1151
+ "eval_samples_per_second": 191.006,
1152
+ "eval_steps_per_second": 6.09,
1153
+ "step": 994
1154
+ },
1155
+ {
1156
+ "epoch": 41.23711340206186,
1157
+ "grad_norm": 10.057676315307617,
1158
+ "learning_rate": 3.240740740740741e-05,
1159
+ "loss": 0.1079,
1160
+ "step": 1000
1161
+ },
1162
+ {
1163
+ "epoch": 41.649484536082475,
1164
+ "grad_norm": 13.667500495910645,
1165
+ "learning_rate": 3.217592592592593e-05,
1166
+ "loss": 0.1437,
1167
+ "step": 1010
1168
+ },
1169
+ {
1170
+ "epoch": 41.97938144329897,
1171
+ "eval_accuracy": 0.8521739130434782,
1172
+ "eval_loss": 0.5860036611557007,
1173
+ "eval_precision": 0.8550665818648641,
1174
+ "eval_recall": 0.8521739130434782,
1175
+ "eval_runtime": 1.7468,
1176
+ "eval_samples_per_second": 197.503,
1177
+ "eval_steps_per_second": 6.297,
1178
+ "step": 1018
1179
+ },
1180
+ {
1181
+ "epoch": 42.06185567010309,
1182
+ "grad_norm": 6.985457420349121,
1183
+ "learning_rate": 3.194444444444444e-05,
1184
+ "loss": 0.1521,
1185
+ "step": 1020
1186
+ },
1187
+ {
1188
+ "epoch": 42.47422680412371,
1189
+ "grad_norm": 16.70668601989746,
1190
+ "learning_rate": 3.171296296296297e-05,
1191
+ "loss": 0.1393,
1192
+ "step": 1030
1193
+ },
1194
+ {
1195
+ "epoch": 42.88659793814433,
1196
+ "grad_norm": 6.907033920288086,
1197
+ "learning_rate": 3.148148148148148e-05,
1198
+ "loss": 0.1022,
1199
+ "step": 1040
1200
+ },
1201
+ {
1202
+ "epoch": 42.96907216494845,
1203
+ "eval_accuracy": 0.8463768115942029,
1204
+ "eval_loss": 0.5460776686668396,
1205
+ "eval_precision": 0.8491763964495722,
1206
+ "eval_recall": 0.8463768115942029,
1207
+ "eval_runtime": 1.7961,
1208
+ "eval_samples_per_second": 192.078,
1209
+ "eval_steps_per_second": 6.124,
1210
+ "step": 1042
1211
+ },
1212
+ {
1213
+ "epoch": 43.29896907216495,
1214
+ "grad_norm": 9.046392440795898,
1215
+ "learning_rate": 3.125e-05,
1216
+ "loss": 0.1385,
1217
+ "step": 1050
1218
+ },
1219
+ {
1220
+ "epoch": 43.71134020618557,
1221
+ "grad_norm": 10.188021659851074,
1222
+ "learning_rate": 3.101851851851852e-05,
1223
+ "loss": 0.1181,
1224
+ "step": 1060
1225
+ },
1226
+ {
1227
+ "epoch": 44.0,
1228
+ "eval_accuracy": 0.855072463768116,
1229
+ "eval_loss": 0.541079044342041,
1230
+ "eval_precision": 0.856643419178803,
1231
+ "eval_recall": 0.855072463768116,
1232
+ "eval_runtime": 1.7664,
1233
+ "eval_samples_per_second": 195.31,
1234
+ "eval_steps_per_second": 6.227,
1235
+ "step": 1067
1236
+ },
1237
+ {
1238
+ "epoch": 44.123711340206185,
1239
+ "grad_norm": 8.506319046020508,
1240
+ "learning_rate": 3.0787037037037034e-05,
1241
+ "loss": 0.1411,
1242
+ "step": 1070
1243
+ },
1244
+ {
1245
+ "epoch": 44.5360824742268,
1246
+ "grad_norm": 15.423176765441895,
1247
+ "learning_rate": 3.055555555555556e-05,
1248
+ "loss": 0.1346,
1249
+ "step": 1080
1250
+ },
1251
+ {
1252
+ "epoch": 44.94845360824742,
1253
+ "grad_norm": 6.524370193481445,
1254
+ "learning_rate": 3.0324074074074077e-05,
1255
+ "loss": 0.1212,
1256
+ "step": 1090
1257
+ },
1258
+ {
1259
+ "epoch": 44.98969072164948,
1260
+ "eval_accuracy": 0.8579710144927536,
1261
+ "eval_loss": 0.5293735861778259,
1262
+ "eval_precision": 0.8580282602145957,
1263
+ "eval_recall": 0.8579710144927536,
1264
+ "eval_runtime": 1.8173,
1265
+ "eval_samples_per_second": 189.843,
1266
+ "eval_steps_per_second": 6.053,
1267
+ "step": 1091
1268
+ },
1269
+ {
1270
+ "epoch": 45.36082474226804,
1271
+ "grad_norm": 12.142955780029297,
1272
+ "learning_rate": 3.0092592592592593e-05,
1273
+ "loss": 0.105,
1274
+ "step": 1100
1275
+ },
1276
+ {
1277
+ "epoch": 45.77319587628866,
1278
+ "grad_norm": 11.581314086914062,
1279
+ "learning_rate": 2.9861111111111113e-05,
1280
+ "loss": 0.1049,
1281
+ "step": 1110
1282
+ },
1283
+ {
1284
+ "epoch": 45.97938144329897,
1285
+ "eval_accuracy": 0.8492753623188406,
1286
+ "eval_loss": 0.566691517829895,
1287
+ "eval_precision": 0.8491712997027965,
1288
+ "eval_recall": 0.8492753623188406,
1289
+ "eval_runtime": 1.799,
1290
+ "eval_samples_per_second": 191.772,
1291
+ "eval_steps_per_second": 6.114,
1292
+ "step": 1115
1293
+ },
1294
+ {
1295
+ "epoch": 46.18556701030928,
1296
+ "grad_norm": 15.353252410888672,
1297
+ "learning_rate": 2.962962962962963e-05,
1298
+ "loss": 0.1335,
1299
+ "step": 1120
1300
+ },
1301
+ {
1302
+ "epoch": 46.597938144329895,
1303
+ "grad_norm": 11.990909576416016,
1304
+ "learning_rate": 2.9398148148148146e-05,
1305
+ "loss": 0.1132,
1306
+ "step": 1130
1307
+ },
1308
+ {
1309
+ "epoch": 46.96907216494845,
1310
+ "eval_accuracy": 0.8463768115942029,
1311
+ "eval_loss": 0.5908281207084656,
1312
+ "eval_precision": 0.8491182494977805,
1313
+ "eval_recall": 0.8463768115942029,
1314
+ "eval_runtime": 1.8291,
1315
+ "eval_samples_per_second": 188.615,
1316
+ "eval_steps_per_second": 6.014,
1317
+ "step": 1139
1318
+ },
1319
+ {
1320
+ "epoch": 47.01030927835052,
1321
+ "grad_norm": 7.466699600219727,
1322
+ "learning_rate": 2.916666666666667e-05,
1323
+ "loss": 0.1229,
1324
+ "step": 1140
1325
+ },
1326
+ {
1327
+ "epoch": 47.422680412371136,
1328
+ "grad_norm": 4.299150466918945,
1329
+ "learning_rate": 2.8935185185185186e-05,
1330
+ "loss": 0.1181,
1331
+ "step": 1150
1332
+ },
1333
+ {
1334
+ "epoch": 47.83505154639175,
1335
+ "grad_norm": 8.699248313903809,
1336
+ "learning_rate": 2.8703703703703706e-05,
1337
+ "loss": 0.1313,
1338
+ "step": 1160
1339
+ },
1340
+ {
1341
+ "epoch": 48.0,
1342
+ "eval_accuracy": 0.8521739130434782,
1343
+ "eval_loss": 0.5995594263076782,
1344
+ "eval_precision": 0.8581686976058893,
1345
+ "eval_recall": 0.8521739130434782,
1346
+ "eval_runtime": 1.7851,
1347
+ "eval_samples_per_second": 193.27,
1348
+ "eval_steps_per_second": 6.162,
1349
+ "step": 1164
1350
+ },
1351
+ {
1352
+ "epoch": 48.24742268041237,
1353
+ "grad_norm": 7.394286632537842,
1354
+ "learning_rate": 2.8472222222222223e-05,
1355
+ "loss": 0.1287,
1356
+ "step": 1170
1357
+ },
1358
+ {
1359
+ "epoch": 48.65979381443299,
1360
+ "grad_norm": 10.575745582580566,
1361
+ "learning_rate": 2.824074074074074e-05,
1362
+ "loss": 0.1312,
1363
+ "step": 1180
1364
+ },
1365
+ {
1366
+ "epoch": 48.98969072164948,
1367
+ "eval_accuracy": 0.8579710144927536,
1368
+ "eval_loss": 0.542959451675415,
1369
+ "eval_precision": 0.8607254186783246,
1370
+ "eval_recall": 0.8579710144927536,
1371
+ "eval_runtime": 1.7426,
1372
+ "eval_samples_per_second": 197.985,
1373
+ "eval_steps_per_second": 6.313,
1374
+ "step": 1188
1375
+ },
1376
+ {
1377
+ "epoch": 49.07216494845361,
1378
+ "grad_norm": 14.257989883422852,
1379
+ "learning_rate": 2.8009259259259263e-05,
1380
+ "loss": 0.1341,
1381
+ "step": 1190
1382
+ },
1383
+ {
1384
+ "epoch": 49.48453608247423,
1385
+ "grad_norm": 9.95071029663086,
1386
+ "learning_rate": 2.777777777777778e-05,
1387
+ "loss": 0.138,
1388
+ "step": 1200
1389
+ },
1390
+ {
1391
+ "epoch": 49.896907216494846,
1392
+ "grad_norm": 10.54672622680664,
1393
+ "learning_rate": 2.75462962962963e-05,
1394
+ "loss": 0.0996,
1395
+ "step": 1210
1396
+ },
1397
+ {
1398
+ "epoch": 49.97938144329897,
1399
+ "eval_accuracy": 0.8521739130434782,
1400
+ "eval_loss": 0.5776570439338684,
1401
+ "eval_precision": 0.8561151948364225,
1402
+ "eval_recall": 0.8521739130434782,
1403
+ "eval_runtime": 1.8283,
1404
+ "eval_samples_per_second": 188.7,
1405
+ "eval_steps_per_second": 6.017,
1406
+ "step": 1212
1407
+ },
1408
+ {
1409
+ "epoch": 50.30927835051546,
1410
+ "grad_norm": 9.269867897033691,
1411
+ "learning_rate": 2.7314814814814816e-05,
1412
+ "loss": 0.1183,
1413
+ "step": 1220
1414
+ },
1415
+ {
1416
+ "epoch": 50.72164948453608,
1417
+ "grad_norm": 3.963714361190796,
1418
+ "learning_rate": 2.7083333333333332e-05,
1419
+ "loss": 0.1389,
1420
+ "step": 1230
1421
+ },
1422
+ {
1423
+ "epoch": 50.96907216494845,
1424
+ "eval_accuracy": 0.8434782608695652,
1425
+ "eval_loss": 0.5757654905319214,
1426
+ "eval_precision": 0.8486477905744771,
1427
+ "eval_recall": 0.8434782608695652,
1428
+ "eval_runtime": 1.8064,
1429
+ "eval_samples_per_second": 190.984,
1430
+ "eval_steps_per_second": 6.089,
1431
+ "step": 1236
1432
+ },
1433
+ {
1434
+ "epoch": 51.134020618556704,
1435
+ "grad_norm": 24.62941551208496,
1436
+ "learning_rate": 2.6851851851851855e-05,
1437
+ "loss": 0.1188,
1438
+ "step": 1240
1439
+ },
1440
+ {
1441
+ "epoch": 51.54639175257732,
1442
+ "grad_norm": 14.212287902832031,
1443
+ "learning_rate": 2.6620370370370372e-05,
1444
+ "loss": 0.1257,
1445
+ "step": 1250
1446
+ },
1447
+ {
1448
+ "epoch": 51.95876288659794,
1449
+ "grad_norm": 10.230920791625977,
1450
+ "learning_rate": 2.6388888888888892e-05,
1451
+ "loss": 0.1079,
1452
+ "step": 1260
1453
+ },
1454
+ {
1455
+ "epoch": 52.0,
1456
+ "eval_accuracy": 0.8579710144927536,
1457
+ "eval_loss": 0.5540273785591125,
1458
+ "eval_precision": 0.8611434608590304,
1459
+ "eval_recall": 0.8579710144927536,
1460
+ "eval_runtime": 1.7965,
1461
+ "eval_samples_per_second": 192.043,
1462
+ "eval_steps_per_second": 6.123,
1463
+ "step": 1261
1464
+ },
1465
+ {
1466
+ "epoch": 52.371134020618555,
1467
+ "grad_norm": 12.681902885437012,
1468
+ "learning_rate": 2.615740740740741e-05,
1469
+ "loss": 0.0964,
1470
+ "step": 1270
1471
+ },
1472
+ {
1473
+ "epoch": 52.78350515463917,
1474
+ "grad_norm": 14.907917022705078,
1475
+ "learning_rate": 2.5925925925925925e-05,
1476
+ "loss": 0.0972,
1477
+ "step": 1280
1478
+ },
1479
+ {
1480
+ "epoch": 52.98969072164948,
1481
+ "eval_accuracy": 0.855072463768116,
1482
+ "eval_loss": 0.5599762797355652,
1483
+ "eval_precision": 0.8559313253403165,
1484
+ "eval_recall": 0.855072463768116,
1485
+ "eval_runtime": 1.8665,
1486
+ "eval_samples_per_second": 184.836,
1487
+ "eval_steps_per_second": 5.893,
1488
+ "step": 1285
1489
+ },
1490
+ {
1491
+ "epoch": 53.1958762886598,
1492
+ "grad_norm": 13.571532249450684,
1493
+ "learning_rate": 2.5694444444444445e-05,
1494
+ "loss": 0.1164,
1495
+ "step": 1290
1496
+ },
1497
+ {
1498
+ "epoch": 53.608247422680414,
1499
+ "grad_norm": 14.119112014770508,
1500
+ "learning_rate": 2.5462962962962965e-05,
1501
+ "loss": 0.0985,
1502
+ "step": 1300
1503
+ },
1504
+ {
1505
+ "epoch": 53.97938144329897,
1506
+ "eval_accuracy": 0.863768115942029,
1507
+ "eval_loss": 0.5391947627067566,
1508
+ "eval_precision": 0.865555829019492,
1509
+ "eval_recall": 0.863768115942029,
1510
+ "eval_runtime": 1.8914,
1511
+ "eval_samples_per_second": 182.408,
1512
+ "eval_steps_per_second": 5.816,
1513
+ "step": 1309
1514
+ },
1515
+ {
1516
+ "epoch": 54.02061855670103,
1517
+ "grad_norm": 11.18630599975586,
1518
+ "learning_rate": 2.5231481481481485e-05,
1519
+ "loss": 0.1139,
1520
+ "step": 1310
1521
+ },
1522
+ {
1523
+ "epoch": 54.43298969072165,
1524
+ "grad_norm": 14.511212348937988,
1525
+ "learning_rate": 2.5e-05,
1526
+ "loss": 0.1117,
1527
+ "step": 1320
1528
+ },
1529
+ {
1530
+ "epoch": 54.845360824742265,
1531
+ "grad_norm": 4.760071277618408,
1532
+ "learning_rate": 2.4768518518518518e-05,
1533
+ "loss": 0.1112,
1534
+ "step": 1330
1535
+ },
1536
+ {
1537
+ "epoch": 54.96907216494845,
1538
+ "eval_accuracy": 0.863768115942029,
1539
+ "eval_loss": 0.5410789847373962,
1540
+ "eval_precision": 0.8655836794521399,
1541
+ "eval_recall": 0.863768115942029,
1542
+ "eval_runtime": 1.8766,
1543
+ "eval_samples_per_second": 183.845,
1544
+ "eval_steps_per_second": 5.862,
1545
+ "step": 1333
1546
+ },
1547
+ {
1548
+ "epoch": 55.25773195876289,
1549
+ "grad_norm": 8.37569808959961,
1550
+ "learning_rate": 2.4537037037037038e-05,
1551
+ "loss": 0.1062,
1552
+ "step": 1340
1553
+ },
1554
+ {
1555
+ "epoch": 55.670103092783506,
1556
+ "grad_norm": 10.700220108032227,
1557
+ "learning_rate": 2.4305555555555558e-05,
1558
+ "loss": 0.1308,
1559
+ "step": 1350
1560
+ },
1561
+ {
1562
+ "epoch": 56.0,
1563
+ "eval_accuracy": 0.863768115942029,
1564
+ "eval_loss": 0.5445396900177002,
1565
+ "eval_precision": 0.8653666576853845,
1566
+ "eval_recall": 0.863768115942029,
1567
+ "eval_runtime": 1.8208,
1568
+ "eval_samples_per_second": 189.479,
1569
+ "eval_steps_per_second": 6.041,
1570
+ "step": 1358
1571
+ },
1572
+ {
1573
+ "epoch": 56.08247422680412,
1574
+ "grad_norm": 19.0463924407959,
1575
+ "learning_rate": 2.4074074074074074e-05,
1576
+ "loss": 0.1081,
1577
+ "step": 1360
1578
+ },
1579
+ {
1580
+ "epoch": 56.49484536082474,
1581
+ "grad_norm": 6.819794654846191,
1582
+ "learning_rate": 2.3842592592592594e-05,
1583
+ "loss": 0.1072,
1584
+ "step": 1370
1585
+ },
1586
+ {
1587
+ "epoch": 56.90721649484536,
1588
+ "grad_norm": 6.308873176574707,
1589
+ "learning_rate": 2.361111111111111e-05,
1590
+ "loss": 0.1005,
1591
+ "step": 1380
1592
+ },
1593
+ {
1594
+ "epoch": 56.98969072164948,
1595
+ "eval_accuracy": 0.855072463768116,
1596
+ "eval_loss": 0.5554308891296387,
1597
+ "eval_precision": 0.8551462662985753,
1598
+ "eval_recall": 0.855072463768116,
1599
+ "eval_runtime": 1.868,
1600
+ "eval_samples_per_second": 184.69,
1601
+ "eval_steps_per_second": 5.889,
1602
+ "step": 1382
1603
+ },
1604
+ {
1605
+ "epoch": 57.31958762886598,
1606
+ "grad_norm": 5.025654315948486,
1607
+ "learning_rate": 2.337962962962963e-05,
1608
+ "loss": 0.088,
1609
+ "step": 1390
1610
+ },
1611
+ {
1612
+ "epoch": 57.7319587628866,
1613
+ "grad_norm": 10.021939277648926,
1614
+ "learning_rate": 2.314814814814815e-05,
1615
+ "loss": 0.0871,
1616
+ "step": 1400
1617
+ },
1618
+ {
1619
+ "epoch": 57.97938144329897,
1620
+ "eval_accuracy": 0.8405797101449275,
1621
+ "eval_loss": 0.5966009497642517,
1622
+ "eval_precision": 0.8440749450064067,
1623
+ "eval_recall": 0.8405797101449275,
1624
+ "eval_runtime": 1.7974,
1625
+ "eval_samples_per_second": 191.939,
1626
+ "eval_steps_per_second": 6.12,
1627
+ "step": 1406
1628
+ },
1629
+ {
1630
+ "epoch": 58.144329896907216,
1631
+ "grad_norm": 16.077518463134766,
1632
+ "learning_rate": 2.2916666666666667e-05,
1633
+ "loss": 0.089,
1634
+ "step": 1410
1635
+ },
1636
+ {
1637
+ "epoch": 58.55670103092783,
1638
+ "grad_norm": 14.556241035461426,
1639
+ "learning_rate": 2.2685185185185187e-05,
1640
+ "loss": 0.1072,
1641
+ "step": 1420
1642
+ },
1643
+ {
1644
+ "epoch": 58.96907216494845,
1645
+ "grad_norm": 9.045204162597656,
1646
+ "learning_rate": 2.2453703703703703e-05,
1647
+ "loss": 0.1102,
1648
+ "step": 1430
1649
+ },
1650
+ {
1651
+ "epoch": 58.96907216494845,
1652
+ "eval_accuracy": 0.8521739130434782,
1653
+ "eval_loss": 0.5807223916053772,
1654
+ "eval_precision": 0.8543040805400182,
1655
+ "eval_recall": 0.8521739130434782,
1656
+ "eval_runtime": 1.8412,
1657
+ "eval_samples_per_second": 187.376,
1658
+ "eval_steps_per_second": 5.974,
1659
+ "step": 1430
1660
+ },
1661
+ {
1662
+ "epoch": 59.381443298969074,
1663
+ "grad_norm": 12.29312515258789,
1664
+ "learning_rate": 2.2222222222222223e-05,
1665
+ "loss": 0.1021,
1666
+ "step": 1440
1667
+ },
1668
+ {
1669
+ "epoch": 59.79381443298969,
1670
+ "grad_norm": 13.808602333068848,
1671
+ "learning_rate": 2.1990740740740743e-05,
1672
+ "loss": 0.1028,
1673
+ "step": 1450
1674
+ },
1675
+ {
1676
+ "epoch": 60.0,
1677
+ "eval_accuracy": 0.8434782608695652,
1678
+ "eval_loss": 0.5653913021087646,
1679
+ "eval_precision": 0.8490636359945823,
1680
+ "eval_recall": 0.8434782608695652,
1681
+ "eval_runtime": 1.8195,
1682
+ "eval_samples_per_second": 189.615,
1683
+ "eval_steps_per_second": 6.046,
1684
+ "step": 1455
1685
+ },
1686
+ {
1687
+ "epoch": 60.20618556701031,
1688
+ "grad_norm": 8.929511070251465,
1689
+ "learning_rate": 2.175925925925926e-05,
1690
+ "loss": 0.1103,
1691
+ "step": 1460
1692
+ },
1693
+ {
1694
+ "epoch": 60.618556701030926,
1695
+ "grad_norm": 14.425239562988281,
1696
+ "learning_rate": 2.152777777777778e-05,
1697
+ "loss": 0.107,
1698
+ "step": 1470
1699
+ },
1700
+ {
1701
+ "epoch": 60.98969072164948,
1702
+ "eval_accuracy": 0.8434782608695652,
1703
+ "eval_loss": 0.577854573726654,
1704
+ "eval_precision": 0.8460752319344831,
1705
+ "eval_recall": 0.8434782608695652,
1706
+ "eval_runtime": 1.8265,
1707
+ "eval_samples_per_second": 188.883,
1708
+ "eval_steps_per_second": 6.022,
1709
+ "step": 1479
1710
+ },
1711
+ {
1712
+ "epoch": 61.03092783505155,
1713
+ "grad_norm": 10.870781898498535,
1714
+ "learning_rate": 2.1296296296296296e-05,
1715
+ "loss": 0.0954,
1716
+ "step": 1480
1717
+ },
1718
+ {
1719
+ "epoch": 61.44329896907217,
1720
+ "grad_norm": 10.188617706298828,
1721
+ "learning_rate": 2.1064814814814816e-05,
1722
+ "loss": 0.0942,
1723
+ "step": 1490
1724
+ },
1725
+ {
1726
+ "epoch": 61.855670103092784,
1727
+ "grad_norm": 6.4580302238464355,
1728
+ "learning_rate": 2.0833333333333336e-05,
1729
+ "loss": 0.0848,
1730
+ "step": 1500
1731
+ },
1732
+ {
1733
+ "epoch": 61.97938144329897,
1734
+ "eval_accuracy": 0.855072463768116,
1735
+ "eval_loss": 0.5842954516410828,
1736
+ "eval_precision": 0.8569219850916401,
1737
+ "eval_recall": 0.855072463768116,
1738
+ "eval_runtime": 1.8368,
1739
+ "eval_samples_per_second": 187.828,
1740
+ "eval_steps_per_second": 5.989,
1741
+ "step": 1503
1742
+ },
1743
+ {
1744
+ "epoch": 62.2680412371134,
1745
+ "grad_norm": 13.236536979675293,
1746
+ "learning_rate": 2.0601851851851853e-05,
1747
+ "loss": 0.0993,
1748
+ "step": 1510
1749
+ },
1750
+ {
1751
+ "epoch": 62.68041237113402,
1752
+ "grad_norm": 11.377030372619629,
1753
+ "learning_rate": 2.037037037037037e-05,
1754
+ "loss": 0.0976,
1755
+ "step": 1520
1756
+ },
1757
+ {
1758
+ "epoch": 62.96907216494845,
1759
+ "eval_accuracy": 0.8434782608695652,
1760
+ "eval_loss": 0.6161760687828064,
1761
+ "eval_precision": 0.8454310204706964,
1762
+ "eval_recall": 0.8434782608695652,
1763
+ "eval_runtime": 1.7609,
1764
+ "eval_samples_per_second": 195.923,
1765
+ "eval_steps_per_second": 6.247,
1766
+ "step": 1527
1767
+ },
1768
+ {
1769
+ "epoch": 63.09278350515464,
1770
+ "grad_norm": 9.68355655670166,
1771
+ "learning_rate": 2.013888888888889e-05,
1772
+ "loss": 0.0788,
1773
+ "step": 1530
1774
+ },
1775
+ {
1776
+ "epoch": 63.50515463917526,
1777
+ "grad_norm": 6.282276153564453,
1778
+ "learning_rate": 1.990740740740741e-05,
1779
+ "loss": 0.103,
1780
+ "step": 1540
1781
+ },
1782
+ {
1783
+ "epoch": 63.91752577319588,
1784
+ "grad_norm": 4.893520832061768,
1785
+ "learning_rate": 1.967592592592593e-05,
1786
+ "loss": 0.0977,
1787
+ "step": 1550
1788
+ },
1789
+ {
1790
+ "epoch": 64.0,
1791
+ "eval_accuracy": 0.8463768115942029,
1792
+ "eval_loss": 0.5822046995162964,
1793
+ "eval_precision": 0.8468574730482583,
1794
+ "eval_recall": 0.8463768115942029,
1795
+ "eval_runtime": 1.8068,
1796
+ "eval_samples_per_second": 190.942,
1797
+ "eval_steps_per_second": 6.088,
1798
+ "step": 1552
1799
+ },
1800
+ {
1801
+ "epoch": 64.3298969072165,
1802
+ "grad_norm": 10.216239929199219,
1803
+ "learning_rate": 1.9444444444444445e-05,
1804
+ "loss": 0.1112,
1805
+ "step": 1560
1806
+ },
1807
+ {
1808
+ "epoch": 64.74226804123711,
1809
+ "grad_norm": 22.551631927490234,
1810
+ "learning_rate": 1.9212962962962962e-05,
1811
+ "loss": 0.1256,
1812
+ "step": 1570
1813
+ },
1814
+ {
1815
+ "epoch": 64.98969072164948,
1816
+ "eval_accuracy": 0.8492753623188406,
1817
+ "eval_loss": 0.575657308101654,
1818
+ "eval_precision": 0.851359361697526,
1819
+ "eval_recall": 0.8492753623188406,
1820
+ "eval_runtime": 1.8317,
1821
+ "eval_samples_per_second": 188.346,
1822
+ "eval_steps_per_second": 6.005,
1823
+ "step": 1576
1824
+ },
1825
+ {
1826
+ "epoch": 65.15463917525773,
1827
+ "grad_norm": 6.853829383850098,
1828
+ "learning_rate": 1.8981481481481482e-05,
1829
+ "loss": 0.096,
1830
+ "step": 1580
1831
+ },
1832
+ {
1833
+ "epoch": 65.56701030927834,
1834
+ "grad_norm": 14.361750602722168,
1835
+ "learning_rate": 1.8750000000000002e-05,
1836
+ "loss": 0.0942,
1837
+ "step": 1590
1838
+ },
1839
+ {
1840
+ "epoch": 65.97938144329896,
1841
+ "grad_norm": 9.966873168945312,
1842
+ "learning_rate": 1.8518518518518518e-05,
1843
+ "loss": 0.0883,
1844
+ "step": 1600
1845
+ },
1846
+ {
1847
+ "epoch": 65.97938144329896,
1848
+ "eval_accuracy": 0.8463768115942029,
1849
+ "eval_loss": 0.5716322660446167,
1850
+ "eval_precision": 0.8466640969128532,
1851
+ "eval_recall": 0.8463768115942029,
1852
+ "eval_runtime": 1.7836,
1853
+ "eval_samples_per_second": 193.433,
1854
+ "eval_steps_per_second": 6.167,
1855
+ "step": 1600
1856
+ },
1857
+ {
1858
+ "epoch": 66.3917525773196,
1859
+ "grad_norm": 9.780498504638672,
1860
+ "learning_rate": 1.8287037037037038e-05,
1861
+ "loss": 0.0791,
1862
+ "step": 1610
1863
+ },
1864
+ {
1865
+ "epoch": 66.80412371134021,
1866
+ "grad_norm": 10.076851844787598,
1867
+ "learning_rate": 1.8055555555555555e-05,
1868
+ "loss": 0.0808,
1869
+ "step": 1620
1870
+ },
1871
+ {
1872
+ "epoch": 66.96907216494846,
1873
+ "eval_accuracy": 0.855072463768116,
1874
+ "eval_loss": 0.5726441144943237,
1875
+ "eval_precision": 0.8562372477793413,
1876
+ "eval_recall": 0.855072463768116,
1877
+ "eval_runtime": 1.782,
1878
+ "eval_samples_per_second": 193.608,
1879
+ "eval_steps_per_second": 6.173,
1880
+ "step": 1624
1881
+ },
1882
+ {
1883
+ "epoch": 67.21649484536083,
1884
+ "grad_norm": 10.814988136291504,
1885
+ "learning_rate": 1.7824074074074075e-05,
1886
+ "loss": 0.0604,
1887
+ "step": 1630
1888
+ },
1889
+ {
1890
+ "epoch": 67.62886597938144,
1891
+ "grad_norm": 14.779629707336426,
1892
+ "learning_rate": 1.7592592592592595e-05,
1893
+ "loss": 0.1034,
1894
+ "step": 1640
1895
+ },
1896
+ {
1897
+ "epoch": 68.0,
1898
+ "eval_accuracy": 0.855072463768116,
1899
+ "eval_loss": 0.5412786602973938,
1900
+ "eval_precision": 0.8548742107305042,
1901
+ "eval_recall": 0.855072463768116,
1902
+ "eval_runtime": 1.8607,
1903
+ "eval_samples_per_second": 185.418,
1904
+ "eval_steps_per_second": 5.912,
1905
+ "step": 1649
1906
+ },
1907
+ {
1908
+ "epoch": 68.04123711340206,
1909
+ "grad_norm": 7.925902843475342,
1910
+ "learning_rate": 1.736111111111111e-05,
1911
+ "loss": 0.098,
1912
+ "step": 1650
1913
+ },
1914
+ {
1915
+ "epoch": 68.45360824742268,
1916
+ "grad_norm": 8.179915428161621,
1917
+ "learning_rate": 1.712962962962963e-05,
1918
+ "loss": 0.0871,
1919
+ "step": 1660
1920
+ },
1921
+ {
1922
+ "epoch": 68.8659793814433,
1923
+ "grad_norm": 8.375000953674316,
1924
+ "learning_rate": 1.6898148148148148e-05,
1925
+ "loss": 0.0845,
1926
+ "step": 1670
1927
+ },
1928
+ {
1929
+ "epoch": 68.98969072164948,
1930
+ "eval_accuracy": 0.8434782608695652,
1931
+ "eval_loss": 0.5826108455657959,
1932
+ "eval_precision": 0.8476663926581475,
1933
+ "eval_recall": 0.8434782608695652,
1934
+ "eval_runtime": 1.8967,
1935
+ "eval_samples_per_second": 181.896,
1936
+ "eval_steps_per_second": 5.8,
1937
+ "step": 1673
1938
+ },
1939
+ {
1940
+ "epoch": 69.27835051546391,
1941
+ "grad_norm": 8.613913536071777,
1942
+ "learning_rate": 1.6666666666666667e-05,
1943
+ "loss": 0.0911,
1944
+ "step": 1680
1945
+ },
1946
+ {
1947
+ "epoch": 69.69072164948453,
1948
+ "grad_norm": 9.535558700561523,
1949
+ "learning_rate": 1.6435185185185187e-05,
1950
+ "loss": 0.0916,
1951
+ "step": 1690
1952
+ },
1953
+ {
1954
+ "epoch": 69.97938144329896,
1955
+ "eval_accuracy": 0.8521739130434782,
1956
+ "eval_loss": 0.566058337688446,
1957
+ "eval_precision": 0.8522049189345976,
1958
+ "eval_recall": 0.8521739130434782,
1959
+ "eval_runtime": 1.7731,
1960
+ "eval_samples_per_second": 194.574,
1961
+ "eval_steps_per_second": 6.204,
1962
+ "step": 1697
1963
+ },
1964
+ {
1965
+ "epoch": 70.10309278350516,
1966
+ "grad_norm": 7.769627571105957,
1967
+ "learning_rate": 1.6203703703703704e-05,
1968
+ "loss": 0.1011,
1969
+ "step": 1700
1970
+ },
1971
+ {
1972
+ "epoch": 70.51546391752578,
1973
+ "grad_norm": 9.350245475769043,
1974
+ "learning_rate": 1.597222222222222e-05,
1975
+ "loss": 0.0896,
1976
+ "step": 1710
1977
+ },
1978
+ {
1979
+ "epoch": 70.9278350515464,
1980
+ "grad_norm": 11.536579132080078,
1981
+ "learning_rate": 1.574074074074074e-05,
1982
+ "loss": 0.0912,
1983
+ "step": 1720
1984
+ },
1985
+ {
1986
+ "epoch": 70.96907216494846,
1987
+ "eval_accuracy": 0.8492753623188406,
1988
+ "eval_loss": 0.5770707130432129,
1989
+ "eval_precision": 0.84979303172866,
1990
+ "eval_recall": 0.8492753623188406,
1991
+ "eval_runtime": 1.817,
1992
+ "eval_samples_per_second": 189.875,
1993
+ "eval_steps_per_second": 6.054,
1994
+ "step": 1721
1995
+ },
1996
+ {
1997
+ "epoch": 71.34020618556701,
1998
+ "grad_norm": 15.122323989868164,
1999
+ "learning_rate": 1.550925925925926e-05,
2000
+ "loss": 0.0995,
2001
+ "step": 1730
2002
+ },
2003
+ {
2004
+ "epoch": 71.75257731958763,
2005
+ "grad_norm": 12.938358306884766,
2006
+ "learning_rate": 1.527777777777778e-05,
2007
+ "loss": 0.0863,
2008
+ "step": 1740
2009
+ },
2010
+ {
2011
+ "epoch": 72.0,
2012
+ "eval_accuracy": 0.855072463768116,
2013
+ "eval_loss": 0.5769326686859131,
2014
+ "eval_precision": 0.8550354692908756,
2015
+ "eval_recall": 0.855072463768116,
2016
+ "eval_runtime": 1.8313,
2017
+ "eval_samples_per_second": 188.386,
2018
+ "eval_steps_per_second": 6.007,
2019
+ "step": 1746
2020
+ },
2021
+ {
2022
+ "epoch": 72.16494845360825,
2023
+ "grad_norm": 6.935812950134277,
2024
+ "learning_rate": 1.5046296296296297e-05,
2025
+ "loss": 0.0731,
2026
+ "step": 1750
2027
+ },
2028
+ {
2029
+ "epoch": 72.57731958762886,
2030
+ "grad_norm": 10.120232582092285,
2031
+ "learning_rate": 1.4814814814814815e-05,
2032
+ "loss": 0.1101,
2033
+ "step": 1760
2034
+ },
2035
+ {
2036
+ "epoch": 72.98969072164948,
2037
+ "grad_norm": 5.746927738189697,
2038
+ "learning_rate": 1.4583333333333335e-05,
2039
+ "loss": 0.083,
2040
+ "step": 1770
2041
+ },
2042
+ {
2043
+ "epoch": 72.98969072164948,
2044
+ "eval_accuracy": 0.8492753623188406,
2045
+ "eval_loss": 0.5860167145729065,
2046
+ "eval_precision": 0.8486187988428825,
2047
+ "eval_recall": 0.8492753623188406,
2048
+ "eval_runtime": 1.8602,
2049
+ "eval_samples_per_second": 185.466,
2050
+ "eval_steps_per_second": 5.913,
2051
+ "step": 1770
2052
+ },
2053
+ {
2054
+ "epoch": 73.4020618556701,
2055
+ "grad_norm": 14.205853462219238,
2056
+ "learning_rate": 1.4351851851851853e-05,
2057
+ "loss": 0.1003,
2058
+ "step": 1780
2059
+ },
2060
+ {
2061
+ "epoch": 73.81443298969072,
2062
+ "grad_norm": 6.671767711639404,
2063
+ "learning_rate": 1.412037037037037e-05,
2064
+ "loss": 0.0839,
2065
+ "step": 1790
2066
+ },
2067
+ {
2068
+ "epoch": 73.97938144329896,
2069
+ "eval_accuracy": 0.855072463768116,
2070
+ "eval_loss": 0.5647125244140625,
2071
+ "eval_precision": 0.8550673486786019,
2072
+ "eval_recall": 0.855072463768116,
2073
+ "eval_runtime": 1.843,
2074
+ "eval_samples_per_second": 187.195,
2075
+ "eval_steps_per_second": 5.969,
2076
+ "step": 1794
2077
+ },
2078
+ {
2079
+ "epoch": 74.22680412371135,
2080
+ "grad_norm": 6.19529914855957,
2081
+ "learning_rate": 1.388888888888889e-05,
2082
+ "loss": 0.0798,
2083
+ "step": 1800
2084
+ },
2085
+ {
2086
+ "epoch": 74.63917525773196,
2087
+ "grad_norm": 13.039739608764648,
2088
+ "learning_rate": 1.3657407407407408e-05,
2089
+ "loss": 0.0903,
2090
+ "step": 1810
2091
+ },
2092
+ {
2093
+ "epoch": 74.96907216494846,
2094
+ "eval_accuracy": 0.855072463768116,
2095
+ "eval_loss": 0.601210355758667,
2096
+ "eval_precision": 0.8534831427546733,
2097
+ "eval_recall": 0.855072463768116,
2098
+ "eval_runtime": 1.7476,
2099
+ "eval_samples_per_second": 197.417,
2100
+ "eval_steps_per_second": 6.294,
2101
+ "step": 1818
2102
+ },
2103
+ {
2104
+ "epoch": 75.05154639175258,
2105
+ "grad_norm": 6.386416435241699,
2106
+ "learning_rate": 1.3425925925925928e-05,
2107
+ "loss": 0.0872,
2108
+ "step": 1820
2109
+ },
2110
+ {
2111
+ "epoch": 75.4639175257732,
2112
+ "grad_norm": 7.484694957733154,
2113
+ "learning_rate": 1.3194444444444446e-05,
2114
+ "loss": 0.0751,
2115
+ "step": 1830
2116
+ },
2117
+ {
2118
+ "epoch": 75.87628865979381,
2119
+ "grad_norm": 10.781839370727539,
2120
+ "learning_rate": 1.2962962962962962e-05,
2121
+ "loss": 0.074,
2122
+ "step": 1840
2123
+ },
2124
+ {
2125
+ "epoch": 76.0,
2126
+ "eval_accuracy": 0.8463768115942029,
2127
+ "eval_loss": 0.6048101186752319,
2128
+ "eval_precision": 0.8461499789126601,
2129
+ "eval_recall": 0.8463768115942029,
2130
+ "eval_runtime": 1.7696,
2131
+ "eval_samples_per_second": 194.962,
2132
+ "eval_steps_per_second": 6.216,
2133
+ "step": 1843
2134
+ },
2135
+ {
2136
+ "epoch": 76.28865979381443,
2137
+ "grad_norm": 17.32390022277832,
2138
+ "learning_rate": 1.2731481481481482e-05,
2139
+ "loss": 0.0943,
2140
+ "step": 1850
2141
+ },
2142
+ {
2143
+ "epoch": 76.70103092783505,
2144
+ "grad_norm": 12.162288665771484,
2145
+ "learning_rate": 1.25e-05,
2146
+ "loss": 0.0907,
2147
+ "step": 1860
2148
+ },
2149
+ {
2150
+ "epoch": 76.98969072164948,
2151
+ "eval_accuracy": 0.8492753623188406,
2152
+ "eval_loss": 0.5806660056114197,
2153
+ "eval_precision": 0.8495330403324792,
2154
+ "eval_recall": 0.8492753623188406,
2155
+ "eval_runtime": 1.7482,
2156
+ "eval_samples_per_second": 197.35,
2157
+ "eval_steps_per_second": 6.292,
2158
+ "step": 1867
2159
+ },
2160
+ {
2161
+ "epoch": 77.11340206185567,
2162
+ "grad_norm": 6.960859298706055,
2163
+ "learning_rate": 1.2268518518518519e-05,
2164
+ "loss": 0.0748,
2165
+ "step": 1870
2166
+ },
2167
+ {
2168
+ "epoch": 77.52577319587628,
2169
+ "grad_norm": 14.269356727600098,
2170
+ "learning_rate": 1.2037037037037037e-05,
2171
+ "loss": 0.0781,
2172
+ "step": 1880
2173
+ },
2174
+ {
2175
+ "epoch": 77.9381443298969,
2176
+ "grad_norm": 6.466542720794678,
2177
+ "learning_rate": 1.1805555555555555e-05,
2178
+ "loss": 0.0613,
2179
+ "step": 1890
2180
+ },
2181
+ {
2182
+ "epoch": 77.97938144329896,
2183
+ "eval_accuracy": 0.8376811594202899,
2184
+ "eval_loss": 0.5774852633476257,
2185
+ "eval_precision": 0.8381818122940702,
2186
+ "eval_recall": 0.8376811594202899,
2187
+ "eval_runtime": 1.7656,
2188
+ "eval_samples_per_second": 195.404,
2189
+ "eval_steps_per_second": 6.23,
2190
+ "step": 1891
2191
+ },
2192
+ {
2193
+ "epoch": 78.35051546391753,
2194
+ "grad_norm": 16.949039459228516,
2195
+ "learning_rate": 1.1574074074074075e-05,
2196
+ "loss": 0.0783,
2197
+ "step": 1900
2198
+ },
2199
+ {
2200
+ "epoch": 78.76288659793815,
2201
+ "grad_norm": 5.50955057144165,
2202
+ "learning_rate": 1.1342592592592593e-05,
2203
+ "loss": 0.0964,
2204
+ "step": 1910
2205
+ },
2206
+ {
2207
+ "epoch": 78.96907216494846,
2208
+ "eval_accuracy": 0.8666666666666667,
2209
+ "eval_loss": 0.5758916735649109,
2210
+ "eval_precision": 0.8675733846947259,
2211
+ "eval_recall": 0.8666666666666667,
2212
+ "eval_runtime": 1.7818,
2213
+ "eval_samples_per_second": 193.62,
2214
+ "eval_steps_per_second": 6.173,
2215
+ "step": 1915
2216
+ },
2217
+ {
2218
+ "epoch": 79.17525773195877,
2219
+ "grad_norm": 7.778840065002441,
2220
+ "learning_rate": 1.1111111111111112e-05,
2221
+ "loss": 0.0775,
2222
+ "step": 1920
2223
+ },
2224
+ {
2225
+ "epoch": 79.58762886597938,
2226
+ "grad_norm": 10.63167667388916,
2227
+ "learning_rate": 1.087962962962963e-05,
2228
+ "loss": 0.0849,
2229
+ "step": 1930
2230
+ },
2231
+ {
2232
+ "epoch": 80.0,
2233
+ "grad_norm": 10.529654502868652,
2234
+ "learning_rate": 1.0648148148148148e-05,
2235
+ "loss": 0.0735,
2236
+ "step": 1940
2237
+ },
2238
+ {
2239
+ "epoch": 80.0,
2240
+ "eval_accuracy": 0.855072463768116,
2241
+ "eval_loss": 0.5961835384368896,
2242
+ "eval_precision": 0.8565539653910103,
2243
+ "eval_recall": 0.855072463768116,
2244
+ "eval_runtime": 1.7657,
2245
+ "eval_samples_per_second": 195.391,
2246
+ "eval_steps_per_second": 6.23,
2247
+ "step": 1940
2248
+ },
2249
+ {
2250
+ "epoch": 80.41237113402062,
2251
+ "grad_norm": 10.91960334777832,
2252
+ "learning_rate": 1.0416666666666668e-05,
2253
+ "loss": 0.0803,
2254
+ "step": 1950
2255
+ },
2256
+ {
2257
+ "epoch": 80.82474226804123,
2258
+ "grad_norm": 6.953213691711426,
2259
+ "learning_rate": 1.0185185185185185e-05,
2260
+ "loss": 0.0663,
2261
+ "step": 1960
2262
+ },
2263
+ {
2264
+ "epoch": 80.98969072164948,
2265
+ "eval_accuracy": 0.8434782608695652,
2266
+ "eval_loss": 0.5768997669219971,
2267
+ "eval_precision": 0.8441240738989768,
2268
+ "eval_recall": 0.8434782608695652,
2269
+ "eval_runtime": 1.8615,
2270
+ "eval_samples_per_second": 185.334,
2271
+ "eval_steps_per_second": 5.909,
2272
+ "step": 1964
2273
+ },
2274
+ {
2275
+ "epoch": 81.23711340206185,
2276
+ "grad_norm": 14.6912841796875,
2277
+ "learning_rate": 9.953703703703704e-06,
2278
+ "loss": 0.0756,
2279
+ "step": 1970
2280
+ },
2281
+ {
2282
+ "epoch": 81.64948453608247,
2283
+ "grad_norm": 11.421167373657227,
2284
+ "learning_rate": 9.722222222222223e-06,
2285
+ "loss": 0.0719,
2286
+ "step": 1980
2287
+ },
2288
+ {
2289
+ "epoch": 81.97938144329896,
2290
+ "eval_accuracy": 0.8492753623188406,
2291
+ "eval_loss": 0.5826414823532104,
2292
+ "eval_precision": 0.8506964547245877,
2293
+ "eval_recall": 0.8492753623188406,
2294
+ "eval_runtime": 1.8427,
2295
+ "eval_samples_per_second": 187.221,
2296
+ "eval_steps_per_second": 5.969,
2297
+ "step": 1988
2298
+ },
2299
+ {
2300
+ "epoch": 82.0618556701031,
2301
+ "grad_norm": 16.955421447753906,
2302
+ "learning_rate": 9.490740740740741e-06,
2303
+ "loss": 0.0756,
2304
+ "step": 1990
2305
+ },
2306
+ {
2307
+ "epoch": 82.47422680412372,
2308
+ "grad_norm": 13.900518417358398,
2309
+ "learning_rate": 9.259259259259259e-06,
2310
+ "loss": 0.0683,
2311
+ "step": 2000
2312
+ },
2313
+ {
2314
+ "epoch": 82.88659793814433,
2315
+ "grad_norm": 9.04283618927002,
2316
+ "learning_rate": 9.027777777777777e-06,
2317
+ "loss": 0.0718,
2318
+ "step": 2010
2319
+ },
2320
+ {
2321
+ "epoch": 82.96907216494846,
2322
+ "eval_accuracy": 0.8579710144927536,
2323
+ "eval_loss": 0.5879714488983154,
2324
+ "eval_precision": 0.8590052571684228,
2325
+ "eval_recall": 0.8579710144927536,
2326
+ "eval_runtime": 1.7802,
2327
+ "eval_samples_per_second": 193.802,
2328
+ "eval_steps_per_second": 6.179,
2329
+ "step": 2012
2330
+ },
2331
+ {
2332
+ "epoch": 83.29896907216495,
2333
+ "grad_norm": 8.817221641540527,
2334
+ "learning_rate": 8.796296296296297e-06,
2335
+ "loss": 0.0699,
2336
+ "step": 2020
2337
+ },
2338
+ {
2339
+ "epoch": 83.71134020618557,
2340
+ "grad_norm": 9.379308700561523,
2341
+ "learning_rate": 8.564814814814816e-06,
2342
+ "loss": 0.0925,
2343
+ "step": 2030
2344
+ },
2345
+ {
2346
+ "epoch": 84.0,
2347
+ "eval_accuracy": 0.8492753623188406,
2348
+ "eval_loss": 0.5986330509185791,
2349
+ "eval_precision": 0.8512692229678578,
2350
+ "eval_recall": 0.8492753623188406,
2351
+ "eval_runtime": 1.7681,
2352
+ "eval_samples_per_second": 195.129,
2353
+ "eval_steps_per_second": 6.221,
2354
+ "step": 2037
2355
+ },
2356
+ {
2357
+ "epoch": 84.12371134020619,
2358
+ "grad_norm": 8.215590476989746,
2359
+ "learning_rate": 8.333333333333334e-06,
2360
+ "loss": 0.0617,
2361
+ "step": 2040
2362
+ },
2363
+ {
2364
+ "epoch": 84.5360824742268,
2365
+ "grad_norm": 5.024844169616699,
2366
+ "learning_rate": 8.101851851851852e-06,
2367
+ "loss": 0.0729,
2368
+ "step": 2050
2369
+ },
2370
+ {
2371
+ "epoch": 84.94845360824742,
2372
+ "grad_norm": 9.782211303710938,
2373
+ "learning_rate": 7.87037037037037e-06,
2374
+ "loss": 0.0621,
2375
+ "step": 2060
2376
+ },
2377
+ {
2378
+ "epoch": 84.98969072164948,
2379
+ "eval_accuracy": 0.8492753623188406,
2380
+ "eval_loss": 0.5914923548698425,
2381
+ "eval_precision": 0.8496762597563219,
2382
+ "eval_recall": 0.8492753623188406,
2383
+ "eval_runtime": 1.7614,
2384
+ "eval_samples_per_second": 195.868,
2385
+ "eval_steps_per_second": 6.245,
2386
+ "step": 2061
2387
+ },
2388
+ {
2389
+ "epoch": 85.36082474226804,
2390
+ "grad_norm": 7.3921942710876465,
2391
+ "learning_rate": 7.63888888888889e-06,
2392
+ "loss": 0.0621,
2393
+ "step": 2070
2394
+ },
2395
+ {
2396
+ "epoch": 85.77319587628865,
2397
+ "grad_norm": 10.206525802612305,
2398
+ "learning_rate": 7.4074074074074075e-06,
2399
+ "loss": 0.059,
2400
+ "step": 2080
2401
+ },
2402
+ {
2403
+ "epoch": 85.97938144329896,
2404
+ "eval_accuracy": 0.8579710144927536,
2405
+ "eval_loss": 0.577899694442749,
2406
+ "eval_precision": 0.8577329472646936,
2407
+ "eval_recall": 0.8579710144927536,
2408
+ "eval_runtime": 1.8903,
2409
+ "eval_samples_per_second": 182.511,
2410
+ "eval_steps_per_second": 5.819,
2411
+ "step": 2085
2412
+ },
2413
+ {
2414
+ "epoch": 86.18556701030928,
2415
+ "grad_norm": 18.180044174194336,
2416
+ "learning_rate": 7.1759259259259266e-06,
2417
+ "loss": 0.0663,
2418
+ "step": 2090
2419
+ },
2420
+ {
2421
+ "epoch": 86.5979381443299,
2422
+ "grad_norm": 10.320213317871094,
2423
+ "learning_rate": 6.944444444444445e-06,
2424
+ "loss": 0.0806,
2425
+ "step": 2100
2426
+ },
2427
+ {
2428
+ "epoch": 86.96907216494846,
2429
+ "eval_accuracy": 0.8492753623188406,
2430
+ "eval_loss": 0.5928123593330383,
2431
+ "eval_precision": 0.850145540799145,
2432
+ "eval_recall": 0.8492753623188406,
2433
+ "eval_runtime": 1.8068,
2434
+ "eval_samples_per_second": 190.946,
2435
+ "eval_steps_per_second": 6.088,
2436
+ "step": 2109
2437
+ },
2438
+ {
2439
+ "epoch": 87.01030927835052,
2440
+ "grad_norm": 13.640397071838379,
2441
+ "learning_rate": 6.712962962962964e-06,
2442
+ "loss": 0.0581,
2443
+ "step": 2110
2444
+ },
2445
+ {
2446
+ "epoch": 87.42268041237114,
2447
+ "grad_norm": 9.787714004516602,
2448
+ "learning_rate": 6.481481481481481e-06,
2449
+ "loss": 0.0641,
2450
+ "step": 2120
2451
+ },
2452
+ {
2453
+ "epoch": 87.83505154639175,
2454
+ "grad_norm": 7.827996730804443,
2455
+ "learning_rate": 6.25e-06,
2456
+ "loss": 0.0617,
2457
+ "step": 2130
2458
+ },
2459
+ {
2460
+ "epoch": 88.0,
2461
+ "eval_accuracy": 0.8521739130434782,
2462
+ "eval_loss": 0.606200098991394,
2463
+ "eval_precision": 0.8519519771693684,
2464
+ "eval_recall": 0.8521739130434782,
2465
+ "eval_runtime": 1.7968,
2466
+ "eval_samples_per_second": 192.013,
2467
+ "eval_steps_per_second": 6.122,
2468
+ "step": 2134
2469
+ },
2470
+ {
2471
+ "epoch": 88.24742268041237,
2472
+ "grad_norm": 10.409219741821289,
2473
+ "learning_rate": 6.0185185185185185e-06,
2474
+ "loss": 0.0677,
2475
+ "step": 2140
2476
+ },
2477
+ {
2478
+ "epoch": 88.65979381443299,
2479
+ "grad_norm": 13.120059967041016,
2480
+ "learning_rate": 5.787037037037038e-06,
2481
+ "loss": 0.0651,
2482
+ "step": 2150
2483
+ },
2484
+ {
2485
+ "epoch": 88.98969072164948,
2486
+ "eval_accuracy": 0.8521739130434782,
2487
+ "eval_loss": 0.6067116260528564,
2488
+ "eval_precision": 0.8518690976003952,
2489
+ "eval_recall": 0.8521739130434782,
2490
+ "eval_runtime": 1.8144,
2491
+ "eval_samples_per_second": 190.144,
2492
+ "eval_steps_per_second": 6.063,
2493
+ "step": 2158
2494
+ },
2495
+ {
2496
+ "epoch": 89.0721649484536,
2497
+ "grad_norm": 8.974705696105957,
2498
+ "learning_rate": 5.555555555555556e-06,
2499
+ "loss": 0.0672,
2500
+ "step": 2160
2501
+ },
2502
+ {
2503
+ "epoch": 89.48453608247422,
2504
+ "grad_norm": 13.397907257080078,
2505
+ "learning_rate": 5.324074074074074e-06,
2506
+ "loss": 0.0727,
2507
+ "step": 2170
2508
+ },
2509
+ {
2510
+ "epoch": 89.89690721649484,
2511
+ "grad_norm": 4.159496784210205,
2512
+ "learning_rate": 5.092592592592592e-06,
2513
+ "loss": 0.0754,
2514
+ "step": 2180
2515
+ },
2516
+ {
2517
+ "epoch": 89.97938144329896,
2518
+ "eval_accuracy": 0.855072463768116,
2519
+ "eval_loss": 0.6107772588729858,
2520
+ "eval_precision": 0.8553431503660337,
2521
+ "eval_recall": 0.855072463768116,
2522
+ "eval_runtime": 1.7776,
2523
+ "eval_samples_per_second": 194.084,
2524
+ "eval_steps_per_second": 6.188,
2525
+ "step": 2182
2526
+ },
2527
+ {
2528
+ "epoch": 90.30927835051547,
2529
+ "grad_norm": 11.130279541015625,
2530
+ "learning_rate": 4.861111111111111e-06,
2531
+ "loss": 0.079,
2532
+ "step": 2190
2533
+ },
2534
+ {
2535
+ "epoch": 90.72164948453609,
2536
+ "grad_norm": 13.203577995300293,
2537
+ "learning_rate": 4.6296296296296296e-06,
2538
+ "loss": 0.0682,
2539
+ "step": 2200
2540
+ },
2541
+ {
2542
+ "epoch": 90.96907216494846,
2543
+ "eval_accuracy": 0.8492753623188406,
2544
+ "eval_loss": 0.618496298789978,
2545
+ "eval_precision": 0.8488872700953353,
2546
+ "eval_recall": 0.8492753623188406,
2547
+ "eval_runtime": 1.7798,
2548
+ "eval_samples_per_second": 193.847,
2549
+ "eval_steps_per_second": 6.181,
2550
+ "step": 2206
2551
+ },
2552
+ {
2553
+ "epoch": 91.1340206185567,
2554
+ "grad_norm": 10.04045581817627,
2555
+ "learning_rate": 4.398148148148149e-06,
2556
+ "loss": 0.0699,
2557
+ "step": 2210
2558
+ },
2559
+ {
2560
+ "epoch": 91.54639175257732,
2561
+ "grad_norm": 2.500128984451294,
2562
+ "learning_rate": 4.166666666666667e-06,
2563
+ "loss": 0.0664,
2564
+ "step": 2220
2565
+ },
2566
+ {
2567
+ "epoch": 91.95876288659794,
2568
+ "grad_norm": 9.432464599609375,
2569
+ "learning_rate": 3.935185185185185e-06,
2570
+ "loss": 0.0763,
2571
+ "step": 2230
2572
+ },
2573
+ {
2574
+ "epoch": 92.0,
2575
+ "eval_accuracy": 0.8579710144927536,
2576
+ "eval_loss": 0.6168191432952881,
2577
+ "eval_precision": 0.8575139456543875,
2578
+ "eval_recall": 0.8579710144927536,
2579
+ "eval_runtime": 1.8002,
2580
+ "eval_samples_per_second": 191.65,
2581
+ "eval_steps_per_second": 6.111,
2582
+ "step": 2231
2583
+ },
2584
+ {
2585
+ "epoch": 92.37113402061856,
2586
+ "grad_norm": 9.279271125793457,
2587
+ "learning_rate": 3.7037037037037037e-06,
2588
+ "loss": 0.0742,
2589
+ "step": 2240
2590
+ },
2591
+ {
2592
+ "epoch": 92.78350515463917,
2593
+ "grad_norm": 19.246337890625,
2594
+ "learning_rate": 3.4722222222222224e-06,
2595
+ "loss": 0.0703,
2596
+ "step": 2250
2597
+ },
2598
+ {
2599
+ "epoch": 92.98969072164948,
2600
+ "eval_accuracy": 0.8521739130434782,
2601
+ "eval_loss": 0.6258795261383057,
2602
+ "eval_precision": 0.8520768323971984,
2603
+ "eval_recall": 0.8521739130434782,
2604
+ "eval_runtime": 1.8416,
2605
+ "eval_samples_per_second": 187.341,
2606
+ "eval_steps_per_second": 5.973,
2607
+ "step": 2255
2608
+ },
2609
+ {
2610
+ "epoch": 93.19587628865979,
2611
+ "grad_norm": 5.38301420211792,
2612
+ "learning_rate": 3.2407407407407406e-06,
2613
+ "loss": 0.0559,
2614
+ "step": 2260
2615
+ },
2616
+ {
2617
+ "epoch": 93.6082474226804,
2618
+ "grad_norm": 7.105731964111328,
2619
+ "learning_rate": 3.0092592592592593e-06,
2620
+ "loss": 0.0861,
2621
+ "step": 2270
2622
+ },
2623
+ {
2624
+ "epoch": 93.97938144329896,
2625
+ "eval_accuracy": 0.855072463768116,
2626
+ "eval_loss": 0.6128158569335938,
2627
+ "eval_precision": 0.8553431503660337,
2628
+ "eval_recall": 0.855072463768116,
2629
+ "eval_runtime": 1.776,
2630
+ "eval_samples_per_second": 194.252,
2631
+ "eval_steps_per_second": 6.194,
2632
+ "step": 2279
2633
+ },
2634
+ {
2635
+ "epoch": 94.02061855670104,
2636
+ "grad_norm": 14.296255111694336,
2637
+ "learning_rate": 2.777777777777778e-06,
2638
+ "loss": 0.089,
2639
+ "step": 2280
2640
+ },
2641
+ {
2642
+ "epoch": 94.43298969072166,
2643
+ "grad_norm": 11.694154739379883,
2644
+ "learning_rate": 2.546296296296296e-06,
2645
+ "loss": 0.07,
2646
+ "step": 2290
2647
+ },
2648
+ {
2649
+ "epoch": 94.84536082474227,
2650
+ "grad_norm": 8.240065574645996,
2651
+ "learning_rate": 2.3148148148148148e-06,
2652
+ "loss": 0.0807,
2653
+ "step": 2300
2654
+ },
2655
+ {
2656
+ "epoch": 94.96907216494846,
2657
+ "eval_accuracy": 0.855072463768116,
2658
+ "eval_loss": 0.6139995455741882,
2659
+ "eval_precision": 0.8546533219302098,
2660
+ "eval_recall": 0.855072463768116,
2661
+ "eval_runtime": 1.763,
2662
+ "eval_samples_per_second": 195.691,
2663
+ "eval_steps_per_second": 6.239,
2664
+ "step": 2303
2665
+ },
2666
+ {
2667
+ "epoch": 95.25773195876289,
2668
+ "grad_norm": 6.740184307098389,
2669
+ "learning_rate": 2.0833333333333334e-06,
2670
+ "loss": 0.0814,
2671
+ "step": 2310
2672
+ },
2673
+ {
2674
+ "epoch": 95.6701030927835,
2675
+ "grad_norm": 9.714829444885254,
2676
+ "learning_rate": 1.8518518518518519e-06,
2677
+ "loss": 0.0621,
2678
+ "step": 2320
2679
+ },
2680
+ {
2681
+ "epoch": 96.0,
2682
+ "eval_accuracy": 0.8521739130434782,
2683
+ "eval_loss": 0.6132925748825073,
2684
+ "eval_precision": 0.8531657869027159,
2685
+ "eval_recall": 0.8521739130434782,
2686
+ "eval_runtime": 1.8081,
2687
+ "eval_samples_per_second": 190.808,
2688
+ "eval_steps_per_second": 6.084,
2689
+ "step": 2328
2690
+ },
2691
+ {
2692
+ "epoch": 96.08247422680412,
2693
+ "grad_norm": 11.212587356567383,
2694
+ "learning_rate": 1.6203703703703703e-06,
2695
+ "loss": 0.065,
2696
+ "step": 2330
2697
+ },
2698
+ {
2699
+ "epoch": 96.49484536082474,
2700
+ "grad_norm": 5.428162097930908,
2701
+ "learning_rate": 1.388888888888889e-06,
2702
+ "loss": 0.0621,
2703
+ "step": 2340
2704
+ },
2705
+ {
2706
+ "epoch": 96.90721649484536,
2707
+ "grad_norm": 15.444799423217773,
2708
+ "learning_rate": 1.1574074074074074e-06,
2709
+ "loss": 0.0831,
2710
+ "step": 2350
2711
+ },
2712
+ {
2713
+ "epoch": 96.98969072164948,
2714
+ "eval_accuracy": 0.8492753623188406,
2715
+ "eval_loss": 0.6100958585739136,
2716
+ "eval_precision": 0.8507158478342087,
2717
+ "eval_recall": 0.8492753623188406,
2718
+ "eval_runtime": 1.7991,
2719
+ "eval_samples_per_second": 191.765,
2720
+ "eval_steps_per_second": 6.114,
2721
+ "step": 2352
2722
+ },
2723
+ {
2724
+ "epoch": 97.31958762886597,
2725
+ "grad_norm": 12.789685249328613,
2726
+ "learning_rate": 9.259259259259259e-07,
2727
+ "loss": 0.0584,
2728
+ "step": 2360
2729
+ },
2730
+ {
2731
+ "epoch": 97.73195876288659,
2732
+ "grad_norm": 9.271283149719238,
2733
+ "learning_rate": 6.944444444444445e-07,
2734
+ "loss": 0.0625,
2735
+ "step": 2370
2736
+ },
2737
+ {
2738
+ "epoch": 97.97938144329896,
2739
+ "eval_accuracy": 0.8492753623188406,
2740
+ "eval_loss": 0.6096817851066589,
2741
+ "eval_precision": 0.8507158478342087,
2742
+ "eval_recall": 0.8492753623188406,
2743
+ "eval_runtime": 1.8191,
2744
+ "eval_samples_per_second": 189.651,
2745
+ "eval_steps_per_second": 6.047,
2746
+ "step": 2376
2747
+ },
2748
+ {
2749
+ "epoch": 98.14432989690722,
2750
+ "grad_norm": 10.486361503601074,
2751
+ "learning_rate": 4.6296296296296297e-07,
2752
+ "loss": 0.0563,
2753
+ "step": 2380
2754
+ },
2755
+ {
2756
+ "epoch": 98.55670103092784,
2757
+ "grad_norm": 4.260477066040039,
2758
+ "learning_rate": 2.3148148148148148e-07,
2759
+ "loss": 0.0648,
2760
+ "step": 2390
2761
+ },
2762
+ {
2763
+ "epoch": 98.96907216494846,
2764
+ "grad_norm": 8.932230949401855,
2765
+ "learning_rate": 0.0,
2766
+ "loss": 0.0571,
2767
+ "step": 2400
2768
+ },
2769
+ {
2770
+ "epoch": 98.96907216494846,
2771
+ "eval_accuracy": 0.8492753623188406,
2772
+ "eval_loss": 0.6083797812461853,
2773
+ "eval_precision": 0.8507158478342087,
2774
+ "eval_recall": 0.8492753623188406,
2775
+ "eval_runtime": 1.7521,
2776
+ "eval_samples_per_second": 196.912,
2777
+ "eval_steps_per_second": 6.278,
2778
+ "step": 2400
2779
+ },
2780
+ {
2781
+ "epoch": 98.96907216494846,
2782
+ "step": 2400,
2783
+ "total_flos": 7.732715563096474e+18,
2784
+ "train_loss": 0.2344164727628231,
2785
+ "train_runtime": 4723.8268,
2786
+ "train_samples_per_second": 65.709,
2787
+ "train_steps_per_second": 0.508
2788
+ }
2789
+ ],
2790
+ "logging_steps": 10,
2791
+ "max_steps": 2400,
2792
+ "num_input_tokens_seen": 0,
2793
+ "num_train_epochs": 100,
2794
+ "save_steps": 500,
2795
+ "stateful_callbacks": {
2796
+ "TrainerControl": {
2797
+ "args": {
2798
+ "should_epoch_stop": false,
2799
+ "should_evaluate": false,
2800
+ "should_log": false,
2801
+ "should_save": false,
2802
+ "should_training_stop": false
2803
+ },
2804
+ "attributes": {}
2805
+ }
2806
+ },
2807
+ "total_flos": 7.732715563096474e+18,
2808
+ "train_batch_size": 32,
2809
+ "trial_name": null,
2810
+ "trial_params": null
2811
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:affda0a3859645c81fa7be0fd211a7d38fc953f748e483c1c67402ec6e11b968
3
+ size 5240