2023-10-24 15:13:44,762 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,763 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-24 15:13:44,763 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,763 MultiCorpus: 7936 train + 992 dev + 992 test sentences - NER_ICDAR_EUROPEANA Corpus: 7936 train + 992 dev + 992 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/fr 2023-10-24 15:13:44,763 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,763 Train: 7936 sentences 2023-10-24 15:13:44,763 (train_with_dev=False, train_with_test=False) 2023-10-24 15:13:44,764 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,764 Training Params: 2023-10-24 15:13:44,764 - learning_rate: "3e-05" 2023-10-24 15:13:44,764 - mini_batch_size: "8" 2023-10-24 15:13:44,764 - max_epochs: "10" 2023-10-24 15:13:44,764 - shuffle: "True" 2023-10-24 15:13:44,764 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,764 Plugins: 2023-10-24 15:13:44,764 - TensorboardLogger 2023-10-24 15:13:44,764 - LinearScheduler | warmup_fraction: '0.1' 2023-10-24 15:13:44,764 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,764 Final evaluation on model from best epoch (best-model.pt) 2023-10-24 15:13:44,764 - metric: "('micro avg', 'f1-score')" 2023-10-24 15:13:44,764 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,764 Computation: 2023-10-24 15:13:44,764 - compute on device: cuda:0 2023-10-24 15:13:44,764 - embedding storage: none 2023-10-24 15:13:44,764 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,764 Model training base path: "hmbench-icdar/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-1" 2023-10-24 15:13:44,764 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,764 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:13:44,764 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-24 15:13:52,870 epoch 1 - iter 99/992 - loss 2.00138499 - time (sec): 8.11 - samples/sec: 1940.03 - lr: 0.000003 - momentum: 0.000000 2023-10-24 15:14:00,756 epoch 1 - iter 198/992 - loss 1.19486555 - time (sec): 15.99 - samples/sec: 1939.90 - lr: 0.000006 - momentum: 0.000000 2023-10-24 15:14:08,909 epoch 1 - iter 297/992 - loss 0.87346403 - time (sec): 24.14 - samples/sec: 1963.76 - lr: 0.000009 - momentum: 0.000000 2023-10-24 15:14:17,064 epoch 1 - iter 396/992 - loss 0.69769156 - time (sec): 32.30 - samples/sec: 1961.48 - lr: 0.000012 - momentum: 0.000000 2023-10-24 15:14:25,604 epoch 1 - iter 495/992 - loss 0.58166806 - time (sec): 40.84 - samples/sec: 1973.97 - lr: 0.000015 - momentum: 0.000000 2023-10-24 15:14:33,994 epoch 1 - iter 594/992 - loss 0.50752231 - time (sec): 49.23 - samples/sec: 1971.55 - lr: 0.000018 - momentum: 0.000000 2023-10-24 15:14:42,580 epoch 1 - iter 693/992 - loss 0.45336096 - time (sec): 57.82 - samples/sec: 1963.72 - lr: 0.000021 - momentum: 0.000000 2023-10-24 15:14:51,243 epoch 1 - iter 792/992 - loss 0.41245068 - time (sec): 66.48 - samples/sec: 1965.17 - lr: 0.000024 - momentum: 0.000000 2023-10-24 15:14:59,630 epoch 1 - iter 891/992 - loss 0.38042237 - time (sec): 74.87 - samples/sec: 1966.67 - lr: 0.000027 - momentum: 0.000000 2023-10-24 15:15:07,967 epoch 1 - iter 990/992 - loss 0.35462009 - time (sec): 83.20 - samples/sec: 1967.35 - lr: 0.000030 - momentum: 0.000000 2023-10-24 15:15:08,132 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:15:08,132 EPOCH 1 done: loss 0.3542 - lr: 0.000030 2023-10-24 15:15:11,185 DEV : loss 0.0961548462510109 - f1-score (micro avg) 0.7105 2023-10-24 15:15:11,200 saving best model 2023-10-24 15:15:11,755 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:15:20,111 epoch 2 - iter 99/992 - loss 0.11640822 - time (sec): 8.35 - samples/sec: 2042.78 - lr: 0.000030 - momentum: 0.000000 2023-10-24 15:15:28,447 epoch 2 - iter 198/992 - loss 0.11616878 - time (sec): 16.69 - samples/sec: 2002.25 - lr: 0.000029 - momentum: 0.000000 2023-10-24 15:15:37,122 epoch 2 - iter 297/992 - loss 0.11115161 - time (sec): 25.37 - samples/sec: 2003.37 - lr: 0.000029 - momentum: 0.000000 2023-10-24 15:15:45,186 epoch 2 - iter 396/992 - loss 0.10710185 - time (sec): 33.43 - samples/sec: 2000.29 - lr: 0.000029 - momentum: 0.000000 2023-10-24 15:15:53,527 epoch 2 - iter 495/992 - loss 0.10512998 - time (sec): 41.77 - samples/sec: 1994.14 - lr: 0.000028 - momentum: 0.000000 2023-10-24 15:16:01,850 epoch 2 - iter 594/992 - loss 0.10552659 - time (sec): 50.09 - samples/sec: 1965.95 - lr: 0.000028 - momentum: 0.000000 2023-10-24 15:16:09,917 epoch 2 - iter 693/992 - loss 0.10493309 - time (sec): 58.16 - samples/sec: 1962.81 - lr: 0.000028 - momentum: 0.000000 2023-10-24 15:16:18,183 epoch 2 - iter 792/992 - loss 0.10434383 - time (sec): 66.43 - samples/sec: 1962.00 - lr: 0.000027 - momentum: 0.000000 2023-10-24 15:16:26,821 epoch 2 - iter 891/992 - loss 0.10270095 - time (sec): 75.07 - samples/sec: 1961.30 - lr: 0.000027 - momentum: 0.000000 2023-10-24 15:16:35,036 epoch 2 - iter 990/992 - loss 0.10121394 - time (sec): 83.28 - samples/sec: 1965.27 - lr: 0.000027 - momentum: 0.000000 2023-10-24 15:16:35,186 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:16:35,186 EPOCH 2 done: loss 0.1011 - lr: 0.000027 2023-10-24 15:16:38,599 DEV : loss 0.09430491924285889 - f1-score (micro avg) 0.7529 2023-10-24 15:16:38,614 saving best model 2023-10-24 15:16:39,338 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:16:47,529 epoch 3 - iter 99/992 - loss 0.06719236 - time (sec): 8.19 - samples/sec: 1940.21 - lr: 0.000026 - momentum: 0.000000 2023-10-24 15:16:55,855 epoch 3 - iter 198/992 - loss 0.06613133 - time (sec): 16.52 - samples/sec: 1993.65 - lr: 0.000026 - momentum: 0.000000 2023-10-24 15:17:04,381 epoch 3 - iter 297/992 - loss 0.06330398 - time (sec): 25.04 - samples/sec: 1985.65 - lr: 0.000026 - momentum: 0.000000 2023-10-24 15:17:12,671 epoch 3 - iter 396/992 - loss 0.06632455 - time (sec): 33.33 - samples/sec: 1960.17 - lr: 0.000025 - momentum: 0.000000 2023-10-24 15:17:20,731 epoch 3 - iter 495/992 - loss 0.07066014 - time (sec): 41.39 - samples/sec: 1959.36 - lr: 0.000025 - momentum: 0.000000 2023-10-24 15:17:29,327 epoch 3 - iter 594/992 - loss 0.07098366 - time (sec): 49.99 - samples/sec: 1951.79 - lr: 0.000025 - momentum: 0.000000 2023-10-24 15:17:37,244 epoch 3 - iter 693/992 - loss 0.07039656 - time (sec): 57.91 - samples/sec: 1953.94 - lr: 0.000024 - momentum: 0.000000 2023-10-24 15:17:45,665 epoch 3 - iter 792/992 - loss 0.06956794 - time (sec): 66.33 - samples/sec: 1960.93 - lr: 0.000024 - momentum: 0.000000 2023-10-24 15:17:53,989 epoch 3 - iter 891/992 - loss 0.06922567 - time (sec): 74.65 - samples/sec: 1958.25 - lr: 0.000024 - momentum: 0.000000 2023-10-24 15:18:02,727 epoch 3 - iter 990/992 - loss 0.06916264 - time (sec): 83.39 - samples/sec: 1961.66 - lr: 0.000023 - momentum: 0.000000 2023-10-24 15:18:02,917 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:18:02,917 EPOCH 3 done: loss 0.0693 - lr: 0.000023 2023-10-24 15:18:06,033 DEV : loss 0.0983629822731018 - f1-score (micro avg) 0.7488 2023-10-24 15:18:06,049 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:18:14,336 epoch 4 - iter 99/992 - loss 0.04434730 - time (sec): 8.29 - samples/sec: 2012.20 - lr: 0.000023 - momentum: 0.000000 2023-10-24 15:18:22,782 epoch 4 - iter 198/992 - loss 0.04085860 - time (sec): 16.73 - samples/sec: 1966.74 - lr: 0.000023 - momentum: 0.000000 2023-10-24 15:18:30,640 epoch 4 - iter 297/992 - loss 0.04207446 - time (sec): 24.59 - samples/sec: 1954.17 - lr: 0.000022 - momentum: 0.000000 2023-10-24 15:18:38,789 epoch 4 - iter 396/992 - loss 0.04681766 - time (sec): 32.74 - samples/sec: 1963.82 - lr: 0.000022 - momentum: 0.000000 2023-10-24 15:18:47,334 epoch 4 - iter 495/992 - loss 0.04857690 - time (sec): 41.29 - samples/sec: 1962.60 - lr: 0.000022 - momentum: 0.000000 2023-10-24 15:18:56,652 epoch 4 - iter 594/992 - loss 0.04806280 - time (sec): 50.60 - samples/sec: 1963.48 - lr: 0.000021 - momentum: 0.000000 2023-10-24 15:19:04,615 epoch 4 - iter 693/992 - loss 0.04657590 - time (sec): 58.57 - samples/sec: 1966.76 - lr: 0.000021 - momentum: 0.000000 2023-10-24 15:19:13,077 epoch 4 - iter 792/992 - loss 0.04774712 - time (sec): 67.03 - samples/sec: 1966.19 - lr: 0.000021 - momentum: 0.000000 2023-10-24 15:19:21,518 epoch 4 - iter 891/992 - loss 0.04815205 - time (sec): 75.47 - samples/sec: 1961.15 - lr: 0.000020 - momentum: 0.000000 2023-10-24 15:19:30,005 epoch 4 - iter 990/992 - loss 0.04916499 - time (sec): 83.96 - samples/sec: 1950.19 - lr: 0.000020 - momentum: 0.000000 2023-10-24 15:19:30,145 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:19:30,145 EPOCH 4 done: loss 0.0492 - lr: 0.000020 2023-10-24 15:19:33,266 DEV : loss 0.1186017319560051 - f1-score (micro avg) 0.766 2023-10-24 15:19:33,282 saving best model 2023-10-24 15:19:34,070 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:19:42,709 epoch 5 - iter 99/992 - loss 0.03958303 - time (sec): 8.64 - samples/sec: 1954.98 - lr: 0.000020 - momentum: 0.000000 2023-10-24 15:19:50,877 epoch 5 - iter 198/992 - loss 0.03820388 - time (sec): 16.81 - samples/sec: 1956.19 - lr: 0.000019 - momentum: 0.000000 2023-10-24 15:19:58,957 epoch 5 - iter 297/992 - loss 0.03759095 - time (sec): 24.89 - samples/sec: 1960.58 - lr: 0.000019 - momentum: 0.000000 2023-10-24 15:20:07,566 epoch 5 - iter 396/992 - loss 0.03671709 - time (sec): 33.49 - samples/sec: 1974.70 - lr: 0.000019 - momentum: 0.000000 2023-10-24 15:20:15,557 epoch 5 - iter 495/992 - loss 0.03795644 - time (sec): 41.49 - samples/sec: 1963.89 - lr: 0.000018 - momentum: 0.000000 2023-10-24 15:20:23,714 epoch 5 - iter 594/992 - loss 0.03754771 - time (sec): 49.64 - samples/sec: 1968.60 - lr: 0.000018 - momentum: 0.000000 2023-10-24 15:20:31,990 epoch 5 - iter 693/992 - loss 0.03701700 - time (sec): 57.92 - samples/sec: 1972.23 - lr: 0.000018 - momentum: 0.000000 2023-10-24 15:20:40,475 epoch 5 - iter 792/992 - loss 0.03754511 - time (sec): 66.40 - samples/sec: 1975.22 - lr: 0.000017 - momentum: 0.000000 2023-10-24 15:20:49,092 epoch 5 - iter 891/992 - loss 0.03754814 - time (sec): 75.02 - samples/sec: 1974.28 - lr: 0.000017 - momentum: 0.000000 2023-10-24 15:20:57,344 epoch 5 - iter 990/992 - loss 0.03819911 - time (sec): 83.27 - samples/sec: 1966.82 - lr: 0.000017 - momentum: 0.000000 2023-10-24 15:20:57,489 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:20:57,489 EPOCH 5 done: loss 0.0382 - lr: 0.000017 2023-10-24 15:21:00,612 DEV : loss 0.1500038057565689 - f1-score (micro avg) 0.7716 2023-10-24 15:21:00,627 saving best model 2023-10-24 15:21:01,415 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:21:09,674 epoch 6 - iter 99/992 - loss 0.02489554 - time (sec): 8.26 - samples/sec: 1944.65 - lr: 0.000016 - momentum: 0.000000 2023-10-24 15:21:17,831 epoch 6 - iter 198/992 - loss 0.02197168 - time (sec): 16.41 - samples/sec: 1969.28 - lr: 0.000016 - momentum: 0.000000 2023-10-24 15:21:26,440 epoch 6 - iter 297/992 - loss 0.02602124 - time (sec): 25.02 - samples/sec: 1970.79 - lr: 0.000016 - momentum: 0.000000 2023-10-24 15:21:34,700 epoch 6 - iter 396/992 - loss 0.02465442 - time (sec): 33.28 - samples/sec: 1958.95 - lr: 0.000015 - momentum: 0.000000 2023-10-24 15:21:43,130 epoch 6 - iter 495/992 - loss 0.02716845 - time (sec): 41.71 - samples/sec: 1975.40 - lr: 0.000015 - momentum: 0.000000 2023-10-24 15:21:51,299 epoch 6 - iter 594/992 - loss 0.02699725 - time (sec): 49.88 - samples/sec: 1977.15 - lr: 0.000015 - momentum: 0.000000 2023-10-24 15:21:59,606 epoch 6 - iter 693/992 - loss 0.02655613 - time (sec): 58.19 - samples/sec: 1976.09 - lr: 0.000014 - momentum: 0.000000 2023-10-24 15:22:07,766 epoch 6 - iter 792/992 - loss 0.02626525 - time (sec): 66.35 - samples/sec: 1973.44 - lr: 0.000014 - momentum: 0.000000 2023-10-24 15:22:16,581 epoch 6 - iter 891/992 - loss 0.02664702 - time (sec): 75.16 - samples/sec: 1956.18 - lr: 0.000014 - momentum: 0.000000 2023-10-24 15:22:25,007 epoch 6 - iter 990/992 - loss 0.02687875 - time (sec): 83.59 - samples/sec: 1959.53 - lr: 0.000013 - momentum: 0.000000 2023-10-24 15:22:25,140 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:22:25,140 EPOCH 6 done: loss 0.0269 - lr: 0.000013 2023-10-24 15:22:28,260 DEV : loss 0.18365593254566193 - f1-score (micro avg) 0.7675 2023-10-24 15:22:28,275 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:22:36,688 epoch 7 - iter 99/992 - loss 0.01924589 - time (sec): 8.41 - samples/sec: 2023.57 - lr: 0.000013 - momentum: 0.000000 2023-10-24 15:22:44,992 epoch 7 - iter 198/992 - loss 0.02187669 - time (sec): 16.72 - samples/sec: 1959.39 - lr: 0.000013 - momentum: 0.000000 2023-10-24 15:22:53,153 epoch 7 - iter 297/992 - loss 0.02145695 - time (sec): 24.88 - samples/sec: 1936.89 - lr: 0.000012 - momentum: 0.000000 2023-10-24 15:23:01,187 epoch 7 - iter 396/992 - loss 0.02078135 - time (sec): 32.91 - samples/sec: 1940.16 - lr: 0.000012 - momentum: 0.000000 2023-10-24 15:23:09,352 epoch 7 - iter 495/992 - loss 0.02110910 - time (sec): 41.08 - samples/sec: 1929.42 - lr: 0.000012 - momentum: 0.000000 2023-10-24 15:23:17,649 epoch 7 - iter 594/992 - loss 0.02248657 - time (sec): 49.37 - samples/sec: 1948.38 - lr: 0.000011 - momentum: 0.000000 2023-10-24 15:23:25,983 epoch 7 - iter 693/992 - loss 0.02223240 - time (sec): 57.71 - samples/sec: 1951.52 - lr: 0.000011 - momentum: 0.000000 2023-10-24 15:23:34,185 epoch 7 - iter 792/992 - loss 0.02177442 - time (sec): 65.91 - samples/sec: 1949.07 - lr: 0.000011 - momentum: 0.000000 2023-10-24 15:23:43,231 epoch 7 - iter 891/992 - loss 0.02152618 - time (sec): 74.95 - samples/sec: 1953.49 - lr: 0.000010 - momentum: 0.000000 2023-10-24 15:23:51,601 epoch 7 - iter 990/992 - loss 0.02140999 - time (sec): 83.33 - samples/sec: 1965.40 - lr: 0.000010 - momentum: 0.000000 2023-10-24 15:23:51,748 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:23:51,748 EPOCH 7 done: loss 0.0214 - lr: 0.000010 2023-10-24 15:23:54,863 DEV : loss 0.21153658628463745 - f1-score (micro avg) 0.7618 2023-10-24 15:23:54,878 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:24:03,108 epoch 8 - iter 99/992 - loss 0.01093374 - time (sec): 8.23 - samples/sec: 2004.96 - lr: 0.000010 - momentum: 0.000000 2023-10-24 15:24:11,138 epoch 8 - iter 198/992 - loss 0.01310924 - time (sec): 16.26 - samples/sec: 1936.95 - lr: 0.000009 - momentum: 0.000000 2023-10-24 15:24:19,358 epoch 8 - iter 297/992 - loss 0.01441408 - time (sec): 24.48 - samples/sec: 1940.68 - lr: 0.000009 - momentum: 0.000000 2023-10-24 15:24:28,043 epoch 8 - iter 396/992 - loss 0.01461590 - time (sec): 33.16 - samples/sec: 1935.65 - lr: 0.000009 - momentum: 0.000000 2023-10-24 15:24:36,832 epoch 8 - iter 495/992 - loss 0.01509641 - time (sec): 41.95 - samples/sec: 1945.22 - lr: 0.000008 - momentum: 0.000000 2023-10-24 15:24:45,127 epoch 8 - iter 594/992 - loss 0.01506859 - time (sec): 50.25 - samples/sec: 1947.58 - lr: 0.000008 - momentum: 0.000000 2023-10-24 15:24:54,112 epoch 8 - iter 693/992 - loss 0.01504122 - time (sec): 59.23 - samples/sec: 1956.47 - lr: 0.000008 - momentum: 0.000000 2023-10-24 15:25:02,153 epoch 8 - iter 792/992 - loss 0.01598623 - time (sec): 67.27 - samples/sec: 1948.08 - lr: 0.000007 - momentum: 0.000000 2023-10-24 15:25:10,479 epoch 8 - iter 891/992 - loss 0.01625220 - time (sec): 75.60 - samples/sec: 1952.55 - lr: 0.000007 - momentum: 0.000000 2023-10-24 15:25:18,505 epoch 8 - iter 990/992 - loss 0.01634153 - time (sec): 83.63 - samples/sec: 1955.26 - lr: 0.000007 - momentum: 0.000000 2023-10-24 15:25:18,705 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:25:18,706 EPOCH 8 done: loss 0.0163 - lr: 0.000007 2023-10-24 15:25:21,824 DEV : loss 0.214664027094841 - f1-score (micro avg) 0.7641 2023-10-24 15:25:21,839 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:25:30,320 epoch 9 - iter 99/992 - loss 0.00554736 - time (sec): 8.48 - samples/sec: 2033.00 - lr: 0.000006 - momentum: 0.000000 2023-10-24 15:25:38,987 epoch 9 - iter 198/992 - loss 0.00828502 - time (sec): 17.15 - samples/sec: 1982.71 - lr: 0.000006 - momentum: 0.000000 2023-10-24 15:25:47,155 epoch 9 - iter 297/992 - loss 0.00991386 - time (sec): 25.32 - samples/sec: 1992.59 - lr: 0.000006 - momentum: 0.000000 2023-10-24 15:25:55,662 epoch 9 - iter 396/992 - loss 0.00972216 - time (sec): 33.82 - samples/sec: 1976.94 - lr: 0.000005 - momentum: 0.000000 2023-10-24 15:26:03,891 epoch 9 - iter 495/992 - loss 0.00993347 - time (sec): 42.05 - samples/sec: 1975.37 - lr: 0.000005 - momentum: 0.000000 2023-10-24 15:26:12,678 epoch 9 - iter 594/992 - loss 0.01064234 - time (sec): 50.84 - samples/sec: 1976.16 - lr: 0.000005 - momentum: 0.000000 2023-10-24 15:26:20,744 epoch 9 - iter 693/992 - loss 0.01059840 - time (sec): 58.90 - samples/sec: 1972.26 - lr: 0.000004 - momentum: 0.000000 2023-10-24 15:26:28,796 epoch 9 - iter 792/992 - loss 0.01094734 - time (sec): 66.96 - samples/sec: 1967.66 - lr: 0.000004 - momentum: 0.000000 2023-10-24 15:26:36,898 epoch 9 - iter 891/992 - loss 0.01069492 - time (sec): 75.06 - samples/sec: 1964.87 - lr: 0.000004 - momentum: 0.000000 2023-10-24 15:26:45,130 epoch 9 - iter 990/992 - loss 0.01088973 - time (sec): 83.29 - samples/sec: 1965.18 - lr: 0.000003 - momentum: 0.000000 2023-10-24 15:26:45,309 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:26:45,309 EPOCH 9 done: loss 0.0109 - lr: 0.000003 2023-10-24 15:26:48,428 DEV : loss 0.22701114416122437 - f1-score (micro avg) 0.7697 2023-10-24 15:26:48,443 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:26:56,446 epoch 10 - iter 99/992 - loss 0.00704755 - time (sec): 8.00 - samples/sec: 1986.71 - lr: 0.000003 - momentum: 0.000000 2023-10-24 15:27:04,484 epoch 10 - iter 198/992 - loss 0.00802891 - time (sec): 16.04 - samples/sec: 1989.28 - lr: 0.000003 - momentum: 0.000000 2023-10-24 15:27:13,149 epoch 10 - iter 297/992 - loss 0.00713159 - time (sec): 24.71 - samples/sec: 2001.01 - lr: 0.000002 - momentum: 0.000000 2023-10-24 15:27:21,521 epoch 10 - iter 396/992 - loss 0.00689762 - time (sec): 33.08 - samples/sec: 1985.90 - lr: 0.000002 - momentum: 0.000000 2023-10-24 15:27:29,824 epoch 10 - iter 495/992 - loss 0.00714780 - time (sec): 41.38 - samples/sec: 1986.04 - lr: 0.000002 - momentum: 0.000000 2023-10-24 15:27:38,100 epoch 10 - iter 594/992 - loss 0.00737894 - time (sec): 49.66 - samples/sec: 1992.26 - lr: 0.000001 - momentum: 0.000000 2023-10-24 15:27:46,622 epoch 10 - iter 693/992 - loss 0.00692848 - time (sec): 58.18 - samples/sec: 1969.53 - lr: 0.000001 - momentum: 0.000000 2023-10-24 15:27:54,981 epoch 10 - iter 792/992 - loss 0.00716748 - time (sec): 66.54 - samples/sec: 1965.85 - lr: 0.000001 - momentum: 0.000000 2023-10-24 15:28:03,371 epoch 10 - iter 891/992 - loss 0.00737072 - time (sec): 74.93 - samples/sec: 1964.65 - lr: 0.000000 - momentum: 0.000000 2023-10-24 15:28:11,754 epoch 10 - iter 990/992 - loss 0.00752842 - time (sec): 83.31 - samples/sec: 1962.95 - lr: 0.000000 - momentum: 0.000000 2023-10-24 15:28:11,961 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:28:11,961 EPOCH 10 done: loss 0.0076 - lr: 0.000000 2023-10-24 15:28:15,064 DEV : loss 0.23377303779125214 - f1-score (micro avg) 0.7682 2023-10-24 15:28:15,642 ---------------------------------------------------------------------------------------------------- 2023-10-24 15:28:15,642 Loading model from best epoch ... 2023-10-24 15:28:17,450 SequenceTagger predicts: Dictionary with 13 tags: O, S-PER, B-PER, E-PER, I-PER, S-LOC, B-LOC, E-LOC, I-LOC, S-ORG, B-ORG, E-ORG, I-ORG 2023-10-24 15:28:20,523 Results: - F-score (micro) 0.7766 - F-score (macro) 0.7019 - Accuracy 0.6524 By class: precision recall f1-score support LOC 0.8214 0.8214 0.8214 655 PER 0.7339 0.8161 0.7728 223 ORG 0.6250 0.4331 0.5116 127 micro avg 0.7820 0.7711 0.7766 1005 macro avg 0.7267 0.6902 0.7019 1005 weighted avg 0.7771 0.7711 0.7715 1005 2023-10-24 15:28:20,523 ----------------------------------------------------------------------------------------------------