philipp-zettl commited on
Commit
0a4024d
1 Parent(s): a13aa19

Update spaCy pipeline

Browse files
Files changed (8) hide show
  1. README.md +32 -6
  2. config.cfg +33 -25
  3. meta.json +30 -6
  4. ner/model +2 -2
  5. ner/moves +1 -1
  6. tok2vec/model +2 -2
  7. vocab/strings.json +0 -0
  8. xx_eb_ner-any-py3-none-any.whl +2 -2
README.md CHANGED
@@ -4,19 +4,35 @@ tags:
4
  - token-classification
5
  language:
6
  - multilingual
7
- license: cc-by-nc-sa-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  ---
9
  | Feature | Description |
10
  | --- | --- |
11
  | **Name** | `xx_eb_ner` |
12
- | **Version** | `0.2.1` |
13
- | **spaCy** | `>=3.7.4,<3.8.0` |
14
  | **Default Pipeline** | `tok2vec`, `ner` |
15
  | **Components** | `tok2vec`, `ner` |
16
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
17
  | **Sources** | n/a |
18
- | **License** | n/a |
19
- | **Author** | [n/a]() |
20
 
21
  ### Label Scheme
22
 
@@ -28,4 +44,14 @@ license: cc-by-nc-sa-4.0
28
  | --- | --- |
29
  | **`ner`** | `COURSE_NAME`, `JOB_TITLE`, `LOCATION` |
30
 
31
- </details>
 
 
 
 
 
 
 
 
 
 
 
4
  - token-classification
5
  language:
6
  - multilingual
7
+ license: cc-by-nc-sa-4.0
8
+ model-index:
9
+ - name: xx_eb_ner
10
+ results:
11
+ - task:
12
+ name: NER
13
+ type: token-classification
14
+ metrics:
15
+ - name: NER Precision
16
+ type: precision
17
+ value: 0.9976688647
18
+ - name: NER Recall
19
+ type: recall
20
+ value: 0.9975230852
21
+ - name: NER F Score
22
+ type: f_score
23
+ value: 0.9975959696
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `xx_eb_ner` |
28
+ | **Version** | `0.3.0` |
29
+ | **spaCy** | `>=3.8.2,<3.9.0` |
30
  | **Default Pipeline** | `tok2vec`, `ner` |
31
  | **Components** | `tok2vec`, `ner` |
32
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
33
  | **Sources** | n/a |
34
+ | **License** | `cc-by-nc-sa-4.0` |
35
+ | **Author** | [philipp-zettl]() |
36
 
37
  ### Label Scheme
38
 
 
44
  | --- | --- |
45
  | **`ner`** | `COURSE_NAME`, `JOB_TITLE`, `LOCATION` |
46
 
47
+ </details>
48
+
49
+ ### Accuracy
50
+
51
+ | Type | Score |
52
+ | --- | --- |
53
+ | `ENTS_F` | 99.76 |
54
+ | `ENTS_P` | 99.77 |
55
+ | `ENTS_R` | 99.75 |
56
+ | `TOK2VEC_LOSS` | 25659.16 |
57
+ | `NER_LOSS` | 9140.46 |
config.cfg CHANGED
@@ -1,21 +1,21 @@
1
  [paths]
2
- train = "./training_data/multilang_train.spacy"
3
- dev = "./training_data/multilang_valid.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
- gpu_allocator = "\"pytorch\" # Use GPU memory management, if available"
9
  seed = 0
10
 
11
  [nlp]
12
  lang = "xx"
13
  pipeline = ["tok2vec","ner"]
 
14
  disabled = []
15
  before_creation = null
16
  after_creation = null
17
  after_pipeline_creation = null
18
- batch_size = 1000
19
  tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
  vectors = {"@vectors":"spacy.Vectors.v1"}
21
 
@@ -38,47 +38,51 @@ use_upper = true
38
  nO = null
39
 
40
  [components.ner.model.tok2vec]
41
- @architectures = "spacy.HashEmbedCNN.v2"
42
- pretrained_vectors = null
43
- width = 96
44
- depth = 4
45
- embed_size = 2000
46
- window_size = 1
47
- maxout_pieces = 3
48
- subword_features = true
49
 
50
  [components.tok2vec]
51
  factory = "tok2vec"
52
 
53
  [components.tok2vec.model]
54
- @architectures = "spacy.HashEmbedCNN.v2"
55
- pretrained_vectors = null
56
- width = 96
57
- depth = 4
58
- embed_size = 2000
 
 
 
 
 
 
 
 
59
  window_size = 1
60
  maxout_pieces = 3
61
- subword_features = true
62
 
63
  [corpora]
64
 
65
  [corpora.dev]
66
  @readers = "spacy.Corpus.v1"
67
  path = ${paths.dev}
68
- gold_preproc = false
69
  max_length = 0
 
70
  limit = 0
71
  augmenter = null
72
 
73
  [corpora.train]
74
  @readers = "spacy.Corpus.v1"
75
  path = ${paths.train}
76
- gold_preproc = false
77
  max_length = 0
 
78
  limit = 0
79
  augmenter = null
80
 
81
  [training]
 
 
82
  seed = ${system.seed}
83
  gpu_allocator = ${system.gpu_allocator}
84
  dropout = 0.1
@@ -89,25 +93,28 @@ max_steps = 20000
89
  eval_frequency = 200
90
  frozen_components = []
91
  annotating_components = []
92
- dev_corpus = "corpora.dev"
93
- train_corpus = "corpora.train"
94
  before_to_disk = null
95
  before_update = null
96
 
97
  [training.batcher]
98
  @batchers = "spacy.batch_by_words.v1"
99
- size = 1000
100
  discard_oversize = false
101
  tolerance = 0.2
102
  get_length = null
103
 
 
 
 
 
 
 
 
104
  [training.logger]
105
  @loggers = "spacy.ConsoleLogger.v1"
106
  progress_bar = false
107
 
108
  [training.optimizer]
109
  @optimizers = "Adam.v1"
110
- learn_rate = 0.001
111
  beta1 = 0.9
112
  beta2 = 0.999
113
  L2_is_weight_decay = true
@@ -115,6 +122,7 @@ L2 = 0.01
115
  grad_clip = 1.0
116
  use_averages = false
117
  eps = 0.00000001
 
118
 
119
  [training.score_weights]
120
  ents_f = 1.0
@@ -125,7 +133,7 @@ ents_per_type = null
125
  [pretraining]
126
 
127
  [initialize]
128
- vectors = null
129
  init_tok2vec = ${paths.init_tok2vec}
130
  vocab_data = null
131
  lookups = null
 
1
  [paths]
2
+ train = "training_data/multilang_train.spacy"
3
+ dev = "training_data/multilang_valid.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
+ gpu_allocator = null
9
  seed = 0
10
 
11
  [nlp]
12
  lang = "xx"
13
  pipeline = ["tok2vec","ner"]
14
+ batch_size = 1000
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
18
  after_pipeline_creation = null
 
19
  tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
  vectors = {"@vectors":"spacy.Vectors.v1"}
21
 
 
38
  nO = null
39
 
40
  [components.ner.model.tok2vec]
41
+ @architectures = "spacy.Tok2VecListener.v1"
42
+ width = ${components.tok2vec.model.encode.width}
43
+ upstream = "*"
 
 
 
 
 
44
 
45
  [components.tok2vec]
46
  factory = "tok2vec"
47
 
48
  [components.tok2vec.model]
49
+ @architectures = "spacy.Tok2Vec.v2"
50
+
51
+ [components.tok2vec.model.embed]
52
+ @architectures = "spacy.MultiHashEmbed.v2"
53
+ width = ${components.tok2vec.model.encode.width}
54
+ attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
55
+ rows = [5000,1000,2500,2500]
56
+ include_static_vectors = true
57
+
58
+ [components.tok2vec.model.encode]
59
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
60
+ width = 256
61
+ depth = 8
62
  window_size = 1
63
  maxout_pieces = 3
 
64
 
65
  [corpora]
66
 
67
  [corpora.dev]
68
  @readers = "spacy.Corpus.v1"
69
  path = ${paths.dev}
 
70
  max_length = 0
71
+ gold_preproc = false
72
  limit = 0
73
  augmenter = null
74
 
75
  [corpora.train]
76
  @readers = "spacy.Corpus.v1"
77
  path = ${paths.train}
 
78
  max_length = 0
79
+ gold_preproc = false
80
  limit = 0
81
  augmenter = null
82
 
83
  [training]
84
+ dev_corpus = "corpora.dev"
85
+ train_corpus = "corpora.train"
86
  seed = ${system.seed}
87
  gpu_allocator = ${system.gpu_allocator}
88
  dropout = 0.1
 
93
  eval_frequency = 200
94
  frozen_components = []
95
  annotating_components = []
 
 
96
  before_to_disk = null
97
  before_update = null
98
 
99
  [training.batcher]
100
  @batchers = "spacy.batch_by_words.v1"
 
101
  discard_oversize = false
102
  tolerance = 0.2
103
  get_length = null
104
 
105
+ [training.batcher.size]
106
+ @schedules = "compounding.v1"
107
+ start = 100
108
+ stop = 1000
109
+ compound = 1.001
110
+ t = 0.0
111
+
112
  [training.logger]
113
  @loggers = "spacy.ConsoleLogger.v1"
114
  progress_bar = false
115
 
116
  [training.optimizer]
117
  @optimizers = "Adam.v1"
 
118
  beta1 = 0.9
119
  beta2 = 0.999
120
  L2_is_weight_decay = true
 
122
  grad_clip = 1.0
123
  use_averages = false
124
  eps = 0.00000001
125
+ learn_rate = 0.001
126
 
127
  [training.score_weights]
128
  ents_f = 1.0
 
133
  [pretraining]
134
 
135
  [initialize]
136
+ vectors = ${paths.vectors}
137
  init_tok2vec = ${paths.init_tok2vec}
138
  vocab_data = null
139
  lookups = null
meta.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "lang":"xx",
3
  "name":"eb_ner",
4
- "version":"0.2.1",
5
  "description":"",
6
- "author":"",
7
  "email":"",
8
  "url":"",
9
- "license":"",
10
- "spacy_version":">=3.7.4,<3.8.0",
11
- "spacy_git_version":"bff8725f4",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -36,7 +36,31 @@
36
  "disabled":[
37
 
38
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "requirements":[
40
-
41
  ]
42
  }
 
1
  {
2
  "lang":"xx",
3
  "name":"eb_ner",
4
+ "version":"0.3.0",
5
  "description":"",
6
+ "author":"philipp-zettl",
7
  "email":"",
8
  "url":"",
9
+ "license":"cc-by-nc-sa-4.0",
10
+ "spacy_version":">=3.8.2,<3.9.0",
11
+ "spacy_git_version":"63f1b53",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
 
36
  "disabled":[
37
 
38
  ],
39
+ "performance":{
40
+ "ents_f":0.9975959696,
41
+ "ents_p":0.9976688647,
42
+ "ents_r":0.9975230852,
43
+ "ents_per_type":{
44
+ "COURSE_NAME":{
45
+ "p":0.9998858916,
46
+ "r":0.9998174391,
47
+ "f":0.9998516642
48
+ },
49
+ "LOCATION":{
50
+ "p":0.9965569206,
51
+ "r":0.9994995296,
52
+ "f":0.9980260561
53
+ },
54
+ "JOB_TITLE":{
55
+ "p":0.9939871152,
56
+ "r":0.9873836902,
57
+ "f":0.9906743989
58
+ }
59
+ },
60
+ "tok2vec_loss":256.5916271971,
61
+ "ner_loss":91.4045944214
62
+ },
63
  "requirements":[
64
+ "spacy>=3.8.2,<3.9.0"
65
  ]
66
  }
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5293ff2b0fe84b2f88062cce2c03daf9e424386a143e5fdc98f87f6bba6c7e39
3
- size 3835000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df892bf6343b803a827590e5b21d78a2f9ed69c0395505d47ac1197a35df0667
3
+ size 170556
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves�${"0":{},"1":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534},"2":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534},"3":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534},"4":{"COURSE_NAME":1955153,"JOB_TITLE":1206960,"LOCATION":1154534,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves�{"0":{},"1":{"COURSE_NAME":357554,"LOCATION":244355,"JOB_TITLE":196831},"2":{"COURSE_NAME":357554,"LOCATION":244355,"JOB_TITLE":196831},"3":{"COURSE_NAME":357554,"LOCATION":244355,"JOB_TITLE":196831},"4":{"COURSE_NAME":357554,"LOCATION":244355,"JOB_TITLE":196831,"":1},"5":{"":1}}�cfg��neg_key�
tok2vec/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c887bb91d8fb38fbfdf3dbf994d61cfb54b7ce900b89bec0e6c92b8f023f7ee7
3
- size 3705091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e05d0d04bc51148a3f0b10159072174c6354293ed84d0250e58f9937ca0f82b
3
+ size 34126801
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff
 
xx_eb_ner-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38309fa0b6607c4d8b130659210e2cd400241ce588039e6fede3efbbd4ab1912
3
- size 7796938
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:768b333089d8b2628a05c9219aa20597ae2a0defbc516826c892c846be7436ec
3
+ size 31813476