KoichiYasuoka commited on
Commit
e2708f6
1 Parent(s): 324dfb8

initial release

Browse files
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "lzh"
4
+ tags:
5
+ - "classical chinese"
6
+ - "literary chinese"
7
+ - "ancient chinese"
8
+ - "question-answering"
9
+ - "dependency-parsing"
10
+ datasets:
11
+ - "universal_dependencies"
12
+ license: "apache-2.0"
13
+ pipeline_tag: "question-answering"
14
+ widget:
15
+ - text: "穴"
16
+ context: "不入虎穴不得虎子"
17
+ - text: "子"
18
+ context: "不入虎穴不得虎子"
19
+ - text: "不"
20
+ context: "[MASK]入虎穴不得虎子"
21
+ ---
22
+
23
+ # bert-ancient-chinese-base-ud-head
24
+
25
+ ## Model Description
26
+
27
+ This is a BERT model pre-trained on Classical Chinese texts for dependency-parsing (head-detection on long-unit-words) as question-answering, derived from [bert-ancient-chinese](https://huggingface.co/Jihuai/bert-ancient-chinese) and [UD_Classical_Chinese-Kyoto](https://github.com/UniversalDependencies/UD_Classical_Chinese-Kyoto). Use [MASK] inside `context` to avoid ambiguity when specifying a multiple-used word as `question`.
28
+
29
+ ## How to Use
30
+
31
+ ```py
32
+ from transformers import AutoTokenizer,AutoModelForQuestionAnswering,QuestionAnsweringPipeline
33
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/bert-ancient-chinese-base-ud-head")
34
+ model=AutoModelForQuestionAnswering.from_pretrained("KoichiYasuoka/bert-ancient-chinese-base-ud-head")
35
+ qap=QuestionAnsweringPipeline(tokenizer=tokenizer,model=model)
36
+ print(qap(question="穴",context="不入虎穴不得虎子"))
37
+ ```
38
+
39
+ or (with [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/))
40
+
41
+ ```py
42
+ class TransformersUD(object):
43
+ def __init__(self,bert):
44
+ import os
45
+ from transformers import (AutoTokenizer,AutoModelForQuestionAnswering,
46
+ AutoModelForTokenClassification,AutoConfig,TokenClassificationPipeline)
47
+ self.tokenizer=AutoTokenizer.from_pretrained(bert)
48
+ self.model=AutoModelForQuestionAnswering.from_pretrained(bert)
49
+ x=AutoModelForTokenClassification.from_pretrained
50
+ if os.path.isdir(bert):
51
+ d,t=x(os.path.join(bert,"deprel")),x(os.path.join(bert,"tagger"))
52
+ else:
53
+ from transformers.file_utils import hf_bucket_url
54
+ c=AutoConfig.from_pretrained(hf_bucket_url(bert,"deprel/config.json"))
55
+ d=x(hf_bucket_url(bert,"deprel/pytorch_model.bin"),config=c)
56
+ s=AutoConfig.from_pretrained(hf_bucket_url(bert,"tagger/config.json"))
57
+ t=x(hf_bucket_url(bert,"tagger/pytorch_model.bin"),config=s)
58
+ self.deprel=TokenClassificationPipeline(model=d,tokenizer=self.tokenizer,
59
+ aggregation_strategy="simple")
60
+ self.tagger=TokenClassificationPipeline(model=t,tokenizer=self.tokenizer)
61
+ def __call__(self,text):
62
+ import numpy,torch,ufal.chu_liu_edmonds
63
+ w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)]
64
+ z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w)
65
+ r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan)
66
+ v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[]
67
+ for i,t in enumerate(v):
68
+ q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id]
69
+ c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]])
70
+ b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c]
71
+ d=self.model(input_ids=torch.tensor([sum(x,[]) for x in c]),
72
+ token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b]))
73
+ s,e=d.start_logits.tolist(),d.end_logits.tolist()
74
+ for i in range(n):
75
+ for j in range(n):
76
+ m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1]
77
+ h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
78
+ if [0 for i in h if i==0]!=[0]:
79
+ i=([p for s,e,p in w]+["root"]).index("root")
80
+ j=i+1 if i<n else numpy.nanargmax(m[:,0])
81
+ m[0:j,0]=m[j+1:,0]=numpy.nan
82
+ h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
83
+ u="# text = "+text.replace("\n"," ")+"\n"
84
+ for i,(s,e,p) in enumerate(w,1):
85
+ p="root" if h[i]==0 else "dep" if p=="root" else p
86
+ u+="\t".join([str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),
87
+ str(h[i]),p,"_","_" if i<n and w[i][0]<e else "SpaceAfter=No"])+"\n"
88
+ return u+"\n"
89
+
90
+ nlp=TransformersUD("KoichiYasuoka/bert-ancient-chinese-base-ud-head")
91
+ print(nlp("不入虎穴不得虎子"))
92
+ ```
93
+
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "lstm_dropout_prob": 0.5,
15
+ "lstm_embedding_size": 768,
16
+ "max_position_embeddings": 512,
17
+ "model_type": "bert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 0,
21
+ "pooler_fc_size": 768,
22
+ "pooler_num_attention_heads": 12,
23
+ "pooler_num_fc_layers": 3,
24
+ "pooler_size_per_head": 128,
25
+ "pooler_type": "first_token_transform",
26
+ "position_embedding_type": "absolute",
27
+ "tokenizer_class": "BertTokenizer",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.19.4",
30
+ "type_vocab_size": 2,
31
+ "use_cache": true,
32
+ "vocab_size": 38208
33
+ }
deprel/config.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "finetuning_task": "pos",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "B-acl",
14
+ "1": "B-advcl",
15
+ "2": "B-advmod",
16
+ "3": "B-amod",
17
+ "4": "B-aux",
18
+ "5": "B-case",
19
+ "6": "B-cc",
20
+ "7": "B-ccomp",
21
+ "8": "B-clf",
22
+ "9": "B-compound",
23
+ "10": "B-compound:redup",
24
+ "11": "B-conj",
25
+ "12": "B-cop",
26
+ "13": "B-csubj",
27
+ "14": "B-csubj:pass",
28
+ "15": "B-det",
29
+ "16": "B-discourse",
30
+ "17": "B-discourse:sp",
31
+ "18": "B-dislocated",
32
+ "19": "B-expl",
33
+ "20": "B-fixed",
34
+ "21": "B-flat",
35
+ "22": "B-flat:foreign",
36
+ "23": "B-flat:vv",
37
+ "24": "B-iobj",
38
+ "25": "B-list",
39
+ "26": "B-mark",
40
+ "27": "B-nmod",
41
+ "28": "B-nsubj",
42
+ "29": "B-nsubj:pass",
43
+ "30": "B-nummod",
44
+ "31": "B-obj",
45
+ "32": "B-obl",
46
+ "33": "B-obl:lmod",
47
+ "34": "B-obl:tmod",
48
+ "35": "B-orphan",
49
+ "36": "B-parataxis",
50
+ "37": "B-root",
51
+ "38": "B-vocative",
52
+ "39": "B-xcomp",
53
+ "40": "I-acl",
54
+ "41": "I-advcl",
55
+ "42": "I-advmod",
56
+ "43": "I-amod",
57
+ "44": "I-ccomp",
58
+ "45": "I-clf",
59
+ "46": "I-compound",
60
+ "47": "I-conj",
61
+ "48": "I-csubj",
62
+ "49": "I-dislocated",
63
+ "50": "I-flat",
64
+ "51": "I-flat:foreign",
65
+ "52": "I-iobj",
66
+ "53": "I-list",
67
+ "54": "I-nmod",
68
+ "55": "I-nsubj",
69
+ "56": "I-nsubj:pass",
70
+ "57": "I-nummod",
71
+ "58": "I-obj",
72
+ "59": "I-obl",
73
+ "60": "I-obl:lmod",
74
+ "61": "I-obl:tmod",
75
+ "62": "I-parataxis",
76
+ "63": "I-root",
77
+ "64": "I-vocative",
78
+ "65": "I-xcomp"
79
+ },
80
+ "initializer_range": 0.02,
81
+ "intermediate_size": 3072,
82
+ "label2id": {
83
+ "B-acl": 0,
84
+ "B-advcl": 1,
85
+ "B-advmod": 2,
86
+ "B-amod": 3,
87
+ "B-aux": 4,
88
+ "B-case": 5,
89
+ "B-cc": 6,
90
+ "B-ccomp": 7,
91
+ "B-clf": 8,
92
+ "B-compound": 9,
93
+ "B-compound:redup": 10,
94
+ "B-conj": 11,
95
+ "B-cop": 12,
96
+ "B-csubj": 13,
97
+ "B-csubj:pass": 14,
98
+ "B-det": 15,
99
+ "B-discourse": 16,
100
+ "B-discourse:sp": 17,
101
+ "B-dislocated": 18,
102
+ "B-expl": 19,
103
+ "B-fixed": 20,
104
+ "B-flat": 21,
105
+ "B-flat:foreign": 22,
106
+ "B-flat:vv": 23,
107
+ "B-iobj": 24,
108
+ "B-list": 25,
109
+ "B-mark": 26,
110
+ "B-nmod": 27,
111
+ "B-nsubj": 28,
112
+ "B-nsubj:pass": 29,
113
+ "B-nummod": 30,
114
+ "B-obj": 31,
115
+ "B-obl": 32,
116
+ "B-obl:lmod": 33,
117
+ "B-obl:tmod": 34,
118
+ "B-orphan": 35,
119
+ "B-parataxis": 36,
120
+ "B-root": 37,
121
+ "B-vocative": 38,
122
+ "B-xcomp": 39,
123
+ "I-acl": 40,
124
+ "I-advcl": 41,
125
+ "I-advmod": 42,
126
+ "I-amod": 43,
127
+ "I-ccomp": 44,
128
+ "I-clf": 45,
129
+ "I-compound": 46,
130
+ "I-conj": 47,
131
+ "I-csubj": 48,
132
+ "I-dislocated": 49,
133
+ "I-flat": 50,
134
+ "I-flat:foreign": 51,
135
+ "I-iobj": 52,
136
+ "I-list": 53,
137
+ "I-nmod": 54,
138
+ "I-nsubj": 55,
139
+ "I-nsubj:pass": 56,
140
+ "I-nummod": 57,
141
+ "I-obj": 58,
142
+ "I-obl": 59,
143
+ "I-obl:lmod": 60,
144
+ "I-obl:tmod": 61,
145
+ "I-parataxis": 62,
146
+ "I-root": 63,
147
+ "I-vocative": 64,
148
+ "I-xcomp": 65
149
+ },
150
+ "layer_norm_eps": 1e-12,
151
+ "lstm_dropout_prob": 0.5,
152
+ "lstm_embedding_size": 768,
153
+ "max_position_embeddings": 512,
154
+ "model_type": "bert",
155
+ "num_attention_heads": 12,
156
+ "num_hidden_layers": 12,
157
+ "pad_token_id": 0,
158
+ "pooler_fc_size": 768,
159
+ "pooler_num_attention_heads": 12,
160
+ "pooler_num_fc_layers": 3,
161
+ "pooler_size_per_head": 128,
162
+ "pooler_type": "first_token_transform",
163
+ "position_embedding_type": "absolute",
164
+ "tokenizer_class": "BertTokenizer",
165
+ "torch_dtype": "float32",
166
+ "transformers_version": "4.19.4",
167
+ "type_vocab_size": 2,
168
+ "use_cache": true,
169
+ "vocab_size": 38208
170
+ }
deprel/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e2b53116cbea8d40e7d0d33c23e21e7fa7954d8abc0eef70174acbf8afefee
3
+ size 459454608
deprel/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
deprel/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "special_tokens_map_file": null, "tokenizer_class": "BertTokenizer"}
deprel/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:467e6868c713cca9f8de470c77e86349ee08a6c58eca584d1ccea735ee15fb07
3
+ size 459254577
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tagger/config.json ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "finetuning_task": "pos",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "B-ADP|Degree=Equ",
14
+ "1": "B-ADP|_",
15
+ "2": "B-ADV|AdvType=Cau",
16
+ "3": "B-ADV|AdvType=Deg|Degree=Cmp",
17
+ "4": "B-ADV|AdvType=Deg|Degree=Pos",
18
+ "5": "B-ADV|AdvType=Deg|Degree=Sup",
19
+ "6": "B-ADV|AdvType=Tim",
20
+ "7": "B-ADV|AdvType=Tim|Aspect=Perf",
21
+ "8": "B-ADV|AdvType=Tim|Tense=Fut",
22
+ "9": "B-ADV|AdvType=Tim|Tense=Past",
23
+ "10": "B-ADV|AdvType=Tim|Tense=Pres",
24
+ "11": "B-ADV|Degree=Equ|VerbForm=Conv",
25
+ "12": "B-ADV|Degree=Pos|VerbForm=Conv",
26
+ "13": "B-ADV|Polarity=Neg",
27
+ "14": "B-ADV|Polarity=Neg|VerbForm=Conv",
28
+ "15": "B-ADV|VerbForm=Conv",
29
+ "16": "B-ADV|_",
30
+ "17": "B-AUX|Mood=Des",
31
+ "18": "B-AUX|Mood=Nec",
32
+ "19": "B-AUX|Mood=Pot",
33
+ "20": "B-AUX|VerbType=Cop",
34
+ "21": "B-AUX|Voice=Pass",
35
+ "22": "B-CCONJ|_",
36
+ "23": "B-INTJ|_",
37
+ "24": "B-NOUN|Case=Loc",
38
+ "25": "B-NOUN|Case=Tem",
39
+ "26": "B-NOUN|NounType=Clf",
40
+ "27": "B-NOUN|_",
41
+ "28": "B-NUM|NumType=Ord",
42
+ "29": "B-NUM|_",
43
+ "30": "B-PART|_",
44
+ "31": "B-PRON|Person=1|PronType=Prs",
45
+ "32": "B-PRON|Person=2|PronType=Prs",
46
+ "33": "B-PRON|Person=3|PronType=Prs",
47
+ "34": "B-PRON|PronType=Dem",
48
+ "35": "B-PRON|PronType=Int",
49
+ "36": "B-PRON|PronType=Prs",
50
+ "37": "B-PRON|PronType=Prs|Reflex=Yes",
51
+ "38": "B-PROPN|Case=Loc|NameType=Geo",
52
+ "39": "B-PROPN|Case=Loc|NameType=Nat",
53
+ "40": "B-PROPN|NameType=Giv",
54
+ "41": "B-PROPN|NameType=Prs",
55
+ "42": "B-PROPN|NameType=Sur",
56
+ "43": "B-SCONJ|_",
57
+ "44": "B-SYM|_",
58
+ "45": "B-VERB|Degree=Equ",
59
+ "46": "B-VERB|Degree=Equ|VerbForm=Part",
60
+ "47": "B-VERB|Degree=Pos",
61
+ "48": "B-VERB|Degree=Pos|VerbForm=Part",
62
+ "49": "B-VERB|Polarity=Neg",
63
+ "50": "B-VERB|Polarity=Neg|VerbForm=Part",
64
+ "51": "B-VERB|VerbForm=Part",
65
+ "52": "B-VERB|_",
66
+ "53": "I-ADV|VerbForm=Conv",
67
+ "54": "I-NOUN|Case=Loc",
68
+ "55": "I-NOUN|Case=Tem",
69
+ "56": "I-NOUN|_",
70
+ "57": "I-NUM|NumType=Ord",
71
+ "58": "I-NUM|_",
72
+ "59": "I-PROPN|Case=Loc|NameType=Geo",
73
+ "60": "I-PROPN|Case=Loc|NameType=Nat",
74
+ "61": "I-PROPN|NameType=Giv",
75
+ "62": "I-PROPN|NameType=Prs",
76
+ "63": "I-PROPN|NameType=Sur",
77
+ "64": "I-VERB|Degree=Equ",
78
+ "65": "I-VERB|Degree=Pos",
79
+ "66": "I-VERB|VerbForm=Part",
80
+ "67": "I-VERB|_"
81
+ },
82
+ "initializer_range": 0.02,
83
+ "intermediate_size": 3072,
84
+ "label2id": {
85
+ "B-ADP|Degree=Equ": 0,
86
+ "B-ADP|_": 1,
87
+ "B-ADV|AdvType=Cau": 2,
88
+ "B-ADV|AdvType=Deg|Degree=Cmp": 3,
89
+ "B-ADV|AdvType=Deg|Degree=Pos": 4,
90
+ "B-ADV|AdvType=Deg|Degree=Sup": 5,
91
+ "B-ADV|AdvType=Tim": 6,
92
+ "B-ADV|AdvType=Tim|Aspect=Perf": 7,
93
+ "B-ADV|AdvType=Tim|Tense=Fut": 8,
94
+ "B-ADV|AdvType=Tim|Tense=Past": 9,
95
+ "B-ADV|AdvType=Tim|Tense=Pres": 10,
96
+ "B-ADV|Degree=Equ|VerbForm=Conv": 11,
97
+ "B-ADV|Degree=Pos|VerbForm=Conv": 12,
98
+ "B-ADV|Polarity=Neg": 13,
99
+ "B-ADV|Polarity=Neg|VerbForm=Conv": 14,
100
+ "B-ADV|VerbForm=Conv": 15,
101
+ "B-ADV|_": 16,
102
+ "B-AUX|Mood=Des": 17,
103
+ "B-AUX|Mood=Nec": 18,
104
+ "B-AUX|Mood=Pot": 19,
105
+ "B-AUX|VerbType=Cop": 20,
106
+ "B-AUX|Voice=Pass": 21,
107
+ "B-CCONJ|_": 22,
108
+ "B-INTJ|_": 23,
109
+ "B-NOUN|Case=Loc": 24,
110
+ "B-NOUN|Case=Tem": 25,
111
+ "B-NOUN|NounType=Clf": 26,
112
+ "B-NOUN|_": 27,
113
+ "B-NUM|NumType=Ord": 28,
114
+ "B-NUM|_": 29,
115
+ "B-PART|_": 30,
116
+ "B-PRON|Person=1|PronType=Prs": 31,
117
+ "B-PRON|Person=2|PronType=Prs": 32,
118
+ "B-PRON|Person=3|PronType=Prs": 33,
119
+ "B-PRON|PronType=Dem": 34,
120
+ "B-PRON|PronType=Int": 35,
121
+ "B-PRON|PronType=Prs": 36,
122
+ "B-PRON|PronType=Prs|Reflex=Yes": 37,
123
+ "B-PROPN|Case=Loc|NameType=Geo": 38,
124
+ "B-PROPN|Case=Loc|NameType=Nat": 39,
125
+ "B-PROPN|NameType=Giv": 40,
126
+ "B-PROPN|NameType=Prs": 41,
127
+ "B-PROPN|NameType=Sur": 42,
128
+ "B-SCONJ|_": 43,
129
+ "B-SYM|_": 44,
130
+ "B-VERB|Degree=Equ": 45,
131
+ "B-VERB|Degree=Equ|VerbForm=Part": 46,
132
+ "B-VERB|Degree=Pos": 47,
133
+ "B-VERB|Degree=Pos|VerbForm=Part": 48,
134
+ "B-VERB|Polarity=Neg": 49,
135
+ "B-VERB|Polarity=Neg|VerbForm=Part": 50,
136
+ "B-VERB|VerbForm=Part": 51,
137
+ "B-VERB|_": 52,
138
+ "I-ADV|VerbForm=Conv": 53,
139
+ "I-NOUN|Case=Loc": 54,
140
+ "I-NOUN|Case=Tem": 55,
141
+ "I-NOUN|_": 56,
142
+ "I-NUM|NumType=Ord": 57,
143
+ "I-NUM|_": 58,
144
+ "I-PROPN|Case=Loc|NameType=Geo": 59,
145
+ "I-PROPN|Case=Loc|NameType=Nat": 60,
146
+ "I-PROPN|NameType=Giv": 61,
147
+ "I-PROPN|NameType=Prs": 62,
148
+ "I-PROPN|NameType=Sur": 63,
149
+ "I-VERB|Degree=Equ": 64,
150
+ "I-VERB|Degree=Pos": 65,
151
+ "I-VERB|VerbForm=Part": 66,
152
+ "I-VERB|_": 67
153
+ },
154
+ "layer_norm_eps": 1e-12,
155
+ "lstm_dropout_prob": 0.5,
156
+ "lstm_embedding_size": 768,
157
+ "max_position_embeddings": 512,
158
+ "model_type": "bert",
159
+ "num_attention_heads": 12,
160
+ "num_hidden_layers": 12,
161
+ "pad_token_id": 0,
162
+ "pooler_fc_size": 768,
163
+ "pooler_num_attention_heads": 12,
164
+ "pooler_num_fc_layers": 3,
165
+ "pooler_size_per_head": 128,
166
+ "pooler_type": "first_token_transform",
167
+ "position_embedding_type": "absolute",
168
+ "tokenizer_class": "BertTokenizer",
169
+ "torch_dtype": "float32",
170
+ "transformers_version": "4.19.4",
171
+ "type_vocab_size": 2,
172
+ "use_cache": true,
173
+ "vocab_size": 38208
174
+ }
tagger/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e054845e4212ceb70ee2fc8ae172efa45b284aea5abd445b6f4fd07ba8f62a00
3
+ size 459460755
tagger/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tagger/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "special_tokens_map_file": null, "tokenizer_class": "BertTokenizer"}
tagger/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff