KoichiYasuoka commited on
Commit
2d8da30
1 Parent(s): 32857c2

model improved

Browse files
config.json CHANGED
@@ -20,12 +20,15 @@
20
  "pooler_dropout": 0,
21
  "pooler_hidden_act": "gelu",
22
  "pooler_hidden_size": 768,
23
- "pos_att_type": null,
24
- "position_biased_input": true,
25
- "relative_attention": false,
 
 
 
26
  "tokenizer_class": "DebertaV2TokenizerFast",
27
  "torch_dtype": "float32",
28
- "transformers_version": "4.19.4",
29
  "type_vocab_size": 0,
30
  "vocab_size": 32000
31
  }
 
20
  "pooler_dropout": 0,
21
  "pooler_hidden_act": "gelu",
22
  "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "relative_attention": true,
29
  "tokenizer_class": "DebertaV2TokenizerFast",
30
  "torch_dtype": "float32",
31
+ "transformers_version": "4.22.1",
32
  "type_vocab_size": 0,
33
  "vocab_size": 32000
34
  }
deprel/config.json CHANGED
@@ -5,7 +5,6 @@
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
8
- "finetuning_task": "pos",
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
@@ -21,43 +20,44 @@
21
  "8": "B-compound",
22
  "9": "B-cop",
23
  "10": "B-csubj",
24
- "11": "B-dep",
25
- "12": "B-det",
26
- "13": "B-discourse",
27
- "14": "B-dislocated",
28
  "15": "B-fixed",
29
  "16": "B-mark",
30
  "17": "B-nmod",
31
  "18": "B-nsubj",
32
- "19": "B-nummod",
33
- "20": "B-obj",
34
- "21": "B-obl",
35
- "22": "B-punct",
36
- "23": "B-root",
37
- "24": "I-acl",
38
- "25": "I-advcl",
39
- "26": "I-advmod",
40
- "27": "I-amod",
41
- "28": "I-aux",
42
- "29": "I-case",
43
- "30": "I-cc",
44
- "31": "I-ccomp",
45
- "32": "I-compound",
46
- "33": "I-cop",
47
- "34": "I-csubj",
48
- "35": "I-dep",
49
- "36": "I-det",
50
- "37": "I-discourse",
51
- "38": "I-dislocated",
52
- "39": "I-fixed",
53
- "40": "I-mark",
54
- "41": "I-nmod",
55
- "42": "I-nsubj",
56
- "43": "I-nummod",
57
- "44": "I-obj",
58
- "45": "I-obl",
59
- "46": "I-punct",
60
- "47": "I-root"
 
61
  },
62
  "initializer_range": 0.02,
63
  "intermediate_size": 3072,
@@ -73,43 +73,44 @@
73
  "B-compound": 8,
74
  "B-cop": 9,
75
  "B-csubj": 10,
76
- "B-dep": 11,
77
- "B-det": 12,
78
- "B-discourse": 13,
79
- "B-dislocated": 14,
80
  "B-fixed": 15,
81
  "B-mark": 16,
82
  "B-nmod": 17,
83
  "B-nsubj": 18,
84
- "B-nummod": 19,
85
- "B-obj": 20,
86
- "B-obl": 21,
87
- "B-punct": 22,
88
- "B-root": 23,
89
- "I-acl": 24,
90
- "I-advcl": 25,
91
- "I-advmod": 26,
92
- "I-amod": 27,
93
- "I-aux": 28,
94
- "I-case": 29,
95
- "I-cc": 30,
96
- "I-ccomp": 31,
97
- "I-compound": 32,
98
- "I-cop": 33,
99
- "I-csubj": 34,
100
- "I-dep": 35,
101
- "I-det": 36,
102
- "I-discourse": 37,
103
- "I-dislocated": 38,
104
- "I-fixed": 39,
105
- "I-mark": 40,
106
- "I-nmod": 41,
107
- "I-nsubj": 42,
108
- "I-nummod": 43,
109
- "I-obj": 44,
110
- "I-obl": 45,
111
- "I-punct": 46,
112
- "I-root": 47
 
113
  },
114
  "layer_norm_eps": 1e-07,
115
  "max_position_embeddings": 512,
@@ -121,12 +122,15 @@
121
  "pooler_dropout": 0,
122
  "pooler_hidden_act": "gelu",
123
  "pooler_hidden_size": 768,
124
- "pos_att_type": null,
125
- "position_biased_input": true,
126
- "relative_attention": false,
 
 
 
127
  "tokenizer_class": "DebertaV2TokenizerFast",
128
  "torch_dtype": "float32",
129
- "transformers_version": "4.19.4",
130
  "type_vocab_size": 0,
131
  "vocab_size": 32000
132
  }
 
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
 
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.1,
10
  "hidden_size": 768,
 
20
  "8": "B-compound",
21
  "9": "B-cop",
22
  "10": "B-csubj",
23
+ "11": "B-csubj:outer",
24
+ "12": "B-dep",
25
+ "13": "B-det",
26
+ "14": "B-discourse",
27
  "15": "B-fixed",
28
  "16": "B-mark",
29
  "17": "B-nmod",
30
  "18": "B-nsubj",
31
+ "19": "B-nsubj:outer",
32
+ "20": "B-nummod",
33
+ "21": "B-obj",
34
+ "22": "B-obl",
35
+ "23": "B-punct",
36
+ "24": "B-root",
37
+ "25": "I-acl",
38
+ "26": "I-advcl",
39
+ "27": "I-advmod",
40
+ "28": "I-amod",
41
+ "29": "I-aux",
42
+ "30": "I-case",
43
+ "31": "I-cc",
44
+ "32": "I-ccomp",
45
+ "33": "I-compound",
46
+ "34": "I-cop",
47
+ "35": "I-csubj",
48
+ "36": "I-csubj:outer",
49
+ "37": "I-dep",
50
+ "38": "I-det",
51
+ "39": "I-discourse",
52
+ "40": "I-fixed",
53
+ "41": "I-mark",
54
+ "42": "I-nmod",
55
+ "43": "I-nsubj",
56
+ "44": "I-nsubj:outer",
57
+ "45": "I-nummod",
58
+ "46": "I-obj",
59
+ "47": "I-obl",
60
+ "48": "I-root"
61
  },
62
  "initializer_range": 0.02,
63
  "intermediate_size": 3072,
 
73
  "B-compound": 8,
74
  "B-cop": 9,
75
  "B-csubj": 10,
76
+ "B-csubj:outer": 11,
77
+ "B-dep": 12,
78
+ "B-det": 13,
79
+ "B-discourse": 14,
80
  "B-fixed": 15,
81
  "B-mark": 16,
82
  "B-nmod": 17,
83
  "B-nsubj": 18,
84
+ "B-nsubj:outer": 19,
85
+ "B-nummod": 20,
86
+ "B-obj": 21,
87
+ "B-obl": 22,
88
+ "B-punct": 23,
89
+ "B-root": 24,
90
+ "I-acl": 25,
91
+ "I-advcl": 26,
92
+ "I-advmod": 27,
93
+ "I-amod": 28,
94
+ "I-aux": 29,
95
+ "I-case": 30,
96
+ "I-cc": 31,
97
+ "I-ccomp": 32,
98
+ "I-compound": 33,
99
+ "I-cop": 34,
100
+ "I-csubj": 35,
101
+ "I-csubj:outer": 36,
102
+ "I-dep": 37,
103
+ "I-det": 38,
104
+ "I-discourse": 39,
105
+ "I-fixed": 40,
106
+ "I-mark": 41,
107
+ "I-nmod": 42,
108
+ "I-nsubj": 43,
109
+ "I-nsubj:outer": 44,
110
+ "I-nummod": 45,
111
+ "I-obj": 46,
112
+ "I-obl": 47,
113
+ "I-root": 48
114
  },
115
  "layer_norm_eps": 1e-07,
116
  "max_position_embeddings": 512,
 
122
  "pooler_dropout": 0,
123
  "pooler_hidden_act": "gelu",
124
  "pooler_hidden_size": 768,
125
+ "pos_att_type": [
126
+ "p2c",
127
+ "c2p"
128
+ ],
129
+ "position_biased_input": false,
130
+ "relative_attention": true,
131
  "tokenizer_class": "DebertaV2TokenizerFast",
132
  "torch_dtype": "float32",
133
+ "transformers_version": "4.22.1",
134
  "type_vocab_size": 0,
135
  "vocab_size": 32000
136
  }
deprel/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1dbb753611452b4eb17d790bf19edbb77c903a7f93f9cca09d9faa266b5aca49
3
- size 440319475
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:458a4b080dd20785d2f7b9501923d34458b62a8e3684833a5fd0e4b3c547e2b4
3
+ size 498609043
deprel/special_tokens_map.json CHANGED
@@ -1 +1,9 @@
1
- {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
deprel/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
deprel/tokenizer_config.json CHANGED
@@ -1 +1,14 @@
1
- {"do_lower_case": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": true, "keep_accents": true, "model_max_length": 512, "tokenizer_class": "DebertaV2TokenizerFast"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "split_by_punct": true,
12
+ "tokenizer_class": "DebertaV2TokenizerFast",
13
+ "unk_token": "[UNK]"
14
+ }
maker.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ import os
3
+ src="KoichiYasuoka/deberta-base-japanese-aozora"
4
+ tgt="KoichiYasuoka/deberta-base-japanese-aozora-ud-head"
5
+ url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
6
+ d=os.path.basename(url)
7
+ os.system("test -d {} || git clone --depth=1 {}".format(d,url))
8
+ os.system("for F in train dev test ; do cp "+d+"/*-$F*.conllu $F.conllu ; done")
9
+ from transformers import (AutoTokenizer,AutoModelForQuestionAnswering,
10
+ AutoModelForTokenClassification,AutoConfig,DefaultDataCollator,
11
+ DataCollatorForTokenClassification,TrainingArguments,Trainer)
12
+ class HEADDataset(object):
13
+ def __init__(self,conllu,tokenizer,augment=False,length=384):
14
+ self.qa,self.pad,self.length=[],tokenizer.pad_token_id,length
15
+ with open(conllu,"r",encoding="utf-8") as r:
16
+ form,head=[],[]
17
+ for t in r:
18
+ w=t.split("\t")
19
+ if len(w)==10 and w[0].isdecimal():
20
+ form.append(w[1])
21
+ head.append(len(head) if w[6]=="0" else int(w[6])-1)
22
+ elif t.strip()=="" and form!=[]:
23
+ v=tokenizer(form,add_special_tokens=False)["input_ids"]
24
+ for i,t in enumerate(v):
25
+ q=[tokenizer.cls_token_id]+t+[tokenizer.sep_token_id]
26
+ c=[q]+v[0:i]+[[tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]]
27
+ b=[len(sum(c[0:j+1],[])) for j in range(len(c))]
28
+ if b[-1]<length:
29
+ self.qa.append((sum(c,[]),head[i],b))
30
+ if augment and [1 for x in v if t==x]==[1]:
31
+ c[i+1]=t
32
+ b=[len(sum(c[0:j+1],[])) for j in range(len(c))]
33
+ if b[-1]<length:
34
+ self.qa.append((sum(c,[]),head[i],b))
35
+ form,head=[],[]
36
+ __len__=lambda self:len(self.qa)
37
+ def __getitem__(self,i):
38
+ (v,h,b),k=self.qa[i],self.length-self.qa[i][2][-1]
39
+ return {"input_ids":v+[self.pad]*k,"attention_mask":[1]*b[-1]+[0]*k,
40
+ "token_type_ids":[0]*b[0]+[1]*(b[-1]-b[0])+[0]*k,
41
+ "start_positions":b[h],"end_positions":b[h+1]-1}
42
+ class UPOSDataset(object):
43
+ def __init__(self,conllu,tokenizer,fields=[3]):
44
+ self.ids,self.upos=[],[]
45
+ label,cls,sep=set(),tokenizer.cls_token_id,tokenizer.sep_token_id
46
+ with open(conllu,"r",encoding="utf-8") as r:
47
+ form,upos=[],[]
48
+ for t in r:
49
+ w=t.split("\t")
50
+ if len(w)==10 and w[0].isdecimal():
51
+ form.append(w[1])
52
+ upos.append("|".join(w[i] for i in fields))
53
+ elif t.strip()=="" and form!=[]:
54
+ v,u=tokenizer(form,add_special_tokens=False)["input_ids"],[]
55
+ for x,y in zip(v,upos):
56
+ u.extend(["B-"+y]*min(len(x),1)+["I-"+y]*(len(x)-1))
57
+ if len(u)>tokenizer.model_max_length-4:
58
+ self.ids.append(sum(v,[])[0:tokenizer.model_max_length-2])
59
+ self.upos.append(u[0:tokenizer.model_max_length-2])
60
+ elif len(u)>0:
61
+ self.ids.append([cls]+sum(v,[])+[sep])
62
+ self.upos.append([u[0]]+u+[u[0]])
63
+ label=set(sum([self.upos[-1],list(label)],[]))
64
+ form,upos=[],[]
65
+ self.label2id={l:i for i,l in enumerate(sorted(label))}
66
+ def __call__(*args):
67
+ label=set(sum([list(t.label2id) for t in args],[]))
68
+ lid={l:i for i,l in enumerate(sorted(label))}
69
+ for t in args:
70
+ t.label2id=lid
71
+ return lid
72
+ __len__=lambda self:len(self.ids)
73
+ __getitem__=lambda self,i:{"input_ids":self.ids[i],
74
+ "labels":[self.label2id[t] for t in self.upos[i]]}
75
+ tkz=AutoTokenizer.from_pretrained(src)
76
+ trainDS=HEADDataset("train.conllu",tkz,True)
77
+ devDS=HEADDataset("dev.conllu",tkz)
78
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=8,
79
+ output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,
80
+ evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
81
+ trn=Trainer(args=arg,data_collator=DefaultDataCollator(),
82
+ model=AutoModelForQuestionAnswering.from_pretrained(src),
83
+ train_dataset=trainDS,eval_dataset=devDS)
84
+ trn.train()
85
+ trn.save_model(tgt)
86
+ tkz.save_pretrained(tgt)
87
+ trainDS=UPOSDataset("train.conllu",tkz,[7])
88
+ devDS=UPOSDataset("dev.conllu",tkz,[7])
89
+ testDS=UPOSDataset("test.conllu",tkz,[7])
90
+ lid=trainDS(devDS,testDS)
91
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,
92
+ id2label={i:l for l,i in lid.items()})
93
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),
94
+ model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),
95
+ train_dataset=trainDS,eval_dataset=devDS)
96
+ trn.train()
97
+ trn.save_model(tgt+"/deprel")
98
+ tkz.save_pretrained(tgt+"/deprel")
99
+ trainDS=UPOSDataset("train.conllu",tkz,[3,5])
100
+ devDS=UPOSDataset("dev.conllu",tkz,[3,5])
101
+ testDS=UPOSDataset("test.conllu",tkz,[3,5])
102
+ lid=trainDS(devDS,testDS)
103
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,
104
+ id2label={i:l for l,i in lid.items()})
105
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),
106
+ model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),
107
+ train_dataset=trainDS,eval_dataset=devDS)
108
+ trn.train()
109
+ trn.save_model(tgt+"/tagger")
110
+ tkz.save_pretrained(tgt+"/tagger")
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7509e3bc46e732e2b4cada2976207b7f735ace7e6be5262a7424c0cc010e2415
3
- size 440178035
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d17984f10f5d9d7fbc506fb1d19e0127e3eacb50d29474f692e7f6a2667decb7
3
+ size 498464467
special_tokens_map.json CHANGED
@@ -1 +1,9 @@
1
- {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tagger/config.json CHANGED
@@ -5,7 +5,6 @@
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
8
- "finetuning_task": "pos",
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
@@ -43,11 +42,10 @@
43
  "30": "I-PART|_",
44
  "31": "I-PRON|_",
45
  "32": "I-PROPN|_",
46
- "33": "I-PUNCT|_",
47
- "34": "I-SCONJ|_",
48
- "35": "I-SYM|_",
49
- "36": "I-VERB|_",
50
- "37": "I-X|_"
51
  },
52
  "initializer_range": 0.02,
53
  "intermediate_size": 3072,
@@ -85,11 +83,10 @@
85
  "I-PART|_": 30,
86
  "I-PRON|_": 31,
87
  "I-PROPN|_": 32,
88
- "I-PUNCT|_": 33,
89
- "I-SCONJ|_": 34,
90
- "I-SYM|_": 35,
91
- "I-VERB|_": 36,
92
- "I-X|_": 37
93
  },
94
  "layer_norm_eps": 1e-07,
95
  "max_position_embeddings": 512,
@@ -101,12 +98,15 @@
101
  "pooler_dropout": 0,
102
  "pooler_hidden_act": "gelu",
103
  "pooler_hidden_size": 768,
104
- "pos_att_type": null,
105
- "position_biased_input": true,
106
- "relative_attention": false,
 
 
 
107
  "tokenizer_class": "DebertaV2TokenizerFast",
108
  "torch_dtype": "float32",
109
- "transformers_version": "4.19.4",
110
  "type_vocab_size": 0,
111
  "vocab_size": 32000
112
  }
 
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
 
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.1,
10
  "hidden_size": 768,
 
42
  "30": "I-PART|_",
43
  "31": "I-PRON|_",
44
  "32": "I-PROPN|_",
45
+ "33": "I-SCONJ|_",
46
+ "34": "I-SYM|_",
47
+ "35": "I-VERB|_",
48
+ "36": "I-X|_"
 
49
  },
50
  "initializer_range": 0.02,
51
  "intermediate_size": 3072,
 
83
  "I-PART|_": 30,
84
  "I-PRON|_": 31,
85
  "I-PROPN|_": 32,
86
+ "I-SCONJ|_": 33,
87
+ "I-SYM|_": 34,
88
+ "I-VERB|_": 35,
89
+ "I-X|_": 36
 
90
  },
91
  "layer_norm_eps": 1e-07,
92
  "max_position_embeddings": 512,
 
98
  "pooler_dropout": 0,
99
  "pooler_hidden_act": "gelu",
100
  "pooler_hidden_size": 768,
101
+ "pos_att_type": [
102
+ "p2c",
103
+ "c2p"
104
+ ],
105
+ "position_biased_input": false,
106
+ "relative_attention": true,
107
  "tokenizer_class": "DebertaV2TokenizerFast",
108
  "torch_dtype": "float32",
109
+ "transformers_version": "4.22.1",
110
  "type_vocab_size": 0,
111
  "vocab_size": 32000
112
  }
tagger/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e837d28f6b426a84a65c0dd3881b30ffabfb28f3f5f5235472bffc074457b4d5
3
- size 440288755
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94a79cb057d18e2aa592ce9a20f4a91aee89b03a71104a6a6afb6cd0291b267c
3
+ size 498572115
tagger/special_tokens_map.json CHANGED
@@ -1 +1,9 @@
1
- {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tagger/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tagger/tokenizer_config.json CHANGED
@@ -1 +1,14 @@
1
- {"do_lower_case": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": true, "keep_accents": true, "model_max_length": 512, "tokenizer_class": "DebertaV2TokenizerFast"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "split_by_punct": true,
12
+ "tokenizer_class": "DebertaV2TokenizerFast",
13
+ "unk_token": "[UNK]"
14
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1 +1,14 @@
1
- {"do_lower_case": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": true, "keep_accents": true, "model_max_length": 512, "tokenizer_class": "DebertaV2TokenizerFast"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "split_by_punct": true,
12
+ "tokenizer_class": "DebertaV2TokenizerFast",
13
+ "unk_token": "[UNK]"
14
+ }