Omartificial-Intelligence-Space commited on
Commit
e302082
·
verified ·
1 Parent(s): d002af9

update tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +16 -5
tokenizer_config.json CHANGED
@@ -27,12 +27,20 @@
27
  "3": {
28
  "content": "<unk>",
29
  "lstrip": false,
 
 
 
 
 
 
 
 
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "250001": {
36
  "content": "<mask>",
37
  "lstrip": true,
38
  "normalized": false,
@@ -44,18 +52,21 @@
44
  "bos_token": "<s>",
45
  "clean_up_tokenization_spaces": true,
46
  "cls_token": "<s>",
 
47
  "eos_token": "</s>",
48
  "mask_token": "<mask>",
49
- "max_length": 128,
50
- "model_max_length": 128,
51
  "pad_to_multiple_of": null,
52
  "pad_token": "<pad>",
53
  "pad_token_type_id": 0,
54
  "padding_side": "right",
55
  "sep_token": "</s>",
56
  "stride": 0,
57
- "tokenizer_class": "XLMRobertaTokenizer",
 
 
58
  "truncation_side": "right",
59
  "truncation_strategy": "longest_first",
60
- "unk_token": "<unk>"
61
  }
 
27
  "3": {
28
  "content": "<unk>",
29
  "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "104": {
36
+ "content": "[UNK]",
37
+ "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "30526": {
44
  "content": "<mask>",
45
  "lstrip": true,
46
  "normalized": false,
 
52
  "bos_token": "<s>",
53
  "clean_up_tokenization_spaces": true,
54
  "cls_token": "<s>",
55
+ "do_lower_case": true,
56
  "eos_token": "</s>",
57
  "mask_token": "<mask>",
58
+ "max_length": 384,
59
+ "model_max_length": 384,
60
  "pad_to_multiple_of": null,
61
  "pad_token": "<pad>",
62
  "pad_token_type_id": 0,
63
  "padding_side": "right",
64
  "sep_token": "</s>",
65
  "stride": 0,
66
+ "strip_accents": null,
67
+ "tokenize_chinese_chars": true,
68
+ "tokenizer_class": "MPNetTokenizer",
69
  "truncation_side": "right",
70
  "truncation_strategy": "longest_first",
71
+ "unk_token": "[UNK]"
72
  }