maykcaldas commited on
Commit
c8310f2
·
1 Parent(s): 430203e

add tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +135 -84
  2. tokenizer_config.json +5 -0
tokenizer.json CHANGED
@@ -2,7 +2,53 @@
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
- "added_tokens": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "normalizer": null,
7
  "pre_tokenizer": {
8
  "type": "WhitespaceSplit"
@@ -15,90 +61,95 @@
15
  "continuing_subword_prefix": "##",
16
  "max_input_chars_per_word": 100,
17
  "vocab": {
18
- "[As+1]": 0,
19
- "[=SH0]": 1,
20
- "[=SH1]": 2,
21
- "[=Ring2]": 3,
22
- "[=Ring1]": 4,
23
- "[CH1]": 5,
24
- "[S]": 6,
25
- "[NH2+1]": 7,
26
- "[B]": 8,
27
- "[C-1]": 9,
28
- "[#C]": 10,
29
- "[=P]": 11,
30
- "[As]": 12,
31
- "[B-1]": 13,
32
- "[bos]": 14,
33
- "[O]": 15,
34
- "[OH0]": 16,
35
- "[I]": 17,
36
- "[nop]": 18,
37
- "[Cl]": 19,
38
- "[SiH2]": 20,
39
- "[Ring1]": 21,
40
- "[Fe-4]": 22,
41
- "[CH0]": 23,
42
- "[Fe]": 24,
43
- "[Fe+2]": 25,
44
- "[CH1-1]": 26,
45
- "[=Branch3]": 27,
46
- "[#Branch1]": 28,
47
- "[=Branch2]": 29,
48
- "[NH0]": 30,
49
- "[N-1]": 31,
50
- "[C]": 32,
51
- "[=NH2+1]": 33,
52
- "[NH1-1]": 34,
53
- "[#N+1]": 35,
54
- "[SeH1]": 36,
55
- "[Branch3]": 37,
56
- "[SH1]": 38,
57
- "[CH2-1]": 39,
58
- "[SH0]": 40,
59
- "[=Se]": 41,
60
- "[NH1+1]": 42,
61
- "[K]": 43,
62
- "[Ring2]": 44,
63
- "[#N]": 45,
64
- "[O-1]": 46,
65
- "[OH1+1]": 47,
66
- "[#Branch2]": 48,
67
- "[=C]": 49,
68
- "[I+1]": 50,
69
- "[Si]": 51,
70
- "[F]": 52,
71
- "[=N+1]": 53,
72
- "[=OH1+1]": 54,
73
- "[Branch2]": 55,
74
- "[=O+1]": 56,
75
- "[#S]": 57,
76
- "[Na]": 58,
77
- "[C+1]": 59,
78
- "[=B]": 60,
79
- "[S+1]": 61,
80
- "[unk]": 62,
81
- "[=Fe]": 63,
82
- "[P]": 64,
83
- "[=N]": 65,
84
- "[SiH1]": 66,
85
  "[NH3+1]": 67,
86
- "[Fe-3]": 68,
87
- "[CH1+1]": 69,
88
- "[Branch1]": 70,
89
- "[Fe+1]": 71,
90
- "[=Branch1]": 72,
91
- "[=S]": 73,
92
- "[Se]": 74,
93
- "[N]": 75,
94
- "[=As]": 76,
95
- "[#Ring2]": 77,
96
- "[Br]": 78,
97
- "[=O]": 79,
98
- "[P+1]": 80,
99
- "[N+1]": 81,
100
- "[eos]": 82,
101
- "[Se+1]": 83
 
 
 
 
 
102
  }
103
  }
104
  }
 
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 10,
8
+ "content": "[bos]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 32,
17
+ "content": "[eos]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 57,
26
+ "content": "[unk]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 65,
35
+ "content": "[nop]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 88,
44
+ "content": "[mask]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
  "type": "WhitespaceSplit"
 
61
  "continuing_subword_prefix": "##",
62
  "max_input_chars_per_word": 100,
63
  "vocab": {
64
+ "[Branch1]": 0,
65
+ "[=NH1+1]": 1,
66
+ "[Se]": 2,
67
+ "[#Branch2]": 3,
68
+ "[O-1]": 4,
69
+ "[SiH1]": 5,
70
+ "[SeH1]": 6,
71
+ "[CH2-1]": 7,
72
+ "[SH0]": 8,
73
+ "[PH1]": 9,
74
+ "[bos]": 10,
75
+ "[Si]": 11,
76
+ "[OH1+1]": 12,
77
+ "[Fe]": 13,
78
+ "[NH1]": 14,
79
+ "[Ring2]": 15,
80
+ "[=N]": 16,
81
+ "[=NH2+1]": 17,
82
+ "[B]": 18,
83
+ "[=SH1]": 19,
84
+ "[C]": 20,
85
+ "[=C]": 21,
86
+ "[NH1-1]": 22,
87
+ "[=O+1]": 23,
88
+ "[As]": 24,
89
+ "[#Branch1]": 25,
90
+ "[I]": 26,
91
+ "[=O]": 27,
92
+ "[B-1]": 28,
93
+ "[Fe-4]": 29,
94
+ "[=Ring1]": 30,
95
+ "[=S]": 31,
96
+ "[eos]": 32,
97
+ "[Cl]": 33,
98
+ "[=P]": 34,
99
+ "[=Fe]": 35,
100
+ "[NH1+1]": 36,
101
+ "[CH1]": 37,
102
+ "[#Ring1]": 38,
103
+ "[As+1]": 39,
104
+ "[Branch3]": 40,
105
+ "[O]": 41,
106
+ "[=OH1+1]": 42,
107
+ "[Branch2]": 43,
108
+ "[=As]": 44,
109
+ "[F]": 45,
110
+ "[P+1]": 46,
111
+ "[S]": 47,
112
+ "[#Ring2]": 48,
113
+ "[#N]": 49,
114
+ "[CH1+1]": 50,
115
+ "[OH0]": 51,
116
+ "[N]": 52,
117
+ "[I+1]": 53,
118
+ "[=Ring2]": 54,
119
+ "[C+1]": 55,
120
+ "[=B]": 56,
121
+ "[unk]": 57,
122
+ "[SiH2]": 58,
123
+ "[C-1]": 59,
124
+ "[=PH1]": 60,
125
+ "[#C]": 61,
126
+ "[SH1]": 62,
127
+ "[Fe-3]": 63,
128
+ "[Br]": 64,
129
+ "[nop]": 65,
130
+ "[CH1-1]": 66,
131
  "[NH3+1]": 67,
132
+ "[=Branch1]": 68,
133
+ "[NH2+1]": 69,
134
+ "[P]": 70,
135
+ "[K]": 71,
136
+ "[N+1]": 72,
137
+ "[CH0]": 73,
138
+ "[=Se]": 74,
139
+ "[Fe+1]": 75,
140
+ "[Ring1]": 76,
141
+ "[S+1]": 77,
142
+ "[=Branch3]": 78,
143
+ "[Fe+2]": 79,
144
+ "[=S+1]": 80,
145
+ "[=N+1]": 81,
146
+ "[Na]": 82,
147
+ "[Se+1]": 83,
148
+ "[N-1]": 84,
149
+ "[NH0]": 85,
150
+ "[#S]": 86,
151
+ "[=Branch2]": 87,
152
+ "[mask]": 88
153
  }
154
  }
155
  }
tokenizer_config.json CHANGED
@@ -1,8 +1,13 @@
1
  {
2
  "cls_token": "[bos]",
3
  "mask_token": "[mask]",
 
 
4
  "pad_token": "[nop]",
 
5
  "sep_token": "[eos]",
 
6
  "tokenizer_class": "PreTrainedTokenizerFast",
 
7
  "unk_token": "[unk]"
8
  }
 
1
  {
2
  "cls_token": "[bos]",
3
  "mask_token": "[mask]",
4
+ "model_max_length": 427,
5
+ "name_or_path": "tokenizer",
6
  "pad_token": "[nop]",
7
+ "padding_side": "right",
8
  "sep_token": "[eos]",
9
+ "special_tokens_map_file": "tokenizer/special_tokens_map.json",
10
  "tokenizer_class": "PreTrainedTokenizerFast",
11
+ "truncation_side": "right",
12
  "unk_token": "[unk]"
13
  }