maykcaldas commited on
Commit
5b69007
·
1 Parent(s): c8310f2

add tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +278 -136
  2. tokenizer_config.json +1 -3
tokenizer.json CHANGED
@@ -2,53 +2,7 @@
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
- "added_tokens": [
6
- {
7
- "id": 10,
8
- "content": "[bos]",
9
- "single_word": false,
10
- "lstrip": false,
11
- "rstrip": false,
12
- "normalized": false,
13
- "special": true
14
- },
15
- {
16
- "id": 32,
17
- "content": "[eos]",
18
- "single_word": false,
19
- "lstrip": false,
20
- "rstrip": false,
21
- "normalized": false,
22
- "special": true
23
- },
24
- {
25
- "id": 57,
26
- "content": "[unk]",
27
- "single_word": false,
28
- "lstrip": false,
29
- "rstrip": false,
30
- "normalized": false,
31
- "special": true
32
- },
33
- {
34
- "id": 65,
35
- "content": "[nop]",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
- {
43
- "id": 88,
44
- "content": "[mask]",
45
- "single_word": false,
46
- "lstrip": false,
47
- "rstrip": false,
48
- "normalized": false,
49
- "special": true
50
- }
51
- ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
  "type": "WhitespaceSplit"
@@ -61,95 +15,283 @@
61
  "continuing_subword_prefix": "##",
62
  "max_input_chars_per_word": 100,
63
  "vocab": {
64
- "[Branch1]": 0,
65
- "[=NH1+1]": 1,
66
- "[Se]": 2,
67
- "[#Branch2]": 3,
68
- "[O-1]": 4,
69
- "[SiH1]": 5,
70
- "[SeH1]": 6,
71
- "[CH2-1]": 7,
72
- "[SH0]": 8,
73
- "[PH1]": 9,
74
- "[bos]": 10,
75
- "[Si]": 11,
76
- "[OH1+1]": 12,
77
- "[Fe]": 13,
78
- "[NH1]": 14,
79
- "[Ring2]": 15,
80
- "[=N]": 16,
81
- "[=NH2+1]": 17,
82
- "[B]": 18,
83
- "[=SH1]": 19,
84
- "[C]": 20,
85
- "[=C]": 21,
86
- "[NH1-1]": 22,
87
- "[=O+1]": 23,
88
- "[As]": 24,
89
- "[#Branch1]": 25,
90
- "[I]": 26,
91
- "[=O]": 27,
92
- "[B-1]": 28,
93
- "[Fe-4]": 29,
94
- "[=Ring1]": 30,
95
- "[=S]": 31,
96
- "[eos]": 32,
97
- "[Cl]": 33,
98
- "[=P]": 34,
99
- "[=Fe]": 35,
100
- "[NH1+1]": 36,
101
- "[CH1]": 37,
102
- "[#Ring1]": 38,
103
- "[As+1]": 39,
104
- "[Branch3]": 40,
105
- "[O]": 41,
106
- "[=OH1+1]": 42,
107
- "[Branch2]": 43,
108
- "[=As]": 44,
109
- "[F]": 45,
110
- "[P+1]": 46,
111
- "[S]": 47,
112
- "[#Ring2]": 48,
113
- "[#N]": 49,
114
- "[CH1+1]": 50,
115
- "[OH0]": 51,
116
- "[N]": 52,
117
- "[I+1]": 53,
118
- "[=Ring2]": 54,
119
- "[C+1]": 55,
120
- "[=B]": 56,
121
- "[unk]": 57,
122
- "[SiH2]": 58,
123
- "[C-1]": 59,
124
- "[=PH1]": 60,
125
- "[#C]": 61,
126
- "[SH1]": 62,
127
- "[Fe-3]": 63,
128
- "[Br]": 64,
129
- "[nop]": 65,
130
- "[CH1-1]": 66,
131
- "[NH3+1]": 67,
132
- "[=Branch1]": 68,
133
- "[NH2+1]": 69,
134
- "[P]": 70,
135
- "[K]": 71,
136
- "[N+1]": 72,
137
- "[CH0]": 73,
138
- "[=Se]": 74,
139
- "[Fe+1]": 75,
140
- "[Ring1]": 76,
141
- "[S+1]": 77,
142
- "[=Branch3]": 78,
143
- "[Fe+2]": 79,
144
- "[=S+1]": 80,
145
- "[=N+1]": 81,
146
- "[Na]": 82,
147
- "[Se+1]": 83,
148
- "[N-1]": 84,
149
- "[NH0]": 85,
150
- "[#S]": 86,
151
- "[=Branch2]": 87,
152
- "[mask]": 88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  }
154
  }
155
  }
 
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
+ "added_tokens": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "normalizer": null,
7
  "pre_tokenizer": {
8
  "type": "WhitespaceSplit"
 
15
  "continuing_subword_prefix": "##",
16
  "max_input_chars_per_word": 100,
17
  "vocab": {
18
+ "[mask]": 0,
19
+ "[bos]": 1,
20
+ "[eos]": 2,
21
+ "[unk]": 3,
22
+ "[nop]": 4,
23
+ ".": 5,
24
+ "[N@]": 6,
25
+ "[Se]": 7,
26
+ "[Li+1]": 8,
27
+ "[=Ge]": 9,
28
+ "[\\N]": 10,
29
+ "[Mg+2]": 11,
30
+ "[ClH0]": 12,
31
+ "[Ba+2]": 13,
32
+ "[N]": 14,
33
+ "[=Mo]": 15,
34
+ "[SH1-2]": 16,
35
+ "[=O]": 17,
36
+ "[Zr+4]": 18,
37
+ "[\\N+1]": 19,
38
+ "[/Br]": 20,
39
+ "[Mo+2]": 21,
40
+ "[/-Ring1]": 22,
41
+ "[=SH0]": 23,
42
+ "[Sn]": 24,
43
+ "[=Si]": 25,
44
+ "[N-1]": 26,
45
+ "[Fe+2]": 27,
46
+ "[FH0]": 28,
47
+ "[B-1]": 29,
48
+ "[2H]": 30,
49
+ "[/N+1]": 31,
50
+ "[Cl-1]": 32,
51
+ "[=V+2]": 33,
52
+ "[\\NH1+1]": 34,
53
+ "[=N]": 35,
54
+ "[CH0]": 36,
55
+ "[P@@]": 37,
56
+ "[=Fe]": 38,
57
+ "[\\S]": 39,
58
+ "[Cl]": 40,
59
+ "[Al+3]": 41,
60
+ "[\\-Ring2]": 42,
61
+ "[As+3]": 43,
62
+ "[Ring1]": 44,
63
+ "[Sm+3]": 45,
64
+ "[=Ni]": 46,
65
+ "[Co+2]": 47,
66
+ "[NH1+2]": 48,
67
+ "[\\C@H1]": 49,
68
+ "[Mo]": 50,
69
+ "[S-2]": 51,
70
+ "[=PH0]": 52,
71
+ "[Zn]": 53,
72
+ "[Cr]": 54,
73
+ "[Br-1]": 55,
74
+ "[=P]": 56,
75
+ "[S]": 57,
76
+ "[#Branch2]": 58,
77
+ "[-/Ring1]": 59,
78
+ "[\\C@@H1]": 60,
79
+ "[NH2+1]": 61,
80
+ "[Sr]": 62,
81
+ "[=Cr]": 63,
82
+ "[=N-1]": 64,
83
+ "[=SnH2]": 65,
84
+ "[\\NH1]": 66,
85
+ "[PH0]": 67,
86
+ "[NH0]": 68,
87
+ "[Gd]": 69,
88
+ "[Na]": 70,
89
+ "[/C@@H1]": 71,
90
+ "[Au+1]": 72,
91
+ "[Ce]": 73,
92
+ "[Dy]": 74,
93
+ "[=Ti]": 75,
94
+ "[CH2]": 76,
95
+ "[Hg]": 77,
96
+ "[=Ring2]": 78,
97
+ "[=Zr]": 79,
98
+ "[Zr+2]": 80,
99
+ "[I]": 81,
100
+ "[Si]": 82,
101
+ "[=Branch1]": 83,
102
+ "[K]": 84,
103
+ "[BH0]": 85,
104
+ "[Pr+3]": 86,
105
+ "[Nd+3]": 87,
106
+ "[Ag+1]": 88,
107
+ "[In+3]": 89,
108
+ "[Cl+3]": 90,
109
+ "[Cu]": 91,
110
+ "[O-2]": 92,
111
+ "[W]": 93,
112
+ "[N+1]": 94,
113
+ "[=Ca]": 95,
114
+ "[=S]": 96,
115
+ "[=P+1]": 97,
116
+ "[/N]": 98,
117
+ "[#O+1]": 99,
118
+ "[H+1]": 100,
119
+ "[Nb+5]": 101,
120
+ "[NH3+1]": 102,
121
+ "[Branch3]": 103,
122
+ "[Gd+3]": 104,
123
+ "[Zn+2]": 105,
124
+ "[/C@]": 106,
125
+ "[Cd+2]": 107,
126
+ "[Ce+3]": 108,
127
+ "[Pb+2]": 109,
128
+ "[Na+1]": 110,
129
+ "[#Branch1]": 111,
130
+ "[Lu+3]": 112,
131
+ "[MgH2]": 113,
132
+ "[\\Br]": 114,
133
+ "[-/Ring2]": 115,
134
+ "[NH1+1]": 116,
135
+ "[S-1]": 117,
136
+ "[I-1]": 118,
137
+ "[O]": 119,
138
+ "[=V]": 120,
139
+ "[Y]": 121,
140
+ "[Rh]": 122,
141
+ "[#S]": 123,
142
+ "[Cs+1]": 124,
143
+ "[Ni]": 125,
144
+ "[H]": 126,
145
+ "[#N]": 127,
146
+ "[Cu+1]": 128,
147
+ "[=PH1]": 129,
148
+ "[SH1-1]": 130,
149
+ "[=Mn]": 131,
150
+ "[Mn+3]": 132,
151
+ "[#C]": 133,
152
+ "[Branch2]": 134,
153
+ "[Sb+3]": 135,
154
+ "[=B]": 136,
155
+ "[\\As]": 137,
156
+ "[=Sn]": 138,
157
+ "[H-1]": 139,
158
+ "[=Ce]": 140,
159
+ "[B+3]": 141,
160
+ "[=O+1]": 142,
161
+ "[=Branch2]": 143,
162
+ "[K+1]": 144,
163
+ "[=Pb]": 145,
164
+ "[P@]": 146,
165
+ "[Al]": 147,
166
+ "[=Ring1]": 148,
167
+ "[Co]": 149,
168
+ "[Be]": 150,
169
+ "[=Se]": 151,
170
+ "[/I]": 152,
171
+ "[C@@H1]": 153,
172
+ "[/S]": 154,
173
+ "[Ring2]": 155,
174
+ "[Ni+2]": 156,
175
+ "[/As]": 157,
176
+ "[C]": 158,
177
+ "[Ta]": 159,
178
+ "[F-1]": 160,
179
+ "[Sb]": 161,
180
+ "[Au]": 162,
181
+ "[/NH1+1]": 163,
182
+ "[Zr]": 164,
183
+ "[CH1-1]": 165,
184
+ "[Rh+3]": 166,
185
+ "[Si@]": 167,
186
+ "[\\O]": 168,
187
+ "[-\\Ring1]": 169,
188
+ "[SnH2]": 170,
189
+ "[Bi+3]": 171,
190
+ "[Fe]": 172,
191
+ "[Hf]": 173,
192
+ "[=NH1+1]": 174,
193
+ "[Pb]": 175,
194
+ "[\\Cl]": 176,
195
+ "[Ca+2]": 177,
196
+ "[Ir]": 178,
197
+ "[Pt+2]": 179,
198
+ "[Hg+2]": 180,
199
+ "[#B]": 181,
200
+ "[Ge]": 182,
201
+ "[=Te]": 183,
202
+ "[Nd]": 184,
203
+ "[OH1-1]": 185,
204
+ "[NH1]": 186,
205
+ "[C+1]": 187,
206
+ "[Co+3]": 188,
207
+ "[=Cd]": 189,
208
+ "[Y+3]": 190,
209
+ "[Ti+2]": 191,
210
+ "[Au+3]": 192,
211
+ "[OH0]": 193,
212
+ "[Pd+2]": 194,
213
+ "[O+1]": 195,
214
+ "[=As]": 196,
215
+ "[/C]": 197,
216
+ "[Hg+1]": 198,
217
+ "[=N+1]": 199,
218
+ "[Sn+2]": 200,
219
+ "[O-1]": 201,
220
+ "[F]": 202,
221
+ "[NH4+1]": 203,
222
+ "[Re]": 204,
223
+ "[/O]": 205,
224
+ "[Sr+2]": 206,
225
+ "[/-Ring2]": 207,
226
+ "[Dy+3]": 208,
227
+ "[S+1]": 209,
228
+ "[Sn+4]": 210,
229
+ "[\\I]": 211,
230
+ "[Ti+4]": 212,
231
+ "[Mn+2]": 213,
232
+ "[Mg]": 214,
233
+ "[B]": 215,
234
+ "[La]": 216,
235
+ "[=W]": 217,
236
+ "[Cs]": 218,
237
+ "[#C-1]": 219,
238
+ "[=Sr]": 220,
239
+ "[=S+1]": 221,
240
+ "[=Be]": 222,
241
+ "[C@H1]": 223,
242
+ "[\\C]": 224,
243
+ "[Ca]": 225,
244
+ "[Si+4]": 226,
245
+ "[PH1]": 227,
246
+ "[Nb]": 228,
247
+ "[Cr+3]": 229,
248
+ "[Branch1]": 230,
249
+ "[Hf+4]": 231,
250
+ "[=Nb]": 232,
251
+ "[P]": 233,
252
+ "[CH1]": 234,
253
+ "[Ag]": 235,
254
+ "[=C-1]": 236,
255
+ "[\\-Ring1]": 237,
256
+ "[C-1]": 238,
257
+ "[C@@]": 239,
258
+ "[/Cl]": 240,
259
+ "[Cu+2]": 241,
260
+ "[Cd]": 242,
261
+ "[CaH2]": 243,
262
+ "[=Co]": 244,
263
+ "[/C@@]": 245,
264
+ "[Li]": 246,
265
+ "[Ti]": 247,
266
+ "[=C]": 248,
267
+ "[P+1]": 249,
268
+ "[=CH0]": 250,
269
+ "[=Y]": 251,
270
+ "[V+2]": 252,
271
+ "[SiH1]": 253,
272
+ "[V]": 254,
273
+ "[La+3]": 255,
274
+ "[Ru+3]": 256,
275
+ "[=Cu]": 257,
276
+ "[C@]": 258,
277
+ "[Fe+3]": 259,
278
+ "[Te]": 260,
279
+ "[=Zn]": 261,
280
+ "[Bi]": 262,
281
+ "[Br]": 263,
282
+ "[/C@H1]": 264,
283
+ "[Mn]": 265,
284
+ "[SH0]": 266,
285
+ "[Pt+4]": 267,
286
+ "[As]": 268,
287
+ "[/NH1]": 269,
288
+ "[Nop]": 270,
289
+ "[Ring3]": 271,
290
+ "[#Branch3]": 272,
291
+ "[=Branch3]": 273,
292
+ "[\\C@]": 274,
293
+ "[#N+1]": 275,
294
+ "[=Ring3]": 276
295
  }
296
  }
297
  }
tokenizer_config.json CHANGED
@@ -1,12 +1,10 @@
1
  {
2
  "cls_token": "[bos]",
3
  "mask_token": "[mask]",
4
- "model_max_length": 427,
5
- "name_or_path": "tokenizer",
6
  "pad_token": "[nop]",
7
  "padding_side": "right",
8
  "sep_token": "[eos]",
9
- "special_tokens_map_file": "tokenizer/special_tokens_map.json",
10
  "tokenizer_class": "PreTrainedTokenizerFast",
11
  "truncation_side": "right",
12
  "unk_token": "[unk]"
 
1
  {
2
  "cls_token": "[bos]",
3
  "mask_token": "[mask]",
4
+ "model_max_length": 135,
 
5
  "pad_token": "[nop]",
6
  "padding_side": "right",
7
  "sep_token": "[eos]",
 
8
  "tokenizer_class": "PreTrainedTokenizerFast",
9
  "truncation_side": "right",
10
  "unk_token": "[unk]"