Upload tokenizer
Browse files- tokenization_phylogpn.py +2 -1
- tokenizer_config.json +1 -0
tokenization_phylogpn.py
CHANGED
@@ -4,7 +4,7 @@ from transformers import PreTrainedTokenizer
|
|
4 |
class PhyloGPNTokenizer(PreTrainedTokenizer):
|
5 |
model_input_names = ["input_ids"]
|
6 |
|
7 |
-
def __init__(self, model_max_length: int = None, unk_token="N", pad_token="-", bos_token=None, eos_token=None, sep_token=None, cls_token=None, mask_token=None, **kwargs):
|
8 |
self.model_max_length = model_max_length
|
9 |
self._vocab = {k: v for v, k in enumerate("ACGTN-")}
|
10 |
|
@@ -19,6 +19,7 @@ class PhyloGPNTokenizer(PreTrainedTokenizer):
|
|
19 |
sep_token=sep_token,
|
20 |
cls_token=cls_token,
|
21 |
mask_token=mask_token,
|
|
|
22 |
add_prefix_space=add_prefix_space,
|
23 |
padding_side=padding_side,
|
24 |
**kwargs,
|
|
|
4 |
class PhyloGPNTokenizer(PreTrainedTokenizer):
|
5 |
model_input_names = ["input_ids"]
|
6 |
|
7 |
+
def __init__(self, model_max_length: int = None, unk_token="N", pad_token="-", bos_token=None, eos_token=None, sep_token=None, cls_token=None, mask_token=None, split_special_tokens=True, **kwargs):
|
8 |
self.model_max_length = model_max_length
|
9 |
self._vocab = {k: v for v, k in enumerate("ACGTN-")}
|
10 |
|
|
|
19 |
sep_token=sep_token,
|
20 |
cls_token=cls_token,
|
21 |
mask_token=mask_token,
|
22 |
+
split_special_tokens=split_special_tokens,
|
23 |
add_prefix_space=add_prefix_space,
|
24 |
padding_side=padding_side,
|
25 |
**kwargs,
|
tokenizer_config.json
CHANGED
@@ -34,6 +34,7 @@
|
|
34 |
"pad_token": "-",
|
35 |
"padding_side": "right",
|
36 |
"sep_token": null,
|
|
|
37 |
"tokenizer_class": "PhyloGPNTokenizer",
|
38 |
"unk_token": "N"
|
39 |
}
|
|
|
34 |
"pad_token": "-",
|
35 |
"padding_side": "right",
|
36 |
"sep_token": null,
|
37 |
+
"split_special_tokens": true,
|
38 |
"tokenizer_class": "PhyloGPNTokenizer",
|
39 |
"unk_token": "N"
|
40 |
}
|