calbors commited on
Commit
8e0a2a4
·
verified ·
1 Parent(s): c3ae6b4

Upload tokenizer

Browse files
tokenization_phylogpn.py CHANGED
@@ -4,7 +4,7 @@ from transformers import PreTrainedTokenizer
4
  class PhyloGPNTokenizer(PreTrainedTokenizer):
5
  model_input_names = ["input_ids"]
6
 
7
- def __init__(self, model_max_length: int = None, unk_token="N", pad_token="-", bos_token=None, eos_token=None, sep_token=None, cls_token=None, mask_token=None, **kwargs):
8
  self.model_max_length = model_max_length
9
  self._vocab = {k: v for v, k in enumerate("ACGTN-")}
10
 
@@ -19,6 +19,7 @@ class PhyloGPNTokenizer(PreTrainedTokenizer):
19
  sep_token=sep_token,
20
  cls_token=cls_token,
21
  mask_token=mask_token,
 
22
  add_prefix_space=add_prefix_space,
23
  padding_side=padding_side,
24
  **kwargs,
 
4
  class PhyloGPNTokenizer(PreTrainedTokenizer):
5
  model_input_names = ["input_ids"]
6
 
7
+ def __init__(self, model_max_length: int = None, unk_token="N", pad_token="-", bos_token=None, eos_token=None, sep_token=None, cls_token=None, mask_token=None, split_special_tokens=True, **kwargs):
8
  self.model_max_length = model_max_length
9
  self._vocab = {k: v for v, k in enumerate("ACGTN-")}
10
 
 
19
  sep_token=sep_token,
20
  cls_token=cls_token,
21
  mask_token=mask_token,
22
+ split_special_tokens=split_special_tokens,
23
  add_prefix_space=add_prefix_space,
24
  padding_side=padding_side,
25
  **kwargs,
tokenizer_config.json CHANGED
@@ -34,6 +34,7 @@
34
  "pad_token": "-",
35
  "padding_side": "right",
36
  "sep_token": null,
 
37
  "tokenizer_class": "PhyloGPNTokenizer",
38
  "unk_token": "N"
39
  }
 
34
  "pad_token": "-",
35
  "padding_side": "right",
36
  "sep_token": null,
37
+ "split_special_tokens": true,
38
  "tokenizer_class": "PhyloGPNTokenizer",
39
  "unk_token": "N"
40
  }