sunzeyeah commited on
Commit
50e5a6c
1 Parent(s): c4c2f24

add tokenization files

Browse files
Files changed (3) hide show
  1. tokenization_gptpangu.py +103 -0
  2. tokenizer_config.json +16 -0
  3. vocab.model +3 -0
tokenization_gptpangu.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import sentencepiece
4
+ import jieba
5
+ import numpy as np
6
+
7
+ from transformers.tokenization_utils import PreTrainedTokenizer
8
+
9
+
10
+ class GPTPanguTokenizer(PreTrainedTokenizer):
11
+ # Ref: https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha/src/branch/master/tokenization_jieba.py
12
+ vocab_files_names = {
13
+ "model_file": "vocab.model"
14
+ }
15
+
16
+ def __init__(
17
+ self,
18
+ model_file,
19
+ **kwargs
20
+ ):
21
+ super().__init__(**kwargs)
22
+
23
+ self.sp = sentencepiece.SentencePieceProcessor()
24
+ self.sp.Load(model_file=model_file)
25
+ self.translator = str.maketrans(" \n", "\u2582\u2583")
26
+
27
+ # special token ids
28
+ # self.eos_token_id = self.sp.piece_to_id("<eot>")
29
+
30
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
31
+ """
32
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
33
+ adding special tokens. A BERT sequence has the following format:
34
+
35
+ - single sequence: `[CLS] X [SEP]`
36
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
37
+
38
+ Args:
39
+ token_ids_0 (`List[int]`):
40
+ List of IDs to which the special tokens will be added.
41
+ token_ids_1 (`List[int]`, *optional*):
42
+ Optional second list of IDs for sequence pairs.
43
+
44
+ Returns:
45
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
46
+ """
47
+ if self.bos_token_id is not None:
48
+ if token_ids_1 is None:
49
+ return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
50
+ bos = [self.bos_token_id]
51
+ sep = [self.sep_token_id]
52
+ eos = [self.eos_token_id]
53
+ return bos + token_ids_0 + sep + token_ids_1 + eos
54
+ else:
55
+ if token_ids_1 is None:
56
+ return token_ids_0 + [self.eos_token_id]
57
+ sep = [self.sep_token_id]
58
+ eos = [self.eos_token_id]
59
+ return token_ids_0 + sep + token_ids_1 + eos
60
+
61
+ def tokenize(self, text, **kwargs):
62
+ """ Tokenize a string. """
63
+ seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
64
+ return seg_list
65
+
66
+ def convert_tokens_to_ids(self, tokens):
67
+ if tokens is None:
68
+ return None
69
+
70
+ if isinstance(tokens, str):
71
+ return self._convert_token_to_id_with_added_voc(tokens)
72
+
73
+ new_seg = " ".join(tokens)
74
+ return self.sp.encode(new_seg)
75
+ # return tokens
76
+
77
+ def _convert_token_to_id(self, token):
78
+ return self.sp.piece_to_id(token)
79
+
80
+ def _convert_id_to_token(self, index):
81
+ return self.sp.id_to_piece(index)
82
+
83
+ def convert_ids_to_tokens(self, ids):
84
+ return self.decode(ids)
85
+
86
+ def decode(self, tokens, **kwargs):
87
+ if isinstance(tokens, torch.Tensor) or isinstance(tokens, np.ndarray):
88
+ tokens = tokens.tolist()
89
+
90
+ if kwargs.get('skip_special_tokens', None) is True:
91
+ tokens = [token for token in tokens if token not in self.all_special_ids]
92
+ text = self.sp.decode(tokens)
93
+ if isinstance(text, list):
94
+ text = text[0]
95
+ text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
96
+ return text
97
+
98
+ @property
99
+ def vocab_size(self) -> int:
100
+ """
101
+ `int`: Size of the base vocabulary (without the added tokens).
102
+ """
103
+ return len(self.sp)
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "<eot>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>",
5
+ "sep_token": "<sep>",
6
+ "bos_token": "<s>",
7
+ "add_prefix_space": false,
8
+ "tokenizer_class": "GPTPanguTokenizer",
9
+ "use_fast": false,
10
+ "auto_map": {
11
+ "AutoTokenizer": [
12
+ "tokenization_gptpangu.GPTPanguTokenizer",
13
+ null
14
+ ]
15
+ }
16
+ }
vocab.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18857e86783e50cfcaa0bc3c043fb4e9b5f240b885d2870ea593ee69b44f7a3a
3
+ size 879697