Tokenizer suggestions
#2
by
psinger
- opened
- README.md +4 -10
- config.json +2 -2
- generation_config.json +3 -2
- tokenization_xgen.py +11 -40
- tokenizer_config.json +0 -1
README.md
CHANGED
@@ -6,13 +6,7 @@ license: apache-2.0
|
|
6 |
|
7 |
Official research release for the family of **XGen** models (`7B`) by Salesforce AI Research:
|
8 |
|
9 |
-
*Title*: [Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length](https://
|
10 |
-
|
11 |
-
*Authors*: [Erik Nijkamp](https://eriknijkamp.com)\*, Tian Xie\*, [Hiroaki Hayashi](https://hiroakih.me/)\*, [Bo Pang](https://scholar.google.com/citations?user=s9fNEVEAAAAJ&hl=en)\*, Congying Xia\*, Chen Xing, Jesse Vig, Semih Yavuz, Philippe Laban, Ben Krause, Senthil Purushwalkam, Tong Niu, Wojciech Kryscinski, Lidiya Murakhovs'ka, Prafulla Kumar Choubey, Alex Fabbri, Ye Liu, Rui Meng, Lifu Tu, Meghana Bhat, [Chien-Sheng Wu](https://jasonwu0731.github.io/), Silvio Savarese, [Yingbo Zhou](https://scholar.google.com/citations?user=H_6RQ7oAAAAJ&hl=en), [Shafiq Rayhan Joty](https://raihanjoty.github.io/), [Caiming Xiong](http://cmxiong.com/).
|
12 |
-
|
13 |
-
(* indicates equal contribution)
|
14 |
-
|
15 |
-
Correspondence to: [Shafiq Rayhan Joty](mailto:[email protected]), [Caiming Xiong](mailto:[email protected])
|
16 |
|
17 |
## Models
|
18 |
|
@@ -55,9 +49,9 @@ print(tokenizer.decode(sample[0]))
|
|
55 |
```bibtex
|
56 |
@misc{XGen,
|
57 |
title={Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length},
|
58 |
-
author={
|
59 |
-
howpublished={
|
60 |
year={2023},
|
61 |
-
url={https://
|
62 |
}
|
63 |
```
|
|
|
6 |
|
7 |
Official research release for the family of **XGen** models (`7B`) by Salesforce AI Research:
|
8 |
|
9 |
+
*Title*: [Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length](https://blog.salesforceairesearch.com/xgen/)
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
## Models
|
12 |
|
|
|
49 |
```bibtex
|
50 |
@misc{XGen,
|
51 |
title={Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length},
|
52 |
+
author={Salesforce AI Research},
|
53 |
+
howpublished={Salesforce AI Research Blog},
|
54 |
year={2023},
|
55 |
+
url={https://blog.salesforceairesearch.com/xgen-7b/}
|
56 |
}
|
57 |
```
|
config.json
CHANGED
@@ -2,8 +2,8 @@
|
|
2 |
"architectures": [
|
3 |
"LlamaForCausalLM"
|
4 |
],
|
5 |
-
"bos_token_id":
|
6 |
-
"eos_token_id":
|
7 |
"hidden_act": "silu",
|
8 |
"hidden_size": 4096,
|
9 |
"initializer_range": 0.02,
|
|
|
2 |
"architectures": [
|
3 |
"LlamaForCausalLM"
|
4 |
],
|
5 |
+
"bos_token_id": 1,
|
6 |
+
"eos_token_id": 2,
|
7 |
"hidden_act": "silu",
|
8 |
"hidden_size": 4096,
|
9 |
"initializer_range": 0.02,
|
generation_config.json
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
-
"bos_token_id":
|
4 |
-
"eos_token_id":
|
|
|
5 |
"transformers_version": "4.29.2"
|
6 |
}
|
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"pad_token_id": 0,
|
6 |
"transformers_version": "4.29.2"
|
7 |
}
|
tokenization_xgen.py
CHANGED
@@ -25,7 +25,7 @@ MAX_MODEL_INPUT_SIZES = {
|
|
25 |
}
|
26 |
|
27 |
|
28 |
-
def tiktoken_tokenizer(base="gpt2",
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
@@ -60,18 +60,9 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
|
60 |
]
|
61 |
return fim_tokens
|
62 |
|
63 |
-
def include_additional_tokens():
|
64 |
-
tokens = []
|
65 |
-
tokens += [f"<dummy_{i}>" for i in range(4)]
|
66 |
-
tokens.append("<sep>") # 50317
|
67 |
-
tokens.append("<eom>") # 50318
|
68 |
-
tokens += [f"<mask_{i}>" for i in reversed(range(1, 51199-50318+1))]
|
69 |
-
return tokens
|
70 |
-
|
71 |
add_whitespaces = include_whitespace(n_min=2, n_max=32)
|
72 |
add_tabs = include_tabs(n_min=2, n_max=10)
|
73 |
fim_tokens = include_fim_tokens()
|
74 |
-
additional_tokens = include_additional_tokens()
|
75 |
|
76 |
tokenizer = tiktoken.get_encoding(base)
|
77 |
|
@@ -91,13 +82,7 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
|
91 |
for sp in fim_tokens:
|
92 |
special_tokens[sp] = idx
|
93 |
idx += 1
|
94 |
-
for sp in additional_tokens:
|
95 |
-
special_tokens[sp] = idx
|
96 |
-
idx += 1
|
97 |
|
98 |
-
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
99 |
-
special_tokens[pad_token] = idx
|
100 |
-
idx += 1
|
101 |
# In production, load the arguments directly instead of accessing private attributes
|
102 |
# See openai_public.py for examples of arguments for specific encodings
|
103 |
enc = tiktoken.Encoding(
|
@@ -127,22 +112,19 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
127 |
def __init__(
|
128 |
self,
|
129 |
pad_token=None,
|
130 |
-
eos_token="<|endoftext|>",
|
131 |
add_eos_token=False,
|
132 |
add_special_tokens=True,
|
133 |
**kwargs,
|
134 |
):
|
135 |
-
|
136 |
-
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
137 |
-
self.add_eos_token = add_eos_token
|
138 |
-
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
139 |
super().__init__(
|
140 |
-
pad_token=
|
141 |
-
eos_token=eos_token_added,
|
142 |
add_eos_token=add_eos_token,
|
143 |
add_special_tokens=add_special_tokens,
|
144 |
**kwargs,
|
145 |
)
|
|
|
|
|
146 |
|
147 |
@property
|
148 |
def vocab_size(self):
|
@@ -151,7 +133,7 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
151 |
|
152 |
def get_vocab(self):
|
153 |
"""Returns vocab as a dict"""
|
154 |
-
vocab = {self.
|
155 |
return vocab
|
156 |
|
157 |
def _tokenize(self, text, **kwargs):
|
@@ -160,25 +142,18 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
160 |
|
161 |
def _convert_token_to_id(self, token):
|
162 |
"""Converts a token (str) in an id using the vocab."""
|
163 |
-
|
164 |
-
return self.encoder.encode_single_token(token)
|
165 |
-
else:
|
166 |
-
return token
|
167 |
|
168 |
def _convert_id_to_token(self, index):
|
169 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
170 |
-
return self.encoder.decode_single_token_bytes(index)
|
171 |
|
172 |
-
def _decode(self, token_ids, skip_special_tokens: bool = False, **kwargs):
|
173 |
-
if not isinstance(token_ids, list):
|
174 |
-
token_ids = [token_ids]
|
175 |
-
if skip_special_tokens:
|
176 |
-
token_ids = [t for t in token_ids if t not in self.all_special_ids]
|
177 |
return self.encoder.decode(token_ids)
|
178 |
|
179 |
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
|
180 |
"""Build model inputs from a sequence by appending eos_token_id."""
|
181 |
-
eos_token_id = [
|
182 |
|
183 |
output = token_ids_0 + eos_token_id
|
184 |
|
@@ -234,7 +209,7 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
234 |
Returns:
|
235 |
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
236 |
"""
|
237 |
-
eos_token_id = [
|
238 |
|
239 |
output = [0] * len(token_ids_0 + eos_token_id)
|
240 |
|
@@ -242,7 +217,3 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
242 |
output += [1] * len(token_ids_1 + eos_token_id)
|
243 |
|
244 |
return output
|
245 |
-
|
246 |
-
# has no vocab file
|
247 |
-
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
|
248 |
-
return ()
|
|
|
25 |
}
|
26 |
|
27 |
|
28 |
+
def tiktoken_tokenizer(base="gpt2", add_special=True):
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
|
|
60 |
]
|
61 |
return fim_tokens
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
add_whitespaces = include_whitespace(n_min=2, n_max=32)
|
64 |
add_tabs = include_tabs(n_min=2, n_max=10)
|
65 |
fim_tokens = include_fim_tokens()
|
|
|
66 |
|
67 |
tokenizer = tiktoken.get_encoding(base)
|
68 |
|
|
|
82 |
for sp in fim_tokens:
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
|
|
|
|
|
|
85 |
|
|
|
|
|
|
|
86 |
# In production, load the arguments directly instead of accessing private attributes
|
87 |
# See openai_public.py for examples of arguments for specific encodings
|
88 |
enc = tiktoken.Encoding(
|
|
|
112 |
def __init__(
|
113 |
self,
|
114 |
pad_token=None,
|
|
|
115 |
add_eos_token=False,
|
116 |
add_special_tokens=True,
|
117 |
**kwargs,
|
118 |
):
|
119 |
+
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
|
|
|
|
|
|
120 |
super().__init__(
|
121 |
+
pad_token=pad_token,
|
|
|
122 |
add_eos_token=add_eos_token,
|
123 |
add_special_tokens=add_special_tokens,
|
124 |
**kwargs,
|
125 |
)
|
126 |
+
self.add_eos_token = add_eos_token
|
127 |
+
self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
|
128 |
|
129 |
@property
|
130 |
def vocab_size(self):
|
|
|
133 |
|
134 |
def get_vocab(self):
|
135 |
"""Returns vocab as a dict"""
|
136 |
+
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
|
137 |
return vocab
|
138 |
|
139 |
def _tokenize(self, text, **kwargs):
|
|
|
142 |
|
143 |
def _convert_token_to_id(self, token):
|
144 |
"""Converts a token (str) in an id using the vocab."""
|
145 |
+
return token
|
|
|
|
|
|
|
146 |
|
147 |
def _convert_id_to_token(self, index):
|
148 |
"""Converts an index (integer) in a token (str) using the vocab."""
|
149 |
+
return self.encoder.decode_single_token_bytes(index)
|
150 |
|
151 |
+
def _decode(self, token_ids: List[int], skip_special_tokens: bool = False, **kwargs):
|
|
|
|
|
|
|
|
|
152 |
return self.encoder.decode(token_ids)
|
153 |
|
154 |
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
|
155 |
"""Build model inputs from a sequence by appending eos_token_id."""
|
156 |
+
eos_token_id = [50256] if self.add_eos_token else []
|
157 |
|
158 |
output = token_ids_0 + eos_token_id
|
159 |
|
|
|
209 |
Returns:
|
210 |
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
211 |
"""
|
212 |
+
eos_token_id = [50256] if self.add_eos_token else []
|
213 |
|
214 |
output = [0] * len(token_ids_0 + eos_token_id)
|
215 |
|
|
|
217 |
output += [1] * len(token_ids_1 + eos_token_id)
|
218 |
|
219 |
return output
|
|
|
|
|
|
|
|
tokenizer_config.json
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
"add_eos_token": false,
|
3 |
"add_special_tokens": true,
|
4 |
"clean_up_tokenization_spaces": true,
|
5 |
-
"eos_token": "<|endoftext|>",
|
6 |
"model_max_length": 1000000000000000019884624838656,
|
7 |
"pad_token": null,
|
8 |
"tokenizer_class": "XgenTokenizer",
|
|
|
2 |
"add_eos_token": false,
|
3 |
"add_special_tokens": true,
|
4 |
"clean_up_tokenization_spaces": true,
|
|
|
5 |
"model_max_length": 1000000000000000019884624838656,
|
6 |
"pad_token": null,
|
7 |
"tokenizer_class": "XgenTokenizer",
|