Tokenizer suggestions

#2
by psinger - opened
README.md CHANGED
@@ -6,13 +6,7 @@ license: apache-2.0
6
 
7
  Official research release for the family of **XGen** models (`7B`) by Salesforce AI Research:
8
 
9
- *Title*: [Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length](https://arxiv.org/abs/2309.03450)
10
-
11
- *Authors*: [Erik Nijkamp](https://eriknijkamp.com)\*, Tian Xie\*, [Hiroaki Hayashi](https://hiroakih.me/)\*, [Bo Pang](https://scholar.google.com/citations?user=s9fNEVEAAAAJ&hl=en)\*, Congying Xia\*, Chen Xing, Jesse Vig, Semih Yavuz, Philippe Laban, Ben Krause, Senthil Purushwalkam, Tong Niu, Wojciech Kryscinski, Lidiya Murakhovs'ka, Prafulla Kumar Choubey, Alex Fabbri, Ye Liu, Rui Meng, Lifu Tu, Meghana Bhat, [Chien-Sheng Wu](https://jasonwu0731.github.io/), Silvio Savarese, [Yingbo Zhou](https://scholar.google.com/citations?user=H_6RQ7oAAAAJ&hl=en), [Shafiq Rayhan Joty](https://raihanjoty.github.io/), [Caiming Xiong](http://cmxiong.com/).
12
-
13
- (* indicates equal contribution)
14
-
15
- Correspondence to: [Shafiq Rayhan Joty](mailto:[email protected]), [Caiming Xiong](mailto:[email protected])
16
 
17
  ## Models
18
 
@@ -55,9 +49,9 @@ print(tokenizer.decode(sample[0]))
55
  ```bibtex
56
  @misc{XGen,
57
  title={Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length},
58
- author={Erik Nijkamp, Tian Xie, Hiroaki Hayashi, Bo Pang, Congying Xia, Chen Xing, Jesse Vig, Semih Yavuz, Philippe Laban, Ben Krause, Senthil Purushwalkam, Tong Niu, Wojciech Kryscinski, Lidiya Murakhovs'ka, Prafulla Kumar Choubey, Alex Fabbri, Ye Liu, Rui Meng, Lifu Tu, Meghana Bhat, Chien-Sheng Wu, Silvio Savarese, Yingbo Zhou, Shafiq Rayhan Joty, Caiming Xiong},
59
- howpublished={ArXiv},
60
  year={2023},
61
- url={https://arxiv.org/abs/2309.03450}
62
  }
63
  ```
 
6
 
7
  Official research release for the family of **XGen** models (`7B`) by Salesforce AI Research:
8
 
9
+ *Title*: [Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length](https://blog.salesforceairesearch.com/xgen/)
 
 
 
 
 
 
10
 
11
  ## Models
12
 
 
49
  ```bibtex
50
  @misc{XGen,
51
  title={Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length},
52
+ author={Salesforce AI Research},
53
+ howpublished={Salesforce AI Research Blog},
54
  year={2023},
55
+ url={https://blog.salesforceairesearch.com/xgen-7b/}
56
  }
57
  ```
config.json CHANGED
@@ -2,8 +2,8 @@
2
  "architectures": [
3
  "LlamaForCausalLM"
4
  ],
5
- "bos_token_id": 50256,
6
- "eos_token_id": 50256,
7
  "hidden_act": "silu",
8
  "hidden_size": 4096,
9
  "initializer_range": 0.02,
 
2
  "architectures": [
3
  "LlamaForCausalLM"
4
  ],
5
+ "bos_token_id": 1,
6
+ "eos_token_id": 2,
7
  "hidden_act": "silu",
8
  "hidden_size": 4096,
9
  "initializer_range": 0.02,
generation_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 50256,
4
- "eos_token_id": 50256,
 
5
  "transformers_version": "4.29.2"
6
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
  "transformers_version": "4.29.2"
7
  }
tokenization_xgen.py CHANGED
@@ -25,7 +25,7 @@ MAX_MODEL_INPUT_SIZES = {
25
  }
26
 
27
 
28
- def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
29
  if not add_special:
30
  return tiktoken.get_encoding(base)
31
 
@@ -60,18 +60,9 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
60
  ]
61
  return fim_tokens
62
 
63
- def include_additional_tokens():
64
- tokens = []
65
- tokens += [f"<dummy_{i}>" for i in range(4)]
66
- tokens.append("<sep>") # 50317
67
- tokens.append("<eom>") # 50318
68
- tokens += [f"<mask_{i}>" for i in reversed(range(1, 51199-50318+1))]
69
- return tokens
70
-
71
  add_whitespaces = include_whitespace(n_min=2, n_max=32)
72
  add_tabs = include_tabs(n_min=2, n_max=10)
73
  fim_tokens = include_fim_tokens()
74
- additional_tokens = include_additional_tokens()
75
 
76
  tokenizer = tiktoken.get_encoding(base)
77
 
@@ -91,13 +82,7 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
91
  for sp in fim_tokens:
92
  special_tokens[sp] = idx
93
  idx += 1
94
- for sp in additional_tokens:
95
- special_tokens[sp] = idx
96
- idx += 1
97
 
98
- if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
99
- special_tokens[pad_token] = idx
100
- idx += 1
101
  # In production, load the arguments directly instead of accessing private attributes
102
  # See openai_public.py for examples of arguments for specific encodings
103
  enc = tiktoken.Encoding(
@@ -127,22 +112,19 @@ class XgenTokenizer(PreTrainedTokenizer):
127
  def __init__(
128
  self,
129
  pad_token=None,
130
- eos_token="<|endoftext|>",
131
  add_eos_token=False,
132
  add_special_tokens=True,
133
  **kwargs,
134
  ):
135
- pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
136
- eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
137
- self.add_eos_token = add_eos_token
138
- self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
139
  super().__init__(
140
- pad_token=pad_token_added,
141
- eos_token=eos_token_added,
142
  add_eos_token=add_eos_token,
143
  add_special_tokens=add_special_tokens,
144
  **kwargs,
145
  )
 
 
146
 
147
  @property
148
  def vocab_size(self):
@@ -151,7 +133,7 @@ class XgenTokenizer(PreTrainedTokenizer):
151
 
152
  def get_vocab(self):
153
  """Returns vocab as a dict"""
154
- vocab = {self.encoder.decode_single_token_bytes(i): i for i in range(self.vocab_size)}
155
  return vocab
156
 
157
  def _tokenize(self, text, **kwargs):
@@ -160,25 +142,18 @@ class XgenTokenizer(PreTrainedTokenizer):
160
 
161
  def _convert_token_to_id(self, token):
162
  """Converts a token (str) in an id using the vocab."""
163
- if isinstance(token, str):
164
- return self.encoder.encode_single_token(token)
165
- else:
166
- return token
167
 
168
  def _convert_id_to_token(self, index):
169
  """Converts an index (integer) in a token (str) using the vocab."""
170
- return self.encoder.decode_single_token_bytes(index).decode("utf-8")
171
 
172
- def _decode(self, token_ids, skip_special_tokens: bool = False, **kwargs):
173
- if not isinstance(token_ids, list):
174
- token_ids = [token_ids]
175
- if skip_special_tokens:
176
- token_ids = [t for t in token_ids if t not in self.all_special_ids]
177
  return self.encoder.decode(token_ids)
178
 
179
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
180
  """Build model inputs from a sequence by appending eos_token_id."""
181
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
182
 
183
  output = token_ids_0 + eos_token_id
184
 
@@ -234,7 +209,7 @@ class XgenTokenizer(PreTrainedTokenizer):
234
  Returns:
235
  `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
236
  """
237
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
238
 
239
  output = [0] * len(token_ids_0 + eos_token_id)
240
 
@@ -242,7 +217,3 @@ class XgenTokenizer(PreTrainedTokenizer):
242
  output += [1] * len(token_ids_1 + eos_token_id)
243
 
244
  return output
245
-
246
- # has no vocab file
247
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
248
- return ()
 
25
  }
26
 
27
 
28
+ def tiktoken_tokenizer(base="gpt2", add_special=True):
29
  if not add_special:
30
  return tiktoken.get_encoding(base)
31
 
 
60
  ]
61
  return fim_tokens
62
 
 
 
 
 
 
 
 
 
63
  add_whitespaces = include_whitespace(n_min=2, n_max=32)
64
  add_tabs = include_tabs(n_min=2, n_max=10)
65
  fim_tokens = include_fim_tokens()
 
66
 
67
  tokenizer = tiktoken.get_encoding(base)
68
 
 
82
  for sp in fim_tokens:
83
  special_tokens[sp] = idx
84
  idx += 1
 
 
 
85
 
 
 
 
86
  # In production, load the arguments directly instead of accessing private attributes
87
  # See openai_public.py for examples of arguments for specific encodings
88
  enc = tiktoken.Encoding(
 
112
  def __init__(
113
  self,
114
  pad_token=None,
 
115
  add_eos_token=False,
116
  add_special_tokens=True,
117
  **kwargs,
118
  ):
119
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
 
 
120
  super().__init__(
121
+ pad_token=pad_token,
 
122
  add_eos_token=add_eos_token,
123
  add_special_tokens=add_special_tokens,
124
  **kwargs,
125
  )
126
+ self.add_eos_token = add_eos_token
127
+ self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
128
 
129
  @property
130
  def vocab_size(self):
 
133
 
134
  def get_vocab(self):
135
  """Returns vocab as a dict"""
136
+ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
137
  return vocab
138
 
139
  def _tokenize(self, text, **kwargs):
 
142
 
143
  def _convert_token_to_id(self, token):
144
  """Converts a token (str) in an id using the vocab."""
145
+ return token
 
 
 
146
 
147
  def _convert_id_to_token(self, index):
148
  """Converts an index (integer) in a token (str) using the vocab."""
149
+ return self.encoder.decode_single_token_bytes(index)
150
 
151
+ def _decode(self, token_ids: List[int], skip_special_tokens: bool = False, **kwargs):
 
 
 
 
152
  return self.encoder.decode(token_ids)
153
 
154
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
155
  """Build model inputs from a sequence by appending eos_token_id."""
156
+ eos_token_id = [50256] if self.add_eos_token else []
157
 
158
  output = token_ids_0 + eos_token_id
159
 
 
209
  Returns:
210
  `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
211
  """
212
+ eos_token_id = [50256] if self.add_eos_token else []
213
 
214
  output = [0] * len(token_ids_0 + eos_token_id)
215
 
 
217
  output += [1] * len(token_ids_1 + eos_token_id)
218
 
219
  return output
 
 
 
 
tokenizer_config.json CHANGED
@@ -2,7 +2,6 @@
2
  "add_eos_token": false,
3
  "add_special_tokens": true,
4
  "clean_up_tokenization_spaces": true,
5
- "eos_token": "<|endoftext|>",
6
  "model_max_length": 1000000000000000019884624838656,
7
  "pad_token": null,
8
  "tokenizer_class": "XgenTokenizer",
 
2
  "add_eos_token": false,
3
  "add_special_tokens": true,
4
  "clean_up_tokenization_spaces": true,
 
5
  "model_max_length": 1000000000000000019884624838656,
6
  "pad_token": null,
7
  "tokenizer_class": "XgenTokenizer",