Text Generation
Transformers
Safetensors
llama
text-generation-inference
Inference Endpoints
mfromm commited on
Commit
691aceb
·
verified ·
1 Parent(s): b506f0b

Update gptx_tokenizer.py

Browse files
Files changed (1) hide show
  1. gptx_tokenizer.py +16 -23
gptx_tokenizer.py CHANGED
@@ -11,12 +11,7 @@ from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_c
11
  from transformers.tokenization_utils import PreTrainedTokenizer
12
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
13
 
14
- # Define special tokens used in the tokenizer
15
- EOD_TOKEN = "<eod>"
16
- PAD_TOKEN = "<pad>"
17
- BOS_TOKEN = "<s>"
18
- EOS_TOKEN = "</s>"
19
- UNK_TOKEN = "<unk>"
20
  REPO_ID = "openGPT-X/Teuken-7B-instruct-commercial-v0.4"
21
 
22
  class HFGPTXTokenizer(PreTrainedTokenizer):
@@ -171,22 +166,16 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
171
  # Since there is no corresponding mapping for EOS from `tok` in
172
  # HuggingFace, it is treated as an additional special token.
173
  # Same for all other special tokens.
174
- self.eos_token = EOD_TOKEN
175
- self.bos_token = BOS_TOKEN
176
- self.pad_token = PAD_TOKEN
177
-
178
- if not self.additional_special_tokens:
179
- self.additional_special_tokens = [
180
- token
181
- for token in self.create_list_of_special_tokens()
182
- # Filter out the special tokens we added manually.
183
- if token
184
- not in [
185
- self.eos_token,
186
- self.bos_token,
187
- self.pad_token,
188
- ]
189
- ]
190
  if config_path is None:
191
  config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
192
 
@@ -243,6 +232,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
243
  self,
244
  token_ids: Union[List[int], List[List[int]]],
245
  num_threads: Optional[int] = None,
 
246
  ) -> str:
247
  """
248
  Decode a list of token IDs into a string.
@@ -252,7 +242,10 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
252
  Returns:
253
  str: Decoded string.
254
  """
255
- return self.tok.decode(input=token_ids, num_threads=num_threads)
 
 
 
256
 
257
  def _convert_id_to_token(self, index: int) -> str:
258
  """
 
11
  from transformers.tokenization_utils import PreTrainedTokenizer
12
  from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
13
 
14
+
 
 
 
 
 
15
  REPO_ID = "openGPT-X/Teuken-7B-instruct-commercial-v0.4"
16
 
17
  class HFGPTXTokenizer(PreTrainedTokenizer):
 
166
  # Since there is no corresponding mapping for EOS from `tok` in
167
  # HuggingFace, it is treated as an additional special token.
168
  # Same for all other special tokens.
169
+
170
+
171
+ self.unk_token = "<unk>"
172
+ self.eos_token = "</s>"
173
+ self.bos_token = "<s>"
174
+ self.pad_token = "<pad>"
175
+ self.eod_token = "<eod>"
176
+
177
+ self.additional_special_tokens = self.create_list_of_special_tokens()
178
+
 
 
 
 
 
 
179
  if config_path is None:
180
  config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
181
 
 
232
  self,
233
  token_ids: Union[List[int], List[List[int]]],
234
  num_threads: Optional[int] = None,
235
+ skip_special_tokens: bool = False,
236
  ) -> str:
237
  """
238
  Decode a list of token IDs into a string.
 
242
  Returns:
243
  str: Decoded string.
244
  """
245
+ output = self.tok.decode(input=token_ids, num_threads=num_threads)
246
+ if skip_special_tokens:
247
+ token_ids = [token for token in output if token not in self.additional_special_tokens]
248
+ return output
249
 
250
  def _convert_id_to_token(self, index: int) -> str:
251
  """