openGPT-X
/

Teuken-7B-instruct-commercial-v0.4

@@ -11,12 +11,7 @@ from huggingface_hub import hf_hub_download, list_repo_files, try_to_load_from_c
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
-# Define special tokens used in the tokenizer
-EOD_TOKEN = "<eod>"
-PAD_TOKEN = "<pad>"
-BOS_TOKEN = "<s>"
-EOS_TOKEN = "</s>"
-UNK_TOKEN = "<unk>"
 REPO_ID = "openGPT-X/Teuken-7B-instruct-commercial-v0.4"
 class HFGPTXTokenizer(PreTrainedTokenizer):
@@ -171,22 +166,16 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         # Since there is no corresponding mapping for EOS from `tok` in
         # HuggingFace, it is treated as an additional special token.
         # Same for all other special tokens.
-        self.eos_token = EOD_TOKEN
-        self.bos_token = BOS_TOKEN
-        self.pad_token = PAD_TOKEN
-        if not self.additional_special_tokens:
-            self.additional_special_tokens = [
-                token
-                for token in self.create_list_of_special_tokens()
-                # Filter out the special tokens we added manually.
-                if token
-                not in [
-                    self.eos_token,
-                    self.bos_token,
-                    self.pad_token,
-                ]
-            ]
         if config_path is None:
             config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
@@ -243,6 +232,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         self,
         token_ids: Union[List[int], List[List[int]]],
         num_threads: Optional[int] = None,
     ) -> str:
         """
         Decode a list of token IDs into a string.
@@ -252,7 +242,10 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         Returns:
             str: Decoded string.
         """
-        return self.tok.decode(input=token_ids, num_threads=num_threads)
     def _convert_id_to_token(self, index: int) -> str:
         """

 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
 REPO_ID = "openGPT-X/Teuken-7B-instruct-commercial-v0.4"
 class HFGPTXTokenizer(PreTrainedTokenizer):
         # Since there is no corresponding mapping for EOS from `tok` in
         # HuggingFace, it is treated as an additional special token.
         # Same for all other special tokens.
+        self.unk_token = "<unk>"
+        self.eos_token = "</s>"
+        self.bos_token = "<s>"
+        self.pad_token = "<pad>"
+        self.eod_token = "<eod>"
+        self.additional_special_tokens = self.create_list_of_special_tokens()
         if config_path is None:
             config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
         self,
         token_ids: Union[List[int], List[List[int]]],
         num_threads: Optional[int] = None,
+        skip_special_tokens: bool = False,
     ) -> str:
         """
         Decode a list of token IDs into a string.
         Returns:
             str: Decoded string.
         """
+        output = self.tok.decode(input=token_ids, num_threads=num_threads)
+        if skip_special_tokens:
+            token_ids = [token for token in output if token not in self.additional_special_tokens]
+        return output
     def _convert_id_to_token(self, index: int) -> str:
         """