openGPT-X
/

Teuken-7B-instruct-commercial-v0.4

@@ -62,7 +62,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
             f"<placeholder_tok_{i}>" for i in range(256)
         ]
-    def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Optional[Path]:
         if not os.path.isfile(config_path):
             config_path = try_to_load_from_cache(repo_id=repo_id, filename=Path(config_path).name)
             if not config_path:
@@ -74,43 +74,73 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
     def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
         """
         Load the tokenizer model from a file or download it from a repository.
         Args:
             model_file_or_name (str): Path to the model file or the model name.
             repo_id (str, optional): Repository ID from which to download the model file.
         Returns:
             spm.SentencePieceProcessor: Loaded SentencePieceProcessor instance.
         Raises:
             ValueError: If repo_id is not provided when model_file_or_name is not a file.
             OSError: If the model file cannot be loaded or downloaded.
         """
         if not os.path.isfile(model_file_or_name):
-            if repo_id is None:
-                raise ValueError("repo_id must be provided if model_file_or_name is not a local file")
-            try:
-                # List all files in the repo
-                repo_files = list_repo_files(repo_id)
-                # Find the tokenizer model file
-                tokenizer_files = [f for f in repo_files if f.endswith('.model')]
-                if not tokenizer_files:
-                    raise FileNotFoundError(f"No .model file found in repository {repo_id}")
-                # Use the first .model file found
-                model_file = tokenizer_files[0]
-                print(f"Found tokenizer model file: {model_file}")
-                # Download the file
-                model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
-                print(f"Downloaded tokenizer model to: {model_file_or_name}")
-            except Exception as e:
-                raise OSError(f"Failed to download tokenizer model: {str(e)}")
         try:
             return spm.SentencePieceProcessor(model_file=model_file_or_name)
         except Exception as e:
             raise OSError(f"Failed to load tokenizer model: {str(e)}")
     def __init__(
         self,
         model_path: Optional[str] = None,

             f"<placeholder_tok_{i}>" for i in range(256)
         ]
+def find_tokenizer_config(self, config_path: Path, repo_id: str = None) -> Optional[Path]:
         if not os.path.isfile(config_path):
             config_path = try_to_load_from_cache(repo_id=repo_id, filename=Path(config_path).name)
             if not config_path:
     def instantiate_from_file_or_name(self, model_file_or_name: str, repo_id: str = None):
         """
         Load the tokenizer model from a file or download it from a repository.
         Args:
             model_file_or_name (str): Path to the model file or the model name.
             repo_id (str, optional): Repository ID from which to download the model file.
         Returns:
             spm.SentencePieceProcessor: Loaded SentencePieceProcessor instance.
         Raises:
             ValueError: If repo_id is not provided when model_file_or_name is not a file.
             OSError: If the model file cannot be loaded or downloaded.
         """
         if not os.path.isfile(model_file_or_name):
+            model_file_or_name = try_to_load_from_cache(repo_id=repo_id, filename=Path(model_file_or_name).name)
+            if not model_file_or_name:
+                model_file_or_name = self._download_model_from_hub(repo_id=repo_id)
         try:
             return spm.SentencePieceProcessor(model_file=model_file_or_name)
         except Exception as e:
             raise OSError(f"Failed to load tokenizer model: {str(e)}")
+    def _download_model_from_hub(self, repo_id: str) -> Optional[str]:
+        try:
+            # List all files in the repo
+            repo_files = list_repo_files(repo_id)
+            # Find the tokenizer model file
+            tokenizer_files = [f for f in repo_files if f.endswith('.model')]
+            if not tokenizer_files:
+                raise FileNotFoundError(f"No .model file found in repository {repo_id}")
+            # Use the first .model file found
+            model_file = tokenizer_files[0]
+            print(f"Found tokenizer model file: {model_file}")
+            # Download the file
+            model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
+            print(f"Downloaded tokenizer model to: {model_file_or_name}")
+        except Exception as e:
+            raise OSError(f"Failed to download tokenizer model: {str(e)}")
+        return model_file_or_name
+    def _download_config_from_hub(self, repo_id: str):
+        if repo_id is None:
+            raise ValueError("repo_id must be provided if config_path is not a local file")
+        try:
+            # List all files in the repo
+            repo_files = list_repo_files(repo_id)
+            # Find the tokenizer config file
+            tokenizer_files = [f for f in repo_files if f.endswith('tokenizer_config.json')]
+            if not tokenizer_files:
+                raise FileNotFoundError(f"No tokenizer_config.json file found in repository {repo_id}")
+            # Use the first tokenizer_config.json file found
+            tokenizer_config_file = tokenizer_files[0]
+            print(f"Found tokenizer config file: {tokenizer_config_file}")
+            # Download the file
+            tokenizer_config_file_or_name = hf_hub_download(repo_id=repo_id, filename=tokenizer_config_file)
+            print(f"Downloaded tokenizer config file to: {tokenizer_config_file_or_name}")
+            return tokenizer_config_file_or_name
+        except Exception as e:
+            raise OSError(f"Failed to download tokenizer model: {str(e)}")
     def __init__(
         self,
         model_path: Optional[str] = None,