Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Sleeping

App Files Files Community

JeffYang52415 commited on Nov 4, 2024

Commit

2822485

unverified ·

1 Parent(s): f835380

feat: update base_parser

Browse files

Files changed (4) hide show

.pre-commit-config.yaml +8 -1
llmdataparser/__init__.py +33 -0
llmdataparser/base_parser.py +100 -0
pyproject.toml +5 -3

.pre-commit-config.yaml CHANGED Viewed

@@ -12,6 +12,7 @@ repos:
     hooks:
       - id: flake8
         additional_dependencies: ["typing-extensions>=4.8.0"]
   - repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:
@@ -21,7 +22,13 @@ repos:
     rev: v1.5.1
     hooks:
       - id: mypy
-        args: ["--python-version=3.11", "--install-types", "--non-interactive"]
         additional_dependencies:
           - "typing-extensions>=4.8.0"
   - repo: https://github.com/pre-commit/pre-commit-hooks

     hooks:
       - id: flake8
         additional_dependencies: ["typing-extensions>=4.8.0"]
+        args: ["--ignore=E203, E501, W503, E501"]
   - repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:
     rev: v1.5.1
     hooks:
       - id: mypy
+        args:
+          [
+            "--python-version=3.11",
+            "--install-types",
+            "--non-interactive",
+            "--ignore-missing-imports",
+          ]
         additional_dependencies:
           - "typing-extensions>=4.8.0"
   - repo: https://github.com/pre-commit/pre-commit-hooks

llmdataparser/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# llmdataparser/__init__.py
+from typing import Type
+from .base_parser import DatasetParser
+from .mmlu_parser import MMLUDatasetParser
+class ParserRegistry:
+    """
+    Registry to keep track of available parsers and provide them on request.
+    """
+    _registry: dict = {}
+    @classmethod
+    def register_parser(cls, name: str, parser_class: Type[DatasetParser]) -> None:
+        cls._registry[name.lower()] = parser_class
+    @classmethod
+    def get_parser(cls, name: str, **kwargs) -> Type[DatasetParser]:
+        parser_class = cls._registry.get(name.lower())
+        if parser_class is None:
+            raise ValueError(f"Parser '{name}' is not registered.")
+        return parser_class(**kwargs)
+    @classmethod
+    def list_parsers(cls) -> list[str]:
+        """Returns a list of available parser names."""
+        return list(cls._registry.keys())
+# Register parsers
+ParserRegistry.register_parser("mmlu", MMLUDatasetParser)

llmdataparser/base_parser.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, Generic, TypeVar
+import datasets
+# Define the generic type variable
+T = TypeVar("T", bound="ParseEntry")
+@dataclass(frozen=True)
+class ParseEntry:
+    """A simple base class for entries, customizable by each dataset parser."""
+class DatasetParser(ABC, Generic[T]):
+    """
+    Abstract base class defining the interface for all dataset parsers.
+    """
+    def __init__(self):
+        self._parsed_data: list[T] = []
+    @abstractmethod
+    def load(self, **kwargs: Any) -> None:
+        pass
+    @abstractmethod
+    def parse(self, split_names: str | list[str] | None = None, **kwargs: Any) -> None:
+        """
+        Parse the loaded dataset into self._parsed_data.
+        """
+    @property
+    def get_parsed_data(self) -> list[T]:
+        if not hasattr(self, "_parsed_data") or not self._parsed_data:
+            raise ValueError("Parsed data has not been initialized.")
+        return self._parsed_data
+    @abstractmethod
+    def process_entry(self, row: dict[str, Any]) -> T:
+        pass
+# Base class for Hugging Face datasets
+class HuggingFaceDatasetParser(DatasetParser[T]):
+    """
+    Base class for parsers that use datasets from Hugging Face.
+    """
+    _data_source: str  # Class variable for the dataset name
+    def __init__(self):
+        self.raw_data = None
+        self.task_names = []
+        super().__init__()
+    def get_task_names(self) -> list[str]:
+        return self.task_names
+    @staticmethod
+    @lru_cache(maxsize=3)
+    def load_dataset_cached(
+        data_source: str, config_name: str = "default", **kwargs: Any
+    ):
+        """
+        Cached static method to load a dataset from Hugging Face.
+        """
+        return datasets.load_dataset(data_source, config_name, **kwargs)
+    def load(
+        self,
+        data_source: str | None = None,
+        config_name: str = "all",
+        trust_remote_code: bool = True,
+        split: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Load the dataset using the Hugging Face datasets library.
+        """
+        # Use class-level data_source if not provided
+        data_source = data_source or self._data_source
+        if not data_source:
+            raise ValueError("The 'data_source' class variable must be defined.")
+        # Call the cached static method
+        self.raw_data = self.load_dataset_cached(
+            data_source,
+            config_name=config_name,
+            trust_remote_code=trust_remote_code,
+            split=split,
+            **kwargs,
+        )
+        self.task_names = list(self.raw_data.keys())
+        print(
+            f"Loaded dataset with {len(self.task_names)} tasks: {', '.join(self.task_names)}."
+        )
+        # Additional common initialization can be added here

pyproject.toml CHANGED Viewed

@@ -49,11 +49,13 @@ profile = "black"
 line_length = 88
 known_first_party = ["llmdataparser"]
 [tool.flake8]
-max-line-length = 88
-ignore = [
-    "E501"  # Line too long
 ]
 [tool.ruff]
 line-length = 88

 line_length = 88
 known_first_party = ["llmdataparser"]
+# .flake8
 [tool.flake8]
+ignore = ['E231', 'E241', "E501"]
+per-file-ignores = [
+    '__init__.py:F401',
 ]
+count = true
 [tool.ruff]
 line-length = 88