Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Sleeping

App Files Files Community

JeffYang52415 commited on Dec 28, 2024

Commit

9682764

unverified ·

1 Parent(s): 9bc0c66

refactor: base parser interface

Browse files

Files changed (1) hide show

llmdataparser/base_parser.py +189 -26

llmdataparser/base_parser.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any, Generic, TypeVar
 import datasets
@@ -9,12 +9,17 @@ import datasets
 T = TypeVar("T", bound="ParseEntry")
-@dataclass(frozen=True)
 class ParseEntry:
     """A simple base class for entries, customizable by each dataset parser."""
-class DatasetParser(ABC, Generic[T]):
     """
     Abstract base class defining the interface for all dataset parsers.
     """
@@ -39,40 +44,178 @@ class DatasetParser(ABC, Generic[T]):
         return self._parsed_data
     @abstractmethod
-    def process_entry(self, row: dict[str, Any]) -> T:
-        pass
-# Base class for Hugging Face datasets
 class HuggingFaceDatasetParser(DatasetParser[T]):
     """
     Base class for parsers that use datasets from Hugging Face.
     """
-    _data_source: str  # Class variable for the dataset name
-    def __init__(self):
-        self.raw_data = None
-        self.task_names = []
         super().__init__()
-    def get_task_names(self) -> list[str]:
-        return self.task_names
     @staticmethod
     @lru_cache(maxsize=3)
     def load_dataset_cached(
-        data_source: str, config_name: str = "default", **kwargs: Any
     ):
         """
         Cached static method to load a dataset from Hugging Face.
         """
-        return datasets.load_dataset(data_source, config_name, **kwargs)
     def load(
         self,
-        data_source: str | None = None,
-        config_name: str = "all",
         trust_remote_code: bool = True,
         split: str | None = None,
         **kwargs: Any,
@@ -80,21 +223,41 @@ class HuggingFaceDatasetParser(DatasetParser[T]):
         """
         Load the dataset using the Hugging Face datasets library.
         """
-        # Use class-level data_source if not provided
-        data_source = data_source or self._data_source
-        if not data_source:
-            raise ValueError("The 'data_source' class variable must be defined.")
         # Call the cached static method
-        self.raw_data = self.load_dataset_cached(
-            data_source,
-            config_name=config_name,
             trust_remote_code=trust_remote_code,
             split=split,
             **kwargs,
         )
-        self.task_names = list(self.raw_data.keys())
         print(
-            f"Loaded dataset with {len(self.task_names)} tasks: {', '.join(self.task_names)}."
         )
-        # Additional common initialization can be added here

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
+from typing import Any, ClassVar, Generic, TypeVar
 import datasets
 T = TypeVar("T", bound="ParseEntry")
+@dataclass(frozen=True, kw_only=True, slots=True)
 class ParseEntry:
     """A simple base class for entries, customizable by each dataset parser."""
+    prompt: str
+    answer: str
+    raw_question: str
+    raw_answer: str
+class DatasetParser(Generic[T], ABC):
     """
     Abstract base class defining the interface for all dataset parsers.
     """
         return self._parsed_data
     @abstractmethod
+    def process_entry(
+        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
+    ) -> T:
+        """
+        Process a single entry from the dataset.
+        Args:
+            row: A dictionary representing a single entry from the dataset.
+            task_name: Optional task name for the entry.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            T: The processed entry, typically an instance of a subclass of ParseEntry.
+        """
+@dataclass(frozen=True, kw_only=True, slots=True)
+class HuggingFaceParseEntry(ParseEntry):
+    """ParseEntry with an additional task_name field."""
+    task_name: str
 class HuggingFaceDatasetParser(DatasetParser[T]):
     """
     Base class for parsers that use datasets from Hugging Face.
     """
+    # _data_source is the name of the dataset, e.g. "lighteval/MATH"
+    _data_source: ClassVar[str]
+    # _task_names is the list of tasks in the dataset, e.g. ["algebra", "geometry", "statistics"]
+    _task_names: ClassVar[list[str]]
+    # _default_task is the default task to use if no task is specified, e.g. "algebra"
+    _default_task: ClassVar[str]
+    # _default_system_prompt is the default system prompt to use if no system prompt is specified
+    _default_system_prompt: ClassVar[str]
+    def __init__(self, system_prompt: str | None = None, **kwargs):
+        """
+        Initialize a HuggingFaceDatasetParser.
+        Args:
+            system_prompt: Optional custom system prompt to use instead of the default.
+                         If not provided, will use the class's _default_system_prompt.
+            **kwargs: Additional keyword arguments passed to the parent class.
+        """
         super().__init__()
+        # raw_data is the dataset loaded from HuggingFace
+        self.raw_data: dict[str, Any] | None = None
+        # split_names is the list of splits in the dataset, e.g. ["train", "test", "validation"]
+        self.split_names: list[str] = []
+        # _current_task is the task currently being processed, e.g. "algebra"
+        self._current_task: str = ""
+        # _system_prompt is the system prompt currently being used
+        self._system_prompt: str = system_prompt or self._default_system_prompt
+    def _get_current_task(self, data_entry: dict[str, Any] | None = None) -> str:
+        """
+        Get the currently loaded task name.
+        Args:
+            data_entry: Optional dictionary containing entry data that might include task information
+        Returns:
+            str: The task name from either the data entry (if available) or the currently set task
+        """
+        # If data_entry is provided and contains task information, use it
+        if data_entry is not None and hasattr(self, "_get_task_from_entry"):
+            try:
+                return self._get_task_from_entry(data_entry)
+            except (KeyError, AttributeError):
+                pass
+        # Otherwise return the task set during load()
+        return self._current_task or self._default_task
+    @property
+    def task_names(self) -> list[str]:
+        """Get all available task names."""
+        return self._task_names
+    @property
+    def total_tasks(self) -> int:
+        """Get total number of available tasks."""
+        return len(self._task_names)
+    @property
+    def get_huggingface_link(self) -> str:
+        return "https://huggingface.co/datasets/" + self._data_source
     @staticmethod
     @lru_cache(maxsize=3)
     def load_dataset_cached(
+        data_source: str, task_name: str = "default", **kwargs: Any
     ):
         """
         Cached static method to load a dataset from Hugging Face.
         """
+        return datasets.load_dataset(data_source, task_name, **kwargs)
+    def parse(
+        self,
+        split_names: str | list[str] | None = None,
+        force: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Parse the MATH dataset splits into structured entries.
+        Args:
+            split_names: Dataset splits to parse. Can be:
+                - None: Parse all available splits
+                - str: Parse a single split (e.g., "train")
+                - list[str]: Parse multiple splits (e.g., ["train", "test"])
+            force: If True, overwrites existing parsed data without confirmation.
+                If False and parsed data exists, prompts for confirmation.
+            **kwargs: Additional keyword arguments passed to process_entry
+        Raises:
+            ValueError: If no data is loaded or if a specified split name doesn't exist
+        """
+        if self.raw_data is None:
+            raise ValueError("No data loaded. Please load the dataset first.")
+        if self._parsed_data and not force:
+            response = input(
+                f"Found {len(self._parsed_data)} existing parsed entries. "
+                "Do you want to overwrite them? [y/N]: "
+            ).lower()
+            if response not in ("y", "yes"):
+                print("Parsing cancelled. Existing data preserved.")
+                return
+        self._parsed_data.clear()
+        # Dataset with splits
+        if split_names is None:
+            split_names = self.split_names
+        elif isinstance(split_names, str):
+            split_names = [split_names]
+        for split_name in split_names:
+            if split_name not in self.split_names:
+                raise ValueError(f"Split '{split_name}' not found in the dataset.")
+            dataset_split = self.raw_data[split_name]
+            total_entries = len(dataset_split)
+            print(f"Processing {split_name} split with {total_entries} entries...")
+            for index, entry in enumerate(dataset_split, start=1):
+                try:
+                    task_name = self._get_current_task(data_entry=entry)
+                    parsed_entry = self.process_entry(entry, task_name, **kwargs)
+                    self._parsed_data.append(parsed_entry)
+                    # Print progress every 100 entries
+                    if index % 100 == 0:
+                        print(
+                            f"Processed {index}/{total_entries} entries from '{split_name}'"
+                        )
+                except Exception as e:
+                    print(f"Error processing entry {index} in {split_name}: {str(e)}")
+                    continue
+            print(f"Completed parsing {index} entries from '{split_name}'")
+        print(f"Total parsed entries: {len(self._parsed_data)}")
     def load(
         self,
+        task_name: str | None = None,
         trust_remote_code: bool = True,
         split: str | None = None,
         **kwargs: Any,
         """
         Load the dataset using the Hugging Face datasets library.
         """
+        # Set the task name
+        self._current_task = task_name or self._default_task
         # Call the cached static method
+        raw_data = self.load_dataset_cached(
+            self._data_source,
+            task_name=self._current_task,
             trust_remote_code=trust_remote_code,
             split=split,
             **kwargs,
         )
+        # Handle split-specific loading
+        if split:
+            self.raw_data = {split: raw_data}
+            self.split_names = [split]
+        else:
+            self.raw_data = raw_data
+            self.split_names = list(raw_data.keys())
         print(
+            f"Loaded dataset with {len(self.split_names)} groups: {', '.join(self.split_names)}."
         )
+    def __repr__(self) -> str:
+        status = "loaded" if self.raw_data is not None else "not loaded"
+        parsed_count = len(self._parsed_data) if self._parsed_data else 0
+        return (
+            f"{self.__class__.__name__}("
+            f"data_source='{self._data_source}', "
+            f"task='{self._current_task}', "
+            f"status='{status}', "
+            f"parsed_entries={parsed_count}"
+            ")"
+        )
+    def __str__(self) -> str:
+        return self.__repr__()