Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Sleeping

App Files Files Community

JeffYang52415 commited on Dec 29, 2024

Commit

299e68a

unverified ·

1 Parent(s): fb32f8e

refactor: description&metrics interface

Browse files

Files changed (3) hide show

llmdataparser/base_parser.py +76 -1
llmdataparser/bbh_parser.py +54 -47
llmdataparser/tmlu_parser.py +67 -52

llmdataparser/base_parser.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any, ClassVar, Generic, TypeVar
 import datasets
@@ -19,6 +19,66 @@ class ParseEntry:
     raw_answer: str
 class DatasetParser(Generic[T], ABC):
     """
     Abstract base class defining the interface for all dataset parsers.
@@ -59,6 +119,21 @@ class DatasetParser(Generic[T], ABC):
             T: The processed entry, typically an instance of a subclass of ParseEntry.
         """
 @dataclass(frozen=True, kw_only=True, slots=True)
 class HuggingFaceParseEntry(ParseEntry):

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
+from typing import Any, ClassVar, Generic, List, TypeVar
 import datasets
     raw_answer: str
+@dataclass(frozen=True, kw_only=True, slots=True)
+class DatasetDescription:
+    """Standardized description of a dataset."""
+    name: str
+    purpose: str
+    source: str
+    language: str
+    format: str
+    characteristics: str
+    citation: str | None = None
+    additional_info: dict[str, Any] | None = None
+    @classmethod
+    def create(
+        cls,
+        name: str,
+        purpose: str,
+        source: str,
+        language: str,
+        format: str,
+        characteristics: str,
+        citation: str | None = None,
+        additional_info: dict[str, Any] | None = None,
+    ) -> "DatasetDescription":
+        return cls(
+            name=name,
+            purpose=purpose,
+            source=source,
+            language=language,
+            format=format,
+            characteristics=characteristics,
+            citation=citation,
+            additional_info=additional_info,
+        )
+@dataclass(frozen=True, kw_only=True, slots=True)
+class EvaluationMetric:
+    """Description of an evaluation metric for a dataset."""
+    name: str
+    type: str
+    description: str
+    implementation: str
+    primary: bool
+    @classmethod
+    def create(
+        cls, name: str, type: str, description: str, implementation: str, primary: bool
+    ) -> "EvaluationMetric":
+        return cls(
+            name=name,
+            type=type,
+            description=description,
+            implementation=implementation,
+            primary=primary,
+        )
 class DatasetParser(Generic[T], ABC):
     """
     Abstract base class defining the interface for all dataset parsers.
             T: The processed entry, typically an instance of a subclass of ParseEntry.
         """
+    def get_dataset_description(self) -> DatasetDescription:
+        """Returns a standardized description of the dataset."""
+        return DatasetDescription(
+            name="Unknown",
+            purpose="Not specified",
+            source="Not specified",
+            language="Not specified",
+            format="Not specified",
+            characteristics="Not specified",
+        )
+    def get_evaluation_metrics(self) -> List[EvaluationMetric]:
+        """Returns the recommended evaluation metrics for the dataset."""
+        return []
 @dataclass(frozen=True, kw_only=True, slots=True)
 class HuggingFaceParseEntry(ParseEntry):

llmdataparser/bbh_parser.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from dataclasses import dataclass
-from typing import Any, ClassVar, Dict, List
-from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 from llmdataparser.prompts import BBH_SYSTEM_PROMPT  # You'll need to create this
@@ -87,26 +92,21 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
             task_name=task,
         )
-    def get_dataset_description(self) -> Dict[str, str]:
         """Returns a description of the Big Bench Hard dataset."""
-        return {
-            "name": "Big Bench Hard (BBH)",
-            "purpose": "A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
-            "source": "https://github.com/suzgunmirac/BIG-Bench-Hard",
-            "language": "English",
-            "size": "6.5k examples across 27 tasks (23 core + 4 related)",
-            "format": "Multiple choice questions with single correct answers",
-            "characteristics": (
                 "Tasks require complex multi-step reasoning and were selected based on "
                 "initial model performance below human baseline. Performance can be "
                 "significantly improved through chain-of-thought prompting. The dataset "
                 "includes 23 core tasks plus additional related tasks."
             ),
-            "model_performance": (
-                "With chain-of-thought prompting, PaLM surpassed human performance on "
-                "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
-            ),
-            "citation": (
                 "@article{suzgun2022challenging,\n"
                 "  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
                 '  author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
@@ -114,39 +114,46 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
                 "  year={2022}\n"
                 "}"
             ),
-        }
-    def get_evaluation_metrics(self) -> List[Dict[str, Any]]:
         """Returns the recommended evaluation metrics for BBH dataset."""
         return [
-            {
-                "name": "accuracy",
-                "type": "classification",
-                "description": "Proportion of exactly correct answers (after stripping parentheses)",
-                "implementation": "evaluate.load('accuracy')",
-                "primary": True,
-            },
-            {
-                "name": "human_eval_delta",
-                "type": "comparison",
-                "description": "Difference between model accuracy and average human-rater performance baseline",
-                "implementation": "custom_human_baseline_comparison",
-                "primary": True,
-            },
-            {
-                "name": "per_task_accuracy",
-                "type": "classification",
-                "description": "Accuracy broken down by individual reasoning tasks",
-                "implementation": "custom_task_accuracy",
-                "primary": False,
-            },
-            {
-                "name": "exact_match",
-                "type": "string_match",
-                "description": "Strict exact match between predicted and target answers",
-                "implementation": "evaluate.load('exact_match')",
-                "primary": False,
-            },
         ]

 from dataclasses import dataclass
+from typing import Any, ClassVar, List
+from llmdataparser.base_parser import (
+    DatasetDescription,
+    EvaluationMetric,
+    HuggingFaceDatasetParser,
+    HuggingFaceParseEntry,
+)
 from llmdataparser.prompts import BBH_SYSTEM_PROMPT  # You'll need to create this
             task_name=task,
         )
+    def get_dataset_description(self) -> DatasetDescription:
         """Returns a description of the Big Bench Hard dataset."""
+        return DatasetDescription.create(
+            name="Big Bench Hard (BBH)",
+            purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
+            source="https://github.com/suzgunmirac/BIG-Bench-Hard",
+            language="English",
+            format="Multiple choice questions with single correct answers",
+            characteristics=(
                 "Tasks require complex multi-step reasoning and were selected based on "
                 "initial model performance below human baseline. Performance can be "
                 "significantly improved through chain-of-thought prompting. The dataset "
                 "includes 23 core tasks plus additional related tasks."
             ),
+            citation=(
                 "@article{suzgun2022challenging,\n"
                 "  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
                 '  author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
                 "  year={2022}\n"
                 "}"
             ),
+            additional_info={
+                "model_performance": (
+                    "With chain-of-thought prompting, PaLM surpassed human performance on "
+                    "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
+                ),
+                "size": "6.5k examples across 27 tasks (23 core + 4 related)",
+            },
+        )
+    def get_evaluation_metrics(self) -> List[EvaluationMetric]:
         """Returns the recommended evaluation metrics for BBH dataset."""
         return [
+            EvaluationMetric.create(
+                name="accuracy",
+                type="classification",
+                description="Proportion of exactly correct answers (after stripping parentheses)",
+                implementation="evaluate.load('accuracy')",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="human_eval_delta",
+                type="comparison",
+                description="Difference between model accuracy and average human-rater performance baseline",
+                implementation="custom_human_baseline_comparison",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="per_task_accuracy",
+                type="classification",
+                description="Accuracy broken down by individual reasoning tasks",
+                implementation="custom_task_accuracy",
+                primary=False,
+            ),
+            EvaluationMetric.create(
+                name="exact_match",
+                type="string_match",
+                description="Strict exact match between predicted and target answers",
+                implementation="evaluate.load('exact_match')",
+                primary=False,
+            ),
         ]

llmdataparser/tmlu_parser.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from dataclasses import dataclass
-from typing import Any, Dict, Final, List
-from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 from llmdataparser.prompts import TMLU_SYSTEM_PROMPT
 TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
@@ -118,63 +123,73 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
             metadata=metadata,
         )
-    def get_dataset_description(self) -> Dict[str, str]:
         """Returns description of the TMLU dataset."""
-        return {
-            "name": "Taiwan Multiple-choice Language Understanding (TMLU)",
-            "version": "1.0",
-            "language": "Traditional Chinese",
-            "purpose": "Evaluate models on Taiwan-specific educational and professional knowledge",
-            "source": "Various Taiwan standardized tests and professional certifications",
-            "format": "Multiple choice questions (A/B/C/D)",
-            "size": "Multiple subjects across different test types",
-            "domain": "Education and Professional Certification",
-            "characteristics": (
                 "Covers various subjects including Advanced Subjects Test (AST), "
                 "General Scholastic Ability Test (GSAT), College Admission Practice (CAP), "
                 "and professional certifications"
             ),
-            "reference": "https://huggingface.co/datasets/miulab/tmlu",
-        }
-    def get_evaluation_metrics(self) -> List[Dict[str, Any]]:
         """Returns recommended evaluation metrics for TMLU."""
         return [
-            {
-                "name": "accuracy",
-                "type": "classification",
-                "description": "Overall percentage of correctly answered questions",
-                "implementation": "datasets.load_metric('accuracy')",
-                "primary": True,
-            },
-            {
-                "name": "per_subject_accuracy",
-                "type": "classification",
-                "description": "Accuracy broken down by subject areas (AST, GSAT, CAP, etc.)",
-                "implementation": "custom_subject_accuracy",
-                "primary": True,
-            },
-            {
-                "name": "per_difficulty_accuracy",
-                "type": "classification",
-                "description": "Accuracy broken down by test difficulty levels",
-                "implementation": "custom_difficulty_accuracy",
-                "primary": False,
-            },
-            {
-                "name": "confusion_matrix",
-                "type": "classification",
-                "description": "Distribution of predicted vs actual answers",
-                "implementation": "datasets.load_metric('confusion_matrix')",
-                "primary": False,
-            },
-            {
-                "name": "explanation_quality",
-                "type": "text",
-                "description": "Quality assessment of model explanations when available",
-                "implementation": "custom_explanation_metric",
-                "primary": False,
-            },
         ]

 from dataclasses import dataclass
+from typing import Any, Final
+from llmdataparser.base_parser import (
+    DatasetDescription,
+    EvaluationMetric,
+    HuggingFaceDatasetParser,
+    HuggingFaceParseEntry,
+)
 from llmdataparser.prompts import TMLU_SYSTEM_PROMPT
 TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
             metadata=metadata,
         )
+    def get_dataset_description(self) -> DatasetDescription:
         """Returns description of the TMLU dataset."""
+        return DatasetDescription.create(
+            name="Taiwan Multiple-choice Language Understanding (TMLU)",
+            language="Traditional Chinese",
+            purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
+            source="Various Taiwan standardized tests and professional certifications",
+            format="Multiple choice questions (A/B/C/D)",
+            characteristics=(
                 "Covers various subjects including Advanced Subjects Test (AST), "
                 "General Scholastic Ability Test (GSAT), College Admission Practice (CAP), "
                 "and professional certifications"
             ),
+            citation="""@article{DBLP:journals/corr/abs-2403-20180,
+                author       = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
+                title        = {Measuring Taiwanese Mandarin Language Understanding},
+                journal      = {CoRR},
+                volume       = {abs/2403.20180},
+                year         = {2024},
+                url          = {https://doi.org/10.48550/arXiv.2403.20180},
+                doi          = {10.48550/ARXIV.2403.20180},
+                eprinttype   = {arXiv},
+                eprint       = {2403.20180},
+                timestamp    = {Wed, 10 Apr 2024 17:37:45 +0200},
+                biburl       = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
+                bibsource    = {dblp computer science bibliography, https://dblp.org}
+            }""",
+        )
+    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
         """Returns recommended evaluation metrics for TMLU."""
         return [
+            EvaluationMetric.create(
+                name="accuracy",
+                type="classification",
+                description="Overall percentage of correctly answered questions",
+                implementation="datasets.load_metric('accuracy')",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="per_subject_accuracy",
+                type="classification",
+                description="Accuracy broken down by subject areas (AST, GSAT, CAP, etc.)",
+                implementation="custom_subject_accuracy",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="per_difficulty_accuracy",
+                type="classification",
+                description="Accuracy broken down by test difficulty levels",
+                implementation="custom_difficulty_accuracy",
+                primary=False,
+            ),
+            EvaluationMetric.create(
+                name="confusion_matrix",
+                type="classification",
+                description="Distribution of predicted vs actual answers",
+                implementation="datasets.load_metric('confusion_matrix')",
+                primary=False,
+            ),
+            EvaluationMetric.create(
+                name="explanation_quality",
+                type="text",
+                description="Quality assessment of model explanations when available",
+                implementation="custom_explanation_metric",
+                primary=False,
+            ),
         ]