Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Sleeping

App Files Files Community

JeffYang52415 commited on Dec 29, 2024

Commit

e5427e0

unverified ·

1 Parent(s): 58d5612

refactor: gsm8k parser

Browse files

Files changed (3) hide show

llmdataparser/base_parser.py +2 -2
llmdataparser/gsm8k_parser.py +60 -1
tests/test_gsm8k_parser.py +44 -0

llmdataparser/base_parser.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any, ClassVar, Generic, List, TypeVar
 import datasets
@@ -130,7 +130,7 @@ class DatasetParser(Generic[T], ABC):
             characteristics="Not specified",
         )
-    def get_evaluation_metrics(self) -> List[EvaluationMetric]:
         """Returns the recommended evaluation metrics for the dataset."""
         return []

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
+from typing import Any, ClassVar, Generic, TypeVar
 import datasets
             characteristics="Not specified",
         )
+    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
         """Returns the recommended evaluation metrics for the dataset."""
         return []

llmdataparser/gsm8k_parser.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from dataclasses import dataclass
 from typing import Any, ClassVar
-from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 from llmdataparser.prompts import GSM8K_SYSTEM_PROMPT
@@ -76,6 +81,60 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
             task_name=task,  # Guarantee non-None
         )
 if __name__ == "__main__":
     from pprint import pprint

 from dataclasses import dataclass
 from typing import Any, ClassVar
+from llmdataparser.base_parser import (
+    DatasetDescription,
+    EvaluationMetric,
+    HuggingFaceDatasetParser,
+    HuggingFaceParseEntry,
+)
 from llmdataparser.prompts import GSM8K_SYSTEM_PROMPT
             task_name=task,  # Guarantee non-None
         )
+    def get_dataset_description(self) -> DatasetDescription:
+        """Returns description of the GSM8K dataset."""
+        return DatasetDescription.create(
+            name="Grade School Math 8K (GSM8K)",
+            purpose="Evaluate mathematical reasoning capabilities through word problems",
+            source="OpenAI",
+            language="English",
+            format="Word problems with step-by-step solutions and numerical answers",
+            characteristics=(
+                "Collection of 8.5K grade school math word problems that require "
+                "multi-step reasoning. Problems gradually increase in difficulty "
+                "and cover basic arithmetic, word problems, and elementary algebra"
+            ),
+            citation="""@article{cobbe2021gsm8k,
+                title={Training Verifiers to Solve Math Word Problems},
+                author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
+                journal={arXiv preprint arXiv:2110.14168},
+                year={2021}
+            }""",
+        )
+    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
+        """Returns recommended evaluation metrics for GSM8K."""
+        return [
+            EvaluationMetric.create(
+                name="exact_match",
+                type="string",
+                description="Exact match comparison between predicted and correct numerical answers",
+                implementation="custom_exact_match",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="solution_validity",
+                type="text",
+                description="Assessment of whether the solution steps are mathematically valid and complete",
+                implementation="custom_solution_validator",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="step_accuracy",
+                type="numerical",
+                description="Accuracy of intermediate calculation steps (e.g., <<48/2=24>>)",
+                implementation="custom_step_accuracy",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="step_count",
+                type="numerical",
+                description="Analysis of the number of reasoning steps in solutions",
+                implementation="custom_step_counter",
+                primary=False,
+            ),
+        ]
 if __name__ == "__main__":
     from pprint import pprint

tests/test_gsm8k_parser.py CHANGED Viewed

@@ -181,3 +181,47 @@ def test_different_splits_parsing(gsm8k_parser):
     assert test_count > 0
     assert train_count > 0
     assert train_count != test_count

     assert test_count > 0
     assert train_count > 0
     assert train_count != test_count
+def test_get_dataset_description(gsm8k_parser):
+    """Test dataset description generation."""
+    description = gsm8k_parser.get_dataset_description()
+    assert description.name == "Grade School Math 8K (GSM8K)"
+    assert description.source == "OpenAI"
+    assert description.language == "English"
+    assert "8.5K grade school math word problems" in description.characteristics
+    assert "Training Verifiers to Solve Math Word Problems" in description.citation
+    assert "Cobbe" in description.citation
+    assert "arXiv" in description.citation
+def test_get_evaluation_metrics(gsm8k_parser):
+    """Test evaluation metrics specification."""
+    metrics = gsm8k_parser.get_evaluation_metrics()
+    # Check we have all expected metrics
+    metric_names = {metric.name for metric in metrics}
+    expected_names = {"exact_match", "solution_validity", "step_accuracy", "step_count"}
+    assert metric_names == expected_names
+    # Check exact_match metric details
+    exact_match = next(m for m in metrics if m.name == "exact_match")
+    assert exact_match.type == "string"
+    assert exact_match.primary is True
+    assert "exact match" in exact_match.description.lower()
+    # Check solution_validity metric details
+    solution_validity = next(m for m in metrics if m.name == "solution_validity")
+    assert solution_validity.type == "text"
+    assert solution_validity.primary is True
+    assert "valid" in solution_validity.description.lower()
+    # Check step metrics
+    step_accuracy = next(m for m in metrics if m.name == "step_accuracy")
+    assert step_accuracy.type == "numerical"
+    assert step_accuracy.primary is True
+    step_count = next(m for m in metrics if m.name == "step_count")
+    assert step_count.type == "numerical"
+    assert step_count.primary is False