|
from dataclasses import dataclass |
|
from typing import Any, ClassVar |
|
|
|
from llmdataparser.base_parser import ( |
|
DatasetDescription, |
|
EvaluationMetric, |
|
HuggingFaceDatasetParser, |
|
HuggingFaceParseEntry, |
|
) |
|
|
|
|
|
@dataclass(frozen=True, kw_only=True, slots=True) |
|
class MATHParseEntry(HuggingFaceParseEntry): |
|
"""Custom entry class for MATH dataset, with fields specific to this dataset parser.""" |
|
|
|
level: str |
|
task_name: str |
|
solution: str |
|
|
|
@classmethod |
|
def create( |
|
cls, |
|
question: str, |
|
answer: str, |
|
raw_question: str, |
|
raw_answer: str, |
|
level: str, |
|
task_name: str, |
|
solution: str, |
|
) -> "MATHParseEntry": |
|
return cls( |
|
question=question, |
|
answer=answer, |
|
raw_question=raw_question, |
|
raw_answer=raw_answer, |
|
level=level, |
|
task_name=task_name, |
|
solution=solution, |
|
) |
|
|
|
|
|
class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]): |
|
"""Parser for the MATH dataset.""" |
|
|
|
_data_source: ClassVar[str] = "lighteval/MATH" |
|
_task_names: ClassVar[list[str]] = [ |
|
"algebra", |
|
"geometry", |
|
"calculus", |
|
"prealgebra", |
|
"intermediate_algebra", |
|
"number_theory", |
|
"precalculus", |
|
"all", |
|
] |
|
_default_task: ClassVar[str] = "all" |
|
|
|
_valid_levels: ClassVar[set[str]] = { |
|
f"Level {i}" for i in range(1, 6) |
|
} |
|
|
|
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str: |
|
"""Get the task name from the data entry or fall back to current task.""" |
|
entry_type: str = data_entry.get("type", "") |
|
if entry_type and (entry_type in self._task_names): |
|
return entry_type |
|
return self._current_task or self._default_task |
|
|
|
def process_entry( |
|
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any |
|
) -> MATHParseEntry: |
|
"""Process a single MATH dataset entry.""" |
|
task = task_name or self._get_current_task(row) |
|
|
|
|
|
level = row.get("level") |
|
if level not in self._valid_levels: |
|
level = "Unknown" |
|
|
|
return MATHParseEntry.create( |
|
question=str(row["problem"]), |
|
answer=row["solution"], |
|
raw_question=row["problem"], |
|
raw_answer=row["solution"], |
|
level=level, |
|
task_name=task, |
|
solution=row["solution"], |
|
) |
|
|
|
def get_dataset_description(self) -> DatasetDescription: |
|
"""Returns description of the MATH dataset.""" |
|
return DatasetDescription.create( |
|
name="MATH", |
|
purpose="Measure mathematical problem-solving capabilities in machine learning models", |
|
source="Hendrycks et al., UC Berkeley (NeurIPS 2021)", |
|
language="English", |
|
format="Competition mathematics problems with step-by-step solutions", |
|
category=["Math"], |
|
characteristics=( |
|
"Collection of 12,500 challenging competition mathematics problems designed to " |
|
"evaluate mathematical reasoning. Problems include step-by-step solutions that " |
|
"can be used to teach models to generate answer derivations and explanations. " |
|
"Problems are categorized by subject area and difficulty level (1-5)." |
|
), |
|
citation="""@article{hendrycksmath2021, |
|
title={Measuring Mathematical Problem Solving With the MATH Dataset}, |
|
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, |
|
journal={NeurIPS}, |
|
year={2021} |
|
}""", |
|
additional_info={ |
|
"difficulty_levels": "1-5", |
|
"topics": [ |
|
"algebra", |
|
"geometry", |
|
"calculus", |
|
"prealgebra", |
|
"intermediate_algebra", |
|
"number_theory", |
|
"precalculus", |
|
], |
|
"size": "12,500 problems", |
|
"evaluation_note": "Exact match equivalence calculated using sympy library", |
|
"homepage": "https://github.com/hendrycks/math", |
|
}, |
|
) |
|
|
|
def get_evaluation_metrics(self) -> list[EvaluationMetric]: |
|
"""Returns recommended evaluation metrics for MATH dataset.""" |
|
return [ |
|
EvaluationMetric.create( |
|
name="symbolic_equivalence", |
|
type="exact_match", |
|
description="Verifies answer correctness using symbolic mathematics (e.g., sympy) to check mathematical equivalence.", |
|
implementation="sympy_equivalence_checker", |
|
primary=True, |
|
), |
|
EvaluationMetric.create( |
|
name="solution_presence", |
|
type="text", |
|
description="Ensures that a complete step-by-step solution is provided, demonstrating how the answer is derived.", |
|
implementation="solution_completeness_checker", |
|
primary=True, |
|
), |
|
EvaluationMetric.create( |
|
name="reasoning_validity", |
|
type="text", |
|
description="Evaluates the logical correctness and mathematical reasoning in the solution's derivation steps.", |
|
implementation="reasoning_validator", |
|
primary=True, |
|
), |
|
EvaluationMetric.create( |
|
name="mathematical_notation", |
|
type="text", |
|
description="Checks for the correct use of mathematical notation and symbolic representation to ensure clarity.", |
|
implementation="notation_validator", |
|
primary=False, |
|
), |
|
EvaluationMetric.create( |
|
name="solution_clarity", |
|
type="text", |
|
description="Assesses the clarity, readability, and coherence of the solution steps to enhance interpretability.", |
|
implementation="clarity_scorer", |
|
primary=False, |
|
), |
|
] |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
parser = MATHDatasetParser() |
|
|
|
|
|
parser.load() |
|
|
|
|
|
parser.parse() |
|
|
|
|
|
parsed_data = parser.get_parsed_data |
|
|
|
|
|
if parsed_data: |
|
example = parsed_data[0] |
|
print("\nExample parsed entry:") |
|
print(f"Task: {example.task_name}") |
|
print(f"Level: {example.level}") |
|
print(f"Question: {example.question}") |
|
print(f"Solution: {example.solution}") |
|
|