File size: 6,908 Bytes
b65e855 c3a92d2 b65e855 c3a92d2 b65e855 acfee14 b65e855 c3a92d2 a06316f c3a92d2 27ff91e c3a92d2 b65e855 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
from dataclasses import dataclass
from typing import Any, ClassVar
from llmdataparser.base_parser import (
DatasetDescription,
EvaluationMetric,
HuggingFaceDatasetParser,
HuggingFaceParseEntry,
)
@dataclass(frozen=True, kw_only=True, slots=True)
class MATHParseEntry(HuggingFaceParseEntry):
"""Custom entry class for MATH dataset, with fields specific to this dataset parser."""
level: str
task_name: str
solution: str
@classmethod
def create(
cls,
prompt: str,
answer: str,
raw_question: str,
raw_answer: str,
level: str,
task_name: str,
solution: str,
) -> "MATHParseEntry":
return cls(
prompt=prompt,
answer=answer,
raw_question=raw_question,
raw_answer=raw_answer,
level=level,
task_name=task_name,
solution=solution,
)
class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
"""Parser for the MATH dataset."""
_data_source: ClassVar[str] = "lighteval/MATH"
_task_names: ClassVar[list[str]] = [
"algebra",
"geometry",
"calculus",
"prealgebra",
"intermediate_algebra",
"number_theory",
"precalculus",
"all",
]
_default_task: ClassVar[str] = "all"
_default_system_prompt: ClassVar[str] = (
"Solve the following mathematics problem step by step:"
)
_valid_levels: ClassVar[set[str]] = {
f"Level {i}" for i in range(1, 6)
} # Levels 1-5 are valid
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
"""Get the task name from the data entry or fall back to current task."""
entry_type: str = data_entry.get("type", "")
if entry_type and (entry_type in self._task_names):
return entry_type
return self._current_task or self._default_task
def process_entry(
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
) -> MATHParseEntry:
"""Process a single MATH dataset entry."""
task = task_name or self._get_current_task(row)
# Validate and normalize level
level = row.get("level")
if level not in self._valid_levels:
level = "Unknown"
return MATHParseEntry.create(
prompt=f"{self._system_prompt}\n{row['problem']}",
answer=row["solution"],
raw_question=row["problem"],
raw_answer=row["solution"],
level=level,
task_name=task,
solution=row["solution"],
)
def get_dataset_description(self) -> DatasetDescription:
"""Returns description of the MATH dataset."""
return DatasetDescription.create(
name="MATH",
purpose="Measure mathematical problem-solving capabilities in machine learning models",
source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
language="English",
format="Competition mathematics problems with step-by-step solutions",
category=["Math"],
characteristics=(
"Collection of 12,500 challenging competition mathematics problems designed to "
"evaluate mathematical reasoning. Problems include step-by-step solutions that "
"can be used to teach models to generate answer derivations and explanations. "
"Problems are categorized by subject area and difficulty level (1-5)."
),
citation="""@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}""",
additional_info={
"difficulty_levels": "1-5",
"topics": [
"algebra",
"geometry",
"calculus",
"prealgebra",
"intermediate_algebra",
"number_theory",
"precalculus",
],
"size": "12,500 problems",
"evaluation_note": "Exact match equivalence calculated using sympy library",
"homepage": "https://github.com/hendrycks/math",
},
)
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
"""Returns recommended evaluation metrics for MATH dataset."""
return [
EvaluationMetric.create(
name="symbolic_equivalence",
type="exact_match",
description="Verifies answer correctness using symbolic mathematics (e.g., sympy) to check mathematical equivalence.",
implementation="sympy_equivalence_checker",
primary=True,
),
EvaluationMetric.create(
name="solution_presence",
type="text",
description="Ensures that a complete step-by-step solution is provided, demonstrating how the answer is derived.",
implementation="solution_completeness_checker",
primary=True,
),
EvaluationMetric.create(
name="reasoning_validity",
type="text",
description="Evaluates the logical correctness and mathematical reasoning in the solution's derivation steps.",
implementation="reasoning_validator",
primary=True,
),
EvaluationMetric.create(
name="mathematical_notation",
type="text",
description="Checks for the correct use of mathematical notation and symbolic representation to ensure clarity.",
implementation="notation_validator",
primary=False,
),
EvaluationMetric.create(
name="solution_clarity",
type="text",
description="Assesses the clarity, readability, and coherence of the solution steps to enhance interpretability.",
implementation="clarity_scorer",
primary=False,
),
]
if __name__ == "__main__":
# Example usage of MATH parser
parser = MATHDatasetParser()
# Load the dataset
parser.load()
# Parse all splits
parser.parse()
# Get parsed data
parsed_data = parser.get_parsed_data
# Print example entry
if parsed_data:
example = parsed_data[0]
print("\nExample parsed entry:")
print(f"Task: {example.task_name}")
print(f"Level: {example.level}")
print(f"Question: {example.raw_question}")
print(f"Solution: {example.solution}")
|