File size: 6,523 Bytes
44529bb
299e68a
 
 
 
 
 
 
 
44529bb
 
 
 
 
 
 
 
 
0450c4e
44529bb
 
 
 
 
 
0450c4e
44529bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
44529bb
 
 
 
 
0450c4e
44529bb
 
 
 
 
 
299e68a
fb32f8e
299e68a
 
 
 
 
 
 
fb32f8e
 
 
 
 
a06316f
299e68a
fb32f8e
 
 
 
 
 
 
299e68a
 
 
 
 
 
 
 
fb32f8e
299e68a
fb32f8e
 
299e68a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb32f8e
 
44529bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
44529bb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from dataclasses import dataclass
from typing import Any, ClassVar, List

from llmdataparser.base_parser import (
    DatasetDescription,
    EvaluationMetric,
    HuggingFaceDatasetParser,
    HuggingFaceParseEntry,
)


@dataclass(frozen=True, kw_only=True, slots=True)
class BBHParseEntry(HuggingFaceParseEntry):
    """Custom entry class for BBH (Big Bench Hard), with fields specific to this dataset."""

    @classmethod
    def create(
        cls,
        question: str,
        answer: str,
        raw_question: str,
        raw_answer: str,
        task_name: str,
    ) -> "BBHParseEntry":
        return cls(
            question=question,
            answer=answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            task_name=task_name,
        )


class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
    """Parser for the Big Bench Hard dataset."""

    _data_source: ClassVar[str] = "lukaemon/bbh"
    _task_names: ClassVar[list[str]] = [
        "boolean_expressions",
        "causal_judgement",
        "date_understanding",
        "disambiguation_qa",
        "dyck_languages",
        "formal_fallacies",
        "geometric_shapes",
        "hyperbaton",
        "logical_deduction_five_objects",
        "logical_deduction_seven_objects",
        "logical_deduction_three_objects",
        "movie_recommendation",
        "multistep_arithmetic_two",
        "navigate",
        "object_counting",
        "penguins_in_a_table",
        "reasoning_about_colored_objects",
        "ruin_names",
        "salient_translation_error_detection",
        "snarks",
        "sports_understanding",
        "temporal_sequences",
        "tracking_shuffled_objects_five_objects",
        "tracking_shuffled_objects_seven_objects",
        "tracking_shuffled_objects_three_objects",
        "web_of_lies",
        "word_sorting",
    ]
    _default_task: ClassVar[str] = "reasoning_about_colored_objects"

    def process_entry(
        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
    ) -> BBHParseEntry:
        """Process a single BBH entry."""
        raw_question = row["input"]
        raw_answer = row["target"]

        # Remove parentheses from the answer
        clean_answer = raw_answer.strip("()")

        question = str(raw_question)

        # Use task_name if provided, otherwise use default
        task = task_name or self._get_current_task(row)

        return BBHParseEntry.create(
            question=question,
            answer=clean_answer,
            raw_question=raw_question,
            raw_answer=raw_answer,
            task_name=task,
        )

    def get_dataset_description(self) -> DatasetDescription:
        """Returns a description of the Big Bench Hard dataset."""
        return DatasetDescription.create(
            name="Big Bench Hard (BBH)",
            purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
            source="https://github.com/suzgunmirac/BIG-Bench-Hard",
            language="English",
            format="Multiple choice questions with single correct answers",
            characteristics=(
                "Tasks require complex multi-step reasoning and were selected based on "
                "initial model performance below human baseline. Performance can be "
                "significantly improved through chain-of-thought prompting. The dataset "
                "includes 23 core tasks plus additional related tasks."
            ),
            category=["Advanced Reasoning"],
            citation=(
                "@article{suzgun2022challenging,\n"
                "  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
                '  author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
                "  journal={arXiv preprint arXiv:2210.09261},\n"
                "  year={2022}\n"
                "}"
            ),
            additional_info={
                "model_performance": (
                    "With chain-of-thought prompting, PaLM surpassed human performance on "
                    "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
                ),
                "size": "6.5k examples across 27 tasks (23 core + 4 related)",
            },
        )

    def get_evaluation_metrics(self) -> List[EvaluationMetric]:
        """Returns the recommended evaluation metrics for BBH dataset."""
        return [
            EvaluationMetric.create(
                name="accuracy",
                type="classification",
                description="Proportion of exactly correct answers (after stripping parentheses)",
                implementation="evaluate.load('accuracy')",
                primary=True,
            ),
            EvaluationMetric.create(
                name="human_eval_delta",
                type="comparison",
                description="Difference between model accuracy and average human-rater performance baseline",
                implementation="custom_human_baseline_comparison",
                primary=True,
            ),
            EvaluationMetric.create(
                name="per_task_accuracy",
                type="classification",
                description="Accuracy broken down by individual reasoning tasks",
                implementation="custom_task_accuracy",
                primary=False,
            ),
            EvaluationMetric.create(
                name="exact_match",
                type="string_match",
                description="Strict exact match between predicted and target answers",
                implementation="evaluate.load('exact_match')",
                primary=False,
            ),
        ]


if __name__ == "__main__":
    # Example usage
    parser = BBHDatasetParser()

    # Load the dataset with a specific task
    parser.load(task_name="reasoning_about_colored_objects")

    # Parse all splits
    parser.parse()

    # Get parsed data
    parsed_data = parser.get_parsed_data

    # Print example entry
    if parsed_data:
        example = parsed_data[0]
        print("\nExample parsed entry:")
        print(f"Task: {example.task_name}")
        print(f"Question: {example.question}")
        print(f"Answer: {example.answer}")