File size: 6,523 Bytes
44529bb 299e68a 44529bb 0450c4e 44529bb 0450c4e 44529bb 0450c4e 44529bb 0450c4e 44529bb 299e68a fb32f8e 299e68a fb32f8e a06316f 299e68a fb32f8e 299e68a fb32f8e 299e68a fb32f8e 299e68a fb32f8e 44529bb 0450c4e 44529bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
from dataclasses import dataclass
from typing import Any, ClassVar, List
from llmdataparser.base_parser import (
DatasetDescription,
EvaluationMetric,
HuggingFaceDatasetParser,
HuggingFaceParseEntry,
)
@dataclass(frozen=True, kw_only=True, slots=True)
class BBHParseEntry(HuggingFaceParseEntry):
"""Custom entry class for BBH (Big Bench Hard), with fields specific to this dataset."""
@classmethod
def create(
cls,
question: str,
answer: str,
raw_question: str,
raw_answer: str,
task_name: str,
) -> "BBHParseEntry":
return cls(
question=question,
answer=answer,
raw_question=raw_question,
raw_answer=raw_answer,
task_name=task_name,
)
class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
"""Parser for the Big Bench Hard dataset."""
_data_source: ClassVar[str] = "lukaemon/bbh"
_task_names: ClassVar[list[str]] = [
"boolean_expressions",
"causal_judgement",
"date_understanding",
"disambiguation_qa",
"dyck_languages",
"formal_fallacies",
"geometric_shapes",
"hyperbaton",
"logical_deduction_five_objects",
"logical_deduction_seven_objects",
"logical_deduction_three_objects",
"movie_recommendation",
"multistep_arithmetic_two",
"navigate",
"object_counting",
"penguins_in_a_table",
"reasoning_about_colored_objects",
"ruin_names",
"salient_translation_error_detection",
"snarks",
"sports_understanding",
"temporal_sequences",
"tracking_shuffled_objects_five_objects",
"tracking_shuffled_objects_seven_objects",
"tracking_shuffled_objects_three_objects",
"web_of_lies",
"word_sorting",
]
_default_task: ClassVar[str] = "reasoning_about_colored_objects"
def process_entry(
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
) -> BBHParseEntry:
"""Process a single BBH entry."""
raw_question = row["input"]
raw_answer = row["target"]
# Remove parentheses from the answer
clean_answer = raw_answer.strip("()")
question = str(raw_question)
# Use task_name if provided, otherwise use default
task = task_name or self._get_current_task(row)
return BBHParseEntry.create(
question=question,
answer=clean_answer,
raw_question=raw_question,
raw_answer=raw_answer,
task_name=task,
)
def get_dataset_description(self) -> DatasetDescription:
"""Returns a description of the Big Bench Hard dataset."""
return DatasetDescription.create(
name="Big Bench Hard (BBH)",
purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
source="https://github.com/suzgunmirac/BIG-Bench-Hard",
language="English",
format="Multiple choice questions with single correct answers",
characteristics=(
"Tasks require complex multi-step reasoning and were selected based on "
"initial model performance below human baseline. Performance can be "
"significantly improved through chain-of-thought prompting. The dataset "
"includes 23 core tasks plus additional related tasks."
),
category=["Advanced Reasoning"],
citation=(
"@article{suzgun2022challenging,\n"
" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
" journal={arXiv preprint arXiv:2210.09261},\n"
" year={2022}\n"
"}"
),
additional_info={
"model_performance": (
"With chain-of-thought prompting, PaLM surpassed human performance on "
"10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
),
"size": "6.5k examples across 27 tasks (23 core + 4 related)",
},
)
def get_evaluation_metrics(self) -> List[EvaluationMetric]:
"""Returns the recommended evaluation metrics for BBH dataset."""
return [
EvaluationMetric.create(
name="accuracy",
type="classification",
description="Proportion of exactly correct answers (after stripping parentheses)",
implementation="evaluate.load('accuracy')",
primary=True,
),
EvaluationMetric.create(
name="human_eval_delta",
type="comparison",
description="Difference between model accuracy and average human-rater performance baseline",
implementation="custom_human_baseline_comparison",
primary=True,
),
EvaluationMetric.create(
name="per_task_accuracy",
type="classification",
description="Accuracy broken down by individual reasoning tasks",
implementation="custom_task_accuracy",
primary=False,
),
EvaluationMetric.create(
name="exact_match",
type="string_match",
description="Strict exact match between predicted and target answers",
implementation="evaluate.load('exact_match')",
primary=False,
),
]
if __name__ == "__main__":
# Example usage
parser = BBHDatasetParser()
# Load the dataset with a specific task
parser.load(task_name="reasoning_about_colored_objects")
# Parse all splits
parser.parse()
# Get parsed data
parsed_data = parser.get_parsed_data
# Print example entry
if parsed_data:
example = parsed_data[0]
print("\nExample parsed entry:")
print(f"Task: {example.task_name}")
print(f"Question: {example.question}")
print(f"Answer: {example.answer}")
|