JeffYang52415's picture
refactor: remove system prompt
0450c4e unverified
from dataclasses import dataclass
from typing import Any, ClassVar, List
from llmdataparser.base_parser import (
DatasetDescription,
EvaluationMetric,
HuggingFaceDatasetParser,
HuggingFaceParseEntry,
)
@dataclass(frozen=True, kw_only=True, slots=True)
class BBHParseEntry(HuggingFaceParseEntry):
"""Custom entry class for BBH (Big Bench Hard), with fields specific to this dataset."""
@classmethod
def create(
cls,
question: str,
answer: str,
raw_question: str,
raw_answer: str,
task_name: str,
) -> "BBHParseEntry":
return cls(
question=question,
answer=answer,
raw_question=raw_question,
raw_answer=raw_answer,
task_name=task_name,
)
class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
"""Parser for the Big Bench Hard dataset."""
_data_source: ClassVar[str] = "lukaemon/bbh"
_task_names: ClassVar[list[str]] = [
"boolean_expressions",
"causal_judgement",
"date_understanding",
"disambiguation_qa",
"dyck_languages",
"formal_fallacies",
"geometric_shapes",
"hyperbaton",
"logical_deduction_five_objects",
"logical_deduction_seven_objects",
"logical_deduction_three_objects",
"movie_recommendation",
"multistep_arithmetic_two",
"navigate",
"object_counting",
"penguins_in_a_table",
"reasoning_about_colored_objects",
"ruin_names",
"salient_translation_error_detection",
"snarks",
"sports_understanding",
"temporal_sequences",
"tracking_shuffled_objects_five_objects",
"tracking_shuffled_objects_seven_objects",
"tracking_shuffled_objects_three_objects",
"web_of_lies",
"word_sorting",
]
_default_task: ClassVar[str] = "reasoning_about_colored_objects"
def process_entry(
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
) -> BBHParseEntry:
"""Process a single BBH entry."""
raw_question = row["input"]
raw_answer = row["target"]
# Remove parentheses from the answer
clean_answer = raw_answer.strip("()")
question = str(raw_question)
# Use task_name if provided, otherwise use default
task = task_name or self._get_current_task(row)
return BBHParseEntry.create(
question=question,
answer=clean_answer,
raw_question=raw_question,
raw_answer=raw_answer,
task_name=task,
)
def get_dataset_description(self) -> DatasetDescription:
"""Returns a description of the Big Bench Hard dataset."""
return DatasetDescription.create(
name="Big Bench Hard (BBH)",
purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
source="https://github.com/suzgunmirac/BIG-Bench-Hard",
language="English",
format="Multiple choice questions with single correct answers",
characteristics=(
"Tasks require complex multi-step reasoning and were selected based on "
"initial model performance below human baseline. Performance can be "
"significantly improved through chain-of-thought prompting. The dataset "
"includes 23 core tasks plus additional related tasks."
),
category=["Advanced Reasoning"],
citation=(
"@article{suzgun2022challenging,\n"
" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
" journal={arXiv preprint arXiv:2210.09261},\n"
" year={2022}\n"
"}"
),
additional_info={
"model_performance": (
"With chain-of-thought prompting, PaLM surpassed human performance on "
"10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
),
"size": "6.5k examples across 27 tasks (23 core + 4 related)",
},
)
def get_evaluation_metrics(self) -> List[EvaluationMetric]:
"""Returns the recommended evaluation metrics for BBH dataset."""
return [
EvaluationMetric.create(
name="accuracy",
type="classification",
description="Proportion of exactly correct answers (after stripping parentheses)",
implementation="evaluate.load('accuracy')",
primary=True,
),
EvaluationMetric.create(
name="human_eval_delta",
type="comparison",
description="Difference between model accuracy and average human-rater performance baseline",
implementation="custom_human_baseline_comparison",
primary=True,
),
EvaluationMetric.create(
name="per_task_accuracy",
type="classification",
description="Accuracy broken down by individual reasoning tasks",
implementation="custom_task_accuracy",
primary=False,
),
EvaluationMetric.create(
name="exact_match",
type="string_match",
description="Strict exact match between predicted and target answers",
implementation="evaluate.load('exact_match')",
primary=False,
),
]
if __name__ == "__main__":
# Example usage
parser = BBHDatasetParser()
# Load the dataset with a specific task
parser.load(task_name="reasoning_about_colored_objects")
# Parse all splits
parser.parse()
# Get parsed data
parsed_data = parser.get_parsed_data
# Print example entry
if parsed_data:
example = parsed_data[0]
print("\nExample parsed entry:")
print(f"Task: {example.task_name}")
print(f"Question: {example.question}")
print(f"Answer: {example.answer}")