from dataclasses import dataclass from typing import Any, ClassVar, List from llmdataparser.base_parser import ( DatasetDescription, EvaluationMetric, HuggingFaceDatasetParser, HuggingFaceParseEntry, ) @dataclass(frozen=True, kw_only=True, slots=True) class BBHParseEntry(HuggingFaceParseEntry): """Custom entry class for BBH (Big Bench Hard), with fields specific to this dataset.""" @classmethod def create( cls, question: str, answer: str, raw_question: str, raw_answer: str, task_name: str, ) -> "BBHParseEntry": return cls( question=question, answer=answer, raw_question=raw_question, raw_answer=raw_answer, task_name=task_name, ) class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]): """Parser for the Big Bench Hard dataset.""" _data_source: ClassVar[str] = "lukaemon/bbh" _task_names: ClassVar[list[str]] = [ "boolean_expressions", "causal_judgement", "date_understanding", "disambiguation_qa", "dyck_languages", "formal_fallacies", "geometric_shapes", "hyperbaton", "logical_deduction_five_objects", "logical_deduction_seven_objects", "logical_deduction_three_objects", "movie_recommendation", "multistep_arithmetic_two", "navigate", "object_counting", "penguins_in_a_table", "reasoning_about_colored_objects", "ruin_names", "salient_translation_error_detection", "snarks", "sports_understanding", "temporal_sequences", "tracking_shuffled_objects_five_objects", "tracking_shuffled_objects_seven_objects", "tracking_shuffled_objects_three_objects", "web_of_lies", "word_sorting", ] _default_task: ClassVar[str] = "reasoning_about_colored_objects" def process_entry( self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any ) -> BBHParseEntry: """Process a single BBH entry.""" raw_question = row["input"] raw_answer = row["target"] # Remove parentheses from the answer clean_answer = raw_answer.strip("()") question = str(raw_question) # Use task_name if provided, otherwise use default task = task_name or self._get_current_task(row) return BBHParseEntry.create( question=question, answer=clean_answer, raw_question=raw_question, raw_answer=raw_answer, task_name=task, ) def get_dataset_description(self) -> DatasetDescription: """Returns a description of the Big Bench Hard dataset.""" return DatasetDescription.create( name="Big Bench Hard (BBH)", purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance", source="https://github.com/suzgunmirac/BIG-Bench-Hard", language="English", format="Multiple choice questions with single correct answers", characteristics=( "Tasks require complex multi-step reasoning and were selected based on " "initial model performance below human baseline. Performance can be " "significantly improved through chain-of-thought prompting. The dataset " "includes 23 core tasks plus additional related tasks." ), category=["Advanced Reasoning"], citation=( "@article{suzgun2022challenging,\n" " title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n" ' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n' " journal={arXiv preprint arXiv:2210.09261},\n" " year={2022}\n" "}" ), additional_info={ "model_performance": ( "With chain-of-thought prompting, PaLM surpassed human performance on " "10/23 tasks, while Codex surpassed human performance on 17/23 tasks" ), "size": "6.5k examples across 27 tasks (23 core + 4 related)", }, ) def get_evaluation_metrics(self) -> List[EvaluationMetric]: """Returns the recommended evaluation metrics for BBH dataset.""" return [ EvaluationMetric.create( name="accuracy", type="classification", description="Proportion of exactly correct answers (after stripping parentheses)", implementation="evaluate.load('accuracy')", primary=True, ), EvaluationMetric.create( name="human_eval_delta", type="comparison", description="Difference between model accuracy and average human-rater performance baseline", implementation="custom_human_baseline_comparison", primary=True, ), EvaluationMetric.create( name="per_task_accuracy", type="classification", description="Accuracy broken down by individual reasoning tasks", implementation="custom_task_accuracy", primary=False, ), EvaluationMetric.create( name="exact_match", type="string_match", description="Strict exact match between predicted and target answers", implementation="evaluate.load('exact_match')", primary=False, ), ] if __name__ == "__main__": # Example usage parser = BBHDatasetParser() # Load the dataset with a specific task parser.load(task_name="reasoning_about_colored_objects") # Parse all splits parser.parse() # Get parsed data parsed_data = parser.get_parsed_data # Print example entry if parsed_data: example = parsed_data[0] print("\nExample parsed entry:") print(f"Task: {example.task_name}") print(f"Question: {example.question}") print(f"Answer: {example.answer}")