JeffYang52415 commited on
Commit
299e68a
·
unverified ·
1 Parent(s): fb32f8e

refactor: description&metrics interface

Browse files
llmdataparser/base_parser.py CHANGED
@@ -1,7 +1,7 @@
1
  from abc import ABC, abstractmethod
2
  from dataclasses import dataclass
3
  from functools import lru_cache
4
- from typing import Any, ClassVar, Generic, TypeVar
5
 
6
  import datasets
7
 
@@ -19,6 +19,66 @@ class ParseEntry:
19
  raw_answer: str
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class DatasetParser(Generic[T], ABC):
23
  """
24
  Abstract base class defining the interface for all dataset parsers.
@@ -59,6 +119,21 @@ class DatasetParser(Generic[T], ABC):
59
  T: The processed entry, typically an instance of a subclass of ParseEntry.
60
  """
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  @dataclass(frozen=True, kw_only=True, slots=True)
64
  class HuggingFaceParseEntry(ParseEntry):
 
1
  from abc import ABC, abstractmethod
2
  from dataclasses import dataclass
3
  from functools import lru_cache
4
+ from typing import Any, ClassVar, Generic, List, TypeVar
5
 
6
  import datasets
7
 
 
19
  raw_answer: str
20
 
21
 
22
+ @dataclass(frozen=True, kw_only=True, slots=True)
23
+ class DatasetDescription:
24
+ """Standardized description of a dataset."""
25
+
26
+ name: str
27
+ purpose: str
28
+ source: str
29
+ language: str
30
+ format: str
31
+ characteristics: str
32
+ citation: str | None = None
33
+ additional_info: dict[str, Any] | None = None
34
+
35
+ @classmethod
36
+ def create(
37
+ cls,
38
+ name: str,
39
+ purpose: str,
40
+ source: str,
41
+ language: str,
42
+ format: str,
43
+ characteristics: str,
44
+ citation: str | None = None,
45
+ additional_info: dict[str, Any] | None = None,
46
+ ) -> "DatasetDescription":
47
+ return cls(
48
+ name=name,
49
+ purpose=purpose,
50
+ source=source,
51
+ language=language,
52
+ format=format,
53
+ characteristics=characteristics,
54
+ citation=citation,
55
+ additional_info=additional_info,
56
+ )
57
+
58
+
59
+ @dataclass(frozen=True, kw_only=True, slots=True)
60
+ class EvaluationMetric:
61
+ """Description of an evaluation metric for a dataset."""
62
+
63
+ name: str
64
+ type: str
65
+ description: str
66
+ implementation: str
67
+ primary: bool
68
+
69
+ @classmethod
70
+ def create(
71
+ cls, name: str, type: str, description: str, implementation: str, primary: bool
72
+ ) -> "EvaluationMetric":
73
+ return cls(
74
+ name=name,
75
+ type=type,
76
+ description=description,
77
+ implementation=implementation,
78
+ primary=primary,
79
+ )
80
+
81
+
82
  class DatasetParser(Generic[T], ABC):
83
  """
84
  Abstract base class defining the interface for all dataset parsers.
 
119
  T: The processed entry, typically an instance of a subclass of ParseEntry.
120
  """
121
 
122
+ def get_dataset_description(self) -> DatasetDescription:
123
+ """Returns a standardized description of the dataset."""
124
+ return DatasetDescription(
125
+ name="Unknown",
126
+ purpose="Not specified",
127
+ source="Not specified",
128
+ language="Not specified",
129
+ format="Not specified",
130
+ characteristics="Not specified",
131
+ )
132
+
133
+ def get_evaluation_metrics(self) -> List[EvaluationMetric]:
134
+ """Returns the recommended evaluation metrics for the dataset."""
135
+ return []
136
+
137
 
138
  @dataclass(frozen=True, kw_only=True, slots=True)
139
  class HuggingFaceParseEntry(ParseEntry):
llmdataparser/bbh_parser.py CHANGED
@@ -1,7 +1,12 @@
1
  from dataclasses import dataclass
2
- from typing import Any, ClassVar, Dict, List
3
-
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 
 
 
 
 
5
  from llmdataparser.prompts import BBH_SYSTEM_PROMPT # You'll need to create this
6
 
7
 
@@ -87,26 +92,21 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
87
  task_name=task,
88
  )
89
 
90
- def get_dataset_description(self) -> Dict[str, str]:
91
  """Returns a description of the Big Bench Hard dataset."""
92
- return {
93
- "name": "Big Bench Hard (BBH)",
94
- "purpose": "A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
95
- "source": "https://github.com/suzgunmirac/BIG-Bench-Hard",
96
- "language": "English",
97
- "size": "6.5k examples across 27 tasks (23 core + 4 related)",
98
- "format": "Multiple choice questions with single correct answers",
99
- "characteristics": (
100
  "Tasks require complex multi-step reasoning and were selected based on "
101
  "initial model performance below human baseline. Performance can be "
102
  "significantly improved through chain-of-thought prompting. The dataset "
103
  "includes 23 core tasks plus additional related tasks."
104
  ),
105
- "model_performance": (
106
- "With chain-of-thought prompting, PaLM surpassed human performance on "
107
- "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
108
- ),
109
- "citation": (
110
  "@article{suzgun2022challenging,\n"
111
  " title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
112
  ' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
@@ -114,39 +114,46 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
114
  " year={2022}\n"
115
  "}"
116
  ),
117
- }
 
 
 
 
 
 
 
118
 
119
- def get_evaluation_metrics(self) -> List[Dict[str, Any]]:
120
  """Returns the recommended evaluation metrics for BBH dataset."""
121
  return [
122
- {
123
- "name": "accuracy",
124
- "type": "classification",
125
- "description": "Proportion of exactly correct answers (after stripping parentheses)",
126
- "implementation": "evaluate.load('accuracy')",
127
- "primary": True,
128
- },
129
- {
130
- "name": "human_eval_delta",
131
- "type": "comparison",
132
- "description": "Difference between model accuracy and average human-rater performance baseline",
133
- "implementation": "custom_human_baseline_comparison",
134
- "primary": True,
135
- },
136
- {
137
- "name": "per_task_accuracy",
138
- "type": "classification",
139
- "description": "Accuracy broken down by individual reasoning tasks",
140
- "implementation": "custom_task_accuracy",
141
- "primary": False,
142
- },
143
- {
144
- "name": "exact_match",
145
- "type": "string_match",
146
- "description": "Strict exact match between predicted and target answers",
147
- "implementation": "evaluate.load('exact_match')",
148
- "primary": False,
149
- },
150
  ]
151
 
152
 
 
1
  from dataclasses import dataclass
2
+ from typing import Any, ClassVar, List
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
  from llmdataparser.prompts import BBH_SYSTEM_PROMPT # You'll need to create this
11
 
12
 
 
92
  task_name=task,
93
  )
94
 
95
+ def get_dataset_description(self) -> DatasetDescription:
96
  """Returns a description of the Big Bench Hard dataset."""
97
+ return DatasetDescription.create(
98
+ name="Big Bench Hard (BBH)",
99
+ purpose="A curated subset of 23 challenging BIG-Bench tasks where language models initially performed below average human-rater performance",
100
+ source="https://github.com/suzgunmirac/BIG-Bench-Hard",
101
+ language="English",
102
+ format="Multiple choice questions with single correct answers",
103
+ characteristics=(
 
104
  "Tasks require complex multi-step reasoning and were selected based on "
105
  "initial model performance below human baseline. Performance can be "
106
  "significantly improved through chain-of-thought prompting. The dataset "
107
  "includes 23 core tasks plus additional related tasks."
108
  ),
109
+ citation=(
 
 
 
 
110
  "@article{suzgun2022challenging,\n"
111
  " title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
112
  ' author={Suzgun, Mirac and Scales, Nathan and Sch{"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and Wei, Jason},\n'
 
114
  " year={2022}\n"
115
  "}"
116
  ),
117
+ additional_info={
118
+ "model_performance": (
119
+ "With chain-of-thought prompting, PaLM surpassed human performance on "
120
+ "10/23 tasks, while Codex surpassed human performance on 17/23 tasks"
121
+ ),
122
+ "size": "6.5k examples across 27 tasks (23 core + 4 related)",
123
+ },
124
+ )
125
 
126
+ def get_evaluation_metrics(self) -> List[EvaluationMetric]:
127
  """Returns the recommended evaluation metrics for BBH dataset."""
128
  return [
129
+ EvaluationMetric.create(
130
+ name="accuracy",
131
+ type="classification",
132
+ description="Proportion of exactly correct answers (after stripping parentheses)",
133
+ implementation="evaluate.load('accuracy')",
134
+ primary=True,
135
+ ),
136
+ EvaluationMetric.create(
137
+ name="human_eval_delta",
138
+ type="comparison",
139
+ description="Difference between model accuracy and average human-rater performance baseline",
140
+ implementation="custom_human_baseline_comparison",
141
+ primary=True,
142
+ ),
143
+ EvaluationMetric.create(
144
+ name="per_task_accuracy",
145
+ type="classification",
146
+ description="Accuracy broken down by individual reasoning tasks",
147
+ implementation="custom_task_accuracy",
148
+ primary=False,
149
+ ),
150
+ EvaluationMetric.create(
151
+ name="exact_match",
152
+ type="string_match",
153
+ description="Strict exact match between predicted and target answers",
154
+ implementation="evaluate.load('exact_match')",
155
+ primary=False,
156
+ ),
157
  ]
158
 
159
 
llmdataparser/tmlu_parser.py CHANGED
@@ -1,7 +1,12 @@
1
  from dataclasses import dataclass
2
- from typing import Any, Dict, Final, List
3
-
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 
 
 
 
 
5
  from llmdataparser.prompts import TMLU_SYSTEM_PROMPT
6
 
7
  TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
@@ -118,63 +123,73 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
118
  metadata=metadata,
119
  )
120
 
121
- def get_dataset_description(self) -> Dict[str, str]:
122
  """Returns description of the TMLU dataset."""
123
- return {
124
- "name": "Taiwan Multiple-choice Language Understanding (TMLU)",
125
- "version": "1.0",
126
- "language": "Traditional Chinese",
127
- "purpose": "Evaluate models on Taiwan-specific educational and professional knowledge",
128
- "source": "Various Taiwan standardized tests and professional certifications",
129
- "format": "Multiple choice questions (A/B/C/D)",
130
- "size": "Multiple subjects across different test types",
131
- "domain": "Education and Professional Certification",
132
- "characteristics": (
133
  "Covers various subjects including Advanced Subjects Test (AST), "
134
  "General Scholastic Ability Test (GSAT), College Admission Practice (CAP), "
135
  "and professional certifications"
136
  ),
137
- "reference": "https://huggingface.co/datasets/miulab/tmlu",
138
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
- def get_evaluation_metrics(self) -> List[Dict[str, Any]]:
141
  """Returns recommended evaluation metrics for TMLU."""
142
  return [
143
- {
144
- "name": "accuracy",
145
- "type": "classification",
146
- "description": "Overall percentage of correctly answered questions",
147
- "implementation": "datasets.load_metric('accuracy')",
148
- "primary": True,
149
- },
150
- {
151
- "name": "per_subject_accuracy",
152
- "type": "classification",
153
- "description": "Accuracy broken down by subject areas (AST, GSAT, CAP, etc.)",
154
- "implementation": "custom_subject_accuracy",
155
- "primary": True,
156
- },
157
- {
158
- "name": "per_difficulty_accuracy",
159
- "type": "classification",
160
- "description": "Accuracy broken down by test difficulty levels",
161
- "implementation": "custom_difficulty_accuracy",
162
- "primary": False,
163
- },
164
- {
165
- "name": "confusion_matrix",
166
- "type": "classification",
167
- "description": "Distribution of predicted vs actual answers",
168
- "implementation": "datasets.load_metric('confusion_matrix')",
169
- "primary": False,
170
- },
171
- {
172
- "name": "explanation_quality",
173
- "type": "text",
174
- "description": "Quality assessment of model explanations when available",
175
- "implementation": "custom_explanation_metric",
176
- "primary": False,
177
- },
178
  ]
179
 
180
 
 
1
  from dataclasses import dataclass
2
+ from typing import Any, Final
3
+
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
  from llmdataparser.prompts import TMLU_SYSTEM_PROMPT
11
 
12
  TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
 
123
  metadata=metadata,
124
  )
125
 
126
+ def get_dataset_description(self) -> DatasetDescription:
127
  """Returns description of the TMLU dataset."""
128
+ return DatasetDescription.create(
129
+ name="Taiwan Multiple-choice Language Understanding (TMLU)",
130
+ language="Traditional Chinese",
131
+ purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
132
+ source="Various Taiwan standardized tests and professional certifications",
133
+ format="Multiple choice questions (A/B/C/D)",
134
+ characteristics=(
 
 
 
135
  "Covers various subjects including Advanced Subjects Test (AST), "
136
  "General Scholastic Ability Test (GSAT), College Admission Practice (CAP), "
137
  "and professional certifications"
138
  ),
139
+ citation="""@article{DBLP:journals/corr/abs-2403-20180,
140
+ author = {Po-Heng Chen and Sijia Cheng and Wei-Lin Chen and Yen-Ting Lin and Yun-Nung Chen},
141
+ title = {Measuring Taiwanese Mandarin Language Understanding},
142
+ journal = {CoRR},
143
+ volume = {abs/2403.20180},
144
+ year = {2024},
145
+ url = {https://doi.org/10.48550/arXiv.2403.20180},
146
+ doi = {10.48550/ARXIV.2403.20180},
147
+ eprinttype = {arXiv},
148
+ eprint = {2403.20180},
149
+ timestamp = {Wed, 10 Apr 2024 17:37:45 +0200},
150
+ biburl = {https://dblp.org/rec/journals/corr/abs-2403-20180.bib},
151
+ bibsource = {dblp computer science bibliography, https://dblp.org}
152
+ }""",
153
+ )
154
 
155
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
156
  """Returns recommended evaluation metrics for TMLU."""
157
  return [
158
+ EvaluationMetric.create(
159
+ name="accuracy",
160
+ type="classification",
161
+ description="Overall percentage of correctly answered questions",
162
+ implementation="datasets.load_metric('accuracy')",
163
+ primary=True,
164
+ ),
165
+ EvaluationMetric.create(
166
+ name="per_subject_accuracy",
167
+ type="classification",
168
+ description="Accuracy broken down by subject areas (AST, GSAT, CAP, etc.)",
169
+ implementation="custom_subject_accuracy",
170
+ primary=True,
171
+ ),
172
+ EvaluationMetric.create(
173
+ name="per_difficulty_accuracy",
174
+ type="classification",
175
+ description="Accuracy broken down by test difficulty levels",
176
+ implementation="custom_difficulty_accuracy",
177
+ primary=False,
178
+ ),
179
+ EvaluationMetric.create(
180
+ name="confusion_matrix",
181
+ type="classification",
182
+ description="Distribution of predicted vs actual answers",
183
+ implementation="datasets.load_metric('confusion_matrix')",
184
+ primary=False,
185
+ ),
186
+ EvaluationMetric.create(
187
+ name="explanation_quality",
188
+ type="text",
189
+ description="Quality assessment of model explanations when available",
190
+ implementation="custom_explanation_metric",
191
+ primary=False,
192
+ ),
193
  ]
194
 
195