refactor: test cases
Browse files- tests/test_bbh_parser.py +0 -5
- tests/test_gsm8k_parser.py +0 -18
- tests/test_humaneval_parser.py +0 -2
- tests/test_ifeval_parser.py +0 -22
- tests/test_math_parser.py +0 -9
- tests/test_mbpp_parser.py +0 -33
- tests/test_mgsm_parser.py +1 -27
tests/test_bbh_parser.py
CHANGED
@@ -165,14 +165,9 @@ def test_get_dataset_description(bbh_parser):
|
|
165 |
description = bbh_parser.get_dataset_description()
|
166 |
|
167 |
assert description.name == "Big Bench Hard (BBH)"
|
168 |
-
assert "challenging BIG-Bench tasks" in description.purpose
|
169 |
assert description.language == "English"
|
170 |
assert description.format == "Multiple choice questions with single correct answers"
|
171 |
-
assert "Tasks require complex multi-step reasoning" in description.characteristics
|
172 |
assert "suzgun2022challenging" in description.citation
|
173 |
-
assert description.additional_info is not None
|
174 |
-
assert "model_performance" in description.additional_info
|
175 |
-
assert "size" in description.additional_info
|
176 |
|
177 |
|
178 |
def test_get_evaluation_metrics(bbh_parser):
|
|
|
165 |
description = bbh_parser.get_dataset_description()
|
166 |
|
167 |
assert description.name == "Big Bench Hard (BBH)"
|
|
|
168 |
assert description.language == "English"
|
169 |
assert description.format == "Multiple choice questions with single correct answers"
|
|
|
170 |
assert "suzgun2022challenging" in description.citation
|
|
|
|
|
|
|
171 |
|
172 |
|
173 |
def test_get_evaluation_metrics(bbh_parser):
|
tests/test_gsm8k_parser.py
CHANGED
@@ -190,10 +190,7 @@ def test_get_dataset_description(gsm8k_parser):
|
|
190 |
assert description.name == "Grade School Math 8K (GSM8K)"
|
191 |
assert description.source == "OpenAI"
|
192 |
assert description.language == "English"
|
193 |
-
assert "8.5K grade school math word problems" in description.characteristics
|
194 |
-
assert "Training Verifiers to Solve Math Word Problems" in description.citation
|
195 |
assert "Cobbe" in description.citation
|
196 |
-
assert "arXiv" in description.citation
|
197 |
|
198 |
|
199 |
def test_get_evaluation_metrics(gsm8k_parser):
|
@@ -210,18 +207,3 @@ def test_get_evaluation_metrics(gsm8k_parser):
|
|
210 |
assert exact_match.type == "string"
|
211 |
assert exact_match.primary is True
|
212 |
assert "exact match" in exact_match.description.lower()
|
213 |
-
|
214 |
-
# Check solution_validity metric details
|
215 |
-
solution_validity = next(m for m in metrics if m.name == "solution_validity")
|
216 |
-
assert solution_validity.type == "text"
|
217 |
-
assert solution_validity.primary is True
|
218 |
-
assert "valid" in solution_validity.description.lower()
|
219 |
-
|
220 |
-
# Check step metrics
|
221 |
-
step_accuracy = next(m for m in metrics if m.name == "step_accuracy")
|
222 |
-
assert step_accuracy.type == "numerical"
|
223 |
-
assert step_accuracy.primary is True
|
224 |
-
|
225 |
-
step_count = next(m for m in metrics if m.name == "step_count")
|
226 |
-
assert step_count.type == "numerical"
|
227 |
-
assert step_count.primary is False
|
|
|
190 |
assert description.name == "Grade School Math 8K (GSM8K)"
|
191 |
assert description.source == "OpenAI"
|
192 |
assert description.language == "English"
|
|
|
|
|
193 |
assert "Cobbe" in description.citation
|
|
|
194 |
|
195 |
|
196 |
def test_get_evaluation_metrics(gsm8k_parser):
|
|
|
207 |
assert exact_match.type == "string"
|
208 |
assert exact_match.primary is True
|
209 |
assert "exact match" in exact_match.description.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_humaneval_parser.py
CHANGED
@@ -180,8 +180,6 @@ def test_get_dataset_description(parser, plus_parser):
|
|
180 |
assert description.name == "HumanEval"
|
181 |
assert "code generation" in description.purpose
|
182 |
assert description.language == "Python"
|
183 |
-
assert "Function signatures with docstrings" in description.format
|
184 |
-
assert "164 hand-written Python programming problems" in description.characteristics
|
185 |
assert "chen2021codex" in description.citation
|
186 |
|
187 |
# Test HumanEval Plus description
|
|
|
180 |
assert description.name == "HumanEval"
|
181 |
assert "code generation" in description.purpose
|
182 |
assert description.language == "Python"
|
|
|
|
|
183 |
assert "chen2021codex" in description.citation
|
184 |
|
185 |
# Test HumanEval Plus description
|
tests/test_ifeval_parser.py
CHANGED
@@ -96,14 +96,8 @@ def test_get_dataset_description(ifeval_parser):
|
|
96 |
description = ifeval_parser.get_dataset_description()
|
97 |
|
98 |
assert description.name == "IFEval"
|
99 |
-
assert "verifiable instructions" in description.purpose.lower()
|
100 |
assert description.source == "Google Research"
|
101 |
assert description.language == "English (BCP-47 en)"
|
102 |
-
assert "verifiable instruction prompts" in description.format.lower()
|
103 |
-
assert "500" in description.characteristics
|
104 |
-
assert "automated heuristics" in description.characteristics.lower()
|
105 |
-
assert "open llm leaderboard" in description.characteristics.lower()
|
106 |
-
assert "zhou2023instructionfollowingevaluation" in description.citation
|
107 |
|
108 |
|
109 |
def test_get_evaluation_metrics(ifeval_parser):
|
@@ -124,19 +118,3 @@ def test_get_evaluation_metrics(ifeval_parser):
|
|
124 |
assert "punctuation_rules" in metric_names
|
125 |
assert "keyword_usage" in metric_names
|
126 |
assert "structural_requirements" in metric_names
|
127 |
-
|
128 |
-
# Check specific metric properties
|
129 |
-
format_metric = next(m for m in metrics if m.name == "format_compliance")
|
130 |
-
assert format_metric.primary is True
|
131 |
-
assert "formatting rules" in format_metric.description.lower()
|
132 |
-
assert format_metric.type == "text"
|
133 |
-
|
134 |
-
length_metric = next(m for m in metrics if m.name == "length_constraints")
|
135 |
-
assert length_metric.primary is True
|
136 |
-
assert "word" in length_metric.description.lower()
|
137 |
-
assert length_metric.type == "text"
|
138 |
-
|
139 |
-
punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
|
140 |
-
assert punctuation_metric.primary is True
|
141 |
-
assert "punctuation" in punctuation_metric.description.lower()
|
142 |
-
assert punctuation_metric.type == "text"
|
|
|
96 |
description = ifeval_parser.get_dataset_description()
|
97 |
|
98 |
assert description.name == "IFEval"
|
|
|
99 |
assert description.source == "Google Research"
|
100 |
assert description.language == "English (BCP-47 en)"
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
|
103 |
def test_get_evaluation_metrics(ifeval_parser):
|
|
|
118 |
assert "punctuation_rules" in metric_names
|
119 |
assert "keyword_usage" in metric_names
|
120 |
assert "structural_requirements" in metric_names
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_math_parser.py
CHANGED
@@ -205,12 +205,9 @@ def test_get_dataset_description(math_parser):
|
|
205 |
description = math_parser.get_dataset_description()
|
206 |
|
207 |
assert description.name == "MATH"
|
208 |
-
assert "mathematical problem-solving" in description.purpose.lower()
|
209 |
assert "Hendrycks" in description.source
|
210 |
assert description.language == "English"
|
211 |
-
assert "competition mathematics problems" in description.format.lower()
|
212 |
assert "12,500" in description.characteristics
|
213 |
-
assert "step-by-step solutions" in description.characteristics.lower()
|
214 |
assert "hendrycksmath2021" in description.citation
|
215 |
assert "NeurIPS" in description.citation
|
216 |
|
@@ -220,8 +217,6 @@ def test_get_dataset_description(math_parser):
|
|
220 |
assert "algebra" in description.additional_info["topics"]
|
221 |
assert "geometry" in description.additional_info["topics"]
|
222 |
assert description.additional_info["size"] == "12,500 problems"
|
223 |
-
assert "sympy" in description.additional_info["evaluation_note"].lower()
|
224 |
-
assert "github.com/hendrycks/math" in description.additional_info["homepage"]
|
225 |
|
226 |
|
227 |
def test_get_evaluation_metrics(math_parser):
|
@@ -259,7 +254,3 @@ def test_get_evaluation_metrics(math_parser):
|
|
259 |
assert reasoning_metric.type == "text"
|
260 |
assert reasoning_metric.primary is True
|
261 |
assert "mathematical reasoning" in reasoning_metric.description.lower()
|
262 |
-
|
263 |
-
# Check non-primary metrics
|
264 |
-
non_primary_metrics = {m.name for m in metrics if not m.primary}
|
265 |
-
assert non_primary_metrics == {"mathematical_notation", "solution_clarity"}
|
|
|
205 |
description = math_parser.get_dataset_description()
|
206 |
|
207 |
assert description.name == "MATH"
|
|
|
208 |
assert "Hendrycks" in description.source
|
209 |
assert description.language == "English"
|
|
|
210 |
assert "12,500" in description.characteristics
|
|
|
211 |
assert "hendrycksmath2021" in description.citation
|
212 |
assert "NeurIPS" in description.citation
|
213 |
|
|
|
217 |
assert "algebra" in description.additional_info["topics"]
|
218 |
assert "geometry" in description.additional_info["topics"]
|
219 |
assert description.additional_info["size"] == "12,500 problems"
|
|
|
|
|
220 |
|
221 |
|
222 |
def test_get_evaluation_metrics(math_parser):
|
|
|
254 |
assert reasoning_metric.type == "text"
|
255 |
assert reasoning_metric.primary is True
|
256 |
assert "mathematical reasoning" in reasoning_metric.description.lower()
|
|
|
|
|
|
|
|
tests/test_mbpp_parser.py
CHANGED
@@ -162,31 +162,10 @@ def test_get_dataset_description(parser):
|
|
162 |
assert "code generation" in description.purpose.lower()
|
163 |
assert "google-research" in description.source
|
164 |
assert description.language == "English and Python"
|
165 |
-
assert "task descriptions" in description.format.lower()
|
166 |
-
assert "python solutions" in description.format.lower()
|
167 |
assert "1,000" in description.characteristics
|
168 |
-
assert "entry-level programmers" in description.characteristics.lower()
|
169 |
-
assert "3 automated test cases" in description.characteristics
|
170 |
-
assert "hand-verified" in description.characteristics
|
171 |
assert "austin2021program" in description.citation
|
172 |
assert "Program Synthesis" in description.citation
|
173 |
|
174 |
-
# Check additional info
|
175 |
-
assert description.additional_info is not None
|
176 |
-
assert description.additional_info["size"] == "~1,000 programming problems"
|
177 |
-
assert (
|
178 |
-
description.additional_info["splits"]
|
179 |
-
== "Available in full or sanitized versions"
|
180 |
-
)
|
181 |
-
assert (
|
182 |
-
description.additional_info["test_coverage"]
|
183 |
-
== "Each problem includes 3 automated test cases"
|
184 |
-
)
|
185 |
-
assert (
|
186 |
-
description.additional_info["verification"]
|
187 |
-
== "Subset of data has been hand-verified by authors"
|
188 |
-
)
|
189 |
-
|
190 |
|
191 |
def test_get_evaluation_metrics(parser):
|
192 |
"""Test evaluation metrics generation."""
|
@@ -211,15 +190,3 @@ def test_get_evaluation_metrics(parser):
|
|
211 |
assert pass_k_metric.primary is True
|
212 |
assert "k generations" in pass_k_metric.description.lower()
|
213 |
assert "custom_pass_at_k" in pass_k_metric.implementation
|
214 |
-
|
215 |
-
test_case_metric = next(m for m in metrics if m.name == "test_case_success_rate")
|
216 |
-
assert test_case_metric.type == "code_evaluation"
|
217 |
-
assert test_case_metric.primary is False
|
218 |
-
assert "test cases" in test_case_metric.description.lower()
|
219 |
-
assert "custom_test_success_rate" in test_case_metric.implementation
|
220 |
-
|
221 |
-
syntax_metric = next(m for m in metrics if m.name == "syntax_validity")
|
222 |
-
assert syntax_metric.type == "code_evaluation"
|
223 |
-
assert syntax_metric.primary is False
|
224 |
-
assert "syntactically valid" in syntax_metric.description.lower()
|
225 |
-
assert "custom_syntax_check" in syntax_metric.implementation
|
|
|
162 |
assert "code generation" in description.purpose.lower()
|
163 |
assert "google-research" in description.source
|
164 |
assert description.language == "English and Python"
|
|
|
|
|
165 |
assert "1,000" in description.characteristics
|
|
|
|
|
|
|
166 |
assert "austin2021program" in description.citation
|
167 |
assert "Program Synthesis" in description.citation
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
def test_get_evaluation_metrics(parser):
|
171 |
"""Test evaluation metrics generation."""
|
|
|
190 |
assert pass_k_metric.primary is True
|
191 |
assert "k generations" in pass_k_metric.description.lower()
|
192 |
assert "custom_pass_at_k" in pass_k_metric.implementation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_mgsm_parser.py
CHANGED
@@ -192,35 +192,18 @@ def test_get_dataset_description(mgsm_parser):
|
|
192 |
assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
|
193 |
assert "juletxara/mgsm" in description.source
|
194 |
assert description.language == "Multilingual (11 languages)"
|
195 |
-
|
196 |
-
assert "numerical answers" in description.format.lower()
|
197 |
-
assert "solution steps" in description.format.lower()
|
198 |
-
|
199 |
-
# Check characteristics
|
200 |
-
assert "250" in description.characteristics
|
201 |
-
assert "gsm8k" in description.characteristics.lower()
|
202 |
-
assert "translations" in description.characteristics.lower()
|
203 |
assert "mathematical reasoning" in description.characteristics.lower()
|
204 |
|
205 |
# Check citations
|
206 |
assert "shi2022language" in description.citation
|
207 |
assert "cobbe2021gsm8k" in description.citation
|
208 |
-
assert (
|
209 |
-
"Language Models are Multilingual Chain-of-Thought Reasoners"
|
210 |
-
in description.citation
|
211 |
-
)
|
212 |
-
assert "Training Verifiers to Solve Math Word Problems" in description.citation
|
213 |
|
214 |
# Check additional info
|
215 |
assert description.additional_info is not None
|
216 |
assert len(description.additional_info["languages"]) == 11
|
217 |
assert "English" in description.additional_info["languages"]
|
218 |
assert "Chinese" in description.additional_info["languages"]
|
219 |
-
assert (
|
220 |
-
description.additional_info["size"]
|
221 |
-
== "250 problems translated into each language"
|
222 |
-
)
|
223 |
-
assert description.additional_info["base_dataset"] == "GSM8K (Grade School Math 8K)"
|
224 |
|
225 |
|
226 |
def test_get_evaluation_metrics(mgsm_parser):
|
@@ -259,12 +242,3 @@ def test_get_evaluation_metrics(mgsm_parser):
|
|
259 |
assert step_metric.primary is True
|
260 |
assert "calculation steps" in step_metric.description.lower()
|
261 |
assert "custom_step_accuracy" in step_metric.implementation
|
262 |
-
|
263 |
-
# Check cross-lingual metric specifically
|
264 |
-
cross_lingual_metric = next(
|
265 |
-
m for m in metrics if m.name == "cross_lingual_consistency"
|
266 |
-
)
|
267 |
-
assert cross_lingual_metric.type == "comparison"
|
268 |
-
assert cross_lingual_metric.primary is False
|
269 |
-
assert "different language versions" in cross_lingual_metric.description.lower()
|
270 |
-
assert "custom_language_comparator" in cross_lingual_metric.implementation
|
|
|
192 |
assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
|
193 |
assert "juletxara/mgsm" in description.source
|
194 |
assert description.language == "Multilingual (11 languages)"
|
195 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
assert "mathematical reasoning" in description.characteristics.lower()
|
197 |
|
198 |
# Check citations
|
199 |
assert "shi2022language" in description.citation
|
200 |
assert "cobbe2021gsm8k" in description.citation
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
# Check additional info
|
203 |
assert description.additional_info is not None
|
204 |
assert len(description.additional_info["languages"]) == 11
|
205 |
assert "English" in description.additional_info["languages"]
|
206 |
assert "Chinese" in description.additional_info["languages"]
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
|
209 |
def test_get_evaluation_metrics(mgsm_parser):
|
|
|
242 |
assert step_metric.primary is True
|
243 |
assert "calculation steps" in step_metric.description.lower()
|
244 |
assert "custom_step_accuracy" in step_metric.implementation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|