JeffYang52415 commited on
Commit
aa46ecd
·
unverified ·
1 Parent(s): 2e6d41b

refactor: test cases

Browse files
tests/test_bbh_parser.py CHANGED
@@ -165,14 +165,9 @@ def test_get_dataset_description(bbh_parser):
165
  description = bbh_parser.get_dataset_description()
166
 
167
  assert description.name == "Big Bench Hard (BBH)"
168
- assert "challenging BIG-Bench tasks" in description.purpose
169
  assert description.language == "English"
170
  assert description.format == "Multiple choice questions with single correct answers"
171
- assert "Tasks require complex multi-step reasoning" in description.characteristics
172
  assert "suzgun2022challenging" in description.citation
173
- assert description.additional_info is not None
174
- assert "model_performance" in description.additional_info
175
- assert "size" in description.additional_info
176
 
177
 
178
  def test_get_evaluation_metrics(bbh_parser):
 
165
  description = bbh_parser.get_dataset_description()
166
 
167
  assert description.name == "Big Bench Hard (BBH)"
 
168
  assert description.language == "English"
169
  assert description.format == "Multiple choice questions with single correct answers"
 
170
  assert "suzgun2022challenging" in description.citation
 
 
 
171
 
172
 
173
  def test_get_evaluation_metrics(bbh_parser):
tests/test_gsm8k_parser.py CHANGED
@@ -190,10 +190,7 @@ def test_get_dataset_description(gsm8k_parser):
190
  assert description.name == "Grade School Math 8K (GSM8K)"
191
  assert description.source == "OpenAI"
192
  assert description.language == "English"
193
- assert "8.5K grade school math word problems" in description.characteristics
194
- assert "Training Verifiers to Solve Math Word Problems" in description.citation
195
  assert "Cobbe" in description.citation
196
- assert "arXiv" in description.citation
197
 
198
 
199
  def test_get_evaluation_metrics(gsm8k_parser):
@@ -210,18 +207,3 @@ def test_get_evaluation_metrics(gsm8k_parser):
210
  assert exact_match.type == "string"
211
  assert exact_match.primary is True
212
  assert "exact match" in exact_match.description.lower()
213
-
214
- # Check solution_validity metric details
215
- solution_validity = next(m for m in metrics if m.name == "solution_validity")
216
- assert solution_validity.type == "text"
217
- assert solution_validity.primary is True
218
- assert "valid" in solution_validity.description.lower()
219
-
220
- # Check step metrics
221
- step_accuracy = next(m for m in metrics if m.name == "step_accuracy")
222
- assert step_accuracy.type == "numerical"
223
- assert step_accuracy.primary is True
224
-
225
- step_count = next(m for m in metrics if m.name == "step_count")
226
- assert step_count.type == "numerical"
227
- assert step_count.primary is False
 
190
  assert description.name == "Grade School Math 8K (GSM8K)"
191
  assert description.source == "OpenAI"
192
  assert description.language == "English"
 
 
193
  assert "Cobbe" in description.citation
 
194
 
195
 
196
  def test_get_evaluation_metrics(gsm8k_parser):
 
207
  assert exact_match.type == "string"
208
  assert exact_match.primary is True
209
  assert "exact match" in exact_match.description.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_humaneval_parser.py CHANGED
@@ -180,8 +180,6 @@ def test_get_dataset_description(parser, plus_parser):
180
  assert description.name == "HumanEval"
181
  assert "code generation" in description.purpose
182
  assert description.language == "Python"
183
- assert "Function signatures with docstrings" in description.format
184
- assert "164 hand-written Python programming problems" in description.characteristics
185
  assert "chen2021codex" in description.citation
186
 
187
  # Test HumanEval Plus description
 
180
  assert description.name == "HumanEval"
181
  assert "code generation" in description.purpose
182
  assert description.language == "Python"
 
 
183
  assert "chen2021codex" in description.citation
184
 
185
  # Test HumanEval Plus description
tests/test_ifeval_parser.py CHANGED
@@ -96,14 +96,8 @@ def test_get_dataset_description(ifeval_parser):
96
  description = ifeval_parser.get_dataset_description()
97
 
98
  assert description.name == "IFEval"
99
- assert "verifiable instructions" in description.purpose.lower()
100
  assert description.source == "Google Research"
101
  assert description.language == "English (BCP-47 en)"
102
- assert "verifiable instruction prompts" in description.format.lower()
103
- assert "500" in description.characteristics
104
- assert "automated heuristics" in description.characteristics.lower()
105
- assert "open llm leaderboard" in description.characteristics.lower()
106
- assert "zhou2023instructionfollowingevaluation" in description.citation
107
 
108
 
109
  def test_get_evaluation_metrics(ifeval_parser):
@@ -124,19 +118,3 @@ def test_get_evaluation_metrics(ifeval_parser):
124
  assert "punctuation_rules" in metric_names
125
  assert "keyword_usage" in metric_names
126
  assert "structural_requirements" in metric_names
127
-
128
- # Check specific metric properties
129
- format_metric = next(m for m in metrics if m.name == "format_compliance")
130
- assert format_metric.primary is True
131
- assert "formatting rules" in format_metric.description.lower()
132
- assert format_metric.type == "text"
133
-
134
- length_metric = next(m for m in metrics if m.name == "length_constraints")
135
- assert length_metric.primary is True
136
- assert "word" in length_metric.description.lower()
137
- assert length_metric.type == "text"
138
-
139
- punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
140
- assert punctuation_metric.primary is True
141
- assert "punctuation" in punctuation_metric.description.lower()
142
- assert punctuation_metric.type == "text"
 
96
  description = ifeval_parser.get_dataset_description()
97
 
98
  assert description.name == "IFEval"
 
99
  assert description.source == "Google Research"
100
  assert description.language == "English (BCP-47 en)"
 
 
 
 
 
101
 
102
 
103
  def test_get_evaluation_metrics(ifeval_parser):
 
118
  assert "punctuation_rules" in metric_names
119
  assert "keyword_usage" in metric_names
120
  assert "structural_requirements" in metric_names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_math_parser.py CHANGED
@@ -205,12 +205,9 @@ def test_get_dataset_description(math_parser):
205
  description = math_parser.get_dataset_description()
206
 
207
  assert description.name == "MATH"
208
- assert "mathematical problem-solving" in description.purpose.lower()
209
  assert "Hendrycks" in description.source
210
  assert description.language == "English"
211
- assert "competition mathematics problems" in description.format.lower()
212
  assert "12,500" in description.characteristics
213
- assert "step-by-step solutions" in description.characteristics.lower()
214
  assert "hendrycksmath2021" in description.citation
215
  assert "NeurIPS" in description.citation
216
 
@@ -220,8 +217,6 @@ def test_get_dataset_description(math_parser):
220
  assert "algebra" in description.additional_info["topics"]
221
  assert "geometry" in description.additional_info["topics"]
222
  assert description.additional_info["size"] == "12,500 problems"
223
- assert "sympy" in description.additional_info["evaluation_note"].lower()
224
- assert "github.com/hendrycks/math" in description.additional_info["homepage"]
225
 
226
 
227
  def test_get_evaluation_metrics(math_parser):
@@ -259,7 +254,3 @@ def test_get_evaluation_metrics(math_parser):
259
  assert reasoning_metric.type == "text"
260
  assert reasoning_metric.primary is True
261
  assert "mathematical reasoning" in reasoning_metric.description.lower()
262
-
263
- # Check non-primary metrics
264
- non_primary_metrics = {m.name for m in metrics if not m.primary}
265
- assert non_primary_metrics == {"mathematical_notation", "solution_clarity"}
 
205
  description = math_parser.get_dataset_description()
206
 
207
  assert description.name == "MATH"
 
208
  assert "Hendrycks" in description.source
209
  assert description.language == "English"
 
210
  assert "12,500" in description.characteristics
 
211
  assert "hendrycksmath2021" in description.citation
212
  assert "NeurIPS" in description.citation
213
 
 
217
  assert "algebra" in description.additional_info["topics"]
218
  assert "geometry" in description.additional_info["topics"]
219
  assert description.additional_info["size"] == "12,500 problems"
 
 
220
 
221
 
222
  def test_get_evaluation_metrics(math_parser):
 
254
  assert reasoning_metric.type == "text"
255
  assert reasoning_metric.primary is True
256
  assert "mathematical reasoning" in reasoning_metric.description.lower()
 
 
 
 
tests/test_mbpp_parser.py CHANGED
@@ -162,31 +162,10 @@ def test_get_dataset_description(parser):
162
  assert "code generation" in description.purpose.lower()
163
  assert "google-research" in description.source
164
  assert description.language == "English and Python"
165
- assert "task descriptions" in description.format.lower()
166
- assert "python solutions" in description.format.lower()
167
  assert "1,000" in description.characteristics
168
- assert "entry-level programmers" in description.characteristics.lower()
169
- assert "3 automated test cases" in description.characteristics
170
- assert "hand-verified" in description.characteristics
171
  assert "austin2021program" in description.citation
172
  assert "Program Synthesis" in description.citation
173
 
174
- # Check additional info
175
- assert description.additional_info is not None
176
- assert description.additional_info["size"] == "~1,000 programming problems"
177
- assert (
178
- description.additional_info["splits"]
179
- == "Available in full or sanitized versions"
180
- )
181
- assert (
182
- description.additional_info["test_coverage"]
183
- == "Each problem includes 3 automated test cases"
184
- )
185
- assert (
186
- description.additional_info["verification"]
187
- == "Subset of data has been hand-verified by authors"
188
- )
189
-
190
 
191
  def test_get_evaluation_metrics(parser):
192
  """Test evaluation metrics generation."""
@@ -211,15 +190,3 @@ def test_get_evaluation_metrics(parser):
211
  assert pass_k_metric.primary is True
212
  assert "k generations" in pass_k_metric.description.lower()
213
  assert "custom_pass_at_k" in pass_k_metric.implementation
214
-
215
- test_case_metric = next(m for m in metrics if m.name == "test_case_success_rate")
216
- assert test_case_metric.type == "code_evaluation"
217
- assert test_case_metric.primary is False
218
- assert "test cases" in test_case_metric.description.lower()
219
- assert "custom_test_success_rate" in test_case_metric.implementation
220
-
221
- syntax_metric = next(m for m in metrics if m.name == "syntax_validity")
222
- assert syntax_metric.type == "code_evaluation"
223
- assert syntax_metric.primary is False
224
- assert "syntactically valid" in syntax_metric.description.lower()
225
- assert "custom_syntax_check" in syntax_metric.implementation
 
162
  assert "code generation" in description.purpose.lower()
163
  assert "google-research" in description.source
164
  assert description.language == "English and Python"
 
 
165
  assert "1,000" in description.characteristics
 
 
 
166
  assert "austin2021program" in description.citation
167
  assert "Program Synthesis" in description.citation
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  def test_get_evaluation_metrics(parser):
171
  """Test evaluation metrics generation."""
 
190
  assert pass_k_metric.primary is True
191
  assert "k generations" in pass_k_metric.description.lower()
192
  assert "custom_pass_at_k" in pass_k_metric.implementation
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_mgsm_parser.py CHANGED
@@ -192,35 +192,18 @@ def test_get_dataset_description(mgsm_parser):
192
  assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
193
  assert "juletxara/mgsm" in description.source
194
  assert description.language == "Multilingual (11 languages)"
195
- assert "word problems" in description.format.lower()
196
- assert "numerical answers" in description.format.lower()
197
- assert "solution steps" in description.format.lower()
198
-
199
- # Check characteristics
200
- assert "250" in description.characteristics
201
- assert "gsm8k" in description.characteristics.lower()
202
- assert "translations" in description.characteristics.lower()
203
  assert "mathematical reasoning" in description.characteristics.lower()
204
 
205
  # Check citations
206
  assert "shi2022language" in description.citation
207
  assert "cobbe2021gsm8k" in description.citation
208
- assert (
209
- "Language Models are Multilingual Chain-of-Thought Reasoners"
210
- in description.citation
211
- )
212
- assert "Training Verifiers to Solve Math Word Problems" in description.citation
213
 
214
  # Check additional info
215
  assert description.additional_info is not None
216
  assert len(description.additional_info["languages"]) == 11
217
  assert "English" in description.additional_info["languages"]
218
  assert "Chinese" in description.additional_info["languages"]
219
- assert (
220
- description.additional_info["size"]
221
- == "250 problems translated into each language"
222
- )
223
- assert description.additional_info["base_dataset"] == "GSM8K (Grade School Math 8K)"
224
 
225
 
226
  def test_get_evaluation_metrics(mgsm_parser):
@@ -259,12 +242,3 @@ def test_get_evaluation_metrics(mgsm_parser):
259
  assert step_metric.primary is True
260
  assert "calculation steps" in step_metric.description.lower()
261
  assert "custom_step_accuracy" in step_metric.implementation
262
-
263
- # Check cross-lingual metric specifically
264
- cross_lingual_metric = next(
265
- m for m in metrics if m.name == "cross_lingual_consistency"
266
- )
267
- assert cross_lingual_metric.type == "comparison"
268
- assert cross_lingual_metric.primary is False
269
- assert "different language versions" in cross_lingual_metric.description.lower()
270
- assert "custom_language_comparator" in cross_lingual_metric.implementation
 
192
  assert "multilingual chain-of-thought reasoning" in description.purpose.lower()
193
  assert "juletxara/mgsm" in description.source
194
  assert description.language == "Multilingual (11 languages)"
195
+
 
 
 
 
 
 
 
196
  assert "mathematical reasoning" in description.characteristics.lower()
197
 
198
  # Check citations
199
  assert "shi2022language" in description.citation
200
  assert "cobbe2021gsm8k" in description.citation
 
 
 
 
 
201
 
202
  # Check additional info
203
  assert description.additional_info is not None
204
  assert len(description.additional_info["languages"]) == 11
205
  assert "English" in description.additional_info["languages"]
206
  assert "Chinese" in description.additional_info["languages"]
 
 
 
 
 
207
 
208
 
209
  def test_get_evaluation_metrics(mgsm_parser):
 
242
  assert step_metric.primary is True
243
  assert "calculation steps" in step_metric.description.lower()
244
  assert "custom_step_accuracy" in step_metric.implementation