refactor: ifeval parser
Browse files- llmdataparser/ifeval_parser.py +73 -1
- tests/test_ifeval_parser.py +51 -0
llmdataparser/ifeval_parser.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from typing import Any, ClassVar, List
|
3 |
|
4 |
-
from llmdataparser.base_parser import
|
|
|
|
|
|
|
|
|
|
|
5 |
from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT # You'll need to create this
|
6 |
|
7 |
|
@@ -77,6 +82,73 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
|
77 |
task_name=task,
|
78 |
)
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
if __name__ == "__main__":
|
82 |
# Example usage
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from typing import Any, ClassVar, List
|
3 |
|
4 |
+
from llmdataparser.base_parser import (
|
5 |
+
DatasetDescription,
|
6 |
+
EvaluationMetric,
|
7 |
+
HuggingFaceDatasetParser,
|
8 |
+
HuggingFaceParseEntry,
|
9 |
+
)
|
10 |
from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT # You'll need to create this
|
11 |
|
12 |
|
|
|
82 |
task_name=task,
|
83 |
)
|
84 |
|
85 |
+
def get_dataset_description(self) -> DatasetDescription:
|
86 |
+
"""Returns description of the IFEval dataset."""
|
87 |
+
return DatasetDescription.create(
|
88 |
+
name="IFEval",
|
89 |
+
purpose="Evaluate instruction following capabilities through verifiable instructions",
|
90 |
+
source="Google Research",
|
91 |
+
language="English (BCP-47 en)",
|
92 |
+
format="Verifiable instruction prompts with automated evaluation criteria",
|
93 |
+
characteristics=(
|
94 |
+
"Collection of approximately 500 verifiable instructions designed to evaluate "
|
95 |
+
"language models' instruction-following capabilities. Instructions include "
|
96 |
+
"specific, measurable criteria like 'write in more than 400 words' or "
|
97 |
+
"'mention the keyword AI at least 3 times' that can be verified through "
|
98 |
+
"automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
|
99 |
+
"for evaluating chat or instruction fine-tuned language models."
|
100 |
+
),
|
101 |
+
citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
|
102 |
+
title={Instruction-Following Evaluation for Large Language Models},
|
103 |
+
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
|
104 |
+
year={2023},
|
105 |
+
eprint={2311.07911},
|
106 |
+
archivePrefix={arXiv},
|
107 |
+
primaryClass={cs.CL},
|
108 |
+
url={https://arxiv.org/abs/2311.07911}
|
109 |
+
}""",
|
110 |
+
)
|
111 |
+
|
112 |
+
def get_evaluation_metrics(self) -> list[EvaluationMetric]:
|
113 |
+
"""Returns recommended evaluation metrics for IFEval."""
|
114 |
+
return [
|
115 |
+
EvaluationMetric.create(
|
116 |
+
name="format_compliance",
|
117 |
+
type="text",
|
118 |
+
description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
|
119 |
+
implementation="custom_format_checker",
|
120 |
+
primary=True,
|
121 |
+
),
|
122 |
+
EvaluationMetric.create(
|
123 |
+
name="length_constraints",
|
124 |
+
type="text",
|
125 |
+
description="Checks if the response meets word, sentence, or paragraph count requirements",
|
126 |
+
implementation="custom_length_validator",
|
127 |
+
primary=True,
|
128 |
+
),
|
129 |
+
EvaluationMetric.create(
|
130 |
+
name="punctuation_rules",
|
131 |
+
type="text",
|
132 |
+
description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
|
133 |
+
implementation="custom_punctuation_checker",
|
134 |
+
primary=True,
|
135 |
+
),
|
136 |
+
EvaluationMetric.create(
|
137 |
+
name="keyword_usage",
|
138 |
+
type="text",
|
139 |
+
description="Verifies correct usage of required keywords or avoidance of forbidden words",
|
140 |
+
implementation="custom_keyword_validator",
|
141 |
+
primary=False,
|
142 |
+
),
|
143 |
+
EvaluationMetric.create(
|
144 |
+
name="structural_requirements",
|
145 |
+
type="text",
|
146 |
+
description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
|
147 |
+
implementation="custom_structure_validator",
|
148 |
+
primary=False,
|
149 |
+
),
|
150 |
+
]
|
151 |
+
|
152 |
|
153 |
if __name__ == "__main__":
|
154 |
# Example usage
|
tests/test_ifeval_parser.py
CHANGED
@@ -89,3 +89,54 @@ def test_parser_string_representation(ifeval_parser):
|
|
89 |
assert "IFEvalDatasetParser" in repr_str
|
90 |
assert "google/IFEval" in repr_str
|
91 |
assert "not loaded" in repr_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
assert "IFEvalDatasetParser" in repr_str
|
90 |
assert "google/IFEval" in repr_str
|
91 |
assert "not loaded" in repr_str
|
92 |
+
|
93 |
+
|
94 |
+
def test_get_dataset_description(ifeval_parser):
|
95 |
+
"""Test dataset description generation for IFEval."""
|
96 |
+
description = ifeval_parser.get_dataset_description()
|
97 |
+
|
98 |
+
assert description.name == "IFEval"
|
99 |
+
assert "verifiable instructions" in description.purpose.lower()
|
100 |
+
assert description.source == "Google Research"
|
101 |
+
assert description.language == "English (BCP-47 en)"
|
102 |
+
assert "verifiable instruction prompts" in description.format.lower()
|
103 |
+
assert "500" in description.characteristics
|
104 |
+
assert "automated heuristics" in description.characteristics.lower()
|
105 |
+
assert "open llm leaderboard" in description.characteristics.lower()
|
106 |
+
assert "zhou2023instructionfollowingevaluation" in description.citation
|
107 |
+
|
108 |
+
|
109 |
+
def test_get_evaluation_metrics(ifeval_parser):
|
110 |
+
"""Test evaluation metrics generation for IFEval."""
|
111 |
+
metrics = ifeval_parser.get_evaluation_metrics()
|
112 |
+
|
113 |
+
# Should have 5 metrics total
|
114 |
+
assert len(metrics) == 5
|
115 |
+
|
116 |
+
# Check primary metrics
|
117 |
+
primary_metrics = [m for m in metrics if m.primary]
|
118 |
+
assert len(primary_metrics) == 3
|
119 |
+
|
120 |
+
# Verify specific metrics exist and have correct properties
|
121 |
+
metric_names = {m.name for m in metrics}
|
122 |
+
assert "format_compliance" in metric_names
|
123 |
+
assert "length_constraints" in metric_names
|
124 |
+
assert "punctuation_rules" in metric_names
|
125 |
+
assert "keyword_usage" in metric_names
|
126 |
+
assert "structural_requirements" in metric_names
|
127 |
+
|
128 |
+
# Check specific metric properties
|
129 |
+
format_metric = next(m for m in metrics if m.name == "format_compliance")
|
130 |
+
assert format_metric.primary is True
|
131 |
+
assert "formatting rules" in format_metric.description.lower()
|
132 |
+
assert format_metric.type == "text"
|
133 |
+
|
134 |
+
length_metric = next(m for m in metrics if m.name == "length_constraints")
|
135 |
+
assert length_metric.primary is True
|
136 |
+
assert "word" in length_metric.description.lower()
|
137 |
+
assert length_metric.type == "text"
|
138 |
+
|
139 |
+
punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
|
140 |
+
assert punctuation_metric.primary is True
|
141 |
+
assert "punctuation" in punctuation_metric.description.lower()
|
142 |
+
assert punctuation_metric.type == "text"
|