JeffYang52415 commited on
Commit
b73a2c7
·
unverified ·
1 Parent(s): 70da483

refactor: ifeval parser

Browse files
llmdataparser/ifeval_parser.py CHANGED
@@ -1,7 +1,12 @@
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar, List
3
 
4
- from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 
 
 
 
 
5
  from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT # You'll need to create this
6
 
7
 
@@ -77,6 +82,73 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
77
  task_name=task,
78
  )
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  if __name__ == "__main__":
82
  # Example usage
 
1
  from dataclasses import dataclass
2
  from typing import Any, ClassVar, List
3
 
4
+ from llmdataparser.base_parser import (
5
+ DatasetDescription,
6
+ EvaluationMetric,
7
+ HuggingFaceDatasetParser,
8
+ HuggingFaceParseEntry,
9
+ )
10
  from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT # You'll need to create this
11
 
12
 
 
82
  task_name=task,
83
  )
84
 
85
+ def get_dataset_description(self) -> DatasetDescription:
86
+ """Returns description of the IFEval dataset."""
87
+ return DatasetDescription.create(
88
+ name="IFEval",
89
+ purpose="Evaluate instruction following capabilities through verifiable instructions",
90
+ source="Google Research",
91
+ language="English (BCP-47 en)",
92
+ format="Verifiable instruction prompts with automated evaluation criteria",
93
+ characteristics=(
94
+ "Collection of approximately 500 verifiable instructions designed to evaluate "
95
+ "language models' instruction-following capabilities. Instructions include "
96
+ "specific, measurable criteria like 'write in more than 400 words' or "
97
+ "'mention the keyword AI at least 3 times' that can be verified through "
98
+ "automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
99
+ "for evaluating chat or instruction fine-tuned language models."
100
+ ),
101
+ citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
102
+ title={Instruction-Following Evaluation for Large Language Models},
103
+ author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
104
+ year={2023},
105
+ eprint={2311.07911},
106
+ archivePrefix={arXiv},
107
+ primaryClass={cs.CL},
108
+ url={https://arxiv.org/abs/2311.07911}
109
+ }""",
110
+ )
111
+
112
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
113
+ """Returns recommended evaluation metrics for IFEval."""
114
+ return [
115
+ EvaluationMetric.create(
116
+ name="format_compliance",
117
+ type="text",
118
+ description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
119
+ implementation="custom_format_checker",
120
+ primary=True,
121
+ ),
122
+ EvaluationMetric.create(
123
+ name="length_constraints",
124
+ type="text",
125
+ description="Checks if the response meets word, sentence, or paragraph count requirements",
126
+ implementation="custom_length_validator",
127
+ primary=True,
128
+ ),
129
+ EvaluationMetric.create(
130
+ name="punctuation_rules",
131
+ type="text",
132
+ description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
133
+ implementation="custom_punctuation_checker",
134
+ primary=True,
135
+ ),
136
+ EvaluationMetric.create(
137
+ name="keyword_usage",
138
+ type="text",
139
+ description="Verifies correct usage of required keywords or avoidance of forbidden words",
140
+ implementation="custom_keyword_validator",
141
+ primary=False,
142
+ ),
143
+ EvaluationMetric.create(
144
+ name="structural_requirements",
145
+ type="text",
146
+ description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
147
+ implementation="custom_structure_validator",
148
+ primary=False,
149
+ ),
150
+ ]
151
+
152
 
153
  if __name__ == "__main__":
154
  # Example usage
tests/test_ifeval_parser.py CHANGED
@@ -89,3 +89,54 @@ def test_parser_string_representation(ifeval_parser):
89
  assert "IFEvalDatasetParser" in repr_str
90
  assert "google/IFEval" in repr_str
91
  assert "not loaded" in repr_str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  assert "IFEvalDatasetParser" in repr_str
90
  assert "google/IFEval" in repr_str
91
  assert "not loaded" in repr_str
92
+
93
+
94
+ def test_get_dataset_description(ifeval_parser):
95
+ """Test dataset description generation for IFEval."""
96
+ description = ifeval_parser.get_dataset_description()
97
+
98
+ assert description.name == "IFEval"
99
+ assert "verifiable instructions" in description.purpose.lower()
100
+ assert description.source == "Google Research"
101
+ assert description.language == "English (BCP-47 en)"
102
+ assert "verifiable instruction prompts" in description.format.lower()
103
+ assert "500" in description.characteristics
104
+ assert "automated heuristics" in description.characteristics.lower()
105
+ assert "open llm leaderboard" in description.characteristics.lower()
106
+ assert "zhou2023instructionfollowingevaluation" in description.citation
107
+
108
+
109
+ def test_get_evaluation_metrics(ifeval_parser):
110
+ """Test evaluation metrics generation for IFEval."""
111
+ metrics = ifeval_parser.get_evaluation_metrics()
112
+
113
+ # Should have 5 metrics total
114
+ assert len(metrics) == 5
115
+
116
+ # Check primary metrics
117
+ primary_metrics = [m for m in metrics if m.primary]
118
+ assert len(primary_metrics) == 3
119
+
120
+ # Verify specific metrics exist and have correct properties
121
+ metric_names = {m.name for m in metrics}
122
+ assert "format_compliance" in metric_names
123
+ assert "length_constraints" in metric_names
124
+ assert "punctuation_rules" in metric_names
125
+ assert "keyword_usage" in metric_names
126
+ assert "structural_requirements" in metric_names
127
+
128
+ # Check specific metric properties
129
+ format_metric = next(m for m in metrics if m.name == "format_compliance")
130
+ assert format_metric.primary is True
131
+ assert "formatting rules" in format_metric.description.lower()
132
+ assert format_metric.type == "text"
133
+
134
+ length_metric = next(m for m in metrics if m.name == "length_constraints")
135
+ assert length_metric.primary is True
136
+ assert "word" in length_metric.description.lower()
137
+ assert length_metric.type == "text"
138
+
139
+ punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
140
+ assert punctuation_metric.primary is True
141
+ assert "punctuation" in punctuation_metric.description.lower()
142
+ assert punctuation_metric.type == "text"