Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Sleeping

App Files Files Community

JeffYang52415 commited on Dec 29, 2024

Commit

b73a2c7

unverified ·

1 Parent(s): 70da483

refactor: ifeval parser

Browse files

Files changed (2) hide show

llmdataparser/ifeval_parser.py +73 -1
tests/test_ifeval_parser.py +51 -0

llmdataparser/ifeval_parser.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from dataclasses import dataclass
 from typing import Any, ClassVar, List
-from llmdataparser.base_parser import HuggingFaceDatasetParser, HuggingFaceParseEntry
 from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT  # You'll need to create this
@@ -77,6 +82,73 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
             task_name=task,
         )
 if __name__ == "__main__":
     # Example usage

 from dataclasses import dataclass
 from typing import Any, ClassVar, List
+from llmdataparser.base_parser import (
+    DatasetDescription,
+    EvaluationMetric,
+    HuggingFaceDatasetParser,
+    HuggingFaceParseEntry,
+)
 from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT  # You'll need to create this
             task_name=task,
         )
+    def get_dataset_description(self) -> DatasetDescription:
+        """Returns description of the IFEval dataset."""
+        return DatasetDescription.create(
+            name="IFEval",
+            purpose="Evaluate instruction following capabilities through verifiable instructions",
+            source="Google Research",
+            language="English (BCP-47 en)",
+            format="Verifiable instruction prompts with automated evaluation criteria",
+            characteristics=(
+                "Collection of approximately 500 verifiable instructions designed to evaluate "
+                "language models' instruction-following capabilities. Instructions include "
+                "specific, measurable criteria like 'write in more than 400 words' or "
+                "'mention the keyword AI at least 3 times' that can be verified through "
+                "automated heuristics. Used as a core benchmark in the Open LLM Leaderboard "
+                "for evaluating chat or instruction fine-tuned language models."
+            ),
+            citation="""@misc{zhou2023instructionfollowingevaluationlargelanguage,
+                title={Instruction-Following Evaluation for Large Language Models},
+                author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
+                year={2023},
+                eprint={2311.07911},
+                archivePrefix={arXiv},
+                primaryClass={cs.CL},
+                url={https://arxiv.org/abs/2311.07911}
+            }""",
+        )
+    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
+        """Returns recommended evaluation metrics for IFEval."""
+        return [
+            EvaluationMetric.create(
+                name="format_compliance",
+                type="text",
+                description="Verifies if the output follows specified formatting rules (e.g., highlighting, bullet points, sections)",
+                implementation="custom_format_checker",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="length_constraints",
+                type="text",
+                description="Checks if the response meets word, sentence, or paragraph count requirements",
+                implementation="custom_length_validator",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="punctuation_rules",
+                type="text",
+                description="Validates adherence to punctuation constraints (e.g., no commas, specific endings)",
+                implementation="custom_punctuation_checker",
+                primary=True,
+            ),
+            EvaluationMetric.create(
+                name="keyword_usage",
+                type="text",
+                description="Verifies correct usage of required keywords or avoidance of forbidden words",
+                implementation="custom_keyword_validator",
+                primary=False,
+            ),
+            EvaluationMetric.create(
+                name="structural_requirements",
+                type="text",
+                description="Checks for specific structural elements like sections, paragraphs, or formatting patterns",
+                implementation="custom_structure_validator",
+                primary=False,
+            ),
+        ]
 if __name__ == "__main__":
     # Example usage

tests/test_ifeval_parser.py CHANGED Viewed

@@ -89,3 +89,54 @@ def test_parser_string_representation(ifeval_parser):
     assert "IFEvalDatasetParser" in repr_str
     assert "google/IFEval" in repr_str
     assert "not loaded" in repr_str

     assert "IFEvalDatasetParser" in repr_str
     assert "google/IFEval" in repr_str
     assert "not loaded" in repr_str
+def test_get_dataset_description(ifeval_parser):
+    """Test dataset description generation for IFEval."""
+    description = ifeval_parser.get_dataset_description()
+    assert description.name == "IFEval"
+    assert "verifiable instructions" in description.purpose.lower()
+    assert description.source == "Google Research"
+    assert description.language == "English (BCP-47 en)"
+    assert "verifiable instruction prompts" in description.format.lower()
+    assert "500" in description.characteristics
+    assert "automated heuristics" in description.characteristics.lower()
+    assert "open llm leaderboard" in description.characteristics.lower()
+    assert "zhou2023instructionfollowingevaluation" in description.citation
+def test_get_evaluation_metrics(ifeval_parser):
+    """Test evaluation metrics generation for IFEval."""
+    metrics = ifeval_parser.get_evaluation_metrics()
+    # Should have 5 metrics total
+    assert len(metrics) == 5
+    # Check primary metrics
+    primary_metrics = [m for m in metrics if m.primary]
+    assert len(primary_metrics) == 3
+    # Verify specific metrics exist and have correct properties
+    metric_names = {m.name for m in metrics}
+    assert "format_compliance" in metric_names
+    assert "length_constraints" in metric_names
+    assert "punctuation_rules" in metric_names
+    assert "keyword_usage" in metric_names
+    assert "structural_requirements" in metric_names
+    # Check specific metric properties
+    format_metric = next(m for m in metrics if m.name == "format_compliance")
+    assert format_metric.primary is True
+    assert "formatting rules" in format_metric.description.lower()
+    assert format_metric.type == "text"
+    length_metric = next(m for m in metrics if m.name == "length_constraints")
+    assert length_metric.primary is True
+    assert "word" in length_metric.description.lower()
+    assert length_metric.type == "text"
+    punctuation_metric = next(m for m in metrics if m.name == "punctuation_rules")
+    assert punctuation_metric.primary is True
+    assert "punctuation" in punctuation_metric.description.lower()
+    assert punctuation_metric.type == "text"