File size: 11,254 Bytes
e9b694b
 
 
70da483
 
 
 
 
 
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
70da483
 
 
 
 
 
 
 
a06316f
70da483
 
 
 
 
 
27ff91e
 
 
 
 
 
 
70da483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9b694b
 
70da483
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
70da483
 
 
 
 
 
 
 
a06316f
70da483
 
 
 
 
 
 
 
27ff91e
 
 
 
 
 
70da483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
from dataclasses import dataclass
from typing import Any, ClassVar

from llmdataparser.base_parser import (
    DatasetDescription,
    EvaluationMetric,
    HuggingFaceDatasetParser,
    HuggingFaceParseEntry,
)


@dataclass(frozen=True, kw_only=True, slots=True)
class HumanEvalParseEntry(HuggingFaceParseEntry):
    """Custom entry class for HumanEval, with fields specific to this dataset parser."""

    task_id: str
    task_name: str
    entry_point: str
    test: str

    @classmethod
    def create(
        cls,
        question: str,
        answer: str,
        raw_question: str,
        task_id: str,
        entry_point: str,
        test: str,
        task_name: str,
    ) -> "HumanEvalParseEntry":
        if not task_id:
            raise ValueError("Task ID cannot be empty")
        if not entry_point:
            raise ValueError("Entry point cannot be empty")
        return cls(
            question=question,
            answer=answer,
            raw_question=raw_question,
            raw_answer=answer,  # In HumanEval, the canonical solution is the raw answer
            task_id=task_id,
            entry_point=entry_point,
            test=test,
            task_name=task_name,
        )


class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
    """Parser for the HumanEval dataset."""

    _data_source: ClassVar[str] = "openai/openai_humaneval"
    _default_task: ClassVar[str] = "openai_humaneval"
    _task_names: ClassVar[list[str]] = ["openai_humaneval"]

    def process_entry(
        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
    ) -> HumanEvalParseEntry:
        """Process a single HumanEval entry."""
        raw_question = row["prompt"]
        answer = row["canonical_solution"]
        task_id = row["task_id"]
        entry_point = row["entry_point"]
        test = row["test"]

        question = str(raw_question)

        # Use task_name if provided, otherwise use default
        task = task_name or self._get_current_task(row)

        return HumanEvalParseEntry.create(
            question=question,
            answer=answer,
            raw_question=raw_question,
            task_id=task_id,
            entry_point=entry_point,
            test=test,
            task_name=task,  # Guarantee non-None
        )

    def get_dataset_description(self) -> DatasetDescription:
        """Returns description of the HumanEval dataset."""
        return DatasetDescription.create(
            name="HumanEval",
            purpose="Evaluate code generation capabilities through Python programming tasks",
            source="OpenAI",
            language="Python",
            format="Function signatures with docstrings and unit tests",
            category=["Programming"],
            characteristics=(
                "Collection of 164 hand-written Python programming problems. Each problem "
                "includes a function signature, docstring, example test cases, and hidden unit "
                "tests. Problems test basic programming, algorithms, and data structure skills"
            ),
            citation="""@article{chen2021codex,
    title={Evaluating Large Language Models Trained on Code},
    author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
    year={2021},
    eprint={2107.03374},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
    }""",
        )

    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
        """Returns recommended evaluation metrics for HumanEval."""
        return [
            EvaluationMetric.create(
                name="pass@k",
                type="code",
                description="Probability that correct solution appears at least once in k samples",
                implementation="custom_pass_at_k",
                primary=True,
            ),
            EvaluationMetric.create(
                name="test_success_rate",
                type="code",
                description="Percentage of test cases passed by the generated solution",
                implementation="custom_test_executor",
                primary=False,
            ),
            EvaluationMetric.create(
                name="type_correctness",
                type="code",
                description="Verification of type hints and type safety in generated code",
                implementation="custom_type_checker",
                primary=False,
            ),
            EvaluationMetric.create(
                name="code_style",
                type="code",
                description="Compliance with Python best practices and PEP 8 guidelines",
                implementation="custom_style_checker",
                primary=False,
            ),
            EvaluationMetric.create(
                name="runtime_efficiency",
                type="code",
                description="Analysis of time and space complexity of the solution",
                implementation="custom_complexity_analyzer",
                primary=False,
            ),
        ]


class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
    """Parser for the enhanced HumanEval Plus dataset with 80x more comprehensive test coverage."""

    _data_source: ClassVar[str] = "evalplus/humanevalplus"
    _default_task: ClassVar[str] = "default"
    _task_names: ClassVar[list[str]] = ["default"]

    def process_entry(
        self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
    ) -> HumanEvalParseEntry:
        """Process a single HumanEval entry."""
        raw_question = row["prompt"]
        answer = row["canonical_solution"]
        task_id = row["task_id"]
        entry_point = row["entry_point"]
        test = row["test"]

        question = str(raw_question)
        # Use task_name if provided, otherwise use default
        task = task_name or self._get_current_task(row)

        return HumanEvalParseEntry.create(
            question=question,
            answer=answer,
            raw_question=raw_question,
            task_id=task_id,
            entry_point=entry_point,
            test=test,
            task_name=task,  # task is guaranteed to be str from _get_current_task
        )

    def get_dataset_description(self) -> DatasetDescription:
        """Returns description of the HumanEval Plus dataset."""
        return DatasetDescription.create(
            name="HumanEval Plus",
            purpose="Enhanced evaluation of code generation with 80x more test coverage",
            source="EvalPlus",
            language="Python",
            format="Function signatures with docstrings and comprehensive test suites",
            category=["Programming"],
            characteristics=(
                "Significantly enhanced version of HumanEval with 80x more test cases. "
                "Includes extensive edge cases, boundary conditions, stress tests, and "
                "error handling scenarios to rigorously evaluate code correctness and robustness. "
                "Each problem has been augmented with comprehensive testing to catch subtle bugs "
                "and ensure production-quality code generation."
            ),
            citation="""@inproceedings{evalplus,
    title = {Is Your Code Generated by Chat{GPT} Really Correct? Rigorous Evaluation of Large Language Models for Code Generation},
    author = {Liu, Jiawei and Xia, Chunqiu Steven and Wang, Yuyao and Zhang, Lingming},
    booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
    year = {2023},
    url = {https://openreview.net/forum?id=1qvx610Cu7},
    }""",
        )

    def get_evaluation_metrics(self) -> list[EvaluationMetric]:
        """Returns recommended evaluation metrics for HumanEval Plus."""
        return [
            EvaluationMetric.create(
                name="pass@k",
                type="code",
                description="Probability that correct solution appears at least once in k samples",
                implementation="custom_pass_at_k",
                primary=True,
            ),
            EvaluationMetric.create(
                name="test_coverage",
                type="code",
                description="Percentage of edge cases and stress tests passed by the solution",
                implementation="custom_coverage_checker",
                primary=False,
            ),
            EvaluationMetric.create(
                name="error_handling",
                type="code",
                description="Assessment of solution's robustness in handling invalid inputs and edge cases",
                implementation="custom_error_handler",
                primary=False,
            ),
            EvaluationMetric.create(
                name="performance_stress",
                type="code",
                description="Evaluation of solution performance under high load and stress conditions",
                implementation="custom_stress_tester",
                primary=False,
            ),
            EvaluationMetric.create(
                name="code_quality",
                type="code",
                description="Analysis of code readability, maintainability and adherence to Python best practices",
                implementation="custom_quality_checker",
                primary=False,
            ),
        ]


if __name__ == "__main__":
    # Example usage
    parser = HumanEvalDatasetParser()

    # Load the dataset
    parser.load()

    # Parse all splits
    parser.parse()

    # Get parsed data
    parsed_data = parser.get_parsed_data

    # Print example entry
    if parsed_data:
        example = parsed_data[0]
        print("\nExample parsed entry:")
        print(f"Task ID: {example.task_id}")
        print(f"Entry Point: {example.entry_point}")
        print(f"Question:\n{example.question}")
        print(f"Solution:\n{example.answer}")

    parser = HumanEvalDatasetPlusParser()
    parser.load()
    parser.parse()
    parsed_data = parser.get_parsed_data
    if parsed_data:
        example = parsed_data[0]
        print("\nExample parsed entry:")
        print(f"Task: {example.task_name}")
        print(f"Question: {example.raw_question}")
        print(f"Correct Answer: {example.answer}")