File size: 6,639 Bytes
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
e9b694b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70da483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0450c4e
70da483
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import pytest

from llmdataparser.humaneval_parser import (
    HumanEvalDatasetParser,
    HumanEvalDatasetPlusParser,
    HumanEvalParseEntry,
)


@pytest.fixture
def sample_entry():
    return {
        "prompt": 'def add(a, b):\n    """Add two numbers."""\n',
        "canonical_solution": "def add(a, b):\n    return a + b\n",
        "task_id": "HumanEval/0",
        "entry_point": "add",
        "test": "def test_add(): assert add(2, 3) == 5",
    }


@pytest.fixture
def parser():
    return HumanEvalDatasetParser()


@pytest.fixture
def plus_parser():
    return HumanEvalDatasetPlusParser()


@pytest.fixture
def plus_sample_entry():
    return {
        "prompt": 'def add(a, b):\n    """Add two numbers."""\n',
        "canonical_solution": "def add(a, b):\n    return a + b\n",
        "task_id": "HumanEval/0",
        "entry_point": "add",
        "test": "def test_add(): assert add(2, 3) == 5",
    }


def test_humaneval_parse_entry_creation():
    """Test creation of HumanEvalParseEntry"""
    entry = HumanEvalParseEntry.create(
        question="test question",
        answer="test answer",
        raw_question="raw question",
        task_id="HumanEval/1",
        entry_point="test_func",
        test="test case",
        task_name="openai_humaneval",
    )

    assert entry.question == "test question"
    assert entry.answer == "test answer"
    assert entry.raw_question == "raw question"
    assert entry.raw_answer == "test answer"  # Should match answer
    assert entry.task_id == "HumanEval/1"
    assert entry.entry_point == "test_func"
    assert entry.test == "test case"
    assert entry.task_name == "openai_humaneval"


def test_humaneval_parse_entry_validation():
    """Test validation of required fields"""
    with pytest.raises(ValueError, match="Task ID cannot be empty"):
        HumanEvalParseEntry.create(
            question="test",
            answer="test",
            raw_question="test",
            task_id="",  # Empty task_id should raise error
            entry_point="test",
            test="test",
            task_name="test",
        )

    with pytest.raises(ValueError, match="Entry point cannot be empty"):
        HumanEvalParseEntry.create(
            question="test",
            answer="test",
            raw_question="test",
            task_id="test",
            entry_point="",  # Empty entry_point should raise error
            test="test",
            task_name="test",
        )


def test_process_entry(parser, sample_entry):
    """Test processing of a single entry"""
    result = parser.process_entry(sample_entry, task_name="openai_humaneval")

    assert isinstance(result, HumanEvalParseEntry)
    assert result.task_id == "HumanEval/0"
    assert result.entry_point == "add"

    assert result.answer == sample_entry["canonical_solution"]
    assert result.test == sample_entry["test"]
    assert result.task_name == "openai_humaneval"


def test_parser_initialization(parser):
    """Test parser initialization and properties"""
    assert parser._data_source == "openai/openai_humaneval"
    assert parser._default_task == "openai_humaneval"
    assert parser._task_names == ["openai_humaneval"]
    assert (
        parser.get_huggingface_link
        == "https://huggingface.co/datasets/openai/openai_humaneval"
    )


@pytest.mark.integration
def test_parser_load_and_parse(parser):
    """Integration test for loading and parsing data"""
    parser.load()
    parser.parse()
    parsed_data = parser.get_parsed_data

    assert len(parsed_data) > 0
    assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)


def test_get_current_task(parser, sample_entry):
    """Test _get_current_task method"""
    task = parser._get_current_task(sample_entry)
    assert task == parser._default_task


def test_plus_parser_initialization(plus_parser):
    """Test HumanEvalDatasetPlusParser initialization and properties"""
    assert plus_parser._data_source == "evalplus/humanevalplus"
    assert plus_parser._default_task == "default"
    assert plus_parser._task_names == ["default"]
    assert (
        plus_parser.get_huggingface_link
        == "https://huggingface.co/datasets/evalplus/humanevalplus"
    )


def test_plus_process_entry(plus_parser, plus_sample_entry):
    """Test processing of a single entry in HumanEvalDatasetPlusParser"""
    result = plus_parser.process_entry(plus_sample_entry, task_name="default")

    assert isinstance(result, HumanEvalParseEntry)
    assert result.task_id == "HumanEval/0"
    assert result.entry_point == "add"

    assert result.answer == plus_sample_entry["canonical_solution"]
    assert result.test == plus_sample_entry["test"]
    assert result.task_name == "default"


@pytest.mark.integration
def test_plus_parser_load_and_parse(plus_parser):
    """Integration test for loading and parsing data with HumanEvalDatasetPlusParser"""
    plus_parser.load()
    plus_parser.parse()
    parsed_data = plus_parser.get_parsed_data

    assert len(parsed_data) > 0
    assert all(isinstance(entry, HumanEvalParseEntry) for entry in parsed_data)


def test_plus_get_current_task(plus_parser, plus_sample_entry):
    """Test _get_current_task method for HumanEvalDatasetPlusParser"""
    task = plus_parser._get_current_task(plus_sample_entry)
    assert task == plus_parser._default_task


def test_get_dataset_description(parser, plus_parser):
    """Test dataset description generation for both parsers."""
    # Test original HumanEval description
    description = parser.get_dataset_description()
    assert description.name == "HumanEval"
    assert "code generation" in description.purpose
    assert description.language == "Python"
    assert "chen2021codex" in description.citation

    # Test HumanEval Plus description
    plus_description = plus_parser.get_dataset_description()
    assert plus_description.name == "HumanEval Plus"
    assert "80x more test coverage" in plus_description.purpose
    assert "comprehensive test suites" in plus_description.format
    assert "edge cases" in plus_description.characteristics
    assert "evalplus" in plus_description.citation


def test_get_evaluation_metrics(parser):
    """Test evaluation metrics generation for both parsers."""
    # Test original HumanEval metrics
    metrics = parser.get_evaluation_metrics()
    assert len(metrics) == 5  # Base metrics + 2 specific metrics

    # Check primary metrics - update to match actual implementation
    primary_metrics = [m for m in metrics if m.primary]
    assert len(primary_metrics) == 1  # pass@k
    assert any(m.name == "pass@k" for m in primary_metrics)