jjkim
commited on
Commit
·
cb0919a
1
Parent(s):
fe7364e
add early termination
Browse files- code_eval.py +27 -10
code_eval.py
CHANGED
@@ -19,7 +19,7 @@ described in the paper "Evaluating Large Language Models Trained on Code"
|
|
19 |
import itertools
|
20 |
import os
|
21 |
from collections import Counter, defaultdict
|
22 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
23 |
|
24 |
import datasets
|
25 |
import evaluate
|
@@ -171,6 +171,7 @@ class CodeEval(evaluate.Metric):
|
|
171 |
|
172 |
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
173 |
futures = []
|
|
|
174 |
completion_id = Counter()
|
175 |
results = defaultdict(list)
|
176 |
|
@@ -189,31 +190,47 @@ class CodeEval(evaluate.Metric):
|
|
189 |
)
|
190 |
future = executor.submit(check_correctness, *args)
|
191 |
futures.append(future)
|
|
|
192 |
completion_id[task_id] += 1
|
193 |
|
194 |
pbar = tqdm(total=len(futures))
|
195 |
for future in as_completed(futures):
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
197 |
results[result["task_id"]].append((result["completion_id"], result))
|
198 |
pbar.update(1)
|
199 |
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
new_result = []
|
202 |
for completion_id, group in itertools.groupby(result, key=lambda x: x[0]):
|
203 |
group = list(group)
|
204 |
new_result.append(
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
210 |
)
|
211 |
)
|
212 |
-
|
|
|
213 |
|
214 |
total, correct = [], []
|
215 |
for result in results.values():
|
216 |
-
result.sort()
|
217 |
passed = [r[1]["passed"] for r in result]
|
218 |
total.append(len(passed))
|
219 |
correct.append(sum(passed))
|
|
|
19 |
import itertools
|
20 |
import os
|
21 |
from collections import Counter, defaultdict
|
22 |
+
from concurrent.futures import CancelledError, ThreadPoolExecutor, as_completed
|
23 |
|
24 |
import datasets
|
25 |
import evaluate
|
|
|
171 |
|
172 |
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
173 |
futures = []
|
174 |
+
future_dict = defaultdict(lambda: defaultdict(list))
|
175 |
completion_id = Counter()
|
176 |
results = defaultdict(list)
|
177 |
|
|
|
190 |
)
|
191 |
future = executor.submit(check_correctness, *args)
|
192 |
futures.append(future)
|
193 |
+
future_dict[task_id][completion_id[task_id]].append(future)
|
194 |
completion_id[task_id] += 1
|
195 |
|
196 |
pbar = tqdm(total=len(futures))
|
197 |
for future in as_completed(futures):
|
198 |
+
try:
|
199 |
+
result = future.result()
|
200 |
+
except CancelledError:
|
201 |
+
pbar.update(1)
|
202 |
+
continue
|
203 |
+
|
204 |
results[result["task_id"]].append((result["completion_id"], result))
|
205 |
pbar.update(1)
|
206 |
|
207 |
+
if not result["passed"]:
|
208 |
+
future_list = future_dict[result["task_id"]][result["completion_id"]]
|
209 |
+
for future in future_list:
|
210 |
+
future.cancel()
|
211 |
+
|
212 |
+
new_results = {}
|
213 |
+
for key, result in results.items():
|
214 |
new_result = []
|
215 |
for completion_id, group in itertools.groupby(result, key=lambda x: x[0]):
|
216 |
group = list(group)
|
217 |
new_result.append(
|
218 |
+
(
|
219 |
+
group[0][0],
|
220 |
+
dict(
|
221 |
+
task_id=group[0][0],
|
222 |
+
passed=all(r[1]["passed"] for r in group),
|
223 |
+
result=[r[1]["result"] for r in group],
|
224 |
+
completion_id=completion_id,
|
225 |
+
),
|
226 |
)
|
227 |
)
|
228 |
+
new_results[key] = new_result
|
229 |
+
results = new_results
|
230 |
|
231 |
total, correct = [], []
|
232 |
for result in results.values():
|
233 |
+
result.sort(key=lambda x: x[0])
|
234 |
passed = [r[1]["passed"] for r in result]
|
235 |
total.append(len(passed))
|
236 |
correct.append(sum(passed))
|