jjkim commited on
Commit
b446f13
·
1 Parent(s): 886eb21

separate timelimits

Browse files
Files changed (1) hide show
  1. code_eval.py +52 -15
code_eval.py CHANGED
@@ -22,13 +22,12 @@ from collections import Counter, defaultdict
22
  from concurrent.futures import ThreadPoolExecutor, as_completed
23
 
24
  import datasets
25
- import numpy as np
26
-
27
  import evaluate
 
 
28
 
29
  from .execute import check_correctness
30
 
31
-
32
  _CITATION = """\
33
  @misc{chen2021evaluating,
34
  title={Evaluating Large Language Models Trained on Code},
@@ -143,7 +142,7 @@ class CodeEval(evaluate.Metric):
143
  features=datasets.Features(
144
  {
145
  "predictions": datasets.Sequence(datasets.Value("string")),
146
- "references": datasets.Value("string"),
147
  }
148
  ),
149
  homepage="https://github.com/openai/human-eval",
@@ -152,33 +151,65 @@ class CodeEval(evaluate.Metric):
152
  license=_LICENSE,
153
  )
154
 
155
- def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
 
 
 
 
 
 
 
156
  """Returns the scores"""
157
 
158
  if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
159
  raise ValueError(_WARNING)
160
 
161
  if os.name == "nt":
162
- raise NotImplementedError("This metric is currently not supported on Windows.")
 
 
163
 
164
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
165
  futures = []
166
  completion_id = Counter()
167
- n_samples = 0
168
  results = defaultdict(list)
169
 
170
- for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
 
 
171
  for candidate in candidates:
172
- test_program = candidate + "\n" + test_case
173
- args = (test_program, timeout, task_id, completion_id[task_id])
174
- future = executor.submit(check_correctness, *args)
175
- futures.append(future)
 
 
 
 
 
 
 
176
  completion_id[task_id] += 1
177
- n_samples += 1
178
 
 
179
  for future in as_completed(futures):
180
  result = future.result()
181
  results[result["task_id"]].append((result["completion_id"], result))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  total, correct = [], []
184
  for result in results.values():
@@ -190,7 +221,11 @@ class CodeEval(evaluate.Metric):
190
  correct = np.array(correct)
191
 
192
  ks = k
193
- pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in ks if (total >= k).all()}
 
 
 
 
194
 
195
  return pass_at_k, results
196
 
@@ -210,4 +245,6 @@ def estimate_pass_at_k(num_samples, num_correct, k):
210
  assert len(num_samples) == len(num_correct)
211
  num_samples_it = iter(num_samples)
212
 
213
- return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
 
 
 
22
  from concurrent.futures import ThreadPoolExecutor, as_completed
23
 
24
  import datasets
 
 
25
  import evaluate
26
+ import numpy as np
27
+ from tqdm import tqdm
28
 
29
  from .execute import check_correctness
30
 
 
31
  _CITATION = """\
32
  @misc{chen2021evaluating,
33
  title={Evaluating Large Language Models Trained on Code},
 
142
  features=datasets.Features(
143
  {
144
  "predictions": datasets.Sequence(datasets.Value("string")),
145
+ "references": datasets.Sequence(datasets.Value("string")),
146
  }
147
  ),
148
  homepage="https://github.com/openai/human-eval",
 
151
  license=_LICENSE,
152
  )
153
 
154
+ def _compute(
155
+ self,
156
+ predictions,
157
+ references,
158
+ k=[1, 10, 100],
159
+ num_workers=4,
160
+ timeout=3.0,
161
+ ):
162
  """Returns the scores"""
163
 
164
  if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
165
  raise ValueError(_WARNING)
166
 
167
  if os.name == "nt":
168
+ raise NotImplementedError(
169
+ "This metric is currently not supported on Windows."
170
+ )
171
 
172
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
173
  futures = []
174
  completion_id = Counter()
 
175
  results = defaultdict(list)
176
 
177
+ for task_id, (candidates, test_case) in enumerate(
178
+ zip(predictions, references)
179
+ ):
180
  for candidate in candidates:
181
+ for _test_case in test_case:
182
+ assert isinstance(_test_case, str)
183
+ test_program = candidate + "\n" + _test_case
184
+ args = (
185
+ test_program,
186
+ timeout,
187
+ task_id,
188
+ completion_id[task_id],
189
+ )
190
+ future = executor.submit(check_correctness, *args)
191
+ futures.append(future)
192
  completion_id[task_id] += 1
 
193
 
194
+ pbar = tqdm(total=len(futures))
195
  for future in as_completed(futures):
196
  result = future.result()
197
  results[result["task_id"]].append((result["completion_id"], result))
198
+ pbar.update(1)
199
+
200
+ for result in results.values():
201
+ new_result = []
202
+ for completion_id, group in itertools.groupby(results, key=lambda x: x[0]):
203
+ group = list(group)
204
+ new_result.append(
205
+ dict(
206
+ task_id=group[0]["task_id"],
207
+ passed=all(r["passed"] for r in group),
208
+ result=[r["result"] for r in group],
209
+ completion_id=completion_id,
210
+ )
211
+ )
212
+ result = new_result
213
 
214
  total, correct = [], []
215
  for result in results.values():
 
221
  correct = np.array(correct)
222
 
223
  ks = k
224
+ pass_at_k = {
225
+ f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
226
+ for k in ks
227
+ if (total >= k).all()
228
+ }
229
 
230
  return pass_at_k, results
231
 
 
245
  assert len(num_samples) == len(num_correct)
246
  num_samples_it = iter(num_samples)
247
 
248
+ return np.array(
249
+ [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
250
+ )