pminervini commited on
Commit
e1b962a
1 Parent(s): 85b25b4
Files changed (2) hide show
  1. halueval-cli.py +5 -3
  2. src/backend/run_eval_suite.py +8 -1
halueval-cli.py CHANGED
@@ -28,13 +28,15 @@ def main():
28
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
29
  eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
30
 
31
- # task_names = ['halueval_qa']
32
  # task_names = ['triviaqa']
33
- TASKS_HARNESS = [task.value for task in Tasks]
34
 
35
  include_task_folder("src/backend/tasks/")
36
  initialize_tasks('INFO')
37
 
 
 
38
  print(tasks.ALL_TASKS)
39
 
40
  for task in TASKS_HARNESS:
@@ -43,7 +45,7 @@ def main():
43
  batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
44
  print('AAA', results["results"])
45
 
46
- # breakpoint()
47
 
48
 
49
  if __name__ == "__main__":
 
28
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
29
  eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
30
 
31
+ TASKS_HARNESS = [t.value for t in Tasks if 'halueval_qa' in t.value.benchmark]
32
  # task_names = ['triviaqa']
33
+ # TASKS_HARNESS = [task.value for task in Tasks]
34
 
35
  include_task_folder("src/backend/tasks/")
36
  initialize_tasks('INFO')
37
 
38
+ # breakpoint()
39
+
40
  print(tasks.ALL_TASKS)
41
 
42
  for task in TASKS_HARNESS:
 
45
  batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
46
  print('AAA', results["results"])
47
 
48
+ breakpoint()
49
 
50
 
51
  if __name__ == "__main__":
src/backend/run_eval_suite.py CHANGED
@@ -8,7 +8,7 @@ import logging
8
  logging.getLogger("openai").setLevel(logging.WARNING)
9
 
10
 
11
- def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None) -> dict:
12
  if limit:
13
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
14
 
@@ -29,6 +29,13 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
29
  results["config"]["model_name"] = eval_request.model
30
  results["config"]["model_sha"] = eval_request.revision
31
 
 
 
 
 
 
 
 
32
  print(evaluator.make_table(results))
33
 
34
  return results
 
8
  logging.getLogger("openai").setLevel(logging.WARNING)
9
 
10
 
11
+ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
12
  if limit:
13
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
14
 
 
29
  results["config"]["model_name"] = eval_request.model
30
  results["config"]["model_sha"] = eval_request.revision
31
 
32
+ if max_nb_samples is not None:
33
+ if 'samples' in results:
34
+ samples = results['samples']
35
+ for task_name in samples.keys():
36
+ if len(samples['task_name']) > max_nb_samples:
37
+ results['samples'][task_name] = results['samples'][task_name][:max_nb_samples]
38
+
39
  print(evaluator.make_table(results))
40
 
41
  return results