Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
e1b962a
1
Parent(s):
85b25b4
update
Browse files- halueval-cli.py +5 -3
- src/backend/run_eval_suite.py +8 -1
halueval-cli.py
CHANGED
@@ -28,13 +28,15 @@ def main():
|
|
28 |
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
29 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
30 |
|
31 |
-
|
32 |
# task_names = ['triviaqa']
|
33 |
-
TASKS_HARNESS = [task.value for task in Tasks]
|
34 |
|
35 |
include_task_folder("src/backend/tasks/")
|
36 |
initialize_tasks('INFO')
|
37 |
|
|
|
|
|
38 |
print(tasks.ALL_TASKS)
|
39 |
|
40 |
for task in TASKS_HARNESS:
|
@@ -43,7 +45,7 @@ def main():
|
|
43 |
batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
|
44 |
print('AAA', results["results"])
|
45 |
|
46 |
-
|
47 |
|
48 |
|
49 |
if __name__ == "__main__":
|
|
|
28 |
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
29 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
30 |
|
31 |
+
TASKS_HARNESS = [t.value for t in Tasks if 'halueval_qa' in t.value.benchmark]
|
32 |
# task_names = ['triviaqa']
|
33 |
+
# TASKS_HARNESS = [task.value for task in Tasks]
|
34 |
|
35 |
include_task_folder("src/backend/tasks/")
|
36 |
initialize_tasks('INFO')
|
37 |
|
38 |
+
# breakpoint()
|
39 |
+
|
40 |
print(tasks.ALL_TASKS)
|
41 |
|
42 |
for task in TASKS_HARNESS:
|
|
|
45 |
batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
|
46 |
print('AAA', results["results"])
|
47 |
|
48 |
+
breakpoint()
|
49 |
|
50 |
|
51 |
if __name__ == "__main__":
|
src/backend/run_eval_suite.py
CHANGED
@@ -8,7 +8,7 @@ import logging
|
|
8 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
9 |
|
10 |
|
11 |
-
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None) -> dict:
|
12 |
if limit:
|
13 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
14 |
|
@@ -29,6 +29,13 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
29 |
results["config"]["model_name"] = eval_request.model
|
30 |
results["config"]["model_sha"] = eval_request.revision
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
print(evaluator.make_table(results))
|
33 |
|
34 |
return results
|
|
|
8 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
9 |
|
10 |
|
11 |
+
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
|
12 |
if limit:
|
13 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
14 |
|
|
|
29 |
results["config"]["model_name"] = eval_request.model
|
30 |
results["config"]["model_sha"] = eval_request.revision
|
31 |
|
32 |
+
if max_nb_samples is not None:
|
33 |
+
if 'samples' in results:
|
34 |
+
samples = results['samples']
|
35 |
+
for task_name in samples.keys():
|
36 |
+
if len(samples['task_name']) > max_nb_samples:
|
37 |
+
results['samples'][task_name] = results['samples'][task_name][:max_nb_samples]
|
38 |
+
|
39 |
print(evaluator.make_table(results))
|
40 |
|
41 |
return results
|