pminervini commited on
Commit
ca9ece0
·
1 Parent(s): a88d51c
backend-cli.py CHANGED
@@ -103,7 +103,7 @@ def process_finished_requests() -> bool:
103
  random.shuffle(eval_requests)
104
 
105
  from src.leaderboard.read_evals import get_raw_eval_results
106
- eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
107
 
108
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
109
  result_name_to_result = {r.eval_name: r for r in eval_results}
 
103
  random.shuffle(eval_requests)
104
 
105
  from src.leaderboard.read_evals import get_raw_eval_results
106
+ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
107
 
108
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
109
  result_name_to_result = {r.eval_name: r for r in eval_results}
completed-cli.py CHANGED
@@ -45,16 +45,59 @@ def request_to_result_name(request: EvalRequest) -> str:
45
  def process_finished_requests() -> bool:
46
  current_finished_status = [FINISHED_STATUS]
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
49
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
50
  # Sort the evals by priority (first submitted first run)
51
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
52
 
 
 
 
53
  import random
54
  random.shuffle(eval_requests)
55
 
56
  from src.leaderboard.read_evals import get_raw_eval_results
57
- eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
58
 
59
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
60
  result_name_to_result = {r.eval_name: r for r in eval_results}
@@ -73,6 +116,7 @@ def process_finished_requests() -> bool:
73
  if eval_result is None or task_name not in eval_result.results:
74
  eval_request: EvalRequest = result_name_to_request[result_name]
75
 
 
76
  print(result_name, 'is incomplete -- missing task:', task_name)
77
 
78
 
 
45
  def process_finished_requests() -> bool:
46
  current_finished_status = [FINISHED_STATUS]
47
 
48
+ if False:
49
+ import os
50
+ import dateutil
51
+ model_result_filepaths = []
52
+ results_path = f'{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B'
53
+ requests_path = f'{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json'
54
+
55
+ for root, _, files in os.walk(results_path):
56
+ # We should only have json files in model results
57
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
58
+ continue
59
+
60
+ # Sort the files by date
61
+ try:
62
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
63
+ except dateutil.parser._parser.ParserError:
64
+ files = [files[-1]]
65
+
66
+ for file in files:
67
+ model_result_filepaths.append(os.path.join(root, file))
68
+
69
+ eval_results = {}
70
+ for model_result_filepath in model_result_filepaths:
71
+ # Creation of result
72
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
73
+ eval_result.update_with_request_file(requests_path)
74
+
75
+ print('XXX', eval_result)
76
+
77
+ # Store results of same eval together
78
+ eval_name = eval_result.eval_name
79
+ if eval_name in eval_results.keys():
80
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
81
+ else:
82
+ eval_results[eval_name] = eval_result
83
+
84
+ print(eval_results)
85
+
86
+ return True
87
+
88
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
89
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
90
  # Sort the evals by priority (first submitted first run)
91
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
92
 
93
+ # XXX
94
+ # eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
95
+
96
  import random
97
  random.shuffle(eval_requests)
98
 
99
  from src.leaderboard.read_evals import get_raw_eval_results
100
+ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND, True)
101
 
102
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
103
  result_name_to_result = {r.eval_name: r for r in eval_results}
 
116
  if eval_result is None or task_name not in eval_result.results:
117
  eval_request: EvalRequest = result_name_to_request[result_name]
118
 
119
+ # print(eval_result)
120
  print(result_name, 'is incomplete -- missing task:', task_name)
121
 
122
 
src/display/utils.py CHANGED
@@ -25,6 +25,8 @@ class Tasks(Enum):
25
  # drop = Task("drop", "f1", "DROP")
26
  nqopen = Task("nq_open", "em", "NQ Open")
27
  triviaqa = Task("triviaqa", "em", "TriviaQA")
 
 
28
 
29
  # These classes are for user facing column names,
30
  # to avoid having to change them all around the code
 
25
  # drop = Task("drop", "f1", "DROP")
26
  nqopen = Task("nq_open", "em", "NQ Open")
27
  triviaqa = Task("triviaqa", "em", "TriviaQA")
28
+ #truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
29
+ #truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
30
 
31
  # These classes are for user facing column names,
32
  # to avoid having to change them all around the code
src/leaderboard/read_evals.py CHANGED
@@ -69,23 +69,78 @@ class EvalResult:
69
  results = {}
70
  for task in Tasks:
71
  task = task.value
72
- # We skip old mmlu entries
73
- wrong_mmlu_version = False
74
- if task.benchmark == "hendrycksTest":
75
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
76
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
77
- wrong_mmlu_version = True
78
-
79
- if wrong_mmlu_version:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  continue
81
 
82
- # Some truthfulQA values are NaNs
83
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
84
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
85
- results[task.benchmark] = 0.0
86
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- # We average all scores of a given metric (mostly for mmlu)
 
 
 
 
89
 
90
  def post_process_results(results: dict) -> dict:
91
  # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
@@ -191,7 +246,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
191
  return request_file
192
 
193
 
194
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
195
  """From the path of the results folder root, extract all needed info for results"""
196
  model_result_filepaths = []
197
 
@@ -212,7 +267,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
212
  eval_results = {}
213
  for model_result_filepath in model_result_filepaths:
214
  # Creation of result
215
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
 
 
216
  eval_result.update_with_request_file(requests_path)
217
 
218
  # Store results of same eval together
@@ -222,8 +280,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
222
  else:
223
  eval_results[eval_name] = eval_result
224
 
225
- # breakpoint()
226
-
227
  results = []
228
  for v in eval_results.values():
229
  results.append(v)
 
69
  results = {}
70
  for task in Tasks:
71
  task = task.value
72
+
73
+ def post_process_results(results: dict) -> dict:
74
+ # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
75
+ res_copy = results.copy()
76
+
77
+ for task_name in res_copy.keys():
78
+ entry_copy = results[task_name].copy()
79
+
80
+ for k, v in entry_copy.items():
81
+ if "exact_match" in k:
82
+ results[task_name][k.replace("exact_match", "em")] = v
83
+
84
+ entry_copy = results[task_name].copy()
85
+
86
+ for k, v in entry_copy.items():
87
+ if "," in k:
88
+ tokens = k.split(",")
89
+ results[task_name][tokens[0]] = v
90
+
91
+ return results
92
+
93
+ accs = np.array([v.get(task.metric, None) for k, v in post_process_results(data["results"]).items() if task.benchmark in k])
94
+
95
+ if accs.size == 0 or any([acc is None for acc in accs]):
96
  continue
97
 
98
+ mean_acc = np.mean(accs) * 100.0
99
+ results[task.benchmark] = mean_acc
100
+
101
+ return EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
102
+ precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
103
+ architecture=architecture)
104
+
105
+ @staticmethod
106
+ def init_from_json_file_backend(json_filepath):
107
+ """Inits the result from the specific model result file"""
108
+ with open(json_filepath) as fp:
109
+ data = json.load(fp)
110
+
111
+ # We manage the legacy config format
112
+ config = data.get("config", data.get("config_general", None))
113
+
114
+ # Precision
115
+ precision = Precision.from_str(config.get("model_dtype"))
116
+
117
+ # Get model and org
118
+ org_and_model = config.get("model_name", config.get("model_args", None))
119
+ org_and_model = org_and_model.split("/", 1)
120
+
121
+ if len(org_and_model) == 1:
122
+ org = None
123
+ model = org_and_model[0]
124
+ result_key = f"{model}_{precision.value.name}"
125
+ else:
126
+ org = org_and_model[0]
127
+ model = org_and_model[1]
128
+ result_key = f"{org}_{model}_{precision.value.name}"
129
+ full_model = "/".join(org_and_model)
130
+
131
+ still_on_hub, error, model_config = \
132
+ is_model_on_hub(full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False)
133
+ architecture = "?"
134
+ if model_config is not None:
135
+ architectures = getattr(model_config, "architectures", None)
136
+ if architectures:
137
+ architecture = ";".join(architectures)
138
 
139
+ # Extract results available in this file (some results are split in several files)
140
+ results = {}
141
+ from src.backend.envs import Tasks as BackendTasks
142
+ for task in BackendTasks:
143
+ task = task.value
144
 
145
  def post_process_results(results: dict) -> dict:
146
  # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
 
246
  return request_file
247
 
248
 
249
+ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]:
250
  """From the path of the results folder root, extract all needed info for results"""
251
  model_result_filepaths = []
252
 
 
267
  eval_results = {}
268
  for model_result_filepath in model_result_filepaths:
269
  # Creation of result
270
+ if is_backend:
271
+ eval_result = EvalResult.init_from_json_file_backend(model_result_filepath)
272
+ else:
273
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
274
  eval_result.update_with_request_file(requests_path)
275
 
276
  # Store results of same eval together
 
280
  else:
281
  eval_results[eval_name] = eval_result
282
 
 
 
283
  results = []
284
  for v in eval_results.values():
285
  results.append(v)