Alina Lozovskaia commited on
Commit
705a80c
1 Parent(s): b7d036c

read_evals initial change

Browse files
Files changed (3) hide show
  1. pyproject.toml +11 -5
  2. src/envs.py +1 -1
  3. src/leaderboard/read_evals.py +101 -89
pyproject.toml CHANGED
@@ -1,9 +1,15 @@
1
  [tool.ruff]
2
- # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- lint.select = ["E", "F"]
4
- lint.ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
- lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
 
 
 
 
 
 
7
 
8
  [tool.isort]
9
  profile = "black"
 
1
  [tool.ruff]
2
+ line-length = 120
3
+ target-version = "py312"
4
+ include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
5
+ ignore=["I","EM","FBT","TRY003","S101","D101","D102","D103","D104","D105","G004","D107","FA102"]
6
+ fixable=["ALL"]
7
+ select=["ALL"]
8
+
9
+ [tool.ruff.lint]
10
+ select = ["E", "F"]
11
+ fixable = ["ALL"]
12
+ ignore = ["E501"] # line too long (black is taking care of this)
13
 
14
  [tool.isort]
15
  profile = "black"
src/envs.py CHANGED
@@ -26,7 +26,7 @@ if not os.access(HF_HOME, os.W_OK):
26
  HF_HOME = "."
27
  os.environ["HF_HOME"] = HF_HOME
28
  else:
29
- print(f"Write access confirmed for HF_HOME")
30
 
31
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
32
  EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
 
26
  HF_HOME = "."
27
  os.environ["HF_HOME"] = HF_HOME
28
  else:
29
+ print("Write access confirmed for HF_HOME")
30
 
31
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
32
  EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
src/leaderboard/read_evals.py CHANGED
@@ -1,8 +1,11 @@
1
- import glob
2
  import json
 
 
 
3
  import math
4
  import os
5
- from dataclasses import dataclass
 
6
 
7
  import dateutil
8
  import numpy as np
@@ -10,117 +13,124 @@ import numpy as np
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
12
 
 
 
13
 
14
  @dataclass
15
  class EvalResult:
16
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
17
- eval_name: str # org_model_precision (uid)
18
- full_model: str # org/model (path on hub)
19
- org: str
20
  model: str
21
- revision: str # commit hash, "" if main
22
- results: dict
23
  precision: Precision = Precision.Unknown
24
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
25
- weight_type: WeightType = WeightType.Original # Original or Adapter
26
- architecture: str = "Unknown" # From config file
27
  license: str = "?"
28
  likes: int = 0
29
  num_params: int = 0
30
- date: str = "" # submission date of request file
31
  still_on_hub: bool = True
32
  is_merge: bool = False
33
  flagged: bool = False
34
  status: str = "FINISHED"
35
- tags: list = None
36
-
 
 
37
  @classmethod
38
- def init_from_json_file(self, json_filepath):
39
- """Inits the result from the specific model result file"""
40
- with open(json_filepath) as fp:
41
  data = json.load(fp)
42
 
43
- # We manage the legacy config format
44
- config = data.get("config_general")
45
-
46
- # Precision
47
- precision = Precision.from_str(config.get("model_dtype"))
48
-
49
- # Get model and org
50
- org_and_model = config.get("model_name")
51
- org_and_model = org_and_model.split("/", 1)
52
-
53
- if len(org_and_model) == 1:
54
- org = None
55
- model = org_and_model[0]
56
- result_key = f"{model}_{precision.value.name}"
57
- else:
58
- org = org_and_model[0]
59
- model = org_and_model[1]
60
- result_key = f"{org}_{model}_{precision.value.name}"
61
  full_model = "/".join(org_and_model)
62
 
63
- # Extract results available in this file (some results are split in several files)
64
- results = {}
65
- for task in Tasks:
66
- task = task.value
67
- # We skip old mmlu entries
68
- wrong_mmlu_version = False
69
- if task.benchmark == "hendrycksTest":
70
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
71
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
72
- wrong_mmlu_version = True
73
-
74
- if wrong_mmlu_version:
75
- continue
76
-
77
- # Some truthfulQA values are NaNs
78
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
79
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
80
- results[task.benchmark] = 0.0
81
- continue
82
-
83
- # We average all scores of a given metric (mostly for mmlu)
84
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
85
- if accs.size == 0 or any([acc is None for acc in accs]):
86
- continue
87
 
88
- mean_acc = np.mean(accs) * 100.0
89
- results[task.benchmark] = mean_acc
90
-
91
- return self(
92
  eval_name=result_key,
93
  full_model=full_model,
94
  org=org,
95
  model=model,
96
  results=results,
97
  precision=precision,
98
- revision=config.get("model_sha", ""),
99
  )
100
 
101
- def update_with_request_file(self, requests_path):
102
- """Finds the relevant request file for the current model and updates info with it"""
103
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
 
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  try:
 
106
  with open(request_file, "r") as f:
107
  request = json.load(f)
 
108
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
109
  self.weight_type = WeightType[request.get("weight_type", "Original")]
110
- self.num_params = request.get("params", 0)
111
  self.date = request.get("submitted_time", "")
112
  self.architecture = request.get("architectures", "Unknown")
113
  self.status = request.get("status", "FAILED")
114
- except Exception:
 
 
 
 
115
  self.status = "FAILED"
116
- print(f"Could not find request file for {self.org}/{self.model}")
 
 
 
 
 
 
 
117
 
118
  def update_with_dynamic_file_dict(self, file_dict):
 
 
119
  self.license = file_dict.get("license", "?")
120
- self.likes = file_dict.get("likes", 0)
121
- self.still_on_hub = file_dict["still_on_hub"]
122
  self.tags = file_dict.get("tags", [])
123
- self.flagged = any("flagged" in tag for tag in self.tags)
 
 
 
124
 
125
  def to_dict(self):
126
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -149,26 +159,28 @@ class EvalResult:
149
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
150
 
151
  return data_dict
152
-
153
 
154
  def get_request_file_for_model(requests_path, model_name, precision):
155
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
156
- request_files = os.path.join(
157
- requests_path,
158
- f"{model_name}_eval_request_*.json",
159
- )
160
- request_files = glob.glob(request_files)
161
-
162
- # Select correct request file (precision)
163
- request_file = ""
164
- request_files = sorted(request_files, reverse=True)
165
- for tmp_request_file in request_files:
166
- with open(tmp_request_file, "r") as f:
 
167
  req_content = json.load(f)
168
- if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
169
- request_file = tmp_request_file
170
- return request_file
171
-
 
172
 
173
  def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
174
  """From the path of the results folder root, extract all needed info for results"""
@@ -220,4 +232,4 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
220
  except KeyError: # not all eval values present
221
  continue
222
 
223
- return results
 
 
1
  import json
2
+ from pathlib import Path
3
+ from json import JSONDecodeError
4
+ import logging
5
  import math
6
  import os
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional, Dict, List
9
 
10
  import dateutil
11
  import numpy as np
 
13
  from src.display.formatting import make_clickable_model
14
  from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
15
 
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
 
19
  @dataclass
20
  class EvalResult:
21
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
22
+ eval_name: str # org_model_precision (uid)
23
+ full_model: str # org/model (path on hub)
24
+ org: Optional[str]
25
  model: str
26
+ revision: str # commit hash, "" if main
27
+ results: Dict[str, float]
28
  precision: Precision = Precision.Unknown
29
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
30
+ weight_type: WeightType = WeightType.Original
31
+ architecture: str = "Unknown" # From config file
32
  license: str = "?"
33
  likes: int = 0
34
  num_params: int = 0
35
+ date: str = "" # submission date of request file
36
  still_on_hub: bool = True
37
  is_merge: bool = False
38
  flagged: bool = False
39
  status: str = "FINISHED"
40
+ # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
41
+ tags: List[str] = field(default_factory=list)
42
+
43
+
44
  @classmethod
45
+ def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
46
+ with open(json_filepath, 'r') as fp:
 
47
  data = json.load(fp)
48
 
49
+ config = data.get("config_general", {})
50
+ precision = Precision.from_str(config.get("model_dtype", "unknown"))
51
+ org_and_model = config.get("model_name", "").split("/", 1)
52
+ org = org_and_model[0] if len(org_and_model) > 1 else None
53
+ model = org_and_model[-1]
54
+ result_key = "_".join(filter(None, [*org_and_model, precision.value.name]))
 
 
 
 
 
 
 
 
 
 
 
 
55
  full_model = "/".join(org_and_model)
56
 
57
+ results = cls.extract_results(data) # Properly call the method to extract results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ return cls(
 
 
 
60
  eval_name=result_key,
61
  full_model=full_model,
62
  org=org,
63
  model=model,
64
  results=results,
65
  precision=precision,
66
+ revision=config.get("model_sha", "")
67
  )
68
 
69
+ @staticmethod
70
+ def extract_results(data: Dict) -> Dict[str, float]:
71
+ results = {}
72
+ for task in Tasks:
73
+ task_value = task.value
74
 
75
+ if task_value.benchmark == "hendrycksTest":
76
+ if any(data.get("versions", {}).get(mmlu_k, 1) == 0 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]):
77
+ continue
78
+
79
+ if task_value.benchmark == "truthfulqa:mc":
80
+ task_key = "harness|truthfulqa:mc|0"
81
+ if task_key in data["results"]:
82
+ task_metric_value = data["results"][task_key][task_value.metric]
83
+ if math.isnan(float(task_metric_value)):
84
+ results[task_value.benchmark] = 0.0
85
+ continue
86
+
87
+ accs = [float(v.get(task_value.metric, 0)) for k, v in data["results"].items() if task_value.benchmark in k and v.get(task_value.metric, None) is not None]
88
+ if accs:
89
+ mean_acc = np.mean(accs) * 100.0
90
+ results[task_value.benchmark] = mean_acc
91
+
92
+ return results
93
+
94
+
95
+ def update_with_request_file(self, requests_path):
96
+ """Finds the relevant request file for the current model and updates info with it."""
97
  try:
98
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
101
+
102
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
103
  self.weight_type = WeightType[request.get("weight_type", "Original")]
104
+ self.num_params = int(request.get("params", 0)) # Ensuring type safety
105
  self.date = request.get("submitted_time", "")
106
  self.architecture = request.get("architectures", "Unknown")
107
  self.status = request.get("status", "FAILED")
108
+
109
+ except FileNotFoundError:
110
+ self.status = "FAILED"
111
+ logging.error(f"Request file not found for {self.org}/{self.model}")
112
+ except JSONDecodeError:
113
  self.status = "FAILED"
114
+ logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
115
+ except KeyError as e:
116
+ self.status = "FAILED"
117
+ logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
118
+ except Exception as e: # Catch-all for any other unexpected exceptions
119
+ self.status = "FAILED"
120
+ logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
121
+
122
 
123
  def update_with_dynamic_file_dict(self, file_dict):
124
+ """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
125
+ # Default values set for optional or potentially missing keys.
126
  self.license = file_dict.get("license", "?")
127
+ self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
128
+ self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
129
  self.tags = file_dict.get("tags", [])
130
+
131
+ # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
132
+ self.flagged = "flagged" in self.tags
133
+
134
 
135
  def to_dict(self):
136
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
159
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
160
 
161
  return data_dict
162
+
163
 
164
  def get_request_file_for_model(requests_path, model_name, precision):
165
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
166
+ requests_path = Path(requests_path)
167
+ pattern = f"{model_name}_eval_request_*.json"
168
+
169
+ # Using pathlib to find files matching the pattern
170
+ request_files = list(requests_path.glob(pattern))
171
+
172
+ # Sort the files by name in descending order to mimic 'reverse=True'
173
+ request_files.sort(reverse=True)
174
+
175
+ # Select the correct request file based on 'status' and 'precision'
176
+ for request_file in request_files:
177
+ with request_file.open("r") as f:
178
  req_content = json.load(f)
179
+ if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
180
+ return str(request_file)
181
+
182
+ # Return empty string if no file found that matches criteria
183
+ return ""
184
 
185
  def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
186
  """From the path of the results folder root, extract all needed info for results"""
 
232
  except KeyError: # not all eval values present
233
  continue
234
 
235
+ return results