Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1012

Alina Lozovskaia commited on Apr 24

Commit

705a80c

•

1 Parent(s): b7d036c

read_evals initial change

Browse files

Files changed (3) hide show

pyproject.toml +11 -5
src/envs.py +1 -1
src/leaderboard/read_evals.py +101 -89

pyproject.toml CHANGED Viewed

@@ -1,9 +1,15 @@
 [tool.ruff]
-# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
-lint.select = ["E", "F"]
-lint.ignore = ["E501"] # line too long (black is taking care of this)
-line-length = 119
-lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
 [tool.isort]
 profile = "black"

 [tool.ruff]
+line-length = 120
+target-version = "py312"
+include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
+ignore=["I","EM","FBT","TRY003","S101","D101","D102","D103","D104","D105","G004","D107","FA102"]
+fixable=["ALL"]
+select=["ALL"]
+[tool.ruff.lint]
+select = ["E", "F"]
+fixable = ["ALL"]
+ignore = ["E501"] # line too long (black is taking care of this)
 [tool.isort]
 profile = "black"

src/envs.py CHANGED Viewed

@@ -26,7 +26,7 @@ if not os.access(HF_HOME, os.W_OK):
     HF_HOME = "."
     os.environ["HF_HOME"] = HF_HOME
 else:
-    print(f"Write access confirmed for HF_HOME")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")

     HF_HOME = "."
     os.environ["HF_HOME"] = HF_HOME
 else:
+    print("Write access confirmed for HF_HOME")
 EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,8 +1,11 @@
-import glob
 import json
 import math
 import os
-from dataclasses import dataclass
 import dateutil
 import numpy as np
@@ -10,117 +13,124 @@ import numpy as np
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
-    eval_name: str  # org_model_precision (uid)
-    full_model: str  # org/model (path on hub)
-    org: str
     model: str
-    revision: str  # commit hash, "" if main
-    results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original  # Original or Adapter
-    architecture: str = "Unknown"  # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = ""  # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     flagged: bool = False
     status: str = "FINISHED"
-    tags: list = None
     @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
             data = json.load(fp)
-        # We manage the legacy config format
-        config = data.get("config_general")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name")
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We skip old mmlu entries
-            wrong_mmlu_version = False
-            if task.benchmark == "hendrycksTest":
-                for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
-                    if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
-                        wrong_mmlu_version = True
-            if wrong_mmlu_version:
-                continue
-            # Some truthfulQA values are NaNs
-            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
-                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
-                    results[task.benchmark] = 0.0
-                    continue
-            # We average all scores of a given metric (mostly for mmlu)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
             precision=precision,
-            revision=config.get("model_sha", ""),
         )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
-        except Exception:
             self.status = "FAILED"
-            print(f"Could not find request file for {self.org}/{self.model}")
     def update_with_dynamic_file_dict(self, file_dict):
         self.license = file_dict.get("license", "?")
-        self.likes = file_dict.get("likes", 0)
-        self.still_on_hub = file_dict["still_on_hub"]
         self.tags = file_dict.get("tags", [])
-        self.flagged = any("flagged" in tag for tag in self.tags)
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -149,26 +159,28 @@ class EvalResult:
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
-            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
-                request_file = tmp_request_file
-    return request_file
 def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
@@ -220,4 +232,4 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
         except KeyError:  # not all eval values present
             continue
-    return results

 import json
+from pathlib import Path
+from json import JSONDecodeError
+import logging
 import math
 import os
+from dataclasses import dataclass, field
+from typing import Optional, Dict, List
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
+    eval_name: str # org_model_precision (uid)
+    full_model: str # org/model (path on hub)
+    org: Optional[str]
     model: str
+    revision: str # commit hash, "" if main
+    results: Dict[str, float]
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original
+    architecture: str = "Unknown" # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = "" # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     flagged: bool = False
     status: str = "FINISHED"
+    # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
+    tags: List[str] = field(default_factory=list)
     @classmethod
+    def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
+        with open(json_filepath, 'r') as fp:
             data = json.load(fp)
+        config = data.get("config_general", {})
+        precision = Precision.from_str(config.get("model_dtype", "unknown"))
+        org_and_model = config.get("model_name", "").split("/", 1)
+        org = org_and_model[0] if len(org_and_model) > 1 else None
+        model = org_and_model[-1]
+        result_key = "_".join(filter(None, [*org_and_model, precision.value.name]))
         full_model = "/".join(org_and_model)
+        results = cls.extract_results(data)  # Properly call the method to extract results
+        return cls(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
             results=results,
             precision=precision,
+            revision=config.get("model_sha", "")
         )
+    @staticmethod
+    def extract_results(data: Dict) -> Dict[str, float]:
+        results = {}
+        for task in Tasks:
+            task_value = task.value
+            if task_value.benchmark == "hendrycksTest":
+                if any(data.get("versions", {}).get(mmlu_k, 1) == 0 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]):
+                    continue
+            if task_value.benchmark == "truthfulqa:mc":
+                task_key = "harness|truthfulqa:mc|0"
+                if task_key in data["results"]:
+                    task_metric_value = data["results"][task_key][task_value.metric]
+                    if math.isnan(float(task_metric_value)):
+                        results[task_value.benchmark] = 0.0
+                        continue
+            accs = [float(v.get(task_value.metric, 0)) for k, v in data["results"].items() if task_value.benchmark in k and v.get(task_value.metric, None) is not None]
+            if accs:
+                mean_acc = np.mean(accs) * 100.0
+                results[task_value.benchmark] = mean_acc
+        return results
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it."""
         try:
+            request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
+            self.num_params = int(request.get("params", 0))  # Ensuring type safety
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
+        except FileNotFoundError:
+            self.status = "FAILED"
+            logging.error(f"Request file not found for {self.org}/{self.model}")
+        except JSONDecodeError:
             self.status = "FAILED"
+            logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
+        except KeyError as e:
+            self.status = "FAILED"
+            logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
+        except Exception as e:  # Catch-all for any other unexpected exceptions
+            self.status = "FAILED"
+            logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
     def update_with_dynamic_file_dict(self, file_dict):
+        """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
+        # Default values set for optional or potentially missing keys.
         self.license = file_dict.get("license", "?")
+        self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
+        self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
         self.tags = file_dict.get("tags", [])
+        # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
+        self.flagged = "flagged" in self.tags
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
+    requests_path = Path(requests_path)
+    pattern = f"{model_name}_eval_request_*.json"
+    # Using pathlib to find files matching the pattern
+    request_files = list(requests_path.glob(pattern))
+    # Sort the files by name in descending order to mimic 'reverse=True'
+    request_files.sort(reverse=True)
+    # Select the correct request file based on 'status' and 'precision'
+    for request_file in request_files:
+        with request_file.open("r") as f:
             req_content = json.load(f)
+            if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
+                return str(request_file)
+    # Return empty string if no file found that matches criteria
+    return ""
 def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
         except KeyError:  # not all eval values present
             continue
+    return results