Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Alina Lozovskaia
commited on
Commit
•
c74b7d7
1
Parent(s):
f86eaae
Changes as per comments
Browse files- app.py +16 -0
- src/leaderboard/read_evals.py +23 -15
app.py
CHANGED
@@ -50,6 +50,9 @@ from src.tools.collections import update_collections
|
|
50 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
51 |
|
52 |
|
|
|
|
|
|
|
53 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
54 |
enable_space_ci()
|
55 |
|
@@ -57,6 +60,19 @@ enable_space_ci()
|
|
57 |
def restart_space():
|
58 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
61 |
"""Download dataset with exponential backoff retries."""
|
62 |
attempt = 0
|
|
|
50 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
51 |
|
52 |
|
53 |
+
# Configure logging
|
54 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
55 |
+
|
56 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
57 |
enable_space_ci()
|
58 |
|
|
|
60 |
def restart_space():
|
61 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
62 |
|
63 |
+
|
64 |
+
def time_diff_wrapper(func):
|
65 |
+
def wrapper(*args, **kwargs):
|
66 |
+
start_time = time.time()
|
67 |
+
result = func(*args, **kwargs)
|
68 |
+
end_time = time.time()
|
69 |
+
diff = end_time - start_time
|
70 |
+
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
71 |
+
return result
|
72 |
+
return wrapper
|
73 |
+
|
74 |
+
|
75 |
+
@time_diff_wrapper
|
76 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
77 |
"""Download dataset with exponential backoff retries."""
|
78 |
attempt = 0
|
src/leaderboard/read_evals.py
CHANGED
@@ -78,39 +78,47 @@ class EvalResult:
|
|
78 |
@staticmethod
|
79 |
def extract_results(data: Dict) -> Dict[str, float]:
|
80 |
"""
|
81 |
-
|
82 |
-
Skips entries based on specific conditions and handles NaN values appropriately.
|
83 |
-
Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
|
84 |
|
85 |
Parameters:
|
86 |
-
- data (Dict):
|
|
|
87 |
|
88 |
Returns:
|
89 |
-
- Dict[str, float]: A dictionary
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"""
|
91 |
results = {}
|
92 |
for task in Tasks:
|
93 |
task = task.value
|
94 |
-
|
95 |
# We skip old mmlu entries
|
96 |
if task.benchmark == "hendrycksTest":
|
97 |
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
98 |
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
99 |
continue
|
100 |
|
101 |
-
# Some
|
102 |
-
if
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
106 |
|
107 |
# We average all scores of a given metric (mostly for mmlu)
|
108 |
-
accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
|
109 |
-
if accs or any([acc is None for acc in accs]):
|
110 |
continue
|
111 |
|
112 |
-
|
113 |
-
|
|
|
114 |
return results
|
115 |
|
116 |
|
|
|
78 |
@staticmethod
|
79 |
def extract_results(data: Dict) -> Dict[str, float]:
|
80 |
"""
|
81 |
+
Extract and process benchmark results from a given dict.
|
|
|
|
|
82 |
|
83 |
Parameters:
|
84 |
+
- data (Dict): A dictionary containing benchmark data. This dictionary must
|
85 |
+
include 'versions' and 'results' keys with respective sub-data.
|
86 |
|
87 |
Returns:
|
88 |
+
- Dict[str, float]: A dictionary where keys are benchmark names and values
|
89 |
+
are the processed average scores as percentages.
|
90 |
+
|
91 |
+
Notes:
|
92 |
+
- The method specifically checks for certain benchmark names to skip outdated entries.
|
93 |
+
- Handles NaN values by setting the corresponding benchmark result to 0.0.
|
94 |
+
- Averages scores across metrics for benchmarks found in the data, in a percentage format.
|
95 |
"""
|
96 |
results = {}
|
97 |
for task in Tasks:
|
98 |
task = task.value
|
|
|
99 |
# We skip old mmlu entries
|
100 |
if task.benchmark == "hendrycksTest":
|
101 |
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
102 |
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
103 |
continue
|
104 |
|
105 |
+
# Some benchamrk values are NaNs, mostly truthfulQA
|
106 |
+
# Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
|
107 |
+
# e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
|
108 |
+
for k, v in data["results"].items():
|
109 |
+
if task.benchmark in k:
|
110 |
+
if math.isnan(float(v[task.metric])):
|
111 |
+
results[task.benchmark] = 0.0
|
112 |
+
continue
|
113 |
|
114 |
# We average all scores of a given metric (mostly for mmlu)
|
115 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
116 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
117 |
continue
|
118 |
|
119 |
+
mean_acc = np.mean(accs) * 100.0
|
120 |
+
results[task.benchmark] = mean_acc
|
121 |
+
|
122 |
return results
|
123 |
|
124 |
|