Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
018441b
1
Parent(s):
b2cd23e
update
Browse files- app.py +22 -11
- beta-cli.py +16 -0
- requirements.txt +1 -2
- src/display/formatting.py +0 -52
- src/display/utils.py +75 -57
- src/leaderboard/read_evals.py +14 -16
- src/populate.py +1 -7
- src/submission/check_validity.py +19 -4
- src/submission/submit.py +13 -13
app.py
CHANGED
@@ -22,40 +22,51 @@ from src.display.utils import (
|
|
22 |
AutoEvalColumn,
|
23 |
ModelType,
|
24 |
fields,
|
|
|
|
|
25 |
)
|
26 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
27 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
28 |
from src.submission.submit import add_new_eval
|
29 |
-
|
30 |
-
|
31 |
from src.tools.plots import (
|
32 |
create_metric_plot_obj,
|
33 |
create_plot_df,
|
34 |
create_scores_df,
|
35 |
)
|
36 |
|
|
|
37 |
def restart_space():
|
38 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
39 |
|
40 |
try:
|
41 |
print(EVAL_REQUESTS_PATH)
|
42 |
-
snapshot_download(
|
|
|
|
|
43 |
except Exception:
|
44 |
restart_space()
|
45 |
try:
|
46 |
print(EVAL_RESULTS_PATH)
|
47 |
-
snapshot_download(
|
|
|
|
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
-
|
54 |
leaderboard_df = original_df.copy()
|
55 |
|
56 |
plot_df = create_plot_df(create_scores_df(raw_data))
|
57 |
|
58 |
-
(
|
|
|
|
|
|
|
|
|
59 |
|
60 |
|
61 |
# Searching and filtering
|
@@ -177,8 +188,8 @@ with demo:
|
|
177 |
)
|
178 |
filter_columns_precision = gr.CheckboxGroup(
|
179 |
label="Precision",
|
180 |
-
choices=[
|
181 |
-
value=[
|
182 |
interactive=True,
|
183 |
elem_id="filter-columns-precision",
|
184 |
)
|
@@ -308,7 +319,7 @@ with demo:
|
|
308 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
309 |
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
310 |
model_type = gr.Dropdown(
|
311 |
-
choices=[t.to_str(" : ") for t in ModelType],
|
312 |
label="Model type",
|
313 |
multiselect=False,
|
314 |
value=None,
|
@@ -317,14 +328,14 @@ with demo:
|
|
317 |
|
318 |
with gr.Column():
|
319 |
precision = gr.Dropdown(
|
320 |
-
choices=[
|
321 |
label="Precision",
|
322 |
multiselect=False,
|
323 |
value="float16",
|
324 |
interactive=True,
|
325 |
)
|
326 |
weight_type = gr.Dropdown(
|
327 |
-
choices=[
|
328 |
label="Weights type",
|
329 |
multiselect=False,
|
330 |
value="Original",
|
|
|
22 |
AutoEvalColumn,
|
23 |
ModelType,
|
24 |
fields,
|
25 |
+
WeightType,
|
26 |
+
Precision
|
27 |
)
|
28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
from src.submission.submit import add_new_eval
|
31 |
+
from src.submission.check_validity import already_submitted_models
|
32 |
+
from src.tools.collections import update_collections
|
33 |
from src.tools.plots import (
|
34 |
create_metric_plot_obj,
|
35 |
create_plot_df,
|
36 |
create_scores_df,
|
37 |
)
|
38 |
|
39 |
+
|
40 |
def restart_space():
|
41 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
42 |
|
43 |
try:
|
44 |
print(EVAL_REQUESTS_PATH)
|
45 |
+
snapshot_download(
|
46 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
47 |
+
)
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
try:
|
51 |
print(EVAL_RESULTS_PATH)
|
52 |
+
snapshot_download(
|
53 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
54 |
+
)
|
55 |
except Exception:
|
56 |
restart_space()
|
57 |
|
58 |
|
59 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
60 |
+
update_collections(original_df.copy())
|
61 |
leaderboard_df = original_df.copy()
|
62 |
|
63 |
plot_df = create_plot_df(create_scores_df(raw_data))
|
64 |
|
65 |
+
(
|
66 |
+
finished_eval_queue_df,
|
67 |
+
running_eval_queue_df,
|
68 |
+
pending_eval_queue_df,
|
69 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
70 |
|
71 |
|
72 |
# Searching and filtering
|
|
|
188 |
)
|
189 |
filter_columns_precision = gr.CheckboxGroup(
|
190 |
label="Precision",
|
191 |
+
choices=[i.value.name for i in Precision],
|
192 |
+
value=[i.value.name for i in Precision],
|
193 |
interactive=True,
|
194 |
elem_id="filter-columns-precision",
|
195 |
)
|
|
|
319 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
320 |
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
321 |
model_type = gr.Dropdown(
|
322 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
323 |
label="Model type",
|
324 |
multiselect=False,
|
325 |
value=None,
|
|
|
328 |
|
329 |
with gr.Column():
|
330 |
precision = gr.Dropdown(
|
331 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
332 |
label="Precision",
|
333 |
multiselect=False,
|
334 |
value="float16",
|
335 |
interactive=True,
|
336 |
)
|
337 |
weight_type = gr.Dropdown(
|
338 |
+
choices=[i.value.name for i in WeightType],
|
339 |
label="Weights type",
|
340 |
multiselect=False,
|
341 |
value="Original",
|
beta-cli.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
from huggingface_hub import snapshot_download
|
4 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
5 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
6 |
+
|
7 |
+
snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
|
8 |
+
snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
|
9 |
+
|
10 |
+
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
11 |
+
|
12 |
+
for entry in raw_data:
|
13 |
+
if '125' in entry.eval_name:
|
14 |
+
print(entry)
|
15 |
+
|
16 |
+
# print(raw_data)
|
requirements.txt
CHANGED
@@ -17,6 +17,5 @@ python-dateutil==2.8.2
|
|
17 |
requests==2.28.2
|
18 |
semantic-version==2.10.0
|
19 |
tqdm==4.65.0
|
20 |
-
|
21 |
-
#transformers==4.35.1
|
22 |
tokenizers>=0.15.0
|
|
|
17 |
requests==2.28.2
|
18 |
semantic-version==2.10.0
|
19 |
tqdm==4.65.0
|
20 |
+
transformers==4.35.2
|
|
|
21 |
tokenizers>=0.15.0
|
src/display/formatting.py
CHANGED
@@ -7,23 +7,6 @@ from huggingface_hub.hf_api import ModelInfo
|
|
7 |
|
8 |
API = HfApi()
|
9 |
|
10 |
-
LLAMAS = [
|
11 |
-
"huggingface/llama-7b",
|
12 |
-
"huggingface/llama-13b",
|
13 |
-
"huggingface/llama-30b",
|
14 |
-
"huggingface/llama-65b",
|
15 |
-
]
|
16 |
-
|
17 |
-
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
18 |
-
VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
19 |
-
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
20 |
-
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
21 |
-
MODEL_PAGE = "https://huggingface.co/models"
|
22 |
-
LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
23 |
-
VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
24 |
-
ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
25 |
-
|
26 |
-
|
27 |
def model_hyperlink(link, model_name):
|
28 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
29 |
|
@@ -31,44 +14,9 @@ def model_hyperlink(link, model_name):
|
|
31 |
def make_clickable_model(model_name):
|
32 |
link = f"https://huggingface.co/{model_name}"
|
33 |
|
34 |
-
if model_name in LLAMAS:
|
35 |
-
link = LLAMA_LINK
|
36 |
-
model_name = model_name.split("/")[1]
|
37 |
-
elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
38 |
-
link = VICUNA_LINK
|
39 |
-
model_name = "stable-vicuna-13b"
|
40 |
-
elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
41 |
-
link = ALPACA_LINK
|
42 |
-
model_name = "alpaca-13b"
|
43 |
-
if model_name == "dolly-12b":
|
44 |
-
link = DOLLY_LINK
|
45 |
-
elif model_name == "vicuna-13b":
|
46 |
-
link = VICUNA_LINK
|
47 |
-
elif model_name == "koala-13b":
|
48 |
-
link = KOALA_LINK
|
49 |
-
elif model_name == "oasst-12b":
|
50 |
-
link = OASST_LINK
|
51 |
-
|
52 |
details_model_name = model_name.replace("/", "__")
|
53 |
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
|
54 |
|
55 |
-
if not bool(os.getenv("DEBUG", "False")):
|
56 |
-
# We only add these checks when not debugging, as they are extremely slow
|
57 |
-
print(f"details_link: {details_link}")
|
58 |
-
try:
|
59 |
-
check_path = list(
|
60 |
-
API.list_files_info(
|
61 |
-
repo_id=f"open-llm-leaderboard/details_{details_model_name}",
|
62 |
-
paths="README.md",
|
63 |
-
repo_type="dataset",
|
64 |
-
)
|
65 |
-
)
|
66 |
-
print(f"check_path: {check_path}")
|
67 |
-
except Exception as err:
|
68 |
-
# No details repo for this model
|
69 |
-
print(f"No details repo for this model: {err}")
|
70 |
-
return model_hyperlink(link, model_name)
|
71 |
-
|
72 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
73 |
|
74 |
|
|
|
7 |
|
8 |
API = HfApi()
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def model_hyperlink(link, model_name):
|
11 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
|
|
|
14 |
def make_clickable_model(model_name):
|
15 |
link = f"https://huggingface.co/{model_name}"
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
details_model_name = model_name.replace("/", "__")
|
18 |
details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
21 |
|
22 |
|
src/display/utils.py
CHANGED
@@ -1,8 +1,26 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# These classes are for user facing column names,
|
8 |
# to avoid having to change them all around the code
|
@@ -16,39 +34,29 @@ class ColumnContent:
|
|
16 |
never_hidden: bool = False
|
17 |
dummy: bool = False
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
precision = ColumnContent("Precision", "str", False) # , True)
|
43 |
-
license = ColumnContent("Hub License", "str", False)
|
44 |
-
params = ColumnContent("#Params (B)", "number", False)
|
45 |
-
likes = ColumnContent("Hub ❤️", "number", False)
|
46 |
-
still_on_hub = ColumnContent("Available on the hub", "bool", False)
|
47 |
-
revision = ColumnContent("Model sha", "str", False, False)
|
48 |
-
dummy = ColumnContent(
|
49 |
-
"model_name_for_query", "str", False, dummy=True
|
50 |
-
) # dummy col to implement search bar (hidden by custom CSS)
|
51 |
-
|
52 |
|
53 |
@dataclass(frozen=True)
|
54 |
class EvalQueueColumn: # Queue column
|
@@ -102,17 +110,17 @@ human_baseline_row = {
|
|
102 |
}
|
103 |
|
104 |
@dataclass
|
105 |
-
class
|
106 |
name: str
|
107 |
-
symbol: str
|
108 |
|
109 |
|
110 |
class ModelType(Enum):
|
111 |
-
PT =
|
112 |
-
FT =
|
113 |
-
IFT =
|
114 |
-
RL =
|
115 |
-
Unknown =
|
116 |
|
117 |
def to_str(self, separator=" "):
|
118 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
@@ -129,23 +137,33 @@ class ModelType(Enum):
|
|
129 |
return ModelType.IFT
|
130 |
return ModelType.Unknown
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
-
@dataclass
|
134 |
-
class Task:
|
135 |
-
benchmark: str
|
136 |
-
metric: str
|
137 |
-
col_name: str
|
138 |
-
|
139 |
-
|
140 |
-
class Tasks(Enum):
|
141 |
-
arc = Task("arc:challenge", "acc_norm", AutoEvalColumn.arc.name)
|
142 |
-
hellaswag = Task("hellaswag", "acc_norm", AutoEvalColumn.hellaswag.name)
|
143 |
-
mmlu = Task("hendrycksTest", "acc", AutoEvalColumn.mmlu.name)
|
144 |
-
truthfulqa = Task("truthfulqa:mc", "mc2", AutoEvalColumn.truthfulqa.name)
|
145 |
-
winogrande = Task("winogrande", "acc", AutoEvalColumn.winogrande.name)
|
146 |
-
gsm8k = Task("gsm8k", "acc", AutoEvalColumn.gsm8k.name)
|
147 |
-
drop = Task("drop", "f1", AutoEvalColumn.drop.name)
|
148 |
-
nq = Task("nqopen", "em", AutoEvalColumn.nqopen.name)
|
149 |
|
150 |
|
151 |
# Column selection
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
def fields(raw_class):
|
7 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class Task:
|
12 |
+
benchmark: str
|
13 |
+
metric: str
|
14 |
+
col_name: str
|
15 |
+
|
16 |
+
class Tasks(Enum):
|
17 |
+
arc = Task("arc:challenge", "acc_norm", "ARC")
|
18 |
+
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
19 |
+
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
20 |
+
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
21 |
+
winogrande = Task("winogrande", "acc", "Winogrande")
|
22 |
+
gsm8k = Task("gsm8k", "acc", "GSM8K")
|
23 |
+
drop = Task("drop", "f1", "DROP")
|
24 |
|
25 |
# These classes are for user facing column names,
|
26 |
# to avoid having to change them all around the code
|
|
|
34 |
never_hidden: bool = False
|
35 |
dummy: bool = False
|
36 |
|
37 |
+
auto_eval_column_dict = []
|
38 |
+
# Init
|
39 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
40 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
41 |
+
#Scores
|
42 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
43 |
+
for task in Tasks:
|
44 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
45 |
+
# Model information
|
46 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
47 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
48 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
49 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
50 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
51 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
52 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
53 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
54 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
55 |
+
# Dummy column for the search bar (hidden by the custom CSS)
|
56 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
57 |
+
|
58 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
59 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
@dataclass(frozen=True)
|
62 |
class EvalQueueColumn: # Queue column
|
|
|
110 |
}
|
111 |
|
112 |
@dataclass
|
113 |
+
class ModelDetails:
|
114 |
name: str
|
115 |
+
symbol: str = "" # emoji, only for the model type
|
116 |
|
117 |
|
118 |
class ModelType(Enum):
|
119 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
120 |
+
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
121 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="���")
|
122 |
+
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
123 |
+
Unknown = ModelDetails(name="", symbol="?")
|
124 |
|
125 |
def to_str(self, separator=" "):
|
126 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
|
|
137 |
return ModelType.IFT
|
138 |
return ModelType.Unknown
|
139 |
|
140 |
+
class WeightType(Enum):
|
141 |
+
Adapter = ModelDetails("Adapter")
|
142 |
+
Original = ModelDetails("Original")
|
143 |
+
Delta = ModelDetails("Delta")
|
144 |
+
|
145 |
+
class Precision(Enum):
|
146 |
+
float16 = ModelDetails("float16")
|
147 |
+
bfloat16 = ModelDetails("bfloat16")
|
148 |
+
qt_8bit = ModelDetails("8bit")
|
149 |
+
qt_4bit = ModelDetails("4bit")
|
150 |
+
qt_GPTQ = ModelDetails("GPTQ")
|
151 |
+
Unknown = ModelDetails("?")
|
152 |
+
|
153 |
+
def from_str(precision):
|
154 |
+
if precision in ["torch.float16", "float16"]:
|
155 |
+
return Precision.float16
|
156 |
+
if precision in ["torch.bfloat16", "bfloat16"]:
|
157 |
+
return Precision.bfloat16
|
158 |
+
if precision in ["8bit"]:
|
159 |
+
return Precision.qt_8bit
|
160 |
+
if precision in ["4bit"]:
|
161 |
+
return Precision.qt_4bit
|
162 |
+
if precision in ["GPTQ", "None"]:
|
163 |
+
return Precision.qt_GPTQ
|
164 |
+
return Precision.Unknown
|
165 |
+
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
|
169 |
# Column selection
|
src/leaderboard/read_evals.py
CHANGED
@@ -5,12 +5,12 @@ import os
|
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
8 |
-
|
9 |
-
|
10 |
import numpy as np
|
11 |
|
12 |
from src.display.formatting import make_clickable_model
|
13 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks
|
14 |
from src.submission.check_validity import is_model_on_hub
|
15 |
|
16 |
|
@@ -23,9 +23,9 @@ class EvalResult:
|
|
23 |
model: str
|
24 |
revision: str # commit hash, "" if main
|
25 |
results: dict
|
26 |
-
precision:
|
27 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
-
weight_type:
|
29 |
architecture: str = "Unknown" # From config file
|
30 |
license: str = "?"
|
31 |
likes: int = 0
|
@@ -43,9 +43,7 @@ class EvalResult:
|
|
43 |
config = data.get("config", data.get("config_general", None))
|
44 |
|
45 |
# Precision
|
46 |
-
precision = config.get("model_dtype")
|
47 |
-
if precision == "None":
|
48 |
-
precision = "GPTQ"
|
49 |
|
50 |
# Get model and org
|
51 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
@@ -54,15 +52,15 @@ class EvalResult:
|
|
54 |
if len(org_and_model) == 1:
|
55 |
org = None
|
56 |
model = org_and_model[0]
|
57 |
-
result_key = f"{model}_{precision}"
|
58 |
else:
|
59 |
org = org_and_model[0]
|
60 |
model = org_and_model[1]
|
61 |
-
result_key = f"{org}_{model}_{precision}"
|
62 |
full_model = "/".join(org_and_model)
|
63 |
|
64 |
still_on_hub, error, model_config = is_model_on_hub(
|
65 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True
|
66 |
)
|
67 |
architecture = "?"
|
68 |
if model_config is not None:
|
@@ -112,13 +110,13 @@ class EvalResult:
|
|
112 |
|
113 |
def update_with_request_file(self, requests_path):
|
114 |
"""Finds the relevant request file for the current model and updates info with it"""
|
115 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
|
116 |
|
117 |
try:
|
118 |
with open(request_file, "r") as f:
|
119 |
request = json.load(f)
|
120 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
121 |
-
self.weight_type = request.get("weight_type", "
|
122 |
self.license = request.get("license", "?")
|
123 |
self.likes = request.get("likes", 0)
|
124 |
self.num_params = request.get("params", 0)
|
@@ -131,10 +129,10 @@ class EvalResult:
|
|
131 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
132 |
data_dict = {
|
133 |
"eval_name": self.eval_name, # not a column, just a save name,
|
134 |
-
AutoEvalColumn.precision.name: self.precision,
|
135 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
136 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
137 |
-
AutoEvalColumn.weight_type.name: self.weight_type,
|
138 |
AutoEvalColumn.architecture.name: self.architecture,
|
139 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
140 |
AutoEvalColumn.dummy.name: self.full_model,
|
@@ -167,7 +165,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
167 |
with open(tmp_request_file, "r") as f:
|
168 |
req_content = json.load(f)
|
169 |
if (
|
170 |
-
req_content["status"] in ["FINISHED"
|
171 |
and req_content["precision"] == precision.split(".")[-1]
|
172 |
):
|
173 |
request_file = tmp_request_file
|
|
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
8 |
+
from datetime import datetime
|
9 |
+
from transformers import AutoConfig
|
10 |
import numpy as np
|
11 |
|
12 |
from src.display.formatting import make_clickable_model
|
13 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
14 |
from src.submission.check_validity import is_model_on_hub
|
15 |
|
16 |
|
|
|
23 |
model: str
|
24 |
revision: str # commit hash, "" if main
|
25 |
results: dict
|
26 |
+
precision: Precision = Precision.Unknown
|
27 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
architecture: str = "Unknown" # From config file
|
30 |
license: str = "?"
|
31 |
likes: int = 0
|
|
|
43 |
config = data.get("config", data.get("config_general", None))
|
44 |
|
45 |
# Precision
|
46 |
+
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
|
|
47 |
|
48 |
# Get model and org
|
49 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
52 |
if len(org_and_model) == 1:
|
53 |
org = None
|
54 |
model = org_and_model[0]
|
55 |
+
result_key = f"{model}_{precision.value.name}"
|
56 |
else:
|
57 |
org = org_and_model[0]
|
58 |
model = org_and_model[1]
|
59 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
60 |
full_model = "/".join(org_and_model)
|
61 |
|
62 |
still_on_hub, error, model_config = is_model_on_hub(
|
63 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
64 |
)
|
65 |
architecture = "?"
|
66 |
if model_config is not None:
|
|
|
110 |
|
111 |
def update_with_request_file(self, requests_path):
|
112 |
"""Finds the relevant request file for the current model and updates info with it"""
|
113 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
114 |
|
115 |
try:
|
116 |
with open(request_file, "r") as f:
|
117 |
request = json.load(f)
|
118 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
119 |
+
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
120 |
self.license = request.get("license", "?")
|
121 |
self.likes = request.get("likes", 0)
|
122 |
self.num_params = request.get("params", 0)
|
|
|
129 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
130 |
data_dict = {
|
131 |
"eval_name": self.eval_name, # not a column, just a save name,
|
132 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
133 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
134 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
135 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
136 |
AutoEvalColumn.architecture.name: self.architecture,
|
137 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
138 |
AutoEvalColumn.dummy.name: self.full_model,
|
|
|
165 |
with open(tmp_request_file, "r") as f:
|
166 |
req_content = json.load(f)
|
167 |
if (
|
168 |
+
req_content["status"] in ["FINISHED"]
|
169 |
and req_content["precision"] == precision.split(".")[-1]
|
170 |
):
|
171 |
request_file = tmp_request_file
|
src/populate.py
CHANGED
@@ -21,13 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
21 |
|
22 |
df = pd.DataFrame.from_records(all_data_json)
|
23 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
24 |
-
|
25 |
-
# df = df[cols].round(decimals=2)
|
26 |
-
for col in cols:
|
27 |
-
if col in df.columns:
|
28 |
-
df[col] = df[col].round(decimals=2)
|
29 |
-
else:
|
30 |
-
df[col] = 0.0
|
31 |
|
32 |
# filter out if any of the benchmarks have not been produced
|
33 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
21 |
|
22 |
df = pd.DataFrame.from_records(all_data_json)
|
23 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
24 |
+
df = df[cols].round(decimals=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# filter out if any of the benchmarks have not been produced
|
27 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
src/submission/check_validity.py
CHANGED
@@ -8,6 +8,7 @@ import huggingface_hub
|
|
8 |
from huggingface_hub import ModelCard
|
9 |
from huggingface_hub.hf_api import ModelInfo
|
10 |
from transformers import AutoConfig
|
|
|
11 |
|
12 |
from src.envs import HAS_HIGHER_RATE_LIMIT
|
13 |
|
@@ -36,9 +37,24 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
36 |
return True, ""
|
37 |
|
38 |
|
39 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
|
40 |
try:
|
41 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
return True, None, config
|
43 |
|
44 |
except ValueError:
|
@@ -48,7 +64,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
48 |
None
|
49 |
)
|
50 |
|
51 |
-
except Exception:
|
52 |
return False, "was not found on hub!", None
|
53 |
|
54 |
|
@@ -71,8 +87,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
71 |
def get_model_arch(model_info: ModelInfo):
|
72 |
return model_info.config.get("architectures", "Unknown")
|
73 |
|
74 |
-
def user_submission_permission(
|
75 |
-
org_or_user, _ = submission_name.split("/")
|
76 |
if org_or_user not in users_to_submission_dates:
|
77 |
return True, ""
|
78 |
submission_dates = sorted(users_to_submission_dates[org_or_user])
|
|
|
8 |
from huggingface_hub import ModelCard
|
9 |
from huggingface_hub.hf_api import ModelInfo
|
10 |
from transformers import AutoConfig
|
11 |
+
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
|
12 |
|
13 |
from src.envs import HAS_HIGHER_RATE_LIMIT
|
14 |
|
|
|
37 |
return True, ""
|
38 |
|
39 |
|
40 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
41 |
try:
|
42 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
43 |
+
if test_tokenizer:
|
44 |
+
tokenizer_config = get_tokenizer_config(model_name)
|
45 |
+
if tokenizer_config is not None:
|
46 |
+
tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
|
47 |
+
else:
|
48 |
+
tokenizer_class_candidate = config.tokenizer_class
|
49 |
+
|
50 |
+
|
51 |
+
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
|
52 |
+
if tokenizer_class is None:
|
53 |
+
return (
|
54 |
+
False,
|
55 |
+
f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.",
|
56 |
+
None
|
57 |
+
)
|
58 |
return True, None, config
|
59 |
|
60 |
except ValueError:
|
|
|
64 |
None
|
65 |
)
|
66 |
|
67 |
+
except Exception as e:
|
68 |
return False, "was not found on hub!", None
|
69 |
|
70 |
|
|
|
87 |
def get_model_arch(model_info: ModelInfo):
|
88 |
return model_info.config.get("architectures", "Unknown")
|
89 |
|
90 |
+
def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
|
|
|
91 |
if org_or_user not in users_to_submission_dates:
|
92 |
return True, ""
|
93 |
submission_dates = sorted(users_to_submission_dates[org_or_user])
|
src/submission/submit.py
CHANGED
@@ -30,6 +30,11 @@ def add_new_eval(
|
|
30 |
if not REQUESTED_MODELS:
|
31 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
32 |
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
precision = precision.split(" ")[0]
|
35 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
@@ -38,11 +43,12 @@ def add_new_eval(
|
|
38 |
return styled_error("Please select a model type.")
|
39 |
|
40 |
# Is the user rate limited?
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
# Did the model authors forbid its submission to the leaderboard?
|
48 |
if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
|
@@ -54,12 +60,12 @@ def add_new_eval(
|
|
54 |
|
55 |
# Is the model on the hub?
|
56 |
if weight_type in ["Delta", "Adapter"]:
|
57 |
-
base_model_on_hub, error, _ = is_model_on_hub(base_model, revision, H4_TOKEN)
|
58 |
if not base_model_on_hub:
|
59 |
return styled_error(f'Base model "{base_model}" {error}')
|
60 |
|
61 |
if not weight_type == "Adapter":
|
62 |
-
model_on_hub, error, _ = is_model_on_hub(model, revision)
|
63 |
if not model_on_hub:
|
64 |
return styled_error(f'Model "{model}" {error}')
|
65 |
|
@@ -99,12 +105,6 @@ def add_new_eval(
|
|
99 |
"license": license,
|
100 |
}
|
101 |
|
102 |
-
user_name = ""
|
103 |
-
model_path = model
|
104 |
-
if "/" in model:
|
105 |
-
user_name = model.split("/")[0]
|
106 |
-
model_path = model.split("/")[1]
|
107 |
-
|
108 |
# Check for duplicate submission
|
109 |
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
110 |
return styled_warning("This model has been already submitted.")
|
|
|
30 |
if not REQUESTED_MODELS:
|
31 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
32 |
|
33 |
+
user_name = ""
|
34 |
+
model_path = model
|
35 |
+
if "/" in model:
|
36 |
+
user_name = model.split("/")[0]
|
37 |
+
model_path = model.split("/")[1]
|
38 |
|
39 |
precision = precision.split(" ")[0]
|
40 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
43 |
return styled_error("Please select a model type.")
|
44 |
|
45 |
# Is the user rate limited?
|
46 |
+
if user_name != "":
|
47 |
+
user_can_submit, error_msg = user_submission_permission(
|
48 |
+
user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
49 |
+
)
|
50 |
+
if not user_can_submit:
|
51 |
+
return styled_error(error_msg)
|
52 |
|
53 |
# Did the model authors forbid its submission to the leaderboard?
|
54 |
if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
|
|
|
60 |
|
61 |
# Is the model on the hub?
|
62 |
if weight_type in ["Delta", "Adapter"]:
|
63 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
|
64 |
if not base_model_on_hub:
|
65 |
return styled_error(f'Base model "{base_model}" {error}')
|
66 |
|
67 |
if not weight_type == "Adapter":
|
68 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
|
69 |
if not model_on_hub:
|
70 |
return styled_error(f'Model "{model}" {error}')
|
71 |
|
|
|
105 |
"license": license,
|
106 |
}
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
# Check for duplicate submission
|
109 |
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
110 |
return styled_warning("This model has been already submitted.")
|