Sean Cho commited on
Commit
ce5c604
·
1 Parent(s): f73765d

update to latest version

Browse files
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
  title: Leaderboard Test
3
  emoji: 📉
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.27.0
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
  ---
12
 
 
1
  ---
2
  title: Leaderboard Test
3
  emoji: 📉
4
+ colorFrom: green
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.27.0
8
  app_file: app.py
9
+ pinned: true
10
  license: apache-2.0
11
  ---
12
 
app.py CHANGED
@@ -2,23 +2,33 @@ import json
2
  import os
3
  from datetime import datetime, timezone
4
 
5
-
6
  import gradio as gr
7
- import numpy as np
8
  import pandas as pd
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
  from huggingface_hub import HfApi
11
- from transformers import AutoConfig
12
 
13
- from src.auto_leaderboard.get_model_metadata import apply_metadata
14
- from src.assets.text_content import *
15
- from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
16
- from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
17
  from src.assets.css_html_js import custom_css, get_window_url_params
18
- from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
19
- from src.init import get_all_requested_models, load_all_info_from_hub
20
-
21
- pd.set_option('display.precision', 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # clone / pull the lmeh eval data
24
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
@@ -37,20 +47,17 @@ EVAL_RESULTS_PATH = "eval-results"
37
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
38
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
39
 
40
- api = HfApi()
41
 
42
- def restart_space():
43
- api.restart_space(
44
- repo_id="BearSean/leaderboard-test", token=H4_TOKEN
45
- )
46
 
47
- eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
 
48
 
49
- if not IS_PUBLIC:
50
- eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
51
- else:
52
- eval_queue_private, eval_results_private = None, None
53
 
 
54
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
55
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
56
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -63,116 +70,51 @@ if not IS_PUBLIC:
63
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
64
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
65
 
66
- BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
67
-
68
-
69
- def has_no_nan_values(df, columns):
70
- return df[columns].notna().all(axis=1)
71
-
72
-
73
- def has_nan_values(df, columns):
74
- return df[columns].isna().any(axis=1)
75
-
76
-
77
- def get_leaderboard_df():
78
- if eval_results:
79
- print("Pulling evaluation results for the leaderboard.")
80
- eval_results.git_pull()
81
- if eval_results_private:
82
- print("Pulling evaluation results for the leaderboard.")
83
- eval_results_private.git_pull()
84
-
85
- all_data = get_eval_results_dicts(IS_PUBLIC)
86
 
87
- if not IS_PUBLIC:
88
- all_data.append(gpt4_values)
89
- all_data.append(gpt35_values)
 
90
 
91
- all_data.append(baseline)
92
- apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
 
 
 
 
 
 
 
93
 
94
- df = pd.DataFrame.from_records(all_data)
95
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
96
- df = df[COLS].round(decimals=2)
97
 
98
- # filter out if any of the benchmarks have not been produced
99
- df = df[has_no_nan_values(df, BENCHMARK_COLS)]
100
- return df
101
 
 
 
102
 
103
- def get_evaluation_queue_df():
104
- if eval_queue:
105
- print("Pulling changes for the evaluation queue.")
106
- eval_queue.git_pull()
107
- if eval_queue_private:
108
- print("Pulling changes for the evaluation queue.")
109
- eval_queue_private.git_pull()
110
 
111
- entries = [
112
- entry
113
- for entry in os.listdir(EVAL_REQUESTS_PATH)
114
- if not entry.startswith(".")
115
- ]
116
- all_evals = []
117
-
118
- for entry in entries:
119
- if ".json" in entry:
120
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
121
- with open(file_path) as fp:
122
- data = json.load(fp)
123
-
124
- data["# params"] = "unknown"
125
- data["model"] = make_clickable_model(data["model"])
126
- data["revision"] = data.get("revision", "main")
127
-
128
- all_evals.append(data)
129
- elif ".md" not in entry:
130
- # this is a folder
131
- sub_entries = [
132
- e
133
- for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
134
- if not e.startswith(".")
135
- ]
136
- for sub_entry in sub_entries:
137
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
138
- with open(file_path) as fp:
139
- data = json.load(fp)
140
-
141
- # data["# params"] = get_n_params(data["model"])
142
- data["model"] = make_clickable_model(data["model"])
143
- all_evals.append(data)
144
-
145
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
146
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
147
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
148
- df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
149
- df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
150
- df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
151
- return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
152
-
153
-
154
-
155
- original_df = get_leaderboard_df()
156
  leaderboard_df = original_df.copy()
157
  (
158
  finished_eval_queue_df,
159
  running_eval_queue_df,
160
  pending_eval_queue_df,
161
- ) = get_evaluation_queue_df()
162
-
163
- def is_model_on_hub(model_name, revision) -> bool:
164
- try:
165
- AutoConfig.from_pretrained(model_name, revision=revision)
166
- return True, None
167
-
168
- except ValueError as e:
169
- return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
170
-
171
- except Exception as e:
172
- print(f"Could not get the model config from the hub.: {e}")
173
- return False, "was not found on hub!"
174
 
175
 
 
176
  def add_new_eval(
177
  model: str,
178
  base_model: str,
@@ -185,6 +127,14 @@ def add_new_eval(
185
  precision = precision.split(" ")[0]
186
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
187
 
 
 
 
 
 
 
 
 
188
  if model_type is None or model_type == "":
189
  return styled_error("Please select a model type.")
190
 
@@ -196,13 +146,12 @@ def add_new_eval(
196
  base_model_on_hub, error = is_model_on_hub(base_model, revision)
197
  if not base_model_on_hub:
198
  return styled_error(f'Base model "{base_model}" {error}')
199
-
200
 
201
  if not weight_type == "Adapter":
202
  model_on_hub, error = is_model_on_hub(model, revision)
203
  if not model_on_hub:
204
  return styled_error(f'Model "{model}" {error}')
205
-
206
  print("adding new eval")
207
 
208
  eval_entry = {
@@ -227,8 +176,12 @@ def add_new_eval(
227
  os.makedirs(OUT_DIR, exist_ok=True)
228
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
229
 
 
 
 
 
230
  # Check for duplicate submission
231
- if out_path.split("eval-queue/")[1].lower() in requested_models:
232
  return styled_warning("This model has been already submitted.")
233
 
234
  with open(out_path, "w") as f:
@@ -238,7 +191,6 @@ def add_new_eval(
238
  path_or_fileobj=out_path,
239
  path_in_repo=out_path.split("eval-queue/")[1],
240
  repo_id=QUEUE_REPO,
241
- token=H4_TOKEN,
242
  repo_type="dataset",
243
  commit_message=f"Add {model} to eval queue",
244
  )
@@ -246,16 +198,19 @@ def add_new_eval(
246
  # remove the local file
247
  os.remove(out_path)
248
 
249
- return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
 
 
250
 
251
 
252
- def refresh():
253
- leaderboard_df = get_leaderboard_df()
 
254
  (
255
  finished_eval_queue_df,
256
  running_eval_queue_df,
257
  pending_eval_queue_df,
258
- ) = get_evaluation_queue_df()
259
  return (
260
  leaderboard_df,
261
  finished_eval_queue_df,
@@ -264,47 +219,68 @@ def refresh():
264
  )
265
 
266
 
267
- def search_table(df, leaderboard_table, query):
268
- if AutoEvalColumn.model_type.name in leaderboard_table.columns:
 
 
 
 
 
 
 
 
 
 
 
 
269
  filtered_df = df[
270
  (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
271
  | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
272
- ]
273
  else:
274
  filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
275
- return filtered_df[leaderboard_table.columns]
276
 
277
 
278
- def select_columns(df, columns):
279
- always_here_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
280
- # We use COLS to maintain sorting
281
- filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]]
 
 
 
 
 
282
  return filtered_df
283
 
284
- #TODO allow this to filter by values of any columns
285
- def filter_items(df, leaderboard_table, query):
286
- if query == "all":
287
- return df[leaderboard_table.columns]
288
- else:
289
- query = query[0] #take only the emoji character
290
- if AutoEvalColumn.model_type_symbol.name in leaderboard_table.columns:
291
- filtered_df = df[(df[AutoEvalColumn.model_type_symbol.name] == query)]
292
- else:
293
- return leaderboard_table.columns
294
- return filtered_df[leaderboard_table.columns]
295
-
296
- def change_tab(query_param):
297
- query_param = query_param.replace("'", '"')
298
- query_param = json.loads(query_param)
 
 
 
 
 
 
 
 
 
 
 
299
 
300
- if (
301
- isinstance(query_param, dict)
302
- and "tab" in query_param
303
- and query_param["tab"] == "evaluation"
304
- ):
305
- return gr.Tabs.update(selected=1)
306
- else:
307
- return gr.Tabs.update(selected=0)
308
 
309
 
310
  demo = gr.Blocks(css=custom_css)
@@ -315,34 +291,83 @@ with demo:
315
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
316
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
317
  with gr.Row():
318
- shown_columns = gr.CheckboxGroup(
319
- choices = [c for c in COLS if c not in [AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name]],
320
- value = [c for c in COLS_LITE if c not in [AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name]],
321
- label="Select columns to show",
322
- elem_id="column-select",
323
- interactive=True,
324
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  with gr.Column(min_width=320):
326
  search_bar = gr.Textbox(
327
- placeholder="🔍 Search for your model and press ENTER...",
328
  show_label=False,
329
  elem_id="search-bar",
330
  )
331
- filter_columns = gr.Radio(
332
- label="⏚ Filter model types",
333
- choices = [
334
- "all",
335
- ModelType.PT.to_str(),
336
- ModelType.FT.to_str(),
337
- ModelType.IFT.to_str(),
338
- ModelType.RL.to_str(),
339
- ],
340
- value="all",
341
- elem_id="filter-columns"
342
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  leaderboard_table = gr.components.Dataframe(
344
- value=leaderboard_df[[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value+ [AutoEvalColumn.dummy.name]],
345
- headers=[AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + shown_columns.value + [AutoEvalColumn.dummy.name],
 
 
 
 
 
 
 
 
 
346
  datatype=TYPES,
347
  max_rows=None,
348
  elem_id="leaderboard-table",
@@ -360,11 +385,55 @@ with demo:
360
  )
361
  search_bar.submit(
362
  search_table,
363
- [hidden_leaderboard_table_for_search, leaderboard_table, search_bar],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  leaderboard_table,
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  )
366
- shown_columns.change(select_columns, [hidden_leaderboard_table_for_search, shown_columns], leaderboard_table)
367
- filter_columns.change(filter_items, [hidden_leaderboard_table_for_search, leaderboard_table, filter_columns], leaderboard_table)
368
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
369
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
370
 
@@ -374,7 +443,10 @@ with demo:
374
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
375
 
376
  with gr.Column():
377
- with gr.Accordion(f"✅ 평가 완료 ({len(finished_eval_queue_df)})", open=False):
 
 
 
378
  with gr.Row():
379
  finished_eval_table = gr.components.Dataframe(
380
  value=finished_eval_queue_df,
@@ -382,7 +454,10 @@ with demo:
382
  datatype=EVAL_TYPES,
383
  max_rows=5,
384
  )
385
- with gr.Accordion(f"🔄 평가 진행 대기열 ({len(running_eval_queue_df)})", open=False):
 
 
 
386
  with gr.Row():
387
  running_eval_table = gr.components.Dataframe(
388
  value=running_eval_queue_df,
@@ -391,7 +466,10 @@ with demo:
391
  max_rows=5,
392
  )
393
 
394
- with gr.Accordion(f"⏳ 평가 대기 대기열 ({len(pending_eval_queue_df)})", open=False):
 
 
 
395
  with gr.Row():
396
  pending_eval_table = gr.components.Dataframe(
397
  value=pending_eval_queue_df,
@@ -405,20 +483,16 @@ with demo:
405
  with gr.Row():
406
  with gr.Column():
407
  model_name_textbox = gr.Textbox(label="Model name")
408
- revision_name_textbox = gr.Textbox(
409
- label="revision", placeholder="main"
410
- )
411
- private = gr.Checkbox(
412
- False, label="Private", visible=not IS_PUBLIC
413
- )
414
  model_type = gr.Dropdown(
415
- choices=[
416
  ModelType.PT.to_str(" : "),
417
  ModelType.FT.to_str(" : "),
418
  ModelType.IFT.to_str(" : "),
419
- ModelType.RL.to_str(" : "),
420
- ],
421
- label="Model type",
422
  multiselect=False,
423
  value=None,
424
  interactive=True,
@@ -426,22 +500,26 @@ with demo:
426
 
427
  with gr.Column():
428
  precision = gr.Dropdown(
429
- choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)"],
430
- label="Precision",
 
 
 
 
 
 
431
  multiselect=False,
432
  value="float16",
433
  interactive=True,
434
  )
435
  weight_type = gr.Dropdown(
436
  choices=["Original", "Delta", "Adapter"],
437
- label="Weights type",
438
  multiselect=False,
439
  value="Original",
440
  interactive=True,
441
  )
442
- base_model_name_textbox = gr.Textbox(
443
- label="Base model (for delta or adapter weights)"
444
- )
445
 
446
  submit_button = gr.Button("제출하고 평가받기")
447
  submission_result = gr.Markdown()
@@ -454,7 +532,7 @@ with demo:
454
  precision,
455
  private,
456
  weight_type,
457
- model_type
458
  ],
459
  submission_result,
460
  )
@@ -470,6 +548,7 @@ with demo:
470
  running_eval_table,
471
  pending_eval_table,
472
  ],
 
473
  )
474
 
475
  with gr.Row():
 
2
  import os
3
  from datetime import datetime, timezone
4
 
 
5
  import gradio as gr
 
6
  import pandas as pd
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import HfApi
 
9
 
 
 
 
 
10
  from src.assets.css_html_js import custom_css, get_window_url_params
11
+ from src.assets.text_content import (
12
+ CITATION_BUTTON_LABEL,
13
+ CITATION_BUTTON_TEXT,
14
+ EVALUATION_QUEUE_TEXT,
15
+ INTRODUCTION_TEXT,
16
+ LLM_BENCHMARKS_TEXT,
17
+ TITLE,
18
+ )
19
+ from src.display_models.get_model_metadata import DO_NOT_SUBMIT_MODELS, ModelType
20
+ from src.display_models.utils import (
21
+ AutoEvalColumn,
22
+ EvalQueueColumn,
23
+ fields,
24
+ styled_error,
25
+ styled_message,
26
+ styled_warning,
27
+ )
28
+ from src.load_from_hub import get_evaluation_queue_df, get_leaderboard_df, is_model_on_hub, load_all_info_from_hub
29
+ from src.rate_limiting import user_submission_permission
30
+
31
+ pd.set_option("display.precision", 1)
32
 
33
  # clone / pull the lmeh eval data
34
  H4_TOKEN = os.environ.get("H4_TOKEN", None)
 
47
  EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
48
  EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
49
 
50
+ api = HfApi(token=H4_TOKEN)
51
 
 
 
 
 
52
 
53
+ def restart_space():
54
+ api.restart_space(repo_id="BearSean/leaderboard-test", token=H4_TOKEN)
55
 
56
+ # Rate limit variables
57
+ RATE_LIMIT_PERIOD = 7
58
+ RATE_LIMIT_QUOTA = 5
 
59
 
60
+ # Column selection
61
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
62
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
63
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
70
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
71
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
72
 
73
+ BENCHMARK_COLS = [
74
+ c.name
75
+ for c in [
76
+ AutoEvalColumn.arc,
77
+ AutoEvalColumn.hellaswag,
78
+ AutoEvalColumn.mmlu,
79
+ AutoEvalColumn.truthfulqa,
80
+ ]
81
+ ]
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ ## LOAD INFO FROM HUB
84
+ eval_queue, requested_models, eval_results, users_to_submission_dates = load_all_info_from_hub(
85
+ QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH
86
+ )
87
 
88
+ if not IS_PUBLIC:
89
+ (eval_queue_private, requested_models_private, eval_results_private, _) = load_all_info_from_hub(
90
+ PRIVATE_QUEUE_REPO,
91
+ PRIVATE_RESULTS_REPO,
92
+ EVAL_REQUESTS_PATH_PRIVATE,
93
+ EVAL_RESULTS_PATH_PRIVATE,
94
+ )
95
+ else:
96
+ eval_queue_private, eval_results_private = None, None
97
 
98
+ original_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
99
+ models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
 
100
 
101
+ # Commented out because it causes infinite restart loops in local
102
+ # to_be_dumped = f"models = {repr(models)}\n"
 
103
 
104
+ # with open("models_backlinks.py", "w") as f:
105
+ # f.write(to_be_dumped)
106
 
107
+ # print(to_be_dumped)
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  leaderboard_df = original_df.copy()
110
  (
111
  finished_eval_queue_df,
112
  running_eval_queue_df,
113
  pending_eval_queue_df,
114
+ ) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
 
117
+ ## INTERACTION FUNCTIONS
118
  def add_new_eval(
119
  model: str,
120
  base_model: str,
 
127
  precision = precision.split(" ")[0]
128
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
129
 
130
+ num_models_submitted_in_period = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD)
131
+ if num_models_submitted_in_period > RATE_LIMIT_QUOTA:
132
+ error_msg = f"Organisation or user `{model.split('/')[0]}`"
133
+ error_msg += f"already has {num_models_submitted_in_period} model requests submitted to the leaderboard "
134
+ error_msg += f"in the last {RATE_LIMIT_PERIOD} days.\n"
135
+ error_msg += "Please wait a couple of days before resubmitting, so that everybody can enjoy using the leaderboard 🤗"
136
+ return styled_error(error_msg)
137
+
138
  if model_type is None or model_type == "":
139
  return styled_error("Please select a model type.")
140
 
 
146
  base_model_on_hub, error = is_model_on_hub(base_model, revision)
147
  if not base_model_on_hub:
148
  return styled_error(f'Base model "{base_model}" {error}')
 
149
 
150
  if not weight_type == "Adapter":
151
  model_on_hub, error = is_model_on_hub(model, revision)
152
  if not model_on_hub:
153
  return styled_error(f'Model "{model}" {error}')
154
+
155
  print("adding new eval")
156
 
157
  eval_entry = {
 
176
  os.makedirs(OUT_DIR, exist_ok=True)
177
  out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
178
 
179
+ # Check if the model has been forbidden:
180
+ if out_path.split("eval-queue/")[1] in DO_NOT_SUBMIT_MODELS:
181
+ return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
182
+
183
  # Check for duplicate submission
184
+ if f"{model}_{revision}_{precision}" in requested_models:
185
  return styled_warning("This model has been already submitted.")
186
 
187
  with open(out_path, "w") as f:
 
191
  path_or_fileobj=out_path,
192
  path_in_repo=out_path.split("eval-queue/")[1],
193
  repo_id=QUEUE_REPO,
 
194
  repo_type="dataset",
195
  commit_message=f"Add {model} to eval queue",
196
  )
 
198
  # remove the local file
199
  os.remove(out_path)
200
 
201
+ return styled_message(
202
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
203
+ )
204
 
205
 
206
+ # Basics
207
+ def refresh() -> list[pd.DataFrame]:
208
+ leaderboard_df = get_leaderboard_df(eval_results, eval_results_private, COLS, BENCHMARK_COLS)
209
  (
210
  finished_eval_queue_df,
211
  running_eval_queue_df,
212
  pending_eval_queue_df,
213
+ ) = get_evaluation_queue_df(eval_queue, eval_queue_private, EVAL_REQUESTS_PATH, EVAL_COLS)
214
  return (
215
  leaderboard_df,
216
  finished_eval_queue_df,
 
219
  )
220
 
221
 
222
+ def change_tab(query_param: str):
223
+ query_param = query_param.replace("'", '"')
224
+ query_param = json.loads(query_param)
225
+
226
+ if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
227
+ return gr.Tabs.update(selected=1)
228
+ else:
229
+ return gr.Tabs.update(selected=0)
230
+
231
+
232
+ # Searching and filtering
233
+ def search_table(df: pd.DataFrame, current_columns_df: pd.DataFrame, query: str) -> pd.DataFrame:
234
+ current_columns = current_columns_df.columns
235
+ if AutoEvalColumn.model_type.name in current_columns:
236
  filtered_df = df[
237
  (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
238
  | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
239
+ ]
240
  else:
241
  filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
242
+ return filtered_df[current_columns]
243
 
244
 
245
+ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
246
+ always_here_cols = [
247
+ AutoEvalColumn.model_type_symbol.name,
248
+ AutoEvalColumn.model.name,
249
+ ]
250
+ # We use COLS to maintain sorting
251
+ filtered_df = df[
252
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
253
+ ]
254
  return filtered_df
255
 
256
+ NUMERIC_INTERVALS = {
257
+ "< 1.5B": (0, 1.5),
258
+ "~3B": (1.5, 5),
259
+ "~7B": (6, 11),
260
+ "~13B": (12, 15),
261
+ "~35B": (16, 55),
262
+ "60B+": (55, 10000),
263
+ }
264
+
265
+ def filter_models(
266
+ df: pd.DataFrame, current_columns_df: pd.DataFrame, type_query: list, size_query: list, show_deleted: bool
267
+ ) -> pd.DataFrame:
268
+ current_columns = current_columns_df.columns
269
+
270
+ # Show all models
271
+ if show_deleted:
272
+ filtered_df = df[current_columns]
273
+ else: # Show only still on the hub models
274
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True][current_columns]
275
+
276
+ type_emoji = [t[0] for t in type_query]
277
+ filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
278
+
279
+ numeric_interval = [NUMERIC_INTERVALS[s] for s in size_query]
280
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
281
+ filtered_df = filtered_df[params_column.between(numeric_interval[0][0], numeric_interval[-1][1])]
282
 
283
+ return filtered_df
 
 
 
 
 
 
 
284
 
285
 
286
  demo = gr.Blocks(css=custom_css)
 
291
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
292
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
293
  with gr.Row():
294
+ with gr.Column():
295
+ with gr.Row():
296
+ shown_columns = gr.CheckboxGroup(
297
+ choices=[
298
+ c
299
+ for c in COLS
300
+ if c
301
+ not in [
302
+ AutoEvalColumn.dummy.name,
303
+ AutoEvalColumn.model.name,
304
+ AutoEvalColumn.model_type_symbol.name,
305
+ AutoEvalColumn.still_on_hub.name,
306
+ ]
307
+ ],
308
+ value=[
309
+ c
310
+ for c in COLS_LITE
311
+ if c
312
+ not in [
313
+ AutoEvalColumn.dummy.name,
314
+ AutoEvalColumn.model.name,
315
+ AutoEvalColumn.model_type_symbol.name,
316
+ AutoEvalColumn.still_on_hub.name,
317
+ ]
318
+ ],
319
+ label="Select columns to show",
320
+ elem_id="column-select",
321
+ interactive=True,
322
+ )
323
+ with gr.Row():
324
+ deleted_models_visibility = gr.Checkbox(
325
+ value=True, label="Show gated/private/deleted models", interactive=True
326
+ )
327
  with gr.Column(min_width=320):
328
  search_bar = gr.Textbox(
329
+ placeholder="🔍 찾고자 하는 모델 명을 입력하세요",
330
  show_label=False,
331
  elem_id="search-bar",
332
  )
333
+ with gr.Box(elem_id="box-filter"):
334
+ filter_columns_type = gr.CheckboxGroup(
335
+ label="Model types",
336
+ choices=[
337
+ ModelType.PT.to_str(),
338
+ ModelType.FT.to_str(),
339
+ ModelType.IFT.to_str(),
340
+ ModelType.RL.to_str(),
341
+ ],
342
+ value=[
343
+ ModelType.PT.to_str(),
344
+ ModelType.FT.to_str(),
345
+ ModelType.IFT.to_str(),
346
+ ModelType.RL.to_str(),
347
+ ],
348
+ interactive=True,
349
+ elem_id="filter-columns-type",
350
+ )
351
+ filter_columns_size = gr.CheckboxGroup(
352
+ label="Model sizes",
353
+ choices=list(NUMERIC_INTERVALS.keys()),
354
+ value=list(NUMERIC_INTERVALS.keys()),
355
+ interactive=True,
356
+ elem_id="filter-columns-size",
357
+ )
358
+
359
  leaderboard_table = gr.components.Dataframe(
360
+ value=leaderboard_df[
361
+ [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
362
+ + shown_columns.value
363
+ + [AutoEvalColumn.dummy.name]
364
+ ],
365
+ headers=[
366
+ AutoEvalColumn.model_type_symbol.name,
367
+ AutoEvalColumn.model.name,
368
+ ]
369
+ + shown_columns.value
370
+ + [AutoEvalColumn.dummy.name],
371
  datatype=TYPES,
372
  max_rows=None,
373
  elem_id="leaderboard-table",
 
385
  )
386
  search_bar.submit(
387
  search_table,
388
+ [
389
+ hidden_leaderboard_table_for_search,
390
+ leaderboard_table,
391
+ search_bar,
392
+ ],
393
+ leaderboard_table,
394
+ )
395
+ shown_columns.change(
396
+ select_columns,
397
+ [hidden_leaderboard_table_for_search, shown_columns],
398
+ leaderboard_table,
399
+ queue=False,
400
+ )
401
+ filter_columns_type.change(
402
+ filter_models,
403
+ [
404
+ hidden_leaderboard_table_for_search,
405
+ leaderboard_table,
406
+ filter_columns_type,
407
+ filter_columns_size,
408
+ deleted_models_visibility,
409
+ ],
410
+ leaderboard_table,
411
+ queue=False,
412
+ )
413
+ filter_columns_size.change(
414
+ filter_models,
415
+ [
416
+ hidden_leaderboard_table_for_search,
417
+ leaderboard_table,
418
+ filter_columns_type,
419
+ filter_columns_size,
420
+ deleted_models_visibility,
421
+ ],
422
  leaderboard_table,
423
+ queue=False,
424
+ )
425
+ deleted_models_visibility.change(
426
+ filter_models,
427
+ [
428
+ hidden_leaderboard_table_for_search,
429
+ leaderboard_table,
430
+ filter_columns_type,
431
+ filter_columns_size,
432
+ deleted_models_visibility,
433
+ ],
434
+ leaderboard_table,
435
+ queue=False,
436
  )
 
 
437
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
438
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
439
 
 
443
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
444
 
445
  with gr.Column():
446
+ with gr.Accordion(
447
+ f"✅ 평가 완료 ({len(finished_eval_queue_df)})",
448
+ open=False,
449
+ ):
450
  with gr.Row():
451
  finished_eval_table = gr.components.Dataframe(
452
  value=finished_eval_queue_df,
 
454
  datatype=EVAL_TYPES,
455
  max_rows=5,
456
  )
457
+ with gr.Accordion(
458
+ f"🔄 평가 진행 대기열 ({len(running_eval_queue_df)})",
459
+ open=False,
460
+ ):
461
  with gr.Row():
462
  running_eval_table = gr.components.Dataframe(
463
  value=running_eval_queue_df,
 
466
  max_rows=5,
467
  )
468
 
469
+ with gr.Accordion(
470
+ f"⏳ 평가 대기 대기열 ({len(pending_eval_queue_df)})",
471
+ open=False,
472
+ ):
473
  with gr.Row():
474
  pending_eval_table = gr.components.Dataframe(
475
  value=pending_eval_queue_df,
 
483
  with gr.Row():
484
  with gr.Column():
485
  model_name_textbox = gr.Textbox(label="Model name")
486
+ revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
487
+ private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
 
 
 
 
488
  model_type = gr.Dropdown(
489
+ choices=[
490
  ModelType.PT.to_str(" : "),
491
  ModelType.FT.to_str(" : "),
492
  ModelType.IFT.to_str(" : "),
493
+ ModelType.RL.to_str(" : "),
494
+ ],
495
+ label="Model type",
496
  multiselect=False,
497
  value=None,
498
  interactive=True,
 
500
 
501
  with gr.Column():
502
  precision = gr.Dropdown(
503
+ choices=[
504
+ "float16",
505
+ "bfloat16",
506
+ "8bit (LLM.int8)",
507
+ "4bit (QLoRA / FP4)",
508
+ "GPTQ"
509
+ ],
510
+ label="Precision",
511
  multiselect=False,
512
  value="float16",
513
  interactive=True,
514
  )
515
  weight_type = gr.Dropdown(
516
  choices=["Original", "Delta", "Adapter"],
517
+ label="Weights type",
518
  multiselect=False,
519
  value="Original",
520
  interactive=True,
521
  )
522
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
523
 
524
  submit_button = gr.Button("제출하고 평가받기")
525
  submission_result = gr.Markdown()
 
532
  precision,
533
  private,
534
  weight_type,
535
+ model_type,
536
  ],
537
  submission_result,
538
  )
 
548
  running_eval_table,
549
  pending_eval_table,
550
  ],
551
+ api_name='refresh'
552
  )
553
 
554
  with gr.Row():
models_backlinks.py ADDED
@@ -0,0 +1 @@
 
 
1
+ models = ['upstage/Llama-2-70b-instruct-v2', 'upstage/Llama-2-70b-instruct', 'upstage/llama-65b-instruct', 'upstage/llama-65b-instruct', 'upstage/llama-30b-instruct-2048', 'upstage/llama-30b-instruct', 'baseline']
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt CHANGED
@@ -24,7 +24,7 @@ gradio_client==0.1.3
24
  h11==0.14.0
25
  httpcore==0.17.0
26
  httpx==0.24.0
27
- huggingface-hub==0.13.4
28
  idna==3.4
29
  Jinja2==3.1.2
30
  jsonschema==4.17.3
@@ -59,7 +59,7 @@ sniffio==1.3.0
59
  starlette==0.26.1
60
  toolz==0.12.0
61
  tqdm==4.65.0
62
- transformers==4.28.1
63
  typing_extensions==4.5.0
64
  tzdata==2023.3
65
  tzlocal==4.3
 
24
  h11==0.14.0
25
  httpcore==0.17.0
26
  httpx==0.24.0
27
+ huggingface-hub==0.16.4
28
  idna==3.4
29
  Jinja2==3.1.2
30
  jsonschema==4.17.3
 
59
  starlette==0.26.1
60
  toolz==0.12.0
61
  tqdm==4.65.0
62
+ transformers==4.32.0
63
  typing_extensions==4.5.0
64
  tzdata==2023.3
65
  tzlocal==4.3
src/assets/css_html_js.py CHANGED
@@ -1,11 +1,4 @@
1
  custom_css = """
2
- #changelog-text {
3
- font-size: 16px !important;
4
- }
5
-
6
- #changelog-text h2 {
7
- font-size: 18px !important;
8
- }
9
 
10
  .markdown-text {
11
  font-size: 16px !important;
@@ -75,6 +68,38 @@ table th:first-child {
75
  #scale-logo .download {
76
  display: none;
77
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  """
79
 
80
  get_window_url_params = """
 
1
  custom_css = """
 
 
 
 
 
 
 
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
 
68
  #scale-logo .download {
69
  display: none;
70
  }
71
+ #filter_type{
72
+ border: 0;
73
+ padding-left: 0;
74
+ padding-top: 0;
75
+ }
76
+ #filter_type label {
77
+ display: flex;
78
+ }
79
+ #filter_type label > span{
80
+ margin-top: var(--spacing-lg);
81
+ margin-right: 0.5em;
82
+ }
83
+ #filter_type label > .wrap{
84
+ width: 103px;
85
+ }
86
+ #filter_type label > .wrap .wrap-inner{
87
+ padding: 2px;
88
+ }
89
+ #filter_type label > .wrap .wrap-inner input{
90
+ width: 1px
91
+ }
92
+ #filter-columns-type{
93
+ border:0;
94
+ padding:0.5;
95
+ }
96
+ #filter-columns-size{
97
+ border:0;
98
+ padding:0.5;
99
+ }
100
+ #box-filter > .form{
101
+ border: 0
102
+ }
103
  """
104
 
105
  get_window_url_params = """
src/assets/hardcoded_evals.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.utils_display import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
@@ -6,9 +6,9 @@ gpt4_values = {
6
  AutoEvalColumn.precision.name: None,
7
  AutoEvalColumn.average.name: 84.3,
8
  AutoEvalColumn.arc.name: 96.3,
9
- AutoEvalColumn.hellaswag.name: 95.3,
10
- AutoEvalColumn.mmlu.name: 86.4,
11
- AutoEvalColumn.truthfulqa.name: 59.0,
12
  AutoEvalColumn.dummy.name: "GPT-4",
13
  AutoEvalColumn.model_type.name: "",
14
  }
@@ -19,9 +19,9 @@ gpt35_values = {
19
  AutoEvalColumn.precision.name: None,
20
  AutoEvalColumn.average.name: 71.9,
21
  AutoEvalColumn.arc.name: 85.2,
22
- AutoEvalColumn.hellaswag.name: 85.5,
23
- AutoEvalColumn.mmlu.name: 70.0,
24
- AutoEvalColumn.truthfulqa.name: 47.0,
25
  AutoEvalColumn.dummy.name: "GPT-3.5",
26
  AutoEvalColumn.model_type.name: "",
27
  }
@@ -32,10 +32,9 @@ baseline = {
32
  AutoEvalColumn.precision.name: None,
33
  AutoEvalColumn.average.name: 25.0,
34
  AutoEvalColumn.arc.name: 25.0,
35
- AutoEvalColumn.hellaswag.name: 25.0,
36
- AutoEvalColumn.mmlu.name: 25.0,
37
- AutoEvalColumn.truthfulqa.name: 25.0,
38
  AutoEvalColumn.dummy.name: "baseline",
39
  AutoEvalColumn.model_type.name: "",
40
  }
41
-
 
1
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
2
 
3
  gpt4_values = {
4
  AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
 
6
  AutoEvalColumn.precision.name: None,
7
  AutoEvalColumn.average.name: 84.3,
8
  AutoEvalColumn.arc.name: 96.3,
9
+ AutoEvalColumn.hellaswag.name: 95.3,
10
+ AutoEvalColumn.mmlu.name: 86.4,
11
+ AutoEvalColumn.truthfulqa.name: 59.0,
12
  AutoEvalColumn.dummy.name: "GPT-4",
13
  AutoEvalColumn.model_type.name: "",
14
  }
 
19
  AutoEvalColumn.precision.name: None,
20
  AutoEvalColumn.average.name: 71.9,
21
  AutoEvalColumn.arc.name: 85.2,
22
+ AutoEvalColumn.hellaswag.name: 85.5,
23
+ AutoEvalColumn.mmlu.name: 70.0,
24
+ AutoEvalColumn.truthfulqa.name: 47.0,
25
  AutoEvalColumn.dummy.name: "GPT-3.5",
26
  AutoEvalColumn.model_type.name: "",
27
  }
 
32
  AutoEvalColumn.precision.name: None,
33
  AutoEvalColumn.average.name: 25.0,
34
  AutoEvalColumn.arc.name: 25.0,
35
+ AutoEvalColumn.hellaswag.name: 25.0,
36
+ AutoEvalColumn.mmlu.name: 25.0,
37
+ AutoEvalColumn.truthfulqa.name: 25.0,
38
  AutoEvalColumn.dummy.name: "baseline",
39
  AutoEvalColumn.model_type.name: "",
40
  }
 
src/assets/text_content.py CHANGED
@@ -1,60 +1,4 @@
1
- from ..auto_leaderboard.model_metadata_type import ModelType
2
-
3
- CHANGELOG_TEXT = f"""
4
- ## [2023-06-19]
5
- - Added model type column
6
- - Hid revision and 8bit columns since all models are the same atm
7
-
8
- ## [2023-06-16]
9
- - Refactored code base
10
- - Added new columns: number of parameters, hub likes, license
11
-
12
- ## [2023-06-13]
13
- - Adjust description for TruthfulQA
14
-
15
- ## [2023-06-12]
16
- - Add Human & GPT-4 Evaluations
17
-
18
- ## [2023-06-05]
19
- - Increase concurrent thread count to 40
20
- - Search models on ENTER
21
-
22
- ## [2023-06-02]
23
- - Add a typeahead search bar
24
- - Use webhooks to automatically spawn a new Space when someone opens a PR
25
- - Start recording `submitted_time` for eval requests
26
- - Limit AutoEvalColumn max-width
27
-
28
- ## [2023-05-30]
29
- - Add a citation button
30
- - Simplify Gradio layout
31
-
32
- ## [2023-05-29]
33
- - Auto-restart every hour for the latest results
34
- - Sync with the internal version (minor style changes)
35
-
36
- ## [2023-05-24]
37
- - Add a baseline that has 25.0 for all values
38
- - Add CHANGELOG
39
-
40
- ## [2023-05-23]
41
- - Fix a CSS issue that made the leaderboard hard to read in dark mode
42
-
43
- ## [2023-05-22]
44
- - Display a success/error message after submitting evaluation requests
45
- - Reject duplicate submission
46
- - Do not display results that have incomplete results
47
- - Display different queues for jobs that are RUNNING, PENDING, FINISHED status
48
-
49
- ## [2023-05-15]
50
- - Fix a typo: from "TruthQA" to "QA"
51
-
52
- ## [2023-05-10]
53
- - Fix a bug that prevented auto-refresh
54
-
55
- ## [2023-05-10]
56
- - Release the leaderboard to public
57
- """
58
 
59
  TITLE = """<h1 align="center" id="space-title">🚀 Open Ko-LLM Leaderboard</h1>"""
60
 
@@ -70,7 +14,7 @@ INTRODUCTION_TEXT = f"""
70
 
71
  LLM_BENCHMARKS_TEXT = f"""
72
  # Context
73
- 뛰어난 LLM 모델들이 앞다투어 공개되고 있지만 이는 대부분 영어 중심의, 영어 문화권에 익숙한 모델입니다. 저희는 한국어 리더보드 🚀 Open Ko-LLM을 운영하여 한국어와 한국 문화의 특성을 반영한 모델을 평가하고자 합니다. 이를 통해 한국어 사용자들이 편리하게 리더보드를 이용하고 참여하여 한국의 연구 수준 향상에 기여할 수 있기를 바랍니다.
74
 
75
  ## Icons
76
  {ModelType.PT.to_str(" : ")} model
@@ -122,7 +66,7 @@ To get more information about quantization, see:
122
  """
123
 
124
  EVALUATION_QUEUE_TEXT = f"""
125
- # 🚀 Open-Ko LLM 리더보드의 평가 큐입니다.
126
  이곳에 추가된 모델들은 곧 자동적으로 KT의 GPU 위에서 평가될 예정입니다!
127
 
128
  ## <모델 제출 전 확인하면 좋은 것들>
 
1
+ from src.display_models.model_metadata_type import ModelType
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  TITLE = """<h1 align="center" id="space-title">🚀 Open Ko-LLM Leaderboard</h1>"""
4
 
 
14
 
15
  LLM_BENCHMARKS_TEXT = f"""
16
  # Context
17
+ 뛰어난 LLM 모델들이 앞다투어 공개되고 있지만 이는 대부분 영어 중심의, 영어 문화권에 익숙한 모델입니다. 저희는 한국어 리더보드 🚀 Open Ko-LLM을 운영하여 한국어와 한국 문화의 특성을 반영한 모델을 평가하고자 합니다. 이를 통해 한국어 사용자들이 편리하게 리더보드를 이용하고 참여하여 한국의 연구 수준 향상에 기여할 수 있기를 바랍니다.
18
 
19
  ## Icons
20
  {ModelType.PT.to_str(" : ")} model
 
66
  """
67
 
68
  EVALUATION_QUEUE_TEXT = f"""
69
+ # 🚀 Open-Ko LLM 리더보드의 평가 큐입니다.
70
  이곳에 추가된 모델들은 곧 자동적으로 KT의 GPU 위에서 평가될 예정입니다!
71
 
72
  ## <모델 제출 전 확인하면 좋은 것들>
src/auto_leaderboard/model_metadata_type.py DELETED
@@ -1,597 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
- import glob
4
- import json
5
- import os
6
- from typing import Dict, List
7
-
8
- from ..utils_display import AutoEvalColumn
9
-
10
- @dataclass
11
- class ModelInfo:
12
- name: str
13
- symbol: str # emoji
14
-
15
-
16
- class ModelType(Enum):
17
- PT = ModelInfo(name="pretrained", symbol="🟢")
18
- FT = ModelInfo(name="fine-tuned", symbol="🔶")
19
- IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
20
- RL = ModelInfo(name="RL-tuned", symbol="🟦")
21
- Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
22
-
23
- def to_str(self, separator = " "):
24
- return f"{self.value.symbol}{separator}{self.value.name}"
25
-
26
-
27
- TYPE_METADATA: Dict[str, ModelType] = {
28
- 'notstoic/PygmalionCoT-7b': ModelType.IFT,
29
- 'aisquared/dlite-v1-355m': ModelType.IFT,
30
- 'aisquared/dlite-v1-1_5b': ModelType.IFT,
31
- 'aisquared/dlite-v1-774m': ModelType.IFT,
32
- 'aisquared/dlite-v1-124m': ModelType.IFT,
33
- 'aisquared/chopt-2_7b': ModelType.IFT,
34
- 'aisquared/dlite-v2-124m': ModelType.IFT,
35
- 'aisquared/dlite-v2-774m': ModelType.IFT,
36
- 'aisquared/dlite-v2-1_5b': ModelType.IFT,
37
- 'aisquared/chopt-1_3b': ModelType.IFT,
38
- 'aisquared/dlite-v2-355m': ModelType.IFT,
39
- 'augtoma/qCammel-13': ModelType.IFT,
40
- 'Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload': ModelType.IFT,
41
- 'Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload': ModelType.IFT,
42
- 'TheBloke/alpaca-lora-65B-HF': ModelType.FT,
43
- 'TheBloke/tulu-7B-fp16': ModelType.IFT,
44
- 'TheBloke/guanaco-7B-HF': ModelType.FT,
45
- 'TheBloke/koala-7B-HF': ModelType.FT,
46
- 'TheBloke/wizardLM-7B-HF': ModelType.IFT,
47
- 'TheBloke/airoboros-13B-HF': ModelType.IFT,
48
- 'TheBloke/koala-13B-HF': ModelType.FT,
49
- 'TheBloke/Wizard-Vicuna-7B-Uncensored-HF': ModelType.FT,
50
- 'TheBloke/dromedary-65b-lora-HF': ModelType.IFT,
51
- 'TheBloke/wizardLM-13B-1.0-fp16': ModelType.IFT,
52
- 'TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16': ModelType.FT,
53
- 'TheBloke/Wizard-Vicuna-30B-Uncensored-fp16': ModelType.FT,
54
- 'TheBloke/wizard-vicuna-13B-HF': ModelType.IFT,
55
- 'TheBloke/UltraLM-13B-fp16': ModelType.IFT,
56
- 'TheBloke/OpenAssistant-FT-7-Llama-30B-HF': ModelType.FT,
57
- 'TheBloke/vicuna-13B-1.1-HF': ModelType.IFT,
58
- 'TheBloke/guanaco-13B-HF': ModelType.FT,
59
- 'TheBloke/guanaco-65B-HF': ModelType.FT,
60
- 'TheBloke/airoboros-7b-gpt4-fp16': ModelType.IFT,
61
- 'TheBloke/llama-30b-supercot-SuperHOT-8K-fp16': ModelType.IFT,
62
- 'TheBloke/Llama-2-13B-fp16': ModelType.PT,
63
- 'TheBloke/llama-2-70b-Guanaco-QLoRA-fp16': ModelType.FT,
64
- 'TheBloke/landmark-attention-llama7b-fp16': ModelType.IFT,
65
- 'TheBloke/Planner-7B-fp16': ModelType.IFT,
66
- 'TheBloke/Wizard-Vicuna-13B-Uncensored-HF': ModelType.FT,
67
- 'TheBloke/gpt4-alpaca-lora-13B-HF': ModelType.IFT,
68
- 'TheBloke/gpt4-x-vicuna-13B-HF': ModelType.IFT,
69
- 'TheBloke/gpt4-alpaca-lora_mlp-65B-HF': ModelType.IFT,
70
- 'TheBloke/tulu-13B-fp16': ModelType.IFT,
71
- 'TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16': ModelType.IFT,
72
- 'TheBloke/Llama-2-70B-fp16': ModelType.IFT,
73
- 'TheBloke/WizardLM-30B-fp16': ModelType.IFT,
74
- 'TheBloke/robin-13B-v2-fp16': ModelType.FT,
75
- 'TheBloke/robin-33B-v2-fp16': ModelType.FT,
76
- 'TheBloke/Vicuna-13B-CoT-fp16': ModelType.IFT,
77
- 'TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16': ModelType.IFT,
78
- 'TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16': ModelType.FT,
79
- 'TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16': ModelType.IFT,
80
- 'TheBloke/GPlatty-30B-SuperHOT-8K-fp16': ModelType.FT,
81
- 'TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16': ModelType.IFT,
82
- 'TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16': ModelType.IFT,
83
- 'jphme/orca_mini_v2_ger_7b': ModelType.IFT,
84
- 'Ejafa/vicuna_7B_vanilla_1.1': ModelType.FT,
85
- 'kevinpro/Vicuna-13B-CoT': ModelType.IFT,
86
- 'AlekseyKorshuk/pygmalion-6b-vicuna-chatml': ModelType.FT,
87
- 'AlekseyKorshuk/chatml-pyg-v1': ModelType.FT,
88
- 'concedo/Vicuzard-30B-Uncensored': ModelType.FT,
89
- 'concedo/OPT-19M-ChatSalad': ModelType.FT,
90
- 'concedo/Pythia-70M-ChatSalad': ModelType.FT,
91
- 'digitous/13B-HyperMantis': ModelType.IFT,
92
- 'digitous/Adventien-GPTJ': ModelType.FT,
93
- 'digitous/Alpacino13b': ModelType.IFT,
94
- 'digitous/GPT-R': ModelType.IFT,
95
- 'digitous/Javelin-R': ModelType.IFT,
96
- 'digitous/Javalion-GPTJ': ModelType.IFT,
97
- 'digitous/Javalion-R': ModelType.IFT,
98
- 'digitous/Skegma-GPTJ': ModelType.FT,
99
- 'digitous/Alpacino30b': ModelType.IFT,
100
- 'digitous/Janin-GPTJ': ModelType.FT,
101
- 'digitous/Janin-R': ModelType.FT,
102
- 'digitous/Javelin-GPTJ': ModelType.FT,
103
- 'SaylorTwift/gpt2_test': ModelType.PT,
104
- 'anton-l/gpt-j-tiny-random': ModelType.FT,
105
- 'Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca': ModelType.FT,
106
- 'Lazycuber/pyg-instruct-wizardlm': ModelType.FT,
107
- 'Lazycuber/Janemalion-6B': ModelType.FT,
108
- 'IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1': ModelType.FT,
109
- 'IDEA-CCNL/Ziya-LLaMA-13B-v1': ModelType.IFT,
110
- 'dsvv-cair/alpaca-cleaned-llama-30b-bf16': ModelType.FT,
111
- 'gpt2-medium': ModelType.PT,
112
- 'camel-ai/CAMEL-13B-Combined-Data': ModelType.IFT,
113
- 'camel-ai/CAMEL-13B-Role-Playing-Data': ModelType.FT,
114
- 'camel-ai/CAMEL-33B-Combined-Data': ModelType.IFT,
115
- 'PygmalionAI/pygmalion-6b': ModelType.FT,
116
- 'PygmalionAI/metharme-1.3b': ModelType.IFT,
117
- 'PygmalionAI/pygmalion-1.3b': ModelType.FT,
118
- 'PygmalionAI/pygmalion-350m': ModelType.FT,
119
- 'PygmalionAI/pygmalion-2.7b': ModelType.FT,
120
- 'medalpaca/medalpaca-7b': ModelType.FT,
121
- 'lilloukas/Platypus-30B': ModelType.IFT,
122
- 'lilloukas/GPlatty-30B': ModelType.FT,
123
- 'mncai/chatdoctor': ModelType.FT,
124
- 'chaoyi-wu/MedLLaMA_13B': ModelType.FT,
125
- 'LoupGarou/WizardCoder-Guanaco-15B-V1.0': ModelType.IFT,
126
- 'LoupGarou/WizardCoder-Guanaco-15B-V1.1': ModelType.FT,
127
- 'hakurei/instruct-12b': ModelType.IFT,
128
- 'hakurei/lotus-12B': ModelType.FT,
129
- 'shibing624/chinese-llama-plus-13b-hf': ModelType.IFT,
130
- 'shibing624/chinese-alpaca-plus-7b-hf': ModelType.IFT,
131
- 'shibing624/chinese-alpaca-plus-13b-hf': ModelType.IFT,
132
- 'mosaicml/mpt-7b-instruct': ModelType.IFT,
133
- 'mosaicml/mpt-30b-chat': ModelType.IFT,
134
- 'mosaicml/mpt-7b-storywriter': ModelType.FT,
135
- 'mosaicml/mpt-30b-instruct': ModelType.IFT,
136
- 'mosaicml/mpt-7b-chat': ModelType.IFT,
137
- 'mosaicml/mpt-30b': ModelType.PT,
138
- 'Corianas/111m': ModelType.IFT,
139
- 'Corianas/Quokka_1.3b': ModelType.IFT,
140
- 'Corianas/256_5epoch': ModelType.FT,
141
- 'Corianas/Quokka_256m': ModelType.IFT,
142
- 'Corianas/Quokka_590m': ModelType.IFT,
143
- 'Corianas/gpt-j-6B-Dolly': ModelType.FT,
144
- 'Corianas/Quokka_2.7b': ModelType.IFT,
145
- 'cyberagent/open-calm-7b': ModelType.FT,
146
- 'Aspik101/Nous-Hermes-13b-pl-lora_unload': ModelType.IFT,
147
- 'THUDM/chatglm2-6b': ModelType.IFT,
148
- 'MetaIX/GPT4-X-Alpasta-30b': ModelType.IFT,
149
- 'NYTK/PULI-GPTrio': ModelType.PT,
150
- 'EleutherAI/pythia-1.3b': ModelType.PT,
151
- 'EleutherAI/pythia-2.8b-deduped': ModelType.PT,
152
- 'EleutherAI/gpt-neo-125m': ModelType.PT,
153
- 'EleutherAI/pythia-160m': ModelType.PT,
154
- 'EleutherAI/gpt-neo-2.7B': ModelType.PT,
155
- 'EleutherAI/pythia-1b-deduped': ModelType.PT,
156
- 'EleutherAI/pythia-6.7b': ModelType.PT,
157
- 'EleutherAI/pythia-70m-deduped': ModelType.PT,
158
- 'EleutherAI/gpt-neox-20b': ModelType.PT,
159
- 'EleutherAI/pythia-1.4b-deduped': ModelType.PT,
160
- 'EleutherAI/pythia-2.7b': ModelType.PT,
161
- 'EleutherAI/pythia-6.9b-deduped': ModelType.PT,
162
- 'EleutherAI/pythia-70m': ModelType.PT,
163
- 'EleutherAI/gpt-j-6b': ModelType.PT,
164
- 'EleutherAI/pythia-12b-deduped': ModelType.PT,
165
- 'EleutherAI/gpt-neo-1.3B': ModelType.PT,
166
- 'EleutherAI/pythia-410m-deduped': ModelType.PT,
167
- 'EleutherAI/pythia-160m-deduped': ModelType.PT,
168
- 'EleutherAI/polyglot-ko-12.8b': ModelType.PT,
169
- 'EleutherAI/pythia-12b': ModelType.PT,
170
- 'roneneldan/TinyStories-33M': ModelType.PT,
171
- 'roneneldan/TinyStories-28M': ModelType.PT,
172
- 'roneneldan/TinyStories-1M': ModelType.PT,
173
- 'roneneldan/TinyStories-8M': ModelType.PT,
174
- 'roneneldan/TinyStories-3M': ModelType.PT,
175
- 'jerryjalapeno/nart-100k-7b': ModelType.FT,
176
- 'lmsys/vicuna-13b-v1.3': ModelType.IFT,
177
- 'lmsys/vicuna-7b-v1.3': ModelType.IFT,
178
- 'lmsys/vicuna-13b-v1.1': ModelType.IFT,
179
- 'lmsys/vicuna-13b-delta-v1.1': ModelType.IFT,
180
- 'lmsys/vicuna-7b-delta-v1.1': ModelType.IFT,
181
- 'abhiramtirumala/DialoGPT-sarcastic-medium': ModelType.FT,
182
- 'haonan-li/bactrian-x-llama-13b-merged': ModelType.IFT,
183
- 'Gryphe/MythoLogic-13b': ModelType.IFT,
184
- 'Gryphe/MythoBoros-13b': ModelType.IFT,
185
- 'pillowtalks-ai/delta13b': ModelType.FT,
186
- 'wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard': ModelType.FT,
187
- 'bigscience/bloom-7b1': ModelType.PT,
188
- 'bigcode/tiny_starcoder_py': ModelType.PT,
189
- 'bigcode/starcoderplus': ModelType.FT,
190
- 'bigcode/gpt_bigcode-santacoder': ModelType.PT,
191
- 'bigcode/starcoder': ModelType.PT,
192
- 'Open-Orca/OpenOrca-Preview1-13B': ModelType.IFT,
193
- 'microsoft/DialoGPT-large': ModelType.FT,
194
- 'microsoft/DialoGPT-small': ModelType.FT,
195
- 'microsoft/DialoGPT-medium': ModelType.FT,
196
- 'microsoft/CodeGPT-small-py': ModelType.FT,
197
- 'Tincando/fiction_story_generator': ModelType.FT,
198
- 'Pirr/pythia-13b-deduped-green_devil': ModelType.FT,
199
- 'Aeala/GPT4-x-AlpacaDente2-30b': ModelType.FT,
200
- 'Aeala/GPT4-x-AlpacaDente-30b': ModelType.FT,
201
- 'Aeala/GPT4-x-Alpasta-13b': ModelType.FT,
202
- 'Aeala/VicUnlocked-alpaca-30b': ModelType.IFT,
203
- 'Tap-M/Luna-AI-Llama2-Uncensored': ModelType.FT,
204
- 'illuin/test-custom-llama': ModelType.FT,
205
- 'dvruette/oasst-llama-13b-2-epochs': ModelType.FT,
206
- 'dvruette/oasst-gpt-neox-20b-1000-steps': ModelType.FT,
207
- 'dvruette/llama-13b-pretrained-dropout': ModelType.PT,
208
- 'dvruette/llama-13b-pretrained': ModelType.PT,
209
- 'dvruette/llama-13b-pretrained-sft-epoch-1': ModelType.FT,
210
- 'dvruette/llama-13b-pretrained-sft-do2': ModelType.FT,
211
- 'dvruette/oasst-gpt-neox-20b-3000-steps': ModelType.FT,
212
- 'dvruette/oasst-pythia-12b-pretrained-sft': ModelType.FT,
213
- 'dvruette/oasst-pythia-6.9b-4000-steps': ModelType.FT,
214
- 'dvruette/gpt-neox-20b-full-precision': ModelType.FT,
215
- 'dvruette/oasst-llama-13b-1000-steps': ModelType.FT,
216
- 'openlm-research/open_llama_7b_700bt_preview': ModelType.PT,
217
- 'openlm-research/open_llama_7b': ModelType.PT,
218
- 'openlm-research/open_llama_7b_v2': ModelType.PT,
219
- 'openlm-research/open_llama_3b': ModelType.PT,
220
- 'openlm-research/open_llama_13b': ModelType.PT,
221
- 'openlm-research/open_llama_3b_v2': ModelType.PT,
222
- 'PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged': ModelType.IFT,
223
- 'GeorgiaTechResearchInstitute/galpaca-30b': ModelType.IFT,
224
- 'GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct': ModelType.IFT,
225
- 'databricks/dolly-v2-7b': ModelType.IFT,
226
- 'databricks/dolly-v2-3b': ModelType.IFT,
227
- 'databricks/dolly-v2-12b': ModelType.IFT,
228
- 'Rachneet/gpt2-xl-alpaca': ModelType.FT,
229
- 'Locutusque/gpt2-conversational-or-qa': ModelType.FT,
230
- 'psyche/kogpt': ModelType.FT,
231
- 'NbAiLab/nb-gpt-j-6B-alpaca': ModelType.IFT,
232
- 'Mikael110/llama-2-7b-guanaco-fp16': ModelType.FT,
233
- 'Mikael110/llama-2-13b-guanaco-fp16': ModelType.FT,
234
- 'Fredithefish/CrimsonPajama': ModelType.IFT,
235
- 'Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K': ModelType.FT,
236
- 'Fredithefish/ScarletPajama-3B-HF': ModelType.FT,
237
- 'Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4': ModelType.IFT,
238
- 'acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1': ModelType.IFT,
239
- 'eachadea/vicuna-13b-1.1': ModelType.FT,
240
- 'eachadea/vicuna-7b-1.1': ModelType.FT,
241
- 'eachadea/vicuna-13b': ModelType.FT,
242
- 'openaccess-ai-collective/wizard-mega-13b': ModelType.IFT,
243
- 'openaccess-ai-collective/manticore-13b': ModelType.IFT,
244
- 'openaccess-ai-collective/manticore-30b-chat-pyg-alpha': ModelType.IFT,
245
- 'openaccess-ai-collective/minotaur-13b': ModelType.IFT,
246
- 'openaccess-ai-collective/minotaur-13b-fixed': ModelType.IFT,
247
- 'openaccess-ai-collective/hippogriff-30b-chat': ModelType.IFT,
248
- 'openaccess-ai-collective/manticore-13b-chat-pyg': ModelType.IFT,
249
- 'pythainlp/wangchanglm-7.5B-sft-enth': ModelType.IFT,
250
- 'pythainlp/wangchanglm-7.5B-sft-en-sharded': ModelType.IFT,
251
- 'euclaise/gpt-neox-122m-minipile-digits': ModelType.FT,
252
- 'stabilityai/StableBeluga1-Delta': ModelType.IFT,
253
- 'stabilityai/stablelm-tuned-alpha-7b': ModelType.IFT,
254
- 'stabilityai/StableBeluga2': ModelType.IFT,
255
- 'stabilityai/StableBeluga-13B': ModelType.IFT,
256
- 'stabilityai/StableBeluga-7B': ModelType.IFT,
257
- 'stabilityai/stablelm-base-alpha-7b': ModelType.PT,
258
- 'stabilityai/stablelm-base-alpha-3b': ModelType.PT,
259
- 'stabilityai/stablelm-tuned-alpha-3b': ModelType.IFT,
260
- 'alibidaran/medical_transcription_generator': ModelType.FT,
261
- 'CalderaAI/30B-Lazarus': ModelType.IFT,
262
- 'CalderaAI/13B-BlueMethod': ModelType.IFT,
263
- 'CalderaAI/13B-Ouroboros': ModelType.IFT,
264
- 'KoboldAI/OPT-13B-Erebus': ModelType.FT,
265
- 'KoboldAI/GPT-J-6B-Janeway': ModelType.FT,
266
- 'KoboldAI/GPT-J-6B-Shinen': ModelType.FT,
267
- 'KoboldAI/fairseq-dense-2.7B': ModelType.PT,
268
- 'KoboldAI/OPT-6B-nerys-v2': ModelType.FT,
269
- 'KoboldAI/GPT-NeoX-20B-Skein': ModelType.FT,
270
- 'KoboldAI/PPO_Pygway-6b-Mix': ModelType.FT,
271
- 'KoboldAI/fairseq-dense-6.7B': ModelType.PT,
272
- 'KoboldAI/fairseq-dense-125M': ModelType.PT,
273
- 'KoboldAI/OPT-13B-Nerybus-Mix': ModelType.FT,
274
- 'KoboldAI/OPT-2.7B-Erebus': ModelType.FT,
275
- 'KoboldAI/OPT-350M-Nerys-v2': ModelType.FT,
276
- 'KoboldAI/OPT-2.7B-Nerys-v2': ModelType.FT,
277
- 'KoboldAI/OPT-2.7B-Nerybus-Mix': ModelType.FT,
278
- 'KoboldAI/OPT-13B-Nerys-v2': ModelType.FT,
279
- 'KoboldAI/GPT-NeoX-20B-Erebus': ModelType.FT,
280
- 'KoboldAI/OPT-6.7B-Erebus': ModelType.FT,
281
- 'KoboldAI/fairseq-dense-355M': ModelType.PT,
282
- 'KoboldAI/OPT-6.7B-Nerybus-Mix': ModelType.FT,
283
- 'KoboldAI/GPT-J-6B-Adventure': ModelType.FT,
284
- 'KoboldAI/OPT-350M-Erebus': ModelType.FT,
285
- 'KoboldAI/GPT-J-6B-Skein': ModelType.FT,
286
- 'KoboldAI/OPT-30B-Erebus': ModelType.FT,
287
- 'klosax/pythia-160m-deduped-step92k-193bt': ModelType.PT,
288
- 'klosax/open_llama_3b_350bt_preview': ModelType.PT,
289
- 'klosax/openllama-3b-350bt': ModelType.PT,
290
- 'klosax/pythia-70m-deduped-step44k-92bt': ModelType.PT,
291
- 'klosax/open_llama_13b_600bt_preview': ModelType.PT,
292
- 'klosax/open_llama_7b_400bt_preview': ModelType.PT,
293
- 'kfkas/Llama-2-ko-7b-Chat': ModelType.IFT,
294
- 'WeOpenML/Alpaca-7B-v1': ModelType.IFT,
295
- 'WeOpenML/PandaLM-Alpaca-7B-v1': ModelType.IFT,
296
- 'TFLai/gpt2-turkish-uncased': ModelType.FT,
297
- 'ehartford/WizardLM-13B-Uncensored': ModelType.IFT,
298
- 'ehartford/dolphin-llama-13b': ModelType.IFT,
299
- 'ehartford/Wizard-Vicuna-30B-Uncensored': ModelType.FT,
300
- 'ehartford/WizardLM-30B-Uncensored': ModelType.IFT,
301
- 'ehartford/Wizard-Vicuna-13B-Uncensored': ModelType.FT,
302
- 'ehartford/WizardLM-7B-Uncensored': ModelType.IFT,
303
- 'ehartford/based-30b': ModelType.FT,
304
- 'ehartford/Wizard-Vicuna-7B-Uncensored': ModelType.FT,
305
- 'wahaha1987/llama_7b_sharegpt94k_fastchat': ModelType.FT,
306
- 'wahaha1987/llama_13b_sharegpt94k_fastchat': ModelType.FT,
307
- 'OpenAssistant/oasst-sft-1-pythia-12b': ModelType.FT,
308
- 'OpenAssistant/stablelm-7b-sft-v7-epoch-3': ModelType.IFT,
309
- 'OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5': ModelType.FT,
310
- 'OpenAssistant/pythia-12b-sft-v8-2.5k-steps': ModelType.IFT,
311
- 'OpenAssistant/pythia-12b-sft-v8-7k-steps': ModelType.IFT,
312
- 'OpenAssistant/pythia-12b-pre-v8-12.5k-steps': ModelType.IFT,
313
- 'OpenAssistant/llama2-13b-orca-8k-3319': ModelType.IFT,
314
- 'junelee/wizard-vicuna-13b': ModelType.FT,
315
- 'BreadAi/gpt-YA-1-1_160M': ModelType.PT,
316
- 'BreadAi/MuseCan': ModelType.PT,
317
- 'BreadAi/MusePy-1-2': ModelType.PT,
318
- 'BreadAi/DiscordPy': ModelType.PT,
319
- 'BreadAi/PM_modelV2': ModelType.PT,
320
- 'BreadAi/gpt-Youtube': ModelType.PT,
321
- 'BreadAi/StoryPy': ModelType.FT,
322
- 'julianweng/Llama-2-7b-chat-orcah': ModelType.FT,
323
- 'AGI-inc/lora_moe_7b_baseline': ModelType.FT,
324
- 'AGI-inc/lora_moe_7b': ModelType.FT,
325
- 'togethercomputer/GPT-NeoXT-Chat-Base-20B': ModelType.IFT,
326
- 'togethercomputer/RedPajama-INCITE-Chat-7B-v0.1': ModelType.IFT,
327
- 'togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1': ModelType.IFT,
328
- 'togethercomputer/RedPajama-INCITE-7B-Base': ModelType.PT,
329
- 'togethercomputer/RedPajama-INCITE-7B-Instruct': ModelType.IFT,
330
- 'togethercomputer/RedPajama-INCITE-Base-3B-v1': ModelType.PT,
331
- 'togethercomputer/Pythia-Chat-Base-7B': ModelType.IFT,
332
- 'togethercomputer/RedPajama-INCITE-Base-7B-v0.1': ModelType.PT,
333
- 'togethercomputer/GPT-JT-6B-v1': ModelType.IFT,
334
- 'togethercomputer/GPT-JT-6B-v0': ModelType.IFT,
335
- 'togethercomputer/RedPajama-INCITE-Chat-3B-v1': ModelType.IFT,
336
- 'togethercomputer/RedPajama-INCITE-7B-Chat': ModelType.IFT,
337
- 'togethercomputer/RedPajama-INCITE-Instruct-3B-v1': ModelType.IFT,
338
- 'Writer/camel-5b-hf': ModelType.IFT,
339
- 'Writer/palmyra-base': ModelType.PT,
340
- 'MBZUAI/LaMini-GPT-1.5B': ModelType.IFT,
341
- 'MBZUAI/lamini-cerebras-111m': ModelType.IFT,
342
- 'MBZUAI/lamini-neo-1.3b': ModelType.IFT,
343
- 'MBZUAI/lamini-cerebras-1.3b': ModelType.IFT,
344
- 'MBZUAI/lamini-cerebras-256m': ModelType.IFT,
345
- 'MBZUAI/LaMini-GPT-124M': ModelType.IFT,
346
- 'MBZUAI/lamini-neo-125m': ModelType.IFT,
347
- 'TehVenom/DiffMerge-DollyGPT-Pygmalion': ModelType.FT,
348
- 'TehVenom/PPO_Shygmalion-6b': ModelType.FT,
349
- 'TehVenom/Dolly_Shygmalion-6b-Dev_V8P2': ModelType.FT,
350
- 'TehVenom/Pygmalion_AlpacaLora-7b': ModelType.FT,
351
- 'TehVenom/PPO_Pygway-V8p4_Dev-6b': ModelType.FT,
352
- 'TehVenom/Dolly_Malion-6b': ModelType.FT,
353
- 'TehVenom/PPO_Shygmalion-V8p4_Dev-6b': ModelType.FT,
354
- 'TehVenom/ChanMalion': ModelType.FT,
355
- 'TehVenom/GPT-J-Pyg_PPO-6B': ModelType.IFT,
356
- 'TehVenom/Pygmalion-13b-Merged': ModelType.FT,
357
- 'TehVenom/Metharme-13b-Merged': ModelType.IFT,
358
- 'TehVenom/Dolly_Shygmalion-6b': ModelType.FT,
359
- 'TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4': ModelType.IFT,
360
- 'georgesung/llama2_7b_chat_uncensored': ModelType.FT,
361
- 'vicgalle/gpt2-alpaca': ModelType.IFT,
362
- 'vicgalle/alpaca-7b': ModelType.FT,
363
- 'vicgalle/gpt2-alpaca-gpt4': ModelType.IFT,
364
- 'facebook/opt-350m': ModelType.PT,
365
- 'facebook/opt-125m': ModelType.PT,
366
- 'facebook/xglm-4.5B': ModelType.PT,
367
- 'facebook/opt-2.7b': ModelType.PT,
368
- 'facebook/opt-6.7b': ModelType.PT,
369
- 'facebook/galactica-30b': ModelType.PT,
370
- 'facebook/opt-13b': ModelType.PT,
371
- 'facebook/opt-66b': ModelType.PT,
372
- 'facebook/xglm-7.5B': ModelType.PT,
373
- 'facebook/xglm-564M': ModelType.PT,
374
- 'facebook/opt-30b': ModelType.PT,
375
- 'golaxy/gogpt-7b': ModelType.FT,
376
- 'golaxy/gogpt2-7b': ModelType.FT,
377
- 'golaxy/gogpt-7b-bloom': ModelType.FT,
378
- 'golaxy/gogpt-3b-bloom': ModelType.FT,
379
- 'psmathur/orca_mini_v2_7b': ModelType.IFT,
380
- 'psmathur/orca_mini_7b': ModelType.IFT,
381
- 'psmathur/orca_mini_3b': ModelType.IFT,
382
- 'psmathur/orca_mini_v2_13b': ModelType.IFT,
383
- 'gpt2-xl': ModelType.PT,
384
- 'lxe/Cerebras-GPT-2.7B-Alpaca-SP': ModelType.FT,
385
- 'Monero/Manticore-13b-Chat-Pyg-Guanaco': ModelType.FT,
386
- 'Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b': ModelType.IFT,
387
- 'Monero/WizardLM-13b-OpenAssistant-Uncensored': ModelType.IFT,
388
- 'Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b': ModelType.IFT,
389
- 'jzjiao/opt-1.3b-rlhf': ModelType.FT,
390
- 'HuggingFaceH4/starchat-beta': ModelType.IFT,
391
- 'KnutJaegersberg/gpt-2-xl-EvolInstruct': ModelType.IFT,
392
- 'KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct': ModelType.IFT,
393
- 'KnutJaegersberg/galactica-orca-wizardlm-1.3b': ModelType.IFT,
394
- 'openchat/openchat_8192': ModelType.IFT,
395
- 'openchat/openchat_v2': ModelType.IFT,
396
- 'openchat/openchat_v2_w': ModelType.IFT,
397
- 'ausboss/llama-13b-supercot': ModelType.IFT,
398
- 'ausboss/llama-30b-supercot': ModelType.IFT,
399
- 'Neko-Institute-of-Science/metharme-7b': ModelType.IFT,
400
- 'Neko-Institute-of-Science/pygmalion-7b': ModelType.FT,
401
- 'SebastianSchramm/Cerebras-GPT-111M-instruction': ModelType.IFT,
402
- 'victor123/WizardLM-13B-1.0': ModelType.IFT,
403
- 'OpenBuddy/openbuddy-openllama-13b-v7-fp16': ModelType.FT,
404
- 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16': ModelType.FT,
405
- 'OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16': ModelType.FT,
406
- 'baichuan-inc/Baichuan-7B': ModelType.PT,
407
- 'tiiuae/falcon-40b-instruct': ModelType.IFT,
408
- 'tiiuae/falcon-40b': ModelType.PT,
409
- 'tiiuae/falcon-7b': ModelType.PT,
410
- 'YeungNLP/firefly-llama-13b': ModelType.FT,
411
- 'YeungNLP/firefly-llama-13b-v1.2': ModelType.FT,
412
- 'YeungNLP/firefly-llama2-13b': ModelType.FT,
413
- 'YeungNLP/firefly-ziya-13b': ModelType.FT,
414
- 'shaohang/Sparse0.5_OPT-1.3': ModelType.FT,
415
- 'xzuyn/Alpacino-SuperCOT-13B': ModelType.IFT,
416
- 'xzuyn/MedicWizard-7B': ModelType.FT,
417
- 'xDAN-AI/xDAN_13b_l2_lora': ModelType.FT,
418
- 'beomi/KoAlpaca-Polyglot-5.8B': ModelType.FT,
419
- 'beomi/llama-2-ko-7b': ModelType.IFT,
420
- 'Salesforce/codegen-6B-multi': ModelType.PT,
421
- 'Salesforce/codegen-16B-nl': ModelType.PT,
422
- 'Salesforce/codegen-6B-nl': ModelType.PT,
423
- 'ai-forever/rugpt3large_based_on_gpt2': ModelType.FT,
424
- 'gpt2-large': ModelType.PT,
425
- 'frank098/orca_mini_3b_juniper': ModelType.FT,
426
- 'frank098/WizardLM_13B_juniper': ModelType.FT,
427
- 'FPHam/Free_Sydney_13b_HF': ModelType.FT,
428
- 'huggingface/llama-13b': ModelType.PT,
429
- 'huggingface/llama-7b': ModelType.PT,
430
- 'huggingface/llama-65b': ModelType.PT,
431
- 'huggingface/llama-30b': ModelType.PT,
432
- 'Henk717/chronoboros-33B': ModelType.IFT,
433
- 'jondurbin/airoboros-13b-gpt4-1.4': ModelType.IFT,
434
- 'jondurbin/airoboros-7b': ModelType.IFT,
435
- 'jondurbin/airoboros-7b-gpt4': ModelType.IFT,
436
- 'jondurbin/airoboros-7b-gpt4-1.1': ModelType.IFT,
437
- 'jondurbin/airoboros-7b-gpt4-1.2': ModelType.IFT,
438
- 'jondurbin/airoboros-7b-gpt4-1.3': ModelType.IFT,
439
- 'jondurbin/airoboros-7b-gpt4-1.4': ModelType.IFT,
440
- 'jondurbin/airoboros-l2-7b-gpt4-1.4.1': ModelType.IFT,
441
- 'jondurbin/airoboros-l2-13b-gpt4-1.4.1': ModelType.IFT,
442
- 'jondurbin/airoboros-l2-70b-gpt4-1.4.1': ModelType.IFT,
443
- 'jondurbin/airoboros-13b': ModelType.IFT,
444
- 'jondurbin/airoboros-33b-gpt4-1.4': ModelType.IFT,
445
- 'jondurbin/airoboros-33b-gpt4-1.2': ModelType.IFT,
446
- 'jondurbin/airoboros-65b-gpt4-1.2': ModelType.IFT,
447
- 'ariellee/SuperPlatty-30B': ModelType.IFT,
448
- 'danielhanchen/open_llama_3b_600bt_preview': ModelType.FT,
449
- 'cerebras/Cerebras-GPT-256M': ModelType.PT,
450
- 'cerebras/Cerebras-GPT-1.3B': ModelType.PT,
451
- 'cerebras/Cerebras-GPT-13B': ModelType.PT,
452
- 'cerebras/Cerebras-GPT-2.7B': ModelType.PT,
453
- 'cerebras/Cerebras-GPT-111M': ModelType.PT,
454
- 'cerebras/Cerebras-GPT-6.7B': ModelType.PT,
455
- 'Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf': ModelType.RL,
456
- 'Yhyu13/llama-30B-hf-openassitant': ModelType.FT,
457
- 'NousResearch/Nous-Hermes-Llama2-13b': ModelType.IFT,
458
- 'NousResearch/Nous-Hermes-llama-2-7b': ModelType.IFT,
459
- 'NousResearch/Redmond-Puffin-13B': ModelType.IFT,
460
- 'NousResearch/Nous-Hermes-13b': ModelType.IFT,
461
- 'project-baize/baize-v2-7b': ModelType.IFT,
462
- 'project-baize/baize-v2-13b': ModelType.IFT,
463
- 'LLMs/WizardLM-13B-V1.0': ModelType.FT,
464
- 'LLMs/AlpacaGPT4-7B-elina': ModelType.FT,
465
- 'wenge-research/yayi-7b': ModelType.FT,
466
- 'wenge-research/yayi-7b-llama2': ModelType.FT,
467
- 'wenge-research/yayi-13b-llama2': ModelType.FT,
468
- 'yhyhy3/open_llama_7b_v2_med_instruct': ModelType.IFT,
469
- 'llama-anon/instruct-13b': ModelType.IFT,
470
- 'huggingtweets/jerma985': ModelType.FT,
471
- 'huggingtweets/gladosystem': ModelType.FT,
472
- 'huggingtweets/bladeecity-jerma985': ModelType.FT,
473
- 'huggyllama/llama-13b': ModelType.PT,
474
- 'huggyllama/llama-65b': ModelType.PT,
475
- 'FabbriSimo01/Facebook_opt_1.3b_Quantized': ModelType.PT,
476
- 'upstage/Llama-2-70b-instruct': ModelType.IFT,
477
- 'upstage/Llama-2-70b-instruct-1024': ModelType.IFT,
478
- 'upstage/llama-65b-instruct': ModelType.IFT,
479
- 'upstage/llama-30b-instruct-2048': ModelType.IFT,
480
- 'upstage/llama-30b-instruct': ModelType.IFT,
481
- 'WizardLM/WizardLM-13B-1.0': ModelType.IFT,
482
- 'WizardLM/WizardLM-13B-V1.1': ModelType.IFT,
483
- 'WizardLM/WizardLM-13B-V1.2': ModelType.IFT,
484
- 'WizardLM/WizardLM-30B-V1.0': ModelType.IFT,
485
- 'WizardLM/WizardCoder-15B-V1.0': ModelType.IFT,
486
- 'gpt2': ModelType.PT,
487
- 'keyfan/vicuna-chinese-replication-v1.1': ModelType.IFT,
488
- 'nthngdy/pythia-owt2-70m-100k': ModelType.FT,
489
- 'nthngdy/pythia-owt2-70m-50k': ModelType.FT,
490
- 'quantumaikr/KoreanLM-hf': ModelType.FT,
491
- 'quantumaikr/open_llama_7b_hf': ModelType.FT,
492
- 'quantumaikr/QuantumLM-70B-hf': ModelType.IFT,
493
- 'MayaPH/FinOPT-Lincoln': ModelType.FT,
494
- 'MayaPH/FinOPT-Franklin': ModelType.FT,
495
- 'MayaPH/GodziLLa-30B': ModelType.IFT,
496
- 'MayaPH/GodziLLa-30B-plus': ModelType.IFT,
497
- 'MayaPH/FinOPT-Washington': ModelType.FT,
498
- 'ogimgio/gpt-neo-125m-neurallinguisticpioneers': ModelType.FT,
499
- 'layoric/llama-2-13b-code-alpaca': ModelType.FT,
500
- 'CobraMamba/mamba-gpt-3b': ModelType.FT,
501
- 'CobraMamba/mamba-gpt-3b-v2': ModelType.FT,
502
- 'CobraMamba/mamba-gpt-3b-v3': ModelType.FT,
503
- 'timdettmers/guanaco-33b-merged': ModelType.FT,
504
- 'elinas/chronos-33b': ModelType.IFT,
505
- 'heegyu/RedTulu-Uncensored-3B-0719': ModelType.IFT,
506
- 'heegyu/WizardVicuna-Uncensored-3B-0719': ModelType.IFT,
507
- 'heegyu/WizardVicuna-3B-0719': ModelType.IFT,
508
- 'meta-llama/Llama-2-7b-chat-hf': ModelType.RL,
509
- 'meta-llama/Llama-2-7b-hf': ModelType.PT,
510
- 'meta-llama/Llama-2-13b-chat-hf': ModelType.RL,
511
- 'meta-llama/Llama-2-13b-hf': ModelType.PT,
512
- 'meta-llama/Llama-2-70b-chat-hf': ModelType.RL,
513
- 'meta-llama/Llama-2-70b-hf': ModelType.PT,
514
- 'xhyi/PT_GPTNEO350_ATG': ModelType.FT,
515
- 'h2oai/h2ogpt-gm-oasst1-en-1024-20b': ModelType.FT,
516
- 'h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt': ModelType.FT,
517
- 'h2oai/h2ogpt-oig-oasst1-512-6_9b': ModelType.IFT,
518
- 'h2oai/h2ogpt-oasst1-512-12b': ModelType.IFT,
519
- 'h2oai/h2ogpt-oig-oasst1-256-6_9b': ModelType.IFT,
520
- 'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt': ModelType.FT,
521
- 'h2oai/h2ogpt-oasst1-512-20b': ModelType.IFT,
522
- 'h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2': ModelType.FT,
523
- 'h2oai/h2ogpt-gm-oasst1-en-1024-12b': ModelType.FT,
524
- 'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b': ModelType.FT,
525
- 'bofenghuang/vigogne-13b-instruct': ModelType.IFT,
526
- 'bofenghuang/vigogne-13b-chat': ModelType.FT,
527
- 'bofenghuang/vigogne-2-7b-instruct': ModelType.IFT,
528
- 'bofenghuang/vigogne-7b-instruct': ModelType.IFT,
529
- 'bofenghuang/vigogne-7b-chat': ModelType.FT,
530
- 'Vmware/open-llama-7b-v2-open-instruct': ModelType.IFT,
531
- 'VMware/open-llama-0.7T-7B-open-instruct-v1.1': ModelType.IFT,
532
- 'ewof/koishi-instruct-3b': ModelType.IFT,
533
- 'gywy/llama2-13b-chinese-v1': ModelType.FT,
534
- 'GOAT-AI/GOAT-7B-Community': ModelType.FT,
535
- 'psyche/kollama2-7b': ModelType.FT,
536
- 'TheTravellingEngineer/llama2-7b-hf-guanaco': ModelType.FT,
537
- 'beaugogh/pythia-1.4b-deduped-sharegpt': ModelType.FT,
538
- 'augtoma/qCammel-70-x': ModelType.IFT,
539
- 'Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload': ModelType.IFT,
540
- 'anhnv125/pygmalion-6b-roleplay': ModelType.FT,
541
- '64bits/LexPodLM-13B': ModelType.FT,
542
- }
543
-
544
-
545
- def model_type_from_str(type):
546
- if "fine-tuned" in type or "🔶" in type:
547
- return ModelType.FT
548
- if "pretrained" in type or "🟢" in type:
549
- return ModelType.PT
550
- if "RL-tuned" in type or "🟦" in type:
551
- return ModelType.RL
552
- if "instruction-tuned" in type or "⭕" in type:
553
- return ModelType.IFT
554
- return ModelType.Unknown
555
-
556
-
557
- def get_model_type(leaderboard_data: List[dict]):
558
- for model_data in leaderboard_data:
559
- request_files = os.path.join("eval-queue", model_data["model_name_for_query"] + "_eval_request_*" + ".json")
560
- request_files = glob.glob(request_files)
561
-
562
- request_file = ""
563
- if len(request_files) == 1:
564
- request_file = request_files[0]
565
- elif len(request_files) > 1:
566
- request_files = sorted(request_files, reverse=True)
567
- for tmp_request_file in request_files:
568
- with open(tmp_request_file, "r") as f:
569
- req_content = json.load(f)
570
- if req_content["status"] == "FINISHED" and req_content["precision"] == model_data["Precision"].split(".")[-1]:
571
- request_file = tmp_request_file
572
-
573
- if request_file == "":
574
- model_data[AutoEvalColumn.model_type.name] = ""
575
- model_data[AutoEvalColumn.model_type_symbol.name] = ""
576
- continue
577
-
578
- try:
579
- with open(request_file, "r") as f:
580
- request = json.load(f)
581
- is_delta = request["weight_type"] != "Original"
582
- except Exception:
583
- is_delta = False
584
-
585
- try:
586
- with open(request_file, "r") as f:
587
- request = json.load(f)
588
- model_type = model_type_from_str(request["model_type"])
589
- model_data[AutoEvalColumn.model_type.name] = model_type.value.name
590
- model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol + ("🔺" if is_delta else "")
591
- except KeyError:
592
- if model_data["model_name_for_query"] in TYPE_METADATA:
593
- model_data[AutoEvalColumn.model_type.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.name
594
- model_data[AutoEvalColumn.model_type_symbol.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.symbol + ("🔺" if is_delta else "")
595
- else:
596
- model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
597
- model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{auto_leaderboard → display_models}/get_model_metadata.py RENAMED
@@ -1,17 +1,22 @@
1
- import re
 
2
  import os
 
3
  from typing import List
4
 
5
- from src.utils_display import AutoEvalColumn
6
- from src.auto_leaderboard.model_metadata_type import get_model_type
7
-
8
- from huggingface_hub import HfApi
9
  import huggingface_hub
 
 
 
 
 
 
 
10
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
11
 
12
 
13
  def get_model_infos_from_hub(leaderboard_data: List[dict]):
14
- for model_data in leaderboard_data:
15
  model_name = model_data["model_name_for_query"]
16
  try:
17
  model_info = api.model_info(model_name)
@@ -33,15 +38,18 @@ def get_model_license(model_info):
33
  except Exception:
34
  return None
35
 
 
36
  def get_model_likes(model_info):
37
  return model_info.likes
38
 
 
39
  size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
40
 
 
41
  def get_model_size(model_name, model_info):
42
  # In billions
43
  try:
44
- return round(model_info.safetensors["total"] / 1e9, 3)
45
  except AttributeError:
46
  try:
47
  size_match = re.search(size_pattern, model_name.lower())
@@ -51,6 +59,74 @@ def get_model_size(model_name, model_info):
51
  return None
52
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def apply_metadata(leaderboard_data: List[dict]):
 
55
  get_model_type(leaderboard_data)
56
  get_model_infos_from_hub(leaderboard_data)
 
 
1
+ import glob
2
+ import json
3
  import os
4
+ import re
5
  from typing import List
6
 
 
 
 
 
7
  import huggingface_hub
8
+ from huggingface_hub import HfApi
9
+ from tqdm import tqdm
10
+
11
+ from src.display_models.model_metadata_flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
12
+ from src.display_models.model_metadata_type import MODEL_TYPE_METADATA, ModelType, model_type_from_str
13
+ from src.display_models.utils import AutoEvalColumn, model_hyperlink
14
+
15
  api = HfApi(token=os.environ.get("H4_TOKEN", None))
16
 
17
 
18
  def get_model_infos_from_hub(leaderboard_data: List[dict]):
19
+ for model_data in tqdm(leaderboard_data):
20
  model_name = model_data["model_name_for_query"]
21
  try:
22
  model_info = api.model_info(model_name)
 
38
  except Exception:
39
  return None
40
 
41
+
42
  def get_model_likes(model_info):
43
  return model_info.likes
44
 
45
+
46
  size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
47
 
48
+
49
  def get_model_size(model_name, model_info):
50
  # In billions
51
  try:
52
+ return round(model_info.safetensors["total"] / 1e9, 3)
53
  except AttributeError:
54
  try:
55
  size_match = re.search(size_pattern, model_name.lower())
 
59
  return None
60
 
61
 
62
+ def get_model_type(leaderboard_data: List[dict]):
63
+ for model_data in leaderboard_data:
64
+ request_files = os.path.join(
65
+ "eval-queue",
66
+ model_data["model_name_for_query"] + "_eval_request_*" + ".json",
67
+ )
68
+ request_files = glob.glob(request_files)
69
+
70
+ # Select correct request file (precision)
71
+ request_file = ""
72
+ if len(request_files) == 1:
73
+ request_file = request_files[0]
74
+ elif len(request_files) > 1:
75
+ request_files = sorted(request_files, reverse=True)
76
+ for tmp_request_file in request_files:
77
+ with open(tmp_request_file, "r") as f:
78
+ req_content = json.load(f)
79
+ if (
80
+ req_content["status"] == "FINISHED"
81
+ and req_content["precision"] == model_data["Precision"].split(".")[-1]
82
+ ):
83
+ request_file = tmp_request_file
84
+
85
+ try:
86
+ with open(request_file, "r") as f:
87
+ request = json.load(f)
88
+ model_type = model_type_from_str(request["model_type"])
89
+ model_data[AutoEvalColumn.model_type.name] = model_type.value.name
90
+ model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("🔺" if is_delta else "")
91
+ except Exception:
92
+ if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
93
+ model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
94
+ model_data["model_name_for_query"]
95
+ ].value.name
96
+ model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[
97
+ model_data["model_name_for_query"]
98
+ ].value.symbol # + ("🔺" if is_delta else "")
99
+ else:
100
+ model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
101
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
102
+
103
+
104
+ def flag_models(leaderboard_data: List[dict]):
105
+ for model_data in leaderboard_data:
106
+ if model_data["model_name_for_query"] in FLAGGED_MODELS:
107
+ issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
108
+ issue_link = model_hyperlink(
109
+ FLAGGED_MODELS[model_data["model_name_for_query"]],
110
+ f"See discussion #{issue_num}",
111
+ )
112
+ model_data[
113
+ AutoEvalColumn.model.name
114
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
115
+
116
+
117
+ def remove_forbidden_models(leaderboard_data: List[dict]):
118
+ indices_to_remove = []
119
+ for ix, model in enumerate(leaderboard_data):
120
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
121
+ indices_to_remove.append(ix)
122
+
123
+ for ix in reversed(indices_to_remove):
124
+ leaderboard_data.pop(ix)
125
+ return leaderboard_data
126
+
127
+
128
  def apply_metadata(leaderboard_data: List[dict]):
129
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
130
  get_model_type(leaderboard_data)
131
  get_model_infos_from_hub(leaderboard_data)
132
+ flag_models(leaderboard_data)
src/display_models/model_metadata_flags.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models which have been flagged by users as being problematic for a reason or another
2
+ # (Model name to forum discussion link)
3
+ FLAGGED_MODELS = {
4
+ "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
5
+ "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
6
+ "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
7
+ "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
8
+ "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
9
+ "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
10
+ }
11
+
12
+ # Models which have been requested by orgs to not be submitted on the leaderboard
13
+ DO_NOT_SUBMIT_MODELS = [
14
+ "Voicelab/trurl-2-13b", # trained on MMLU
15
+ ]
src/display_models/model_metadata_type.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Dict
4
+
5
+
6
+ @dataclass
7
+ class ModelInfo:
8
+ name: str
9
+ symbol: str # emoji
10
+
11
+
12
+ class ModelType(Enum):
13
+ PT = ModelInfo(name="pretrained", symbol="🟢")
14
+ FT = ModelInfo(name="fine-tuned", symbol="🔶")
15
+ IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
16
+ RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
+ Unknown = ModelInfo(name="Unknown, add type to request file!", symbol="?")
18
+
19
+ def to_str(self, separator=" "):
20
+ return f"{self.value.symbol}{separator}{self.value.name}"
21
+
22
+
23
+ MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
+ "tiiuae/falcon-180B": ModelType.PT,
25
+ "Qwen/Qwen-7B": ModelType.PT,
26
+ "Qwen/Qwen-7B-Chat": ModelType.RL,
27
+ "notstoic/PygmalionCoT-7b": ModelType.IFT,
28
+ "aisquared/dlite-v1-355m": ModelType.IFT,
29
+ "aisquared/dlite-v1-1_5b": ModelType.IFT,
30
+ "aisquared/dlite-v1-774m": ModelType.IFT,
31
+ "aisquared/dlite-v1-124m": ModelType.IFT,
32
+ "aisquared/chopt-2_7b": ModelType.IFT,
33
+ "aisquared/dlite-v2-124m": ModelType.IFT,
34
+ "aisquared/dlite-v2-774m": ModelType.IFT,
35
+ "aisquared/dlite-v2-1_5b": ModelType.IFT,
36
+ "aisquared/chopt-1_3b": ModelType.IFT,
37
+ "aisquared/dlite-v2-355m": ModelType.IFT,
38
+ "augtoma/qCammel-13": ModelType.IFT,
39
+ "Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload": ModelType.IFT,
40
+ "Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload": ModelType.IFT,
41
+ "TheBloke/alpaca-lora-65B-HF": ModelType.FT,
42
+ "TheBloke/tulu-7B-fp16": ModelType.IFT,
43
+ "TheBloke/guanaco-7B-HF": ModelType.FT,
44
+ "TheBloke/koala-7B-HF": ModelType.FT,
45
+ "TheBloke/wizardLM-7B-HF": ModelType.IFT,
46
+ "TheBloke/airoboros-13B-HF": ModelType.IFT,
47
+ "TheBloke/koala-13B-HF": ModelType.FT,
48
+ "TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.FT,
49
+ "TheBloke/dromedary-65b-lora-HF": ModelType.IFT,
50
+ "TheBloke/wizardLM-13B-1.0-fp16": ModelType.IFT,
51
+ "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16": ModelType.FT,
52
+ "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.FT,
53
+ "TheBloke/wizard-vicuna-13B-HF": ModelType.IFT,
54
+ "TheBloke/UltraLM-13B-fp16": ModelType.IFT,
55
+ "TheBloke/OpenAssistant-FT-7-Llama-30B-HF": ModelType.FT,
56
+ "TheBloke/vicuna-13B-1.1-HF": ModelType.IFT,
57
+ "TheBloke/guanaco-13B-HF": ModelType.FT,
58
+ "TheBloke/guanaco-65B-HF": ModelType.FT,
59
+ "TheBloke/airoboros-7b-gpt4-fp16": ModelType.IFT,
60
+ "TheBloke/llama-30b-supercot-SuperHOT-8K-fp16": ModelType.IFT,
61
+ "TheBloke/Llama-2-13B-fp16": ModelType.PT,
62
+ "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16": ModelType.FT,
63
+ "TheBloke/landmark-attention-llama7b-fp16": ModelType.IFT,
64
+ "TheBloke/Planner-7B-fp16": ModelType.IFT,
65
+ "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.FT,
66
+ "TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.IFT,
67
+ "TheBloke/gpt4-x-vicuna-13B-HF": ModelType.IFT,
68
+ "TheBloke/gpt4-alpaca-lora_mlp-65B-HF": ModelType.IFT,
69
+ "TheBloke/tulu-13B-fp16": ModelType.IFT,
70
+ "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16": ModelType.IFT,
71
+ "TheBloke/Llama-2-70B-fp16": ModelType.IFT,
72
+ "TheBloke/WizardLM-30B-fp16": ModelType.IFT,
73
+ "TheBloke/robin-13B-v2-fp16": ModelType.FT,
74
+ "TheBloke/robin-33B-v2-fp16": ModelType.FT,
75
+ "TheBloke/Vicuna-13B-CoT-fp16": ModelType.IFT,
76
+ "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16": ModelType.IFT,
77
+ "TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16": ModelType.FT,
78
+ "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16": ModelType.IFT,
79
+ "TheBloke/GPlatty-30B-SuperHOT-8K-fp16": ModelType.FT,
80
+ "TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16": ModelType.IFT,
81
+ "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16": ModelType.IFT,
82
+ "jphme/orca_mini_v2_ger_7b": ModelType.IFT,
83
+ "Ejafa/vicuna_7B_vanilla_1.1": ModelType.FT,
84
+ "kevinpro/Vicuna-13B-CoT": ModelType.IFT,
85
+ "AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.FT,
86
+ "AlekseyKorshuk/chatml-pyg-v1": ModelType.FT,
87
+ "concedo/Vicuzard-30B-Uncensored": ModelType.FT,
88
+ "concedo/OPT-19M-ChatSalad": ModelType.FT,
89
+ "concedo/Pythia-70M-ChatSalad": ModelType.FT,
90
+ "digitous/13B-HyperMantis": ModelType.IFT,
91
+ "digitous/Adventien-GPTJ": ModelType.FT,
92
+ "digitous/Alpacino13b": ModelType.IFT,
93
+ "digitous/GPT-R": ModelType.IFT,
94
+ "digitous/Javelin-R": ModelType.IFT,
95
+ "digitous/Javalion-GPTJ": ModelType.IFT,
96
+ "digitous/Javalion-R": ModelType.IFT,
97
+ "digitous/Skegma-GPTJ": ModelType.FT,
98
+ "digitous/Alpacino30b": ModelType.IFT,
99
+ "digitous/Janin-GPTJ": ModelType.FT,
100
+ "digitous/Janin-R": ModelType.FT,
101
+ "digitous/Javelin-GPTJ": ModelType.FT,
102
+ "SaylorTwift/gpt2_test": ModelType.PT,
103
+ "anton-l/gpt-j-tiny-random": ModelType.FT,
104
+ "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.FT,
105
+ "Lazycuber/pyg-instruct-wizardlm": ModelType.FT,
106
+ "Lazycuber/Janemalion-6B": ModelType.FT,
107
+ "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.FT,
108
+ "IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.IFT,
109
+ "dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.FT,
110
+ "gpt2-medium": ModelType.PT,
111
+ "camel-ai/CAMEL-13B-Combined-Data": ModelType.IFT,
112
+ "camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.FT,
113
+ "camel-ai/CAMEL-33B-Combined-Data": ModelType.IFT,
114
+ "PygmalionAI/pygmalion-6b": ModelType.FT,
115
+ "PygmalionAI/metharme-1.3b": ModelType.IFT,
116
+ "PygmalionAI/pygmalion-1.3b": ModelType.FT,
117
+ "PygmalionAI/pygmalion-350m": ModelType.FT,
118
+ "PygmalionAI/pygmalion-2.7b": ModelType.FT,
119
+ "medalpaca/medalpaca-7b": ModelType.FT,
120
+ "lilloukas/Platypus-30B": ModelType.IFT,
121
+ "lilloukas/GPlatty-30B": ModelType.FT,
122
+ "mncai/chatdoctor": ModelType.FT,
123
+ "chaoyi-wu/MedLLaMA_13B": ModelType.FT,
124
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.IFT,
125
+ "LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.FT,
126
+ "hakurei/instruct-12b": ModelType.IFT,
127
+ "hakurei/lotus-12B": ModelType.FT,
128
+ "shibing624/chinese-llama-plus-13b-hf": ModelType.IFT,
129
+ "shibing624/chinese-alpaca-plus-7b-hf": ModelType.IFT,
130
+ "shibing624/chinese-alpaca-plus-13b-hf": ModelType.IFT,
131
+ "mosaicml/mpt-7b-instruct": ModelType.IFT,
132
+ "mosaicml/mpt-30b-chat": ModelType.IFT,
133
+ "mosaicml/mpt-7b-storywriter": ModelType.FT,
134
+ "mosaicml/mpt-30b-instruct": ModelType.IFT,
135
+ "mosaicml/mpt-7b-chat": ModelType.IFT,
136
+ "mosaicml/mpt-30b": ModelType.PT,
137
+ "Corianas/111m": ModelType.IFT,
138
+ "Corianas/Quokka_1.3b": ModelType.IFT,
139
+ "Corianas/256_5epoch": ModelType.FT,
140
+ "Corianas/Quokka_256m": ModelType.IFT,
141
+ "Corianas/Quokka_590m": ModelType.IFT,
142
+ "Corianas/gpt-j-6B-Dolly": ModelType.FT,
143
+ "Corianas/Quokka_2.7b": ModelType.IFT,
144
+ "cyberagent/open-calm-7b": ModelType.FT,
145
+ "Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.IFT,
146
+ "THUDM/chatglm2-6b": ModelType.IFT,
147
+ "MetaIX/GPT4-X-Alpasta-30b": ModelType.IFT,
148
+ "NYTK/PULI-GPTrio": ModelType.PT,
149
+ "EleutherAI/pythia-1.3b": ModelType.PT,
150
+ "EleutherAI/pythia-2.8b-deduped": ModelType.PT,
151
+ "EleutherAI/gpt-neo-125m": ModelType.PT,
152
+ "EleutherAI/pythia-160m": ModelType.PT,
153
+ "EleutherAI/gpt-neo-2.7B": ModelType.PT,
154
+ "EleutherAI/pythia-1b-deduped": ModelType.PT,
155
+ "EleutherAI/pythia-6.7b": ModelType.PT,
156
+ "EleutherAI/pythia-70m-deduped": ModelType.PT,
157
+ "EleutherAI/gpt-neox-20b": ModelType.PT,
158
+ "EleutherAI/pythia-1.4b-deduped": ModelType.PT,
159
+ "EleutherAI/pythia-2.7b": ModelType.PT,
160
+ "EleutherAI/pythia-6.9b-deduped": ModelType.PT,
161
+ "EleutherAI/pythia-70m": ModelType.PT,
162
+ "EleutherAI/gpt-j-6b": ModelType.PT,
163
+ "EleutherAI/pythia-12b-deduped": ModelType.PT,
164
+ "EleutherAI/gpt-neo-1.3B": ModelType.PT,
165
+ "EleutherAI/pythia-410m-deduped": ModelType.PT,
166
+ "EleutherAI/pythia-160m-deduped": ModelType.PT,
167
+ "EleutherAI/polyglot-ko-12.8b": ModelType.PT,
168
+ "EleutherAI/pythia-12b": ModelType.PT,
169
+ "roneneldan/TinyStories-33M": ModelType.PT,
170
+ "roneneldan/TinyStories-28M": ModelType.PT,
171
+ "roneneldan/TinyStories-1M": ModelType.PT,
172
+ "roneneldan/TinyStories-8M": ModelType.PT,
173
+ "roneneldan/TinyStories-3M": ModelType.PT,
174
+ "jerryjalapeno/nart-100k-7b": ModelType.FT,
175
+ "lmsys/vicuna-13b-v1.3": ModelType.IFT,
176
+ "lmsys/vicuna-7b-v1.3": ModelType.IFT,
177
+ "lmsys/vicuna-13b-v1.1": ModelType.IFT,
178
+ "lmsys/vicuna-13b-delta-v1.1": ModelType.IFT,
179
+ "lmsys/vicuna-7b-delta-v1.1": ModelType.IFT,
180
+ "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.FT,
181
+ "haonan-li/bactrian-x-llama-13b-merged": ModelType.IFT,
182
+ "Gryphe/MythoLogic-13b": ModelType.IFT,
183
+ "Gryphe/MythoBoros-13b": ModelType.IFT,
184
+ "pillowtalks-ai/delta13b": ModelType.FT,
185
+ "wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.FT,
186
+ "bigscience/bloom-7b1": ModelType.PT,
187
+ "bigcode/tiny_starcoder_py": ModelType.PT,
188
+ "bigcode/starcoderplus": ModelType.FT,
189
+ "bigcode/gpt_bigcode-santacoder": ModelType.PT,
190
+ "bigcode/starcoder": ModelType.PT,
191
+ "Open-Orca/OpenOrca-Preview1-13B": ModelType.IFT,
192
+ "microsoft/DialoGPT-large": ModelType.FT,
193
+ "microsoft/DialoGPT-small": ModelType.FT,
194
+ "microsoft/DialoGPT-medium": ModelType.FT,
195
+ "microsoft/CodeGPT-small-py": ModelType.FT,
196
+ "Tincando/fiction_story_generator": ModelType.FT,
197
+ "Pirr/pythia-13b-deduped-green_devil": ModelType.FT,
198
+ "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.FT,
199
+ "Aeala/GPT4-x-AlpacaDente-30b": ModelType.FT,
200
+ "Aeala/GPT4-x-Alpasta-13b": ModelType.FT,
201
+ "Aeala/VicUnlocked-alpaca-30b": ModelType.IFT,
202
+ "Tap-M/Luna-AI-Llama2-Uncensored": ModelType.FT,
203
+ "illuin/test-custom-llama": ModelType.FT,
204
+ "dvruette/oasst-llama-13b-2-epochs": ModelType.FT,
205
+ "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.FT,
206
+ "dvruette/llama-13b-pretrained-dropout": ModelType.PT,
207
+ "dvruette/llama-13b-pretrained": ModelType.PT,
208
+ "dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.FT,
209
+ "dvruette/llama-13b-pretrained-sft-do2": ModelType.FT,
210
+ "dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.FT,
211
+ "dvruette/oasst-pythia-12b-pretrained-sft": ModelType.FT,
212
+ "dvruette/oasst-pythia-6.9b-4000-steps": ModelType.FT,
213
+ "dvruette/gpt-neox-20b-full-precision": ModelType.FT,
214
+ "dvruette/oasst-llama-13b-1000-steps": ModelType.FT,
215
+ "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
216
+ "openlm-research/open_llama_7b": ModelType.PT,
217
+ "openlm-research/open_llama_7b_v2": ModelType.PT,
218
+ "openlm-research/open_llama_3b": ModelType.PT,
219
+ "openlm-research/open_llama_13b": ModelType.PT,
220
+ "openlm-research/open_llama_3b_v2": ModelType.PT,
221
+ "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.IFT,
222
+ "GeorgiaTechResearchInstitute/galpaca-30b": ModelType.IFT,
223
+ "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.IFT,
224
+ "databricks/dolly-v2-7b": ModelType.IFT,
225
+ "databricks/dolly-v2-3b": ModelType.IFT,
226
+ "databricks/dolly-v2-12b": ModelType.IFT,
227
+ "Rachneet/gpt2-xl-alpaca": ModelType.FT,
228
+ "Locutusque/gpt2-conversational-or-qa": ModelType.FT,
229
+ "psyche/kogpt": ModelType.FT,
230
+ "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.IFT,
231
+ "Mikael110/llama-2-7b-guanaco-fp16": ModelType.FT,
232
+ "Mikael110/llama-2-13b-guanaco-fp16": ModelType.FT,
233
+ "Fredithefish/CrimsonPajama": ModelType.IFT,
234
+ "Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.FT,
235
+ "Fredithefish/ScarletPajama-3B-HF": ModelType.FT,
236
+ "Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.IFT,
237
+ "acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1": ModelType.IFT,
238
+ "eachadea/vicuna-13b-1.1": ModelType.FT,
239
+ "eachadea/vicuna-7b-1.1": ModelType.FT,
240
+ "eachadea/vicuna-13b": ModelType.FT,
241
+ "openaccess-ai-collective/wizard-mega-13b": ModelType.IFT,
242
+ "openaccess-ai-collective/manticore-13b": ModelType.IFT,
243
+ "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.IFT,
244
+ "openaccess-ai-collective/minotaur-13b": ModelType.IFT,
245
+ "openaccess-ai-collective/minotaur-13b-fixed": ModelType.IFT,
246
+ "openaccess-ai-collective/hippogriff-30b-chat": ModelType.IFT,
247
+ "openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.IFT,
248
+ "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.IFT,
249
+ "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.IFT,
250
+ "euclaise/gpt-neox-122m-minipile-digits": ModelType.FT,
251
+ "stabilityai/StableBeluga1-Delta": ModelType.IFT,
252
+ "stabilityai/stablelm-tuned-alpha-7b": ModelType.IFT,
253
+ "stabilityai/StableBeluga2": ModelType.IFT,
254
+ "stabilityai/StableBeluga-13B": ModelType.IFT,
255
+ "stabilityai/StableBeluga-7B": ModelType.IFT,
256
+ "stabilityai/stablelm-base-alpha-7b": ModelType.PT,
257
+ "stabilityai/stablelm-base-alpha-3b": ModelType.PT,
258
+ "stabilityai/stablelm-tuned-alpha-3b": ModelType.IFT,
259
+ "alibidaran/medical_transcription_generator": ModelType.FT,
260
+ "CalderaAI/30B-Lazarus": ModelType.IFT,
261
+ "CalderaAI/13B-BlueMethod": ModelType.IFT,
262
+ "CalderaAI/13B-Ouroboros": ModelType.IFT,
263
+ "KoboldAI/OPT-13B-Erebus": ModelType.FT,
264
+ "KoboldAI/GPT-J-6B-Janeway": ModelType.FT,
265
+ "KoboldAI/GPT-J-6B-Shinen": ModelType.FT,
266
+ "KoboldAI/fairseq-dense-2.7B": ModelType.PT,
267
+ "KoboldAI/OPT-6B-nerys-v2": ModelType.FT,
268
+ "KoboldAI/GPT-NeoX-20B-Skein": ModelType.FT,
269
+ "KoboldAI/PPO_Pygway-6b-Mix": ModelType.FT,
270
+ "KoboldAI/fairseq-dense-6.7B": ModelType.PT,
271
+ "KoboldAI/fairseq-dense-125M": ModelType.PT,
272
+ "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.FT,
273
+ "KoboldAI/OPT-2.7B-Erebus": ModelType.FT,
274
+ "KoboldAI/OPT-350M-Nerys-v2": ModelType.FT,
275
+ "KoboldAI/OPT-2.7B-Nerys-v2": ModelType.FT,
276
+ "KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.FT,
277
+ "KoboldAI/OPT-13B-Nerys-v2": ModelType.FT,
278
+ "KoboldAI/GPT-NeoX-20B-Erebus": ModelType.FT,
279
+ "KoboldAI/OPT-6.7B-Erebus": ModelType.FT,
280
+ "KoboldAI/fairseq-dense-355M": ModelType.PT,
281
+ "KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.FT,
282
+ "KoboldAI/GPT-J-6B-Adventure": ModelType.FT,
283
+ "KoboldAI/OPT-350M-Erebus": ModelType.FT,
284
+ "KoboldAI/GPT-J-6B-Skein": ModelType.FT,
285
+ "KoboldAI/OPT-30B-Erebus": ModelType.FT,
286
+ "klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
287
+ "klosax/open_llama_3b_350bt_preview": ModelType.PT,
288
+ "klosax/openllama-3b-350bt": ModelType.PT,
289
+ "klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
290
+ "klosax/open_llama_13b_600bt_preview": ModelType.PT,
291
+ "klosax/open_llama_7b_400bt_preview": ModelType.PT,
292
+ "kfkas/Llama-2-ko-7b-Chat": ModelType.IFT,
293
+ "WeOpenML/Alpaca-7B-v1": ModelType.IFT,
294
+ "WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.IFT,
295
+ "TFLai/gpt2-turkish-uncased": ModelType.FT,
296
+ "ehartford/WizardLM-13B-Uncensored": ModelType.IFT,
297
+ "ehartford/dolphin-llama-13b": ModelType.IFT,
298
+ "ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.FT,
299
+ "ehartford/WizardLM-30B-Uncensored": ModelType.IFT,
300
+ "ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.FT,
301
+ "ehartford/WizardLM-7B-Uncensored": ModelType.IFT,
302
+ "ehartford/based-30b": ModelType.FT,
303
+ "ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.FT,
304
+ "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.FT,
305
+ "wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.FT,
306
+ "OpenAssistant/oasst-sft-1-pythia-12b": ModelType.FT,
307
+ "OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.IFT,
308
+ "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.FT,
309
+ "OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.IFT,
310
+ "OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.IFT,
311
+ "OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.IFT,
312
+ "OpenAssistant/llama2-13b-orca-8k-3319": ModelType.IFT,
313
+ "junelee/wizard-vicuna-13b": ModelType.FT,
314
+ "BreadAi/gpt-YA-1-1_160M": ModelType.PT,
315
+ "BreadAi/MuseCan": ModelType.PT,
316
+ "BreadAi/MusePy-1-2": ModelType.PT,
317
+ "BreadAi/DiscordPy": ModelType.PT,
318
+ "BreadAi/PM_modelV2": ModelType.PT,
319
+ "BreadAi/gpt-Youtube": ModelType.PT,
320
+ "BreadAi/StoryPy": ModelType.FT,
321
+ "julianweng/Llama-2-7b-chat-orcah": ModelType.FT,
322
+ "AGI-inc/lora_moe_7b_baseline": ModelType.FT,
323
+ "AGI-inc/lora_moe_7b": ModelType.FT,
324
+ "togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.IFT,
325
+ "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.IFT,
326
+ "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.IFT,
327
+ "togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
328
+ "togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.IFT,
329
+ "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
330
+ "togethercomputer/Pythia-Chat-Base-7B": ModelType.IFT,
331
+ "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
332
+ "togethercomputer/GPT-JT-6B-v1": ModelType.IFT,
333
+ "togethercomputer/GPT-JT-6B-v0": ModelType.IFT,
334
+ "togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.IFT,
335
+ "togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.IFT,
336
+ "togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.IFT,
337
+ "Writer/camel-5b-hf": ModelType.IFT,
338
+ "Writer/palmyra-base": ModelType.PT,
339
+ "MBZUAI/LaMini-GPT-1.5B": ModelType.IFT,
340
+ "MBZUAI/lamini-cerebras-111m": ModelType.IFT,
341
+ "MBZUAI/lamini-neo-1.3b": ModelType.IFT,
342
+ "MBZUAI/lamini-cerebras-1.3b": ModelType.IFT,
343
+ "MBZUAI/lamini-cerebras-256m": ModelType.IFT,
344
+ "MBZUAI/LaMini-GPT-124M": ModelType.IFT,
345
+ "MBZUAI/lamini-neo-125m": ModelType.IFT,
346
+ "TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.FT,
347
+ "TehVenom/PPO_Shygmalion-6b": ModelType.FT,
348
+ "TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.FT,
349
+ "TehVenom/Pygmalion_AlpacaLora-7b": ModelType.FT,
350
+ "TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.FT,
351
+ "TehVenom/Dolly_Malion-6b": ModelType.FT,
352
+ "TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.FT,
353
+ "TehVenom/ChanMalion": ModelType.FT,
354
+ "TehVenom/GPT-J-Pyg_PPO-6B": ModelType.IFT,
355
+ "TehVenom/Pygmalion-13b-Merged": ModelType.FT,
356
+ "TehVenom/Metharme-13b-Merged": ModelType.IFT,
357
+ "TehVenom/Dolly_Shygmalion-6b": ModelType.FT,
358
+ "TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.IFT,
359
+ "georgesung/llama2_7b_chat_uncensored": ModelType.FT,
360
+ "vicgalle/gpt2-alpaca": ModelType.IFT,
361
+ "vicgalle/alpaca-7b": ModelType.FT,
362
+ "vicgalle/gpt2-alpaca-gpt4": ModelType.IFT,
363
+ "facebook/opt-350m": ModelType.PT,
364
+ "facebook/opt-125m": ModelType.PT,
365
+ "facebook/xglm-4.5B": ModelType.PT,
366
+ "facebook/opt-2.7b": ModelType.PT,
367
+ "facebook/opt-6.7b": ModelType.PT,
368
+ "facebook/galactica-30b": ModelType.PT,
369
+ "facebook/opt-13b": ModelType.PT,
370
+ "facebook/opt-66b": ModelType.PT,
371
+ "facebook/xglm-7.5B": ModelType.PT,
372
+ "facebook/xglm-564M": ModelType.PT,
373
+ "facebook/opt-30b": ModelType.PT,
374
+ "golaxy/gogpt-7b": ModelType.FT,
375
+ "golaxy/gogpt2-7b": ModelType.FT,
376
+ "golaxy/gogpt-7b-bloom": ModelType.FT,
377
+ "golaxy/gogpt-3b-bloom": ModelType.FT,
378
+ "psmathur/orca_mini_v2_7b": ModelType.IFT,
379
+ "psmathur/orca_mini_7b": ModelType.IFT,
380
+ "psmathur/orca_mini_3b": ModelType.IFT,
381
+ "psmathur/orca_mini_v2_13b": ModelType.IFT,
382
+ "gpt2-xl": ModelType.PT,
383
+ "lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.FT,
384
+ "Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.FT,
385
+ "Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.IFT,
386
+ "Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.IFT,
387
+ "Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.IFT,
388
+ "jzjiao/opt-1.3b-rlhf": ModelType.FT,
389
+ "HuggingFaceH4/starchat-beta": ModelType.IFT,
390
+ "KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.IFT,
391
+ "KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.IFT,
392
+ "KnutJaegersberg/galactica-orca-wizardlm-1.3b": ModelType.IFT,
393
+ "openchat/openchat_8192": ModelType.IFT,
394
+ "openchat/openchat_v2": ModelType.IFT,
395
+ "openchat/openchat_v2_w": ModelType.IFT,
396
+ "ausboss/llama-13b-supercot": ModelType.IFT,
397
+ "ausboss/llama-30b-supercot": ModelType.IFT,
398
+ "Neko-Institute-of-Science/metharme-7b": ModelType.IFT,
399
+ "Neko-Institute-of-Science/pygmalion-7b": ModelType.FT,
400
+ "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.IFT,
401
+ "victor123/WizardLM-13B-1.0": ModelType.IFT,
402
+ "OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.FT,
403
+ "OpenBuddy/openbuddy-llama2-13b-v8.1-fp16": ModelType.FT,
404
+ "OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16": ModelType.FT,
405
+ "baichuan-inc/Baichuan-7B": ModelType.PT,
406
+ "tiiuae/falcon-40b-instruct": ModelType.IFT,
407
+ "tiiuae/falcon-40b": ModelType.PT,
408
+ "tiiuae/falcon-7b": ModelType.PT,
409
+ "YeungNLP/firefly-llama-13b": ModelType.FT,
410
+ "YeungNLP/firefly-llama-13b-v1.2": ModelType.FT,
411
+ "YeungNLP/firefly-llama2-13b": ModelType.FT,
412
+ "YeungNLP/firefly-ziya-13b": ModelType.FT,
413
+ "shaohang/Sparse0.5_OPT-1.3": ModelType.FT,
414
+ "xzuyn/Alpacino-SuperCOT-13B": ModelType.IFT,
415
+ "xzuyn/MedicWizard-7B": ModelType.FT,
416
+ "xDAN-AI/xDAN_13b_l2_lora": ModelType.FT,
417
+ "beomi/KoAlpaca-Polyglot-5.8B": ModelType.FT,
418
+ "beomi/llama-2-ko-7b": ModelType.IFT,
419
+ "Salesforce/codegen-6B-multi": ModelType.PT,
420
+ "Salesforce/codegen-16B-nl": ModelType.PT,
421
+ "Salesforce/codegen-6B-nl": ModelType.PT,
422
+ "ai-forever/rugpt3large_based_on_gpt2": ModelType.FT,
423
+ "gpt2-large": ModelType.PT,
424
+ "frank098/orca_mini_3b_juniper": ModelType.FT,
425
+ "frank098/WizardLM_13B_juniper": ModelType.FT,
426
+ "FPHam/Free_Sydney_13b_HF": ModelType.FT,
427
+ "huggingface/llama-13b": ModelType.PT,
428
+ "huggingface/llama-7b": ModelType.PT,
429
+ "huggingface/llama-65b": ModelType.PT,
430
+ "huggingface/llama-30b": ModelType.PT,
431
+ "Henk717/chronoboros-33B": ModelType.IFT,
432
+ "jondurbin/airoboros-13b-gpt4-1.4": ModelType.IFT,
433
+ "jondurbin/airoboros-7b": ModelType.IFT,
434
+ "jondurbin/airoboros-7b-gpt4": ModelType.IFT,
435
+ "jondurbin/airoboros-7b-gpt4-1.1": ModelType.IFT,
436
+ "jondurbin/airoboros-7b-gpt4-1.2": ModelType.IFT,
437
+ "jondurbin/airoboros-7b-gpt4-1.3": ModelType.IFT,
438
+ "jondurbin/airoboros-7b-gpt4-1.4": ModelType.IFT,
439
+ "jondurbin/airoboros-l2-7b-gpt4-1.4.1": ModelType.IFT,
440
+ "jondurbin/airoboros-l2-13b-gpt4-1.4.1": ModelType.IFT,
441
+ "jondurbin/airoboros-l2-70b-gpt4-1.4.1": ModelType.IFT,
442
+ "jondurbin/airoboros-13b": ModelType.IFT,
443
+ "jondurbin/airoboros-33b-gpt4-1.4": ModelType.IFT,
444
+ "jondurbin/airoboros-33b-gpt4-1.2": ModelType.IFT,
445
+ "jondurbin/airoboros-65b-gpt4-1.2": ModelType.IFT,
446
+ "ariellee/SuperPlatty-30B": ModelType.IFT,
447
+ "danielhanchen/open_llama_3b_600bt_preview": ModelType.FT,
448
+ "cerebras/Cerebras-GPT-256M": ModelType.PT,
449
+ "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
450
+ "cerebras/Cerebras-GPT-13B": ModelType.PT,
451
+ "cerebras/Cerebras-GPT-2.7B": ModelType.PT,
452
+ "cerebras/Cerebras-GPT-111M": ModelType.PT,
453
+ "cerebras/Cerebras-GPT-6.7B": ModelType.PT,
454
+ "Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
455
+ "Yhyu13/llama-30B-hf-openassitant": ModelType.FT,
456
+ "NousResearch/Nous-Hermes-Llama2-13b": ModelType.IFT,
457
+ "NousResearch/Nous-Hermes-llama-2-7b": ModelType.IFT,
458
+ "NousResearch/Redmond-Puffin-13B": ModelType.IFT,
459
+ "NousResearch/Nous-Hermes-13b": ModelType.IFT,
460
+ "project-baize/baize-v2-7b": ModelType.IFT,
461
+ "project-baize/baize-v2-13b": ModelType.IFT,
462
+ "LLMs/WizardLM-13B-V1.0": ModelType.FT,
463
+ "LLMs/AlpacaGPT4-7B-elina": ModelType.FT,
464
+ "wenge-research/yayi-7b": ModelType.FT,
465
+ "wenge-research/yayi-7b-llama2": ModelType.FT,
466
+ "wenge-research/yayi-13b-llama2": ModelType.FT,
467
+ "yhyhy3/open_llama_7b_v2_med_instruct": ModelType.IFT,
468
+ "llama-anon/instruct-13b": ModelType.IFT,
469
+ "huggingtweets/jerma985": ModelType.FT,
470
+ "huggingtweets/gladosystem": ModelType.FT,
471
+ "huggingtweets/bladeecity-jerma985": ModelType.FT,
472
+ "huggyllama/llama-13b": ModelType.PT,
473
+ "huggyllama/llama-65b": ModelType.PT,
474
+ "FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
475
+ "upstage/Llama-2-70b-instruct": ModelType.IFT,
476
+ "upstage/Llama-2-70b-instruct-1024": ModelType.IFT,
477
+ "upstage/llama-65b-instruct": ModelType.IFT,
478
+ "upstage/llama-30b-instruct-2048": ModelType.IFT,
479
+ "upstage/llama-30b-instruct": ModelType.IFT,
480
+ "WizardLM/WizardLM-13B-1.0": ModelType.IFT,
481
+ "WizardLM/WizardLM-13B-V1.1": ModelType.IFT,
482
+ "WizardLM/WizardLM-13B-V1.2": ModelType.IFT,
483
+ "WizardLM/WizardLM-30B-V1.0": ModelType.IFT,
484
+ "WizardLM/WizardCoder-15B-V1.0": ModelType.IFT,
485
+ "gpt2": ModelType.PT,
486
+ "keyfan/vicuna-chinese-replication-v1.1": ModelType.IFT,
487
+ "nthngdy/pythia-owt2-70m-100k": ModelType.FT,
488
+ "nthngdy/pythia-owt2-70m-50k": ModelType.FT,
489
+ "quantumaikr/KoreanLM-hf": ModelType.FT,
490
+ "quantumaikr/open_llama_7b_hf": ModelType.FT,
491
+ "quantumaikr/QuantumLM-70B-hf": ModelType.IFT,
492
+ "MayaPH/FinOPT-Lincoln": ModelType.FT,
493
+ "MayaPH/FinOPT-Franklin": ModelType.FT,
494
+ "MayaPH/GodziLLa-30B": ModelType.IFT,
495
+ "MayaPH/GodziLLa-30B-plus": ModelType.IFT,
496
+ "MayaPH/FinOPT-Washington": ModelType.FT,
497
+ "ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.FT,
498
+ "layoric/llama-2-13b-code-alpaca": ModelType.FT,
499
+ "CobraMamba/mamba-gpt-3b": ModelType.FT,
500
+ "CobraMamba/mamba-gpt-3b-v2": ModelType.FT,
501
+ "CobraMamba/mamba-gpt-3b-v3": ModelType.FT,
502
+ "timdettmers/guanaco-33b-merged": ModelType.FT,
503
+ "elinas/chronos-33b": ModelType.IFT,
504
+ "heegyu/RedTulu-Uncensored-3B-0719": ModelType.IFT,
505
+ "heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.IFT,
506
+ "heegyu/WizardVicuna-3B-0719": ModelType.IFT,
507
+ "meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
508
+ "meta-llama/Llama-2-7b-hf": ModelType.PT,
509
+ "meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
510
+ "meta-llama/Llama-2-13b-hf": ModelType.PT,
511
+ "meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
512
+ "meta-llama/Llama-2-70b-hf": ModelType.PT,
513
+ "xhyi/PT_GPTNEO350_ATG": ModelType.FT,
514
+ "h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.FT,
515
+ "h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.FT,
516
+ "h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.IFT,
517
+ "h2oai/h2ogpt-oasst1-512-12b": ModelType.IFT,
518
+ "h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.IFT,
519
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.FT,
520
+ "h2oai/h2ogpt-oasst1-512-20b": ModelType.IFT,
521
+ "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.FT,
522
+ "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.FT,
523
+ "h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.FT,
524
+ "bofenghuang/vigogne-13b-instruct": ModelType.IFT,
525
+ "bofenghuang/vigogne-13b-chat": ModelType.FT,
526
+ "bofenghuang/vigogne-2-7b-instruct": ModelType.IFT,
527
+ "bofenghuang/vigogne-7b-instruct": ModelType.IFT,
528
+ "bofenghuang/vigogne-7b-chat": ModelType.FT,
529
+ "Vmware/open-llama-7b-v2-open-instruct": ModelType.IFT,
530
+ "VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.IFT,
531
+ "ewof/koishi-instruct-3b": ModelType.IFT,
532
+ "gywy/llama2-13b-chinese-v1": ModelType.FT,
533
+ "GOAT-AI/GOAT-7B-Community": ModelType.FT,
534
+ "psyche/kollama2-7b": ModelType.FT,
535
+ "TheTravellingEngineer/llama2-7b-hf-guanaco": ModelType.FT,
536
+ "beaugogh/pythia-1.4b-deduped-sharegpt": ModelType.FT,
537
+ "augtoma/qCammel-70-x": ModelType.IFT,
538
+ "Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload": ModelType.IFT,
539
+ "anhnv125/pygmalion-6b-roleplay": ModelType.FT,
540
+ "64bits/LexPodLM-13B": ModelType.FT,
541
+ }
542
+
543
+
544
+ def model_type_from_str(type):
545
+ if "fine-tuned" in type or "🔶" in type:
546
+ return ModelType.FT
547
+ if "pretrained" in type or "🟢" in type:
548
+ return ModelType.PT
549
+ if "RL-tuned" in type or "🟦" in type:
550
+ return ModelType.RL
551
+ if "instruction-tuned" in type or "⭕" in type:
552
+ return ModelType.IFT
553
+ return ModelType.Unknown
src/{auto_leaderboard/load_results.py → display_models/read_results.py} RENAMED
@@ -1,14 +1,13 @@
1
- from dataclasses import dataclass
2
-
3
- import glob
4
  import json
5
  import os
 
6
  from typing import Dict, List, Tuple
7
- import dateutil
8
 
9
- from src.utils_display import AutoEvalColumn, make_clickable_model
10
  import numpy as np
11
 
 
 
12
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
13
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
14
  BENCH_TO_NAME = {
@@ -31,13 +30,15 @@ class EvalResult:
31
  weight_type: str = ""
32
 
33
  def to_dict(self):
 
 
34
  if self.org is not None:
35
  base_model = f"{self.org}/{self.model}"
36
  else:
37
  base_model = f"{self.model}"
38
  data_dict = {}
39
 
40
- data_dict["eval_name"] = self.eval_name # not a column, just a save name
41
  data_dict["weight_type"] = self.weight_type # not a column, just a save name
42
  data_dict[AutoEvalColumn.precision.name] = self.precision
43
  data_dict[AutoEvalColumn.model_type.name] = self.model_type
@@ -45,6 +46,9 @@ class EvalResult:
45
  data_dict[AutoEvalColumn.dummy.name] = base_model
46
  data_dict[AutoEvalColumn.revision.name] = self.revision
47
  data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
 
 
 
48
 
49
  for benchmark in BENCHMARKS:
50
  if benchmark not in self.results.keys():
@@ -60,10 +64,9 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
60
  with open(json_filepath) as fp:
61
  data = json.load(fp)
62
 
63
-
64
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
65
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
66
- return None, [] # we skip models with the wrong version
67
 
68
  try:
69
  config = data["config"]
@@ -87,22 +90,29 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
87
  else:
88
  org = model_split[0]
89
  model = model_split[1]
90
- result_key = f"{org}_{model}_{model_sha}_{precision}"
91
 
92
  eval_results = []
93
  for benchmark, metric in zip(BENCHMARKS, METRICS):
94
- accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
95
- if accs.size == 0:
96
  continue
97
  mean_acc = np.mean(accs) * 100.0
98
- eval_results.append(EvalResult(
99
- eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, precision=precision, #todo model_type=, weight_type=
100
- ))
 
 
 
 
 
 
 
101
 
102
  return result_key, eval_results
103
 
104
 
105
- def get_eval_results(is_public) -> List[EvalResult]:
106
  json_filepaths = []
107
 
108
  for root, dir, files in os.walk("eval-results"):
@@ -113,11 +123,11 @@ def get_eval_results(is_public) -> List[EvalResult]:
113
  # Sort the files by date
114
  # store results by precision maybe?
115
  try:
116
- files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
117
  except dateutil.parser._parser.ParserError:
118
  files = [files[-1]]
119
 
120
- #up_to_date = files[-1]
121
  for file in files:
122
  json_filepaths.append(os.path.join(root, file))
123
 
@@ -135,7 +145,7 @@ def get_eval_results(is_public) -> List[EvalResult]:
135
  return eval_results
136
 
137
 
138
- def get_eval_results_dicts(is_public=True) -> List[Dict]:
139
- eval_results = get_eval_results(is_public)
140
 
141
  return [e.to_dict() for e in eval_results]
 
 
 
 
1
  import json
2
  import os
3
+ from dataclasses import dataclass
4
  from typing import Dict, List, Tuple
 
5
 
6
+ import dateutil
7
  import numpy as np
8
 
9
+ from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
+
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
  BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
13
  BENCH_TO_NAME = {
 
30
  weight_type: str = ""
31
 
32
  def to_dict(self):
33
+ from src.load_from_hub import is_model_on_hub
34
+
35
  if self.org is not None:
36
  base_model = f"{self.org}/{self.model}"
37
  else:
38
  base_model = f"{self.model}"
39
  data_dict = {}
40
 
41
+ data_dict["eval_name"] = self.eval_name # not a column, just a save name
42
  data_dict["weight_type"] = self.weight_type # not a column, just a save name
43
  data_dict[AutoEvalColumn.precision.name] = self.precision
44
  data_dict[AutoEvalColumn.model_type.name] = self.model_type
 
46
  data_dict[AutoEvalColumn.dummy.name] = base_model
47
  data_dict[AutoEvalColumn.revision.name] = self.revision
48
  data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
49
+ data_dict[AutoEvalColumn.still_on_hub.name] = (
50
+ is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
51
+ )
52
 
53
  for benchmark in BENCHMARKS:
54
  if benchmark not in self.results.keys():
 
64
  with open(json_filepath) as fp:
65
  data = json.load(fp)
66
 
 
67
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
68
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
69
+ return None, [] # we skip models with the wrong version
70
 
71
  try:
72
  config = data["config"]
 
90
  else:
91
  org = model_split[0]
92
  model = model_split[1]
93
+ result_key = f"{org}_{model}_{model_sha}_{precision}"
94
 
95
  eval_results = []
96
  for benchmark, metric in zip(BENCHMARKS, METRICS):
97
+ accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
98
+ if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
  mean_acc = np.mean(accs) * 100.0
101
+ eval_results.append(
102
+ EvalResult(
103
+ eval_name=result_key,
104
+ org=org,
105
+ model=model,
106
+ revision=model_sha,
107
+ results={benchmark: mean_acc},
108
+ precision=precision, # todo model_type=, weight_type=
109
+ )
110
+ )
111
 
112
  return result_key, eval_results
113
 
114
 
115
+ def get_eval_results() -> List[EvalResult]:
116
  json_filepaths = []
117
 
118
  for root, dir, files in os.walk("eval-results"):
 
123
  # Sort the files by date
124
  # store results by precision maybe?
125
  try:
126
+ files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
127
  except dateutil.parser._parser.ParserError:
128
  files = [files[-1]]
129
 
130
+ # up_to_date = files[-1]
131
  for file in files:
132
  json_filepaths.append(os.path.join(root, file))
133
 
 
145
  return eval_results
146
 
147
 
148
+ def get_eval_results_dicts() -> List[Dict]:
149
+ eval_results = get_eval_results()
150
 
151
  return [e.to_dict() for e in eval_results]
src/{utils_display.py → display_models/utils.py} RENAMED
@@ -1,19 +1,27 @@
 
1
  from dataclasses import dataclass
2
 
3
- # These classes are for user facing column names, to avoid having to change them
4
- # all around the code when a modif is needed
 
 
 
 
 
5
  @dataclass
6
  class ColumnContent:
7
  name: str
8
- type: str
9
- displayed_by_default: bool
10
  hidden: bool = False
11
 
 
12
  def fields(raw_class):
13
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
 
 
15
  @dataclass(frozen=True)
16
- class AutoEvalColumn: # Auto evals column
17
  model_type_symbol = ColumnContent("T", "str", True)
18
  model = ColumnContent("Model", "markdown", True)
19
  average = ColumnContent("Average ⬆️", "number", True)
@@ -22,15 +30,19 @@ class AutoEvalColumn: # Auto evals column
22
  mmlu = ColumnContent("MMLU", "number", True)
23
  truthfulqa = ColumnContent("TruthfulQA", "number", True)
24
  model_type = ColumnContent("Type", "str", False)
25
- precision = ColumnContent("Precision", "str", False) #, True)
26
  license = ColumnContent("Hub License", "str", False)
27
  params = ColumnContent("#Params (B)", "number", False)
28
  likes = ColumnContent("Hub ❤️", "number", False)
 
29
  revision = ColumnContent("Model sha", "str", False, False)
30
- dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
 
 
 
31
 
32
  @dataclass(frozen=True)
33
- class EloEvalColumn: # Elo evals column
34
  model = ColumnContent("Model", "markdown", True)
35
  gpt4 = ColumnContent("GPT-4 (all)", "number", True)
36
  human_all = ColumnContent("Human (all)", "number", True)
@@ -39,7 +51,7 @@ class EloEvalColumn: # Elo evals column
39
 
40
 
41
  @dataclass(frozen=True)
42
- class EvalQueueColumn: # Queue column
43
  model = ColumnContent("model", "markdown", True)
44
  revision = ColumnContent("revision", "str", True)
45
  private = ColumnContent("private", "bool", True)
@@ -47,7 +59,13 @@ class EvalQueueColumn: # Queue column
47
  weight_type = ColumnContent("weight_type", "str", "Original")
48
  status = ColumnContent("status", "str", True)
49
 
50
- LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
 
 
 
 
 
 
51
 
52
 
53
  KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
@@ -84,16 +102,45 @@ def make_clickable_model(model_name):
84
  link = KOALA_LINK
85
  elif model_name == "oasst-12b":
86
  link = OASST_LINK
87
- #else:
88
- # link = MODEL_PAGE
89
-
90
- return model_hyperlink(link, model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def styled_error(error):
93
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
94
 
 
95
  def styled_warning(warn):
96
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
97
 
 
98
  def styled_message(message):
99
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
 
 
 
 
 
 
 
 
1
+ import os
2
  from dataclasses import dataclass
3
 
4
+ from huggingface_hub import HfApi
5
+
6
+ API = HfApi()
7
+
8
+
9
+ # These classes are for user facing column names, to avoid having to change them
10
+ # all around the code when a modif is needed
11
  @dataclass
12
  class ColumnContent:
13
  name: str
14
+ type: str
15
+ displayed_by_default: bool
16
  hidden: bool = False
17
 
18
+
19
  def fields(raw_class):
20
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
 
22
+
23
  @dataclass(frozen=True)
24
+ class AutoEvalColumn: # Auto evals column
25
  model_type_symbol = ColumnContent("T", "str", True)
26
  model = ColumnContent("Model", "markdown", True)
27
  average = ColumnContent("Average ⬆️", "number", True)
 
30
  mmlu = ColumnContent("MMLU", "number", True)
31
  truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
  model_type = ColumnContent("Type", "str", False)
33
+ precision = ColumnContent("Precision", "str", False) # , True)
34
  license = ColumnContent("Hub License", "str", False)
35
  params = ColumnContent("#Params (B)", "number", False)
36
  likes = ColumnContent("Hub ❤️", "number", False)
37
+ still_on_hub = ColumnContent("Available on the hub", "bool", False)
38
  revision = ColumnContent("Model sha", "str", False, False)
39
+ dummy = ColumnContent(
40
+ "model_name_for_query", "str", True
41
+ ) # dummy col to implement search bar (hidden by custom CSS)
42
+
43
 
44
  @dataclass(frozen=True)
45
+ class EloEvalColumn: # Elo evals column
46
  model = ColumnContent("Model", "markdown", True)
47
  gpt4 = ColumnContent("GPT-4 (all)", "number", True)
48
  human_all = ColumnContent("Human (all)", "number", True)
 
51
 
52
 
53
  @dataclass(frozen=True)
54
+ class EvalQueueColumn: # Queue column
55
  model = ColumnContent("model", "markdown", True)
56
  revision = ColumnContent("revision", "str", True)
57
  private = ColumnContent("private", "bool", True)
 
59
  weight_type = ColumnContent("weight_type", "str", "Original")
60
  status = ColumnContent("status", "str", True)
61
 
62
+
63
+ LLAMAS = [
64
+ "huggingface/llama-7b",
65
+ "huggingface/llama-13b",
66
+ "huggingface/llama-30b",
67
+ "huggingface/llama-65b",
68
+ ]
69
 
70
 
71
  KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
 
102
  link = KOALA_LINK
103
  elif model_name == "oasst-12b":
104
  link = OASST_LINK
105
+
106
+ details_model_name = model_name.replace("/", "__")
107
+ details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
108
+
109
+ if not bool(os.getenv("DEBUG", "False")):
110
+ # We only add these checks when not debugging, as they are extremely slow
111
+ print(f"details_link: {details_link}")
112
+ try:
113
+ check_path = list(
114
+ API.list_files_info(
115
+ repo_id=f"open-llm-leaderboard/details_{details_model_name}",
116
+ paths="README.md",
117
+ repo_type="dataset",
118
+ )
119
+ )
120
+ print(f"check_path: {check_path}")
121
+ except Exception as err:
122
+ # No details repo for this model
123
+ print(f"No details repo for this model: {err}")
124
+ return model_hyperlink(link, model_name)
125
+
126
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
127
+
128
 
129
  def styled_error(error):
130
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
131
 
132
+
133
  def styled_warning(warn):
134
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
135
 
136
+
137
  def styled_message(message):
138
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
139
+
140
+
141
+ def has_no_nan_values(df, columns):
142
+ return df[columns].notna().all(axis=1)
143
+
144
+
145
+ def has_nan_values(df, columns):
146
+ return df[columns].isna().any(axis=1)
src/init.py DELETED
@@ -1,58 +0,0 @@
1
- import os
2
- from huggingface_hub import Repository
3
-
4
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
5
-
6
-
7
- def get_all_requested_models(requested_models_dir):
8
- depth = 1
9
- file_names = []
10
-
11
- for root, dirs, files in os.walk(requested_models_dir):
12
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
13
- if current_depth == depth:
14
- file_names.extend([os.path.join(root, file) for file in files])
15
-
16
- return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
17
-
18
- def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
19
- eval_queue_repo = None
20
- eval_results_repo = None
21
- requested_models = None
22
-
23
- if H4_TOKEN:
24
- print("Pulling evaluation requests and results.")
25
-
26
- eval_queue_repo = Repository(
27
- local_dir=QUEUE_PATH,
28
- clone_from=QUEUE_REPO,
29
- use_auth_token=H4_TOKEN,
30
- repo_type="dataset",
31
- )
32
- eval_queue_repo.git_pull()
33
-
34
- eval_results_repo = Repository(
35
- local_dir=RESULTS_PATH,
36
- clone_from=RESULTS_REPO,
37
- use_auth_token=H4_TOKEN,
38
- repo_type="dataset",
39
- )
40
- eval_results_repo.git_pull()
41
-
42
- requested_models = get_all_requested_models("eval-queue")
43
- else:
44
- print("No HuggingFace token provided. Skipping evaluation requests and results.")
45
-
46
- return eval_queue_repo, requested_models, eval_results_repo
47
-
48
-
49
- #def load_results(model, benchmark, metric):
50
- # file_path = os.path.join("autoevals", model, f"{model}-eval_{benchmark}.json")
51
- # if not os.path.exists(file_path):
52
- # return 0.0, None
53
-
54
- # with open(file_path) as fp:
55
- # data = json.load(fp)
56
- # accs = np.array([v[metric] for k, v in data["results"].items()])
57
- # mean_acc = np.mean(accs)
58
- # return mean_acc, data["config"]["model_args"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/load_from_hub.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+ from huggingface_hub import Repository
6
+ from transformers import AutoConfig
7
+ from collections import defaultdict
8
+
9
+ from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
10
+ from src.display_models.get_model_metadata import apply_metadata
11
+ from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
12
+ from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
13
+
14
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
15
+
16
+
17
+ def get_all_requested_models(requested_models_dir: str) -> set[str]:
18
+ depth = 1
19
+ file_names = []
20
+ users_to_submission_dates = defaultdict(list)
21
+
22
+ for root, _, files in os.walk(requested_models_dir):
23
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
24
+ if current_depth == depth:
25
+ for file in files:
26
+ if not file.endswith(".json"): continue
27
+ with open(os.path.join(root, file), "r") as f:
28
+ info = json.load(f)
29
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
30
+
31
+ # Select organisation
32
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
33
+ continue
34
+ organisation, _ = info["model"].split("/")
35
+ users_to_submission_dates[organisation].append(info["submitted_time"])
36
+
37
+ return set(file_names), users_to_submission_dates
38
+
39
+
40
+ def load_all_info_from_hub(QUEUE_REPO: str, RESULTS_REPO: str, QUEUE_PATH: str, RESULTS_PATH: str) -> list[Repository]:
41
+ eval_queue_repo = None
42
+ eval_results_repo = None
43
+ requested_models = None
44
+
45
+ print("Pulling evaluation requests and results.")
46
+
47
+ eval_queue_repo = Repository(
48
+ local_dir=QUEUE_PATH,
49
+ clone_from=QUEUE_REPO,
50
+ repo_type="dataset",
51
+ )
52
+ eval_queue_repo.git_pull()
53
+
54
+ eval_results_repo = Repository(
55
+ local_dir=RESULTS_PATH,
56
+ clone_from=RESULTS_REPO,
57
+ repo_type="dataset",
58
+ )
59
+ eval_results_repo.git_pull()
60
+
61
+ requested_models, users_to_submission_dates = get_all_requested_models("eval-queue")
62
+
63
+ return eval_queue_repo, requested_models, eval_results_repo, users_to_submission_dates
64
+
65
+
66
+ def get_leaderboard_df(
67
+ eval_results: Repository, eval_results_private: Repository, cols: list, benchmark_cols: list
68
+ ) -> pd.DataFrame:
69
+ if eval_results:
70
+ print("Pulling evaluation results for the leaderboard.")
71
+ eval_results.git_pull()
72
+ if eval_results_private:
73
+ print("Pulling evaluation results for the leaderboard.")
74
+ eval_results_private.git_pull()
75
+
76
+ all_data = get_eval_results_dicts()
77
+
78
+ if not IS_PUBLIC:
79
+ all_data.append(gpt4_values)
80
+ all_data.append(gpt35_values)
81
+
82
+ all_data.append(baseline)
83
+ apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
84
+
85
+ df = pd.DataFrame.from_records(all_data)
86
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
87
+ df = df[cols].round(decimals=2)
88
+
89
+ # filter out if any of the benchmarks have not been produced
90
+ df = df[has_no_nan_values(df, benchmark_cols)]
91
+ return df
92
+
93
+
94
+ def get_evaluation_queue_df(
95
+ eval_queue: Repository, eval_queue_private: Repository, save_path: str, cols: list
96
+ ) -> list[pd.DataFrame]:
97
+ if eval_queue:
98
+ print("Pulling changes for the evaluation queue.")
99
+ eval_queue.git_pull()
100
+ if eval_queue_private:
101
+ print("Pulling changes for the evaluation queue.")
102
+ eval_queue_private.git_pull()
103
+
104
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
105
+ all_evals = []
106
+
107
+ for entry in entries:
108
+ if ".json" in entry:
109
+ file_path = os.path.join(save_path, entry)
110
+ with open(file_path) as fp:
111
+ data = json.load(fp)
112
+
113
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
114
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
115
+
116
+ all_evals.append(data)
117
+ elif ".md" not in entry:
118
+ # this is a folder
119
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
120
+ for sub_entry in sub_entries:
121
+ file_path = os.path.join(save_path, entry, sub_entry)
122
+ with open(file_path) as fp:
123
+ data = json.load(fp)
124
+
125
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
126
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
127
+ all_evals.append(data)
128
+
129
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
130
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
131
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
132
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
133
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
134
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
135
+ return df_finished[cols], df_running[cols], df_pending[cols]
136
+
137
+
138
+ def is_model_on_hub(model_name: str, revision: str) -> bool:
139
+ try:
140
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
141
+ return True, None
142
+
143
+ except ValueError:
144
+ return (
145
+ False,
146
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
147
+ )
148
+
149
+ except Exception as e:
150
+ print(f"Could not get the model config from the hub.: {e}")
151
+ return False, "was not found on hub!"
src/rate_limiting.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from datetime import datetime, timezone, timedelta
3
+
4
+
5
+ def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period):
6
+ org_or_user, _ = submission_name.split("/")
7
+ if org_or_user not in users_to_submission_dates:
8
+ return 0
9
+ submission_dates = sorted(users_to_submission_dates[org_or_user])
10
+
11
+ time_limit = (datetime.now(timezone.utc) - timedelta(days=rate_limit_period)).strftime("%Y-%m-%dT%H:%M:%SZ")
12
+ submissions_after_timelimit = [d for d in submission_dates if d > time_limit]
13
+
14
+ return len(submissions_after_timelimit)
15
+
16
+