lixuejing commited on
Commit
a8d737a
Β·
1 Parent(s): a4bb8a8

add rows for llmbenchmark

Browse files
Files changed (4) hide show
  1. app.py +290 -43
  2. src/about.py +11 -2
  3. src/display/utils.py +53 -7
  4. src/populate.py +2 -0
app.py CHANGED
@@ -22,12 +22,18 @@ from src.display.utils import (
22
  ModelType,
23
  fields,
24
  WeightType,
25
- Precision
 
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
-
 
 
 
 
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
@@ -48,45 +54,163 @@ try:
48
  except Exception:
49
  restart_space()
50
 
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  demo = gr.Blocks(css=custom_css)
@@ -95,8 +219,131 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -201,4 +448,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
22
  ModelType,
23
  fields,
24
  WeightType,
25
+ Precision,
26
+ NUMERIC_INTERVALS
27
  )
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
+ from src.tools.collections import update_collections
32
+ from src.tools.plots import (
33
+ create_metric_plot_obj,
34
+ create_plot_df,
35
+ create_scores_df,
36
+ )
37
 
38
  def restart_space():
39
  API.restart_space(repo_id=REPO_ID)
 
54
  except Exception:
55
  restart_space()
56
 
57
+ #raw_data, original_df = get_leaderboard_df(
58
+ leaderboard_df = get_leaderboard_df(
59
+ results_path=EVAL_RESULTS_PATH,
60
+ requests_path=EVAL_REQUESTS_PATH,
61
+ #dynamic_path=DYNAMIC_INFO_FILE_PATH,
62
+ cols=COLS,
63
+ benchmark_cols=BENCHMARK_COLS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  )
65
+ #update_collections(original_df.copy())
66
+ #leaderboard_df = original_df.copy()
67
+
68
+ #plot_df = create_plot_df(create_scores_df(raw_data))
69
+
70
+ (
71
+ finished_eval_queue_df,
72
+ running_eval_queue_df,
73
+ pending_eval_queue_df,
74
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
+
76
+ #return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
77
+ #leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
78
+ return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
79
+ leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
80
+
81
+
82
+ # Searching and filtering
83
+ def update_table(
84
+ hidden_df: pd.DataFrame,
85
+ columns: list,
86
+ type_query: list,
87
+ precision_query: str,
88
+ size_query: list,
89
+ hide_models: list,
90
+ query: str,
91
+ ):
92
+ filtered_df = filter_models(df=hidden_df, type_query=type_query, size_query=size_query, precision_query=precision_query, hide_models=hide_models)
93
+ filtered_df = filter_queries(query, filtered_df)
94
+ df = select_columns(filtered_df, columns)
95
+ return df
96
+
97
+
98
+ def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
99
+ query = request.query_params.get("query") or ""
100
+ return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
101
+
102
+
103
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
104
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
105
+
106
+
107
+ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
108
+ always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
109
+ dummy_col = [AutoEvalColumn.dummy.name]
110
+ #AutoEvalColumn.model_type_symbol.name,
111
+ #AutoEvalColumn.model.name,
112
+ # We use COLS to maintain sorting
113
+ filtered_df = df[
114
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
115
+ ]
116
+ return filtered_df
117
+
118
+
119
+ def filter_queries(query: str, filtered_df: pd.DataFrame):
120
+ """Added by Abishek"""
121
+ final_df = []
122
+ if query != "":
123
+ queries = [q.strip() for q in query.split(";")]
124
+ for _q in queries:
125
+ _q = _q.strip()
126
+ if _q != "":
127
+ temp_filtered_df = search_table(filtered_df, _q)
128
+ if len(temp_filtered_df) > 0:
129
+ final_df.append(temp_filtered_df)
130
+ if len(final_df) > 0:
131
+ filtered_df = pd.concat(final_df)
132
+ filtered_df = filtered_df.drop_duplicates(
133
+ subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
134
+ )
135
+
136
+ return filtered_df
137
+
138
+
139
+ def filter_models(
140
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, hide_models: list
141
+ ) -> pd.DataFrame:
142
+ # Show all models
143
+ if "Private or deleted" in hide_models:
144
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
145
+ else:
146
+ filtered_df = df
147
+
148
+ if "Contains a merge/moerge" in hide_models:
149
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
150
+
151
+ if "MoE" in hide_models:
152
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
153
+
154
+ if "Flagged" in hide_models:
155
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
156
+
157
+ type_emoji = [t[0] for t in type_query]
158
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
159
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
160
+
161
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
162
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
163
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
164
+ filtered_df = filtered_df.loc[mask]
165
+
166
+ return filtered_df
167
+
168
+ leaderboard_df = filter_models(
169
+ df=leaderboard_df,
170
+ type_query=[t.to_str(" : ") for t in ModelType],
171
+ size_query=list(NUMERIC_INTERVALS.keys()),
172
+ precision_query=[i.value.name for i in Precision],
173
+ hide_models=[], # Deleted, merges, flagged, MoEs
174
+ )
175
+
176
+ #LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
177
+ #
178
+ #(
179
+ # finished_eval_queue_df,
180
+ # running_eval_queue_df,
181
+ # pending_eval_queue_df,
182
+ #) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
183
+
184
+ #def init_leaderboard(dataframe):
185
+ # if dataframe is None or dataframe.empty:
186
+ # raise ValueError("Leaderboard DataFrame is empty or None.")
187
+ # return Leaderboard(
188
+ # value=dataframe,
189
+ # datatype=[c.type for c in fields(AutoEvalColumn)],
190
+ # select_columns=SelectColumns(
191
+ # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
192
+ # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
193
+ # label="Select Columns to Display:",
194
+ # ),
195
+ # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
196
+ # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
197
+ # filter_columns=[
198
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
199
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
200
+ # ColumnFilter(
201
+ # AutoEvalColumn.params.name,
202
+ # type="slider",
203
+ # min=0.01,
204
+ # max=150,
205
+ # label="Select the number of parameters (B)",
206
+ # ),
207
+ # ColumnFilter(
208
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
209
+ # ),
210
+ # ],
211
+ # bool_checkboxgroup_label="Hide models",
212
+ # interactive=False,
213
+ # )
214
 
215
 
216
  demo = gr.Blocks(css=custom_css)
 
219
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
220
 
221
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
222
+ with gr.TabItem("πŸ… VLM Benchmark", elem_id="vlm-benchmark-tab-table", id=0):
223
+ #leaderboard = init_leaderboard(LEADERBOARD_DF)
224
+ with gr.Row():
225
+ with gr.Column():
226
+ with gr.Row():
227
+ search_bar = gr.Textbox(
228
+ placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
229
+ show_label=False,
230
+ elem_id="search-bar",
231
+ )
232
+ with gr.Row():
233
+ shown_columns = gr.CheckboxGroup(
234
+ choices=[
235
+ c.name
236
+ for c in fields(AutoEvalColumn)
237
+ if not c.hidden and not c.never_hidden and not c.dummy
238
+ ],
239
+ value=[
240
+ c.name
241
+ for c in fields(AutoEvalColumn)
242
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
243
+ ],
244
+ label="Select columns to show",
245
+ elem_id="column-select",
246
+ interactive=True,
247
+ )
248
+ with gr.Row():
249
+ hide_models = gr.CheckboxGroup(
250
+ label="Hide models",
251
+ choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
252
+ value=[],
253
+ interactive=True
254
+ )
255
+ with gr.Column(min_width=320):
256
+ #with gr.Box(elem_id="box-filter"):
257
+ filter_columns_type = gr.CheckboxGroup(
258
+ label="Model types",
259
+ choices=[t.to_str() for t in ModelType],
260
+ value=[t.to_str() for t in ModelType],
261
+ interactive=True,
262
+ elem_id="filter-columns-type",
263
+ )
264
+ filter_columns_precision = gr.CheckboxGroup(
265
+ label="Precision",
266
+ choices=[i.value.name for i in Precision],
267
+ value=[i.value.name for i in Precision],
268
+ interactive=True,
269
+ elem_id="filter-columns-precision",
270
+ )
271
+ filter_columns_size = gr.CheckboxGroup(
272
+ label="Model sizes (in billions of parameters)",
273
+ choices=list(NUMERIC_INTERVALS.keys()),
274
+ value=list(NUMERIC_INTERVALS.keys()),
275
+ interactive=True,
276
+ elem_id="filter-columns-size",
277
+ )
278
+
279
+ leaderboard_table = gr.components.Dataframe(
280
+ value=leaderboard_df[
281
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
282
+ + shown_columns.value
283
+ + [AutoEvalColumn.dummy.name]
284
+ ],
285
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
286
+ datatype=TYPES,
287
+ elem_id="leaderboard-table",
288
+ interactive=False,
289
+ visible=True,
290
+ #column_widths=["2%", "33%"]
291
+ )
292
+
293
+ # Dummy leaderboard for handling the case when the user uses backspace key
294
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
295
+ value=original_df[COLS],
296
+ headers=COLS,
297
+ datatype=TYPES,
298
+ visible=False,
299
+ )
300
+ search_bar.submit(
301
+ update_table,
302
+ [
303
+ hidden_leaderboard_table_for_search,
304
+ shown_columns,
305
+ filter_columns_type,
306
+ filter_columns_precision,
307
+ filter_columns_size,
308
+ hide_models,
309
+ search_bar,
310
+ ],
311
+ leaderboard_table,
312
+ )
313
+
314
+ # Define a hidden component that will trigger a reload only if a query parameter has been set
315
+ hidden_search_bar = gr.Textbox(value="", visible=False)
316
+ hidden_search_bar.change(
317
+ update_table,
318
+ [
319
+ hidden_leaderboard_table_for_search,
320
+ shown_columns,
321
+ filter_columns_type,
322
+ filter_columns_precision,
323
+ filter_columns_size,
324
+ hide_models,
325
+ search_bar,
326
+ ],
327
+ leaderboard_table,
328
+ )
329
+ # Check query parameter once at startup and update search bar + hidden component
330
+ demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
331
+
332
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, hide_models]:
333
+ selector.change(
334
+ update_table,
335
+ [
336
+ hidden_leaderboard_table_for_search,
337
+ shown_columns,
338
+ filter_columns_type,
339
+ filter_columns_precision,
340
+ filter_columns_size,
341
+ hide_models,
342
+ search_bar,
343
+ ],
344
+ leaderboard_table,
345
+ queue=True,
346
+ )
347
 
348
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
349
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
448
  scheduler = BackgroundScheduler()
449
  scheduler.add_job(restart_space, "interval", seconds=1800)
450
  scheduler.start()
451
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -12,8 +12,17 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("cmmmu", "acc", "CMMMU")
16
+ task1 = Task("cmmu", "acc", "CMMU")
17
+ task2 = Task("cv_bench", "acc", "CV_Bench")
18
+ task3 = Task("hallusion_bench", "acc", "Hallusion_Bench")
19
+ task4 = Task("mmmu", "acc", "MMMU")
20
+ task5 = Task("mmmu_pro_standard", "acc", "MMMU_Pro_Standard")
21
+ task6 = Task("mmmu_pro_vision", "acc", "MMMU_Pro_Vision")
22
+ task7 = Task("ocrbench", "acc", "OCRBench")
23
+ task8 = Task("math_vision", "acc", "Math_Vision")
24
+ task9 = Task("cvbench", "acc", "CVBench")
25
+ task10 = Task("ciibench", "acc", "CIIBench")
26
 
27
  NUM_FEWSHOT = 0 # Change with your few shot
28
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -54,6 +54,33 @@ class EvalQueueColumn: # Queue column
54
  status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  @dataclass
58
  class ModelDetails:
59
  name: str
@@ -63,9 +90,9 @@ class ModelDetails:
63
 
64
  class ModelType(Enum):
65
  PT = ModelDetails(name="pretrained", symbol="🟒")
66
- FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -77,10 +104,10 @@ class ModelType(Enum):
77
  return ModelType.FT
78
  if "pretrained" in type or "🟒" in type:
79
  return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "β­•" in type:
83
- return ModelType.IFT
84
  return ModelType.Unknown
85
 
86
  class WeightType(Enum):
@@ -91,6 +118,9 @@ class WeightType(Enum):
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
 
 
 
94
  Unknown = ModelDetails("?")
95
 
96
  def from_str(precision):
@@ -98,6 +128,12 @@ class Precision(Enum):
98
  return Precision.float16
99
  if precision in ["torch.bfloat16", "bfloat16"]:
100
  return Precision.bfloat16
 
 
 
 
 
 
101
  return Precision.Unknown
102
 
103
  # Column selection
@@ -108,3 +144,13 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
 
 
 
 
 
 
 
 
 
 
 
 
54
  status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
57
+
58
+ # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
59
+ # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
60
+ # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
61
+ # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
62
+ # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
63
+ # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
64
+ # GSM8K: paper
65
+ # Define the human baselines
66
+ human_baseline_row = {
67
+ AutoEvalColumn.model.name: "<p>Human performance</p>",
68
+ AutoEvalColumn.revision.name: "N/A",
69
+ AutoEvalColumn.precision.name: None,
70
+ AutoEvalColumn.average.name: 92.75,
71
+ AutoEvalColumn.merged.name: False,
72
+ AutoEvalColumn.arc.name: 80.0,
73
+ AutoEvalColumn.hellaswag.name: 95.0,
74
+ AutoEvalColumn.mmlu.name: 89.8,
75
+ AutoEvalColumn.truthfulqa.name: 94.0,
76
+ AutoEvalColumn.winogrande.name: 94.0,
77
+ AutoEvalColumn.gsm8k.name: 100,
78
+ AutoEvalColumn.c_sem.name: 100,
79
+ AutoEvalColumn.dummy.name: "human_baseline",
80
+ AutoEvalColumn.model_type.name: "",
81
+ AutoEvalColumn.flagged.name: False,
82
+ }
83
+
84
  @dataclass
85
  class ModelDetails:
86
  name: str
 
90
 
91
  class ModelType(Enum):
92
  PT = ModelDetails(name="pretrained", symbol="🟒")
93
+ FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="πŸ”Ά")
94
+ chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="πŸ’¬")
95
+ merges = ModelDetails(name="base merges and moerges", symbol="🀝")
96
  Unknown = ModelDetails(name="", symbol="?")
97
 
98
  def to_str(self, separator=" "):
 
104
  return ModelType.FT
105
  if "pretrained" in type or "🟒" in type:
106
  return ModelType.PT
107
+ if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "β­•", "πŸ’¬"]]):
108
+ return ModelType.chat
109
+ if "merge" in type or "🀝" in type:
110
+ return ModelType.merges
111
  return ModelType.Unknown
112
 
113
  class WeightType(Enum):
 
118
  class Precision(Enum):
119
  float16 = ModelDetails("float16")
120
  bfloat16 = ModelDetails("bfloat16")
121
+ qt_8bit = ModelDetails("8bit")
122
+ qt_4bit = ModelDetails("4bit")
123
+ qt_GPTQ = ModelDetails("GPTQ")
124
  Unknown = ModelDetails("?")
125
 
126
  def from_str(precision):
 
128
  return Precision.float16
129
  if precision in ["torch.bfloat16", "bfloat16"]:
130
  return Precision.bfloat16
131
+ if precision in ["8bit"]:
132
+ return Precision.qt_8bit
133
+ if precision in ["4bit"]:
134
+ return Precision.qt_4bit
135
+ if precision in ["GPTQ", "None"]:
136
+ return Precision.qt_GPTQ
137
  return Precision.Unknown
138
 
139
  # Column selection
 
144
 
145
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
146
 
147
+ NUMERIC_INTERVALS = {
148
+ "?": pd.Interval(-1, 0, closed="right"),
149
+ "~1.5": pd.Interval(0, 2, closed="right"),
150
+ "~3": pd.Interval(2, 4, closed="right"),
151
+ "~7": pd.Interval(4, 9, closed="right"),
152
+ "~13": pd.Interval(9, 20, closed="right"),
153
+ "~35": pd.Interval(20, 45, closed="right"),
154
+ "~60": pd.Interval(45, 70, closed="right"),
155
+ "70+": pd.Interval(70, 10000, closed="right"),
156
+ }
src/populate.py CHANGED
@@ -6,12 +6,14 @@ import pandas as pd
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
 
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
+ from src.leaderboard.filter_models import filter_models_flags
10
 
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
+ filter_models_flags(all_data_json)
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)