nathan-flagged-models-vis

#478
by SaylorTwift HF staff - opened
Files changed (3) hide show
  1. app.py +14 -3
  2. src/display/utils.py +2 -0
  3. src/leaderboard/filter_models.py +14 -0
app.py CHANGED
@@ -78,9 +78,10 @@ def update_table(
78
  precision_query: str,
79
  size_query: list,
80
  show_deleted: bool,
 
81
  query: str,
82
  ):
83
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
84
  filtered_df = filter_queries(query, filtered_df)
85
  df = select_columns(filtered_df, columns)
86
  return df
@@ -128,7 +129,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
128
 
129
 
130
  def filter_models(
131
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
132
  ) -> pd.DataFrame:
133
  # Show all models
134
  if show_deleted:
@@ -136,6 +137,9 @@ def filter_models(
136
  else: # Show only still on the hub models
137
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
138
 
 
 
 
139
  type_emoji = [t[0] for t in type_query]
140
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
141
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
@@ -147,6 +151,7 @@ def filter_models(
147
 
148
  return filtered_df
149
 
 
150
 
151
  demo = gr.Blocks(css=custom_css)
152
  with demo:
@@ -183,6 +188,9 @@ with demo:
183
  deleted_models_visibility = gr.Checkbox(
184
  value=False, label="Show private/deleted models", interactive=True
185
  )
 
 
 
186
  with gr.Column(min_width=320):
187
  #with gr.Box(elem_id="box-filter"):
188
  filter_columns_type = gr.CheckboxGroup(
@@ -237,6 +245,7 @@ with demo:
237
  filter_columns_precision,
238
  filter_columns_size,
239
  deleted_models_visibility,
 
240
  search_bar,
241
  ],
242
  leaderboard_table,
@@ -253,6 +262,7 @@ with demo:
253
  filter_columns_precision,
254
  filter_columns_size,
255
  deleted_models_visibility,
 
256
  search_bar,
257
  ],
258
  leaderboard_table,
@@ -260,7 +270,7 @@ with demo:
260
  # Check query parameter once at startup and update search bar + hidden component
261
  demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
262
 
263
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
264
  selector.change(
265
  update_table,
266
  [
@@ -270,6 +280,7 @@ with demo:
270
  filter_columns_precision,
271
  filter_columns_size,
272
  deleted_models_visibility,
 
273
  search_bar,
274
  ],
275
  leaderboard_table,
 
78
  precision_query: str,
79
  size_query: list,
80
  show_deleted: bool,
81
+ show_flagged: bool,
82
  query: str,
83
  ):
84
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_flagged)
85
  filtered_df = filter_queries(query, filtered_df)
86
  df = select_columns(filtered_df, columns)
87
  return df
 
129
 
130
 
131
  def filter_models(
132
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_flagged: bool
133
  ) -> pd.DataFrame:
134
  # Show all models
135
  if show_deleted:
 
137
  else: # Show only still on the hub models
138
  filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
139
 
140
+ if not show_flagged:
141
+ filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
142
+
143
  type_emoji = [t[0] for t in type_query]
144
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
145
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
 
151
 
152
  return filtered_df
153
 
154
+ leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False)
155
 
156
  demo = gr.Blocks(css=custom_css)
157
  with demo:
 
188
  deleted_models_visibility = gr.Checkbox(
189
  value=False, label="Show private/deleted models", interactive=True
190
  )
191
+ flagged_models_visibility = gr.Checkbox(
192
+ value=False, label="Show flagged models", interactive=True
193
+ )
194
  with gr.Column(min_width=320):
195
  #with gr.Box(elem_id="box-filter"):
196
  filter_columns_type = gr.CheckboxGroup(
 
245
  filter_columns_precision,
246
  filter_columns_size,
247
  deleted_models_visibility,
248
+ flagged_models_visibility,
249
  search_bar,
250
  ],
251
  leaderboard_table,
 
262
  filter_columns_precision,
263
  filter_columns_size,
264
  deleted_models_visibility,
265
+ flagged_models_visibility,
266
  search_bar,
267
  ],
268
  leaderboard_table,
 
270
  # Check query parameter once at startup and update search bar + hidden component
271
  demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
272
 
273
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility, flagged_models_visibility]:
274
  selector.change(
275
  update_table,
276
  [
 
280
  filter_columns_precision,
281
  filter_columns_size,
282
  deleted_models_visibility,
283
+ flagged_models_visibility,
284
  search_bar,
285
  ],
286
  leaderboard_table,
src/display/utils.py CHANGED
@@ -51,6 +51,7 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
51
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
52
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
53
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
54
  # Dummy column for the search bar (hidden by the custom CSS)
55
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
56
 
@@ -80,6 +81,7 @@ baseline_row = {
80
  AutoEvalColumn.gsm8k.name: 0.21,
81
  AutoEvalColumn.dummy.name: "baseline",
82
  AutoEvalColumn.model_type.name: "",
 
83
  }
84
 
85
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 
51
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
52
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
53
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
54
+ auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
55
  # Dummy column for the search bar (hidden by the custom CSS)
56
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
57
 
 
81
  AutoEvalColumn.gsm8k.name: 0.21,
82
  AutoEvalColumn.dummy.name: "baseline",
83
  AutoEvalColumn.model_type.name: "",
84
+ AutoEvalColumn.flagged.name: False,
85
  }
86
 
87
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
src/leaderboard/filter_models.py CHANGED
@@ -14,6 +14,17 @@ FLAGGED_MODELS = {
14
  "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
15
  "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
  "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
 
19
  # Models which have been requested by orgs to not be submitted on the leaderboard
@@ -36,6 +47,9 @@ def flag_models(leaderboard_data: list[dict]):
36
  model_data[
37
  AutoEvalColumn.model.name
38
  ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
 
 
 
39
 
40
 
41
  def remove_forbidden_models(leaderboard_data: list[dict]):
 
14
  "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
15
  "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
  "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
17
+ "jan-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
18
+ "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
19
+ "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
20
+ "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
21
+ "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
22
+ "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
23
+ "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
24
+ "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
25
+ "janai-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
26
+ "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
27
+ "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
28
  }
29
 
30
  # Models which have been requested by orgs to not be submitted on the leaderboard
 
47
  model_data[
48
  AutoEvalColumn.model.name
49
  ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
50
+ model_data[AutoEvalColumn.flagged.name] = True
51
+ else:
52
+ model_data[AutoEvalColumn.flagged.name] = False
53
 
54
 
55
  def remove_forbidden_models(leaderboard_data: list[dict]):