Clรฉmentine commited on
Commit
294422e
ยท
1 Parent(s): 388bfbd

added plots back

Browse files
Files changed (4) hide show
  1. app.py +21 -21
  2. src/display/utils.py +1 -0
  3. src/populate.py +3 -3
  4. src/tools/plots.py +5 -9
app.py CHANGED
@@ -135,9 +135,9 @@ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queu
135
 
136
 
137
  # Data processing for plots now only on demand in the respective Gradio tab
138
- #def load_and_create_plots():
139
- # plot_df = create_plot_df(create_scores_df(leaderboard_df))
140
- # return plot_df
141
 
142
  def init_leaderboard(dataframe):
143
  return Leaderboard(
@@ -182,24 +182,24 @@ with demo:
182
  with gr.TabItem("๐Ÿ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
183
  leaderboard = init_leaderboard(leaderboard_df)
184
 
185
- #with gr.TabItem("๐Ÿ“ˆ Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
186
- # with gr.Row():
187
- # with gr.Column():
188
- # plot_df = load_and_create_plots()
189
- # chart = create_metric_plot_obj(
190
- # plot_df,
191
- # [AutoEvalColumn.average.name],
192
- # title="Average of Top Scores and Human Baseline Over Time (from last update)",
193
- # )
194
- # gr.Plot(value=chart, min_width=500)
195
- # with gr.Column():
196
- # plot_df = load_and_create_plots()
197
- # chart = create_metric_plot_obj(
198
- # plot_df,
199
- # BENCHMARK_COLS,
200
- # title="Top Scores and Human Baseline Over Time (from last update)",
201
- # )
202
- # gr.Plot(value=chart, min_width=500)
203
 
204
  with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=3):
205
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
135
 
136
 
137
  # Data processing for plots now only on demand in the respective Gradio tab
138
+ def load_and_create_plots():
139
+ plot_df = create_plot_df(create_scores_df(leaderboard_df))
140
+ return plot_df
141
 
142
  def init_leaderboard(dataframe):
143
  return Leaderboard(
 
182
  with gr.TabItem("๐Ÿ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
183
  leaderboard = init_leaderboard(leaderboard_df)
184
 
185
+ with gr.TabItem("๐Ÿ“ˆ Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
186
+ with gr.Row():
187
+ with gr.Column():
188
+ plot_df = load_and_create_plots()
189
+ chart = create_metric_plot_obj(
190
+ plot_df,
191
+ [AutoEvalColumn.average.name],
192
+ title="Average of Top Scores and Human Baseline Over Time (from last update)",
193
+ )
194
+ gr.Plot(value=chart, min_width=500)
195
+ with gr.Column():
196
+ plot_df = load_and_create_plots()
197
+ chart = create_metric_plot_obj(
198
+ plot_df,
199
+ BENCHMARK_COLS,
200
+ title="Top Scores and Human Baseline Over Time (from last update)",
201
+ )
202
+ gr.Plot(value=chart, min_width=500)
203
 
204
  with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=3):
205
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
src/display/utils.py CHANGED
@@ -93,6 +93,7 @@ auto_eval_column_dict.append(
93
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
94
  auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
95
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 
96
  # Dummy column for the search bar (hidden by the custom CSS)
97
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
98
 
 
93
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
94
  auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
95
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
96
+ auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
97
  # Dummy column for the search bar (hidden by the custom CSS)
98
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
99
 
src/populate.py CHANGED
@@ -43,10 +43,10 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols:
43
  """Retrieve and process leaderboard data."""
44
  all_data_json = leaderboard_dataset.to_dict()
45
  num_items = leaderboard_dataset.num_rows
46
- all_data_json = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
47
- filter_models_flags(all_data_json)
48
 
49
- df = pd.DataFrame.from_records(all_data_json)
50
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
51
  df = df[cols].round(decimals=2)
52
  df = df[has_no_nan_values(df, benchmark_cols)]
 
43
  """Retrieve and process leaderboard data."""
44
  all_data_json = leaderboard_dataset.to_dict()
45
  num_items = leaderboard_dataset.num_rows
46
+ all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
47
+ filter_models_flags(all_data_json_list)
48
 
49
+ df = pd.DataFrame.from_records(all_data_json_list)
50
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
51
  df = df[cols].round(decimals=2)
52
  df = df[has_no_nan_values(df, benchmark_cols)]
src/tools/plots.py CHANGED
@@ -28,22 +28,18 @@ def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
28
  last_date = ""
29
  column = task.col_name
30
  for _, row in results_df.iterrows():
31
- current_model = row["full_model"]
32
  # We ignore models that are flagged/no longer on the hub/not finished
33
  to_ignore = (
34
- not row["still_on_hub"]
35
- or not row["not_flagged"]
36
  or current_model in FLAGGED_MODELS
37
- or row["status"] != "FINISHED"
38
  )
39
  if to_ignore:
40
  continue
41
 
42
- current_date = row["date"]
43
- if task.benchmark == "Average":
44
- current_score = np.mean(list(row["results"].values()))
45
- else:
46
- current_score = row["results"][task.benchmark]
47
 
48
  if current_score > current_max:
49
  if current_date == last_date and len(scores[column]) > 0:
 
28
  last_date = ""
29
  column = task.col_name
30
  for _, row in results_df.iterrows():
31
+ current_model = row[AutoEvalColumn.fullname.name]
32
  # We ignore models that are flagged/no longer on the hub/not finished
33
  to_ignore = (
34
+ not row[AutoEvalColumn.still_on_hub.name]
35
+ or not row[AutoEvalColumn.not_flagged.name]
36
  or current_model in FLAGGED_MODELS
 
37
  )
38
  if to_ignore:
39
  continue
40
 
41
+ current_date = row[AutoEvalColumn.date.name]
42
+ current_score = row[task.col_name]
 
 
 
43
 
44
  if current_score > current_max:
45
  if current_date == last_date and len(scores[column]) > 0: