g8a9 commited on
Commit
61e7dfb
1 Parent(s): 8319d40

update layout

Browse files
Files changed (4) hide show
  1. app.py +67 -57
  2. src/about.py +19 -17
  3. src/display/utils.py +4 -1
  4. src/leaderboard/read_evals.py +15 -2
app.py CHANGED
@@ -142,6 +142,11 @@ def filter_models(
142
  return filtered_df
143
 
144
 
 
 
 
 
 
145
  demo = gr.Blocks(css=custom_css)
146
  with demo:
147
  gr.HTML(TITLE)
@@ -150,56 +155,61 @@ with demo:
150
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
151
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
152
  with gr.Row():
153
- with gr.Column():
154
- with gr.Row():
155
- search_bar = gr.Textbox(
156
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
157
- show_label=False,
158
- elem_id="search-bar",
159
- )
160
- with gr.Row():
161
- shown_columns = gr.CheckboxGroup(
162
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
163
- value=[
164
- c.name
165
- for c in fields(AutoEvalColumn)
166
- if c.displayed_by_default and not c.hidden and not c.never_hidden
167
- ],
168
- label="Select columns to show",
169
- elem_id="column-select",
170
- interactive=True,
171
- )
172
- with gr.Row():
173
- deleted_models_visibility = gr.Checkbox(
174
- value=False, label="Show gated/private/deleted models", interactive=True
175
- )
176
- with gr.Column(min_width=320):
177
- # with gr.Box(elem_id="box-filter"):
178
- filter_columns_type = gr.CheckboxGroup(
179
- label="Model types",
180
- choices=[t.to_str() for t in ModelType],
181
- value=[t.to_str() for t in ModelType],
182
- interactive=True,
183
- elem_id="filter-columns-type",
184
- )
185
- filter_columns_precision = gr.CheckboxGroup(
186
- label="Precision",
187
- choices=[i.value.name for i in Precision],
188
- value=[i.value.name for i in Precision],
189
- interactive=True,
190
- elem_id="filter-columns-precision",
191
- )
192
- filter_columns_size = gr.CheckboxGroup(
193
- label="Model sizes (in billions of parameters)",
194
- choices=list(NUMERIC_INTERVALS.keys()),
195
- value=list(NUMERIC_INTERVALS.keys()),
196
- interactive=True,
197
- elem_id="filter-columns-size",
198
- )
 
199
 
200
  leaderboard_table = gr.components.Dataframe(
201
- value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
202
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
 
 
 
 
203
  datatype=TYPES,
204
  elem_id="leaderboard-table",
205
  interactive=False,
@@ -217,31 +227,31 @@ with demo:
217
  update_table,
218
  [
219
  hidden_leaderboard_table_for_search,
220
- shown_columns,
221
  filter_columns_type,
222
- filter_columns_precision,
223
  filter_columns_size,
224
- deleted_models_visibility,
225
  search_bar,
226
  ],
227
  leaderboard_table,
228
  )
229
  for selector in [
230
- shown_columns,
231
  filter_columns_type,
232
- filter_columns_precision,
233
  filter_columns_size,
234
- deleted_models_visibility,
235
  ]:
236
  selector.change(
237
  update_table,
238
  [
239
  hidden_leaderboard_table_for_search,
240
- shown_columns,
241
  filter_columns_type,
242
- filter_columns_precision,
243
  filter_columns_size,
244
- deleted_models_visibility,
245
  search_bar,
246
  ],
247
  leaderboard_table,
 
142
  return filtered_df
143
 
144
 
145
+ shown_columns = [
146
+ c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden
147
+ ]
148
+
149
+
150
  demo = gr.Blocks(css=custom_css)
151
  with demo:
152
  gr.HTML(TITLE)
 
155
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
156
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
157
  with gr.Row():
158
+ # with gr.Column():
159
+ # with gr.Row():
160
+ search_bar = gr.Textbox(
161
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
162
+ show_label=False,
163
+ elem_id="search-bar",
164
+ )
165
+
166
+ # with gr.Row():
167
+ # shown_columns = gr.CheckboxGroup(
168
+ # choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
169
+ # value=[
170
+ # c.name
171
+ # for c in fields(AutoEvalColumn)
172
+ # if c.displayed_by_default and not c.hidden and not c.never_hidden
173
+ # ],
174
+ # label="Select columns to show",
175
+ # elem_id="column-select",
176
+ # interactive=True,
177
+ # )
178
+ # with gr.Row():
179
+ # deleted_models_visibility = gr.Checkbox(
180
+ # value=False, label="Show gated/private/deleted models", interactive=True
181
+ # )
182
+ # with gr.Column(min_width=320):
183
+ # with gr.Box(elem_id="box-filter"):
184
+ filter_columns_type = gr.CheckboxGroup(
185
+ label="Model types",
186
+ choices=[t.to_str() for t in ModelType],
187
+ value=[t.to_str() for t in ModelType],
188
+ interactive=True,
189
+ elem_id="filter-columns-type",
190
+ )
191
+ # filter_columns_precision = gr.CheckboxGroup(
192
+ # label="Precision",
193
+ # choices=[i.value.name for i in Precision],
194
+ # value=[i.value.name for i in Precision],
195
+ # interactive=True,
196
+ # elem_id="filter-columns-precision",
197
+ # )
198
+ filter_columns_size = gr.CheckboxGroup(
199
+ label="Model sizes (in billions of parameters)",
200
+ choices=list(NUMERIC_INTERVALS.keys()),
201
+ value=list(NUMERIC_INTERVALS.keys()),
202
+ interactive=True,
203
+ elem_id="filter-columns-size",
204
+ )
205
 
206
  leaderboard_table = gr.components.Dataframe(
207
+ value=leaderboard_df[
208
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.displayed_by_default]
209
+ ], # ,# ] + shown_columns],
210
+ headers=[
211
+ c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.displayed_by_default
212
+ ], ##, if c.never_hidden] + shown_columns,
213
  datatype=TYPES,
214
  elem_id="leaderboard-table",
215
  interactive=False,
 
227
  update_table,
228
  [
229
  hidden_leaderboard_table_for_search,
230
+ # None,
231
  filter_columns_type,
232
+ # filter_columns_precision,
233
  filter_columns_size,
234
+ # None,
235
  search_bar,
236
  ],
237
  leaderboard_table,
238
  )
239
  for selector in [
240
+ # shown_columns,
241
  filter_columns_type,
242
+ # filter_columns_precision,
243
  filter_columns_size,
244
+ # deleted_models_visibility,
245
  ]:
246
  selector.change(
247
  update_table,
248
  [
249
  hidden_leaderboard_table_for_search,
250
+ # None,
251
  filter_columns_type,
252
+ # filter_columns_precision,
253
  filter_columns_size,
254
+ # None,
255
  search_bar,
256
  ],
257
  leaderboard_table,
src/about.py CHANGED
@@ -7,6 +7,7 @@ class Task:
7
  benchmark: str
8
  metric: str
9
  col_name: str
 
10
  higher_is_better: bool = True
11
  scale_by_100: bool = True
12
 
@@ -15,23 +16,24 @@ class Task:
15
  # ---------------------------------------------------
16
  class Tasks(Enum):
17
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
18
- task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
19
- task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
20
- task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
21
- task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
22
- task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
23
- task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS")
24
- task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo")
25
- task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
26
- task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
27
- task14 = Task("ironita_irony", "f1,none", "IronITA Irony")
28
- task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm")
29
- task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
30
- task8 = Task("news_sum", "bertscore,none", "News Sum")
31
- task16 = Task("sentipolc", "f1,none", "SENTIPOLC")
32
- task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
33
- task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
34
- task11 = Task("xcopa_it", "acc,none", "XCOPA")
 
35
 
36
 
37
  NUM_FEWSHOT = 0 # Change with your few shot
 
7
  benchmark: str
8
  metric: str
9
  col_name: str
10
+ category: str
11
  higher_is_better: bool = True
12
  scale_by_100: bool = True
13
 
 
16
  # ---------------------------------------------------
17
  class Tasks(Enum):
18
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
19
+ task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg", "NLU")
20
+ task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso", "NLU")
21
+ task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C", "CFK")
22
+ task4 = Task("belebele_ita", "acc_norm,none", "Belebele", "NLU")
23
+ task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing", "BFS")
24
+ task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS", "BFS")
25
+ task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo", "BFS")
26
+ task5 = Task("hatecheck_ita", "f1,none", "HateCheck", "BFS")
27
+ task6 = Task("honest_ita", "acc,none", "HONEST", "BFS", higher_is_better=False)
28
+ task14 = Task("ironita_irony", "f1,none", "IronITA Irony", "NLU")
29
+ task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm", "NLU")
30
+ task7 = Task("itacola", "mcc,none", "ItaCoLA", "NLU", scale_by_100=False)
31
+ task8 = Task("news_sum", "bertscore,none", "News Sum", "NLU")
32
+ task16 = Task("sentipolc", "f1,none", "SENTIPOLC", "NLU")
33
+ task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it", "CFK")
34
+ task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA", "CFK")
35
+ task11 = Task("xcopa_it", "acc,none", "XCOPA", "CFK")
36
+ task17 = Task("hellaswag_ita", "acc_norm,none", "Hellaswag-it", "CFK")
37
 
38
 
39
  NUM_FEWSHOT = 0 # Change with your few shot
src/display/utils.py CHANGED
@@ -32,7 +32,10 @@ auto_eval_column_dict.append(["training_codebase", ColumnContent, ColumnContent(
32
  auto_eval_column_dict.append(["training_data", ColumnContent, ColumnContent("Data", "str", True, False)])
33
 
34
  # Scores
35
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
 
36
  for task in Tasks:
37
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
38
 
 
32
  auto_eval_column_dict.append(["training_data", ColumnContent, ColumnContent("Data", "str", True, False)])
33
 
34
  # Scores
35
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg ⬆️", "number", True)])
36
+ auto_eval_column_dict.append(["average_NLU", ColumnContent, ColumnContent("Avg NLU", "number", True)])
37
+ auto_eval_column_dict.append(["average_CFK", ColumnContent, ColumnContent("Avg CFK", "number", True)])
38
+ auto_eval_column_dict.append(["average_BFS", ColumnContent, ColumnContent("Avg BFS", "number", True)])
39
  for task in Tasks:
40
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
41
 
src/leaderboard/read_evals.py CHANGED
@@ -104,7 +104,7 @@ class EvalResult:
104
  if task.scale_by_100:
105
  mean_acc *= 100.0
106
 
107
- results[task.benchmark] = mean_acc
108
 
109
  # pdb.set_trace()
110
 
@@ -141,7 +141,17 @@ class EvalResult:
141
 
142
  def to_dict(self):
143
  """Converts the Eval Result to a dict compatible with our dataframe display"""
144
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
 
 
 
 
 
 
 
145
  data_dict = {
146
  "eval_name": self.eval_name, # not a column, just a save name,
147
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -150,6 +160,9 @@ class EvalResult:
150
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
151
  AutoEvalColumn.architecture.name: self.architecture,
152
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
 
 
153
  AutoEvalColumn.average.name: average,
154
  AutoEvalColumn.license.name: self.license,
155
  AutoEvalColumn.params.name: self.num_params,
 
104
  if task.scale_by_100:
105
  mean_acc *= 100.0
106
 
107
+ results[task.benchmark] = {"value": mean_acc, "category": task.category}
108
 
109
  # pdb.set_trace()
110
 
 
141
 
142
  def to_dict(self):
143
  """Converts the Eval Result to a dict compatible with our dataframe display"""
144
+
145
+ # compute one average score per category
146
+ def _get_score_category(category):
147
+ filtered_scores = [v["value"] for _, v in self.results.items() if v["category"] == category]
148
+ return sum(filtered_scores) / len(filtered_scores)
149
+
150
+ average_NLU = _get_score_category("NLU")
151
+ average_CFK = _get_score_category("CFK")
152
+ average_BFS = _get_score_category("BFS")
153
+ average = (average_NLU + average_CFK + average_BFS) / 3
154
+
155
  data_dict = {
156
  "eval_name": self.eval_name, # not a column, just a save name,
157
  AutoEvalColumn.precision.name: self.precision.value.name,
 
160
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
161
  AutoEvalColumn.architecture.name: self.architecture,
162
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
163
+ AutoEvalColumn.average_NLU.name: average_NLU,
164
+ AutoEvalColumn.average_CFK.name: average_CFK,
165
+ AutoEvalColumn.average_BFS.name: average_BFS,
166
  AutoEvalColumn.average.name: average,
167
  AutoEvalColumn.license.name: self.license,
168
  AutoEvalColumn.params.name: self.num_params,