“WadoodAbdul” commited on
Commit
eb6e73c
·
1 Parent(s): 7d35d7a

added m2_types and updated documentation

Browse files
app.py CHANGED
@@ -12,11 +12,14 @@ from src.about import (
12
  INTRODUCTION_TEXT,
13
  LLM_BENCHMARKS_TEXT,
14
  TITLE,
 
15
  )
16
  from src.display.css_html_js import custom_css
17
  from src.display.utils import (
18
- BENCHMARK_COLS,
19
- COLS,
 
 
20
  EVAL_COLS,
21
  EVAL_TYPES,
22
  NUMERIC_INTERVALS,
@@ -52,8 +55,11 @@ except Exception:
52
  restart_space()
53
 
54
 
55
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
56
- leaderboard_df = original_df.copy()
 
 
 
57
 
58
  (
59
  finished_eval_queue_df,
@@ -74,7 +80,7 @@ def update_table(
74
  ):
75
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
76
  filtered_df = filter_queries(query, filtered_df)
77
- df = select_columns(filtered_df, columns)
78
  return df
79
 
80
 
@@ -82,13 +88,13 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
82
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
83
 
84
 
85
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
86
  always_here_cols = [
87
  AutoEvalColumn.model_type_symbol.name,
88
  AutoEvalColumn.model.name,
89
  ]
90
  # We use COLS to maintain sorting
91
- filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns]]
92
  return filtered_df
93
 
94
 
@@ -146,6 +152,7 @@ def filter_models(
146
  demo = gr.Blocks(css=custom_css)
147
  with demo:
148
  gr.HTML(TITLE)
 
149
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
150
 
151
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
@@ -160,11 +167,11 @@ with demo:
160
  )
161
  with gr.Row():
162
  shown_columns = gr.CheckboxGroup(
163
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
164
  value=[
165
  c.name
166
  for c in fields(AutoEvalColumn)
167
- if c.displayed_by_default and not c.hidden and not c.never_hidden
168
  ],
169
  label="Select columns to show",
170
  elem_id="column-select",
@@ -197,9 +204,8 @@ with demo:
197
  # interactive=True,
198
  # elem_id="filter-columns-size",
199
  # )
200
-
201
  leaderboard_table = gr.components.Dataframe(
202
- value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
203
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
204
  datatype=TYPES,
205
  elem_id="leaderboard-table",
@@ -209,8 +215,8 @@ with demo:
209
 
210
  # Dummy leaderboard for handling the case when the user uses backspace key
211
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
212
- value=original_df[COLS],
213
- headers=COLS,
214
  datatype=TYPES,
215
  visible=False,
216
  )
@@ -254,11 +260,11 @@ with demo:
254
  )
255
  with gr.Row():
256
  shown_columns = gr.CheckboxGroup(
257
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
258
  value=[
259
  c.name
260
  for c in fields(AutoEvalColumn)
261
- if c.displayed_by_default and not c.hidden and not c.never_hidden
262
  ],
263
  label="Select columns to show",
264
  elem_id="column-select",
@@ -293,7 +299,7 @@ with demo:
293
  # )
294
 
295
  leaderboard_table = gr.components.Dataframe(
296
- value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
297
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
298
  datatype=TYPES,
299
  elem_id="leaderboard-table",
@@ -303,8 +309,8 @@ with demo:
303
 
304
  # Dummy leaderboard for handling the case when the user uses backspace key
305
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
306
- value=original_df[COLS],
307
- headers=COLS,
308
  datatype=TYPES,
309
  visible=False,
310
  )
@@ -345,87 +351,6 @@ with demo:
345
  with gr.Row():
346
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
347
 
348
- with gr.Column():
349
- with gr.Accordion(
350
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
351
- open=False,
352
- ):
353
- with gr.Row():
354
- finished_eval_table = gr.components.Dataframe(
355
- value=finished_eval_queue_df,
356
- headers=EVAL_COLS,
357
- datatype=EVAL_TYPES,
358
- row_count=5,
359
- )
360
- with gr.Accordion(
361
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
362
- open=False,
363
- ):
364
- with gr.Row():
365
- running_eval_table = gr.components.Dataframe(
366
- value=running_eval_queue_df,
367
- headers=EVAL_COLS,
368
- datatype=EVAL_TYPES,
369
- row_count=5,
370
- )
371
-
372
- with gr.Accordion(
373
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
374
- open=False,
375
- ):
376
- with gr.Row():
377
- pending_eval_table = gr.components.Dataframe(
378
- value=pending_eval_queue_df,
379
- headers=EVAL_COLS,
380
- datatype=EVAL_TYPES,
381
- row_count=5,
382
- )
383
- with gr.Row():
384
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
385
-
386
- with gr.Row():
387
- with gr.Column():
388
- model_name_textbox = gr.Textbox(label="Model name")
389
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
390
- model_type = gr.Dropdown(
391
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
392
- label="Model type",
393
- multiselect=False,
394
- value=None,
395
- interactive=True,
396
- )
397
-
398
- with gr.Column():
399
- precision = gr.Dropdown(
400
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
401
- label="Precision",
402
- multiselect=False,
403
- value="float16",
404
- interactive=True,
405
- )
406
- weight_type = gr.Dropdown(
407
- choices=[i.value.name for i in WeightType],
408
- label="Weights type",
409
- multiselect=False,
410
- value="Original",
411
- interactive=True,
412
- )
413
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
414
-
415
- submit_button = gr.Button("Submit Eval")
416
- submission_result = gr.Markdown()
417
- submit_button.click(
418
- add_new_eval,
419
- [
420
- model_name_textbox,
421
- base_model_name_textbox,
422
- revision_name_textbox,
423
- precision,
424
- weight_type,
425
- model_type,
426
- ],
427
- submission_result,
428
- )
429
 
430
  with gr.Row():
431
  with gr.Accordion("📙 Citation", open=False):
@@ -440,4 +365,4 @@ with demo:
440
  scheduler = BackgroundScheduler()
441
  scheduler.add_job(restart_space, "interval", seconds=1800)
442
  scheduler.start()
443
- demo.queue(default_concurrency_limit=40).launch()
 
12
  INTRODUCTION_TEXT,
13
  LLM_BENCHMARKS_TEXT,
14
  TITLE,
15
+ LOGO
16
  )
17
  from src.display.css_html_js import custom_css
18
  from src.display.utils import (
19
+ DATASET_BENCHMARK_COLS,
20
+ TYPES_BENCHMARK_COLS,
21
+ DATASET_COLS,
22
+ M2_TYPES_COLS,
23
  EVAL_COLS,
24
  EVAL_TYPES,
25
  NUMERIC_INTERVALS,
 
55
  restart_space()
56
 
57
 
58
+ raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
59
+ datasets_leaderboard_df = datasets_original_df.copy()
60
+
61
+ raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, M2_TYPES_COLS, TYPES_BENCHMARK_COLS, "m2_types")
62
+ types_leaderboard_df = types_original_df.copy()
63
 
64
  (
65
  finished_eval_queue_df,
 
80
  ):
81
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
82
  filtered_df = filter_queries(query, filtered_df)
83
+ df = select_columns(filtered_df, columns, list(hidden_df.columns))
84
  return df
85
 
86
 
 
88
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
89
 
90
 
91
+ def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
92
  always_here_cols = [
93
  AutoEvalColumn.model_type_symbol.name,
94
  AutoEvalColumn.model.name,
95
  ]
96
  # We use COLS to maintain sorting
97
+ filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
98
  return filtered_df
99
 
100
 
 
152
  demo = gr.Blocks(css=custom_css)
153
  with demo:
154
  gr.HTML(TITLE)
155
+ gr.HTML(LOGO, elem_classes="logo")
156
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
157
 
158
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
167
  )
168
  with gr.Row():
169
  shown_columns = gr.CheckboxGroup(
170
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.m2_type_col],
171
  value=[
172
  c.name
173
  for c in fields(AutoEvalColumn)
174
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.m2_type_col
175
  ],
176
  label="Select columns to show",
177
  elem_id="column-select",
 
204
  # interactive=True,
205
  # elem_id="filter-columns-size",
206
  # )
 
207
  leaderboard_table = gr.components.Dataframe(
208
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
209
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
210
  datatype=TYPES,
211
  elem_id="leaderboard-table",
 
215
 
216
  # Dummy leaderboard for handling the case when the user uses backspace key
217
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
218
+ value=datasets_original_df[DATASET_COLS],
219
+ headers=DATASET_COLS,
220
  datatype=TYPES,
221
  visible=False,
222
  )
 
260
  )
261
  with gr.Row():
262
  shown_columns = gr.CheckboxGroup(
263
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.dataset_task_col],
264
  value=[
265
  c.name
266
  for c in fields(AutoEvalColumn)
267
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.dataset_task_col
268
  ],
269
  label="Select columns to show",
270
  elem_id="column-select",
 
299
  # )
300
 
301
  leaderboard_table = gr.components.Dataframe(
302
+ value=types_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
303
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
304
  datatype=TYPES,
305
  elem_id="leaderboard-table",
 
309
 
310
  # Dummy leaderboard for handling the case when the user uses backspace key
311
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
312
+ value=types_original_df[M2_TYPES_COLS],
313
+ headers=M2_TYPES_COLS,
314
  datatype=TYPES,
315
  visible=False,
316
  )
 
351
  with gr.Row():
352
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  with gr.Row():
356
  with gr.Accordion("📙 Citation", open=False):
 
365
  scheduler = BackgroundScheduler()
366
  scheduler.add_job(restart_space, "interval", seconds=1800)
367
  scheduler.start()
368
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
assets/image.png ADDED
src/about.py CHANGED
@@ -7,8 +7,7 @@ class Task:
7
  benchmark: str
8
  metric: str
9
  col_name: str
10
-
11
-
12
 
13
  # Select your tasks here
14
  # ---------------------------------------------------
@@ -23,6 +22,21 @@ class Tasks(Enum):
23
  # task5 = Task("", "f1", "")
24
  # task6 = Task("", "f1", "")
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  NUM_FEWSHOT = 0 # Change with your few shot
28
  # ---------------------------------------------------
@@ -30,28 +44,33 @@ NUM_FEWSHOT = 0 # Change with your few shot
30
 
31
  # Your leaderboard name
32
  TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
33
-
34
  # What does your leaderboard evaluate?
35
  INTRODUCTION_TEXT = """
 
 
 
36
  """
37
 
38
  # Which evaluations are you running? how can people reproduce what you have?
39
  LLM_BENCHMARKS_TEXT = f"""
 
 
 
40
  ## About
41
- Named Entity Recogintion is a significant task for information extraction. However, we do not have a open leaderboard to rank the NER capabilities of models in the Bio-Medical domain.
42
 
43
- MEDICS NER leaderboard aims to solve this by quantifying NER performance on open-source datasets.
44
- To keep the evaluation widely relevant the entity types in the dataset are mapped to broader M2 types. More information on this mapping can be found here - M2-DATASETS-ARTICLE-LINK
45
 
46
- ### Tasks
47
- 📈 We evaluate models on X key datasets, encompassing Y entity types
48
- - NCBI - INFO
49
  - CHIA
50
  - BIORED
51
  - BC5CD
52
 
53
  ### Evaluation Metrics
54
-
55
 
56
 
57
  ## Reproducibility
@@ -60,33 +79,7 @@ To reproduce our results, here is the commands you can run:
60
  """
61
 
62
  EVALUATION_QUEUE_TEXT = """
63
- ## Some good practices before submitting a model
64
-
65
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
66
- ```python
67
- from transformers import AutoConfig, AutoModel, AutoTokenizer
68
- config = AutoConfig.from_pretrained("your model name", revision=revision)
69
- model = AutoModel.from_pretrained("your model name", revision=revision)
70
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
71
- ```
72
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
73
-
74
- Note: make sure your model is public!
75
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
76
-
77
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
78
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
79
-
80
- ### 3) Make sure your model has an open license!
81
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
82
-
83
- ### 4) Fill up your model card
84
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
85
-
86
- ## In case of model failure
87
- If your model is displayed in the `FAILED` category, its execution stopped.
88
- Make sure you have followed the above steps first.
89
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
90
  """
91
 
92
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
7
  benchmark: str
8
  metric: str
9
  col_name: str
10
+
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
 
22
  # task5 = Task("", "f1", "")
23
  # task6 = Task("", "f1", "")
24
 
25
+ @dataclass
26
+ class M2Type:
27
+ benchmark: str
28
+ metric: str
29
+ col_name: str
30
+
31
+ class M2Types(Enum):
32
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
33
+ type0 = M2Type("condition", "f1", "CONDITION")
34
+ type1 = M2Type("measurement", "f1", "MEASUREMENT")
35
+ type2 = M2Type("drug", "f1", "DRUG")
36
+ type3 = M2Type("procedure", "f1", "PROCEDURE")
37
+ type4 = M2Type("gene", "f1", "GENE")
38
+ type5 = M2Type("gene variant", "f1", "GENE VARIANT")
39
+
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
42
  # ---------------------------------------------------
 
44
 
45
  # Your leaderboard name
46
  TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
47
+ LOGO = """<img src="file/assets/image.png" alt="M2 X HF" width="500" height="333">"""
48
  # What does your leaderboard evaluate?
49
  INTRODUCTION_TEXT = """
50
+ Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
51
+ The datasets used for this evaluation encompass a wide range of medical entities, including diseases, symptoms, medications, procedures and anatomical terms. These datasets are sourced from openly available clinical data (including annotations) to ensure comprehensive coverage and reflect the complexity of real-world medical language. More details about the datasets included can be found in the "About" section.
52
+ The evaluation metrics used in this leaderboard focus primarily on the F1-score, a widely recognized measure of a model's accuracy. More details about the evaluation metric can be found in the "About" section
53
  """
54
 
55
  # Which evaluations are you running? how can people reproduce what you have?
56
  LLM_BENCHMARKS_TEXT = f"""
57
+
58
+ Note: It is important to note that the purpose of this evaluation is purely academic and exploratory. The models assessed here have not been approved for clinical use, and their results should not be interpreted as clinically validated. The leaderboard serves as a platform for researchers to compare models, understand their strengths and limitations, and drive further advancements in the field of clinical NLP.
59
+
60
  ## About
61
+ The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
62
 
63
+ ## How it works
 
64
 
65
+ ### Datasets
66
+ 📈 We evaluate the models on 4 datasets, encompassing 6 entity types
67
+ - NCBI
68
  - CHIA
69
  - BIORED
70
  - BC5CD
71
 
72
  ### Evaluation Metrics
73
+ We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.
74
 
75
 
76
  ## Reproducibility
 
79
  """
80
 
81
  EVALUATION_QUEUE_TEXT = """
82
+ Follow the steps detailed in the [medics_ner](https://github.com/WadoodAbdul/medics_ner/blob/3b415e9c4c9561ce5168374813072bde36658ff4/docs/submit_to_leaderboard.md) repo to upload you model to the leaderoard.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """
84
 
85
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/display/css_html_js.py CHANGED
@@ -1,4 +1,11 @@
1
  custom_css = """
 
 
 
 
 
 
 
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
 
1
  custom_css = """
2
+ .logo {
3
+ width: 500px;
4
+ height: auto;
5
+ margin: 0 auto;
6
+ max-width: 100%
7
+ object-fit: contain;
8
+ }
9
 
10
  .markdown-text {
11
  font-size: 16px !important;
src/display/utils.py CHANGED
@@ -4,6 +4,7 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  from src.about import Tasks
 
7
 
8
 
9
  def fields(raw_class):
@@ -20,6 +21,8 @@ class ColumnContent:
20
  displayed_by_default: bool
21
  hidden: bool = False
22
  never_hidden: bool = False
 
 
23
 
24
 
25
  ## Leaderboard columns
@@ -30,7 +33,9 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
30
  # Scores
31
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
32
  for task in Tasks:
33
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False)])
 
 
34
  # Model information
35
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -128,7 +133,8 @@ class Precision(Enum):
128
 
129
 
130
  # Column selection
131
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
132
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
133
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
134
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -136,7 +142,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
136
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
137
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
138
 
139
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
140
 
141
  NUMERIC_INTERVALS = {
142
  "?": pd.Interval(-1, 0, closed="right"),
 
4
  import pandas as pd
5
 
6
  from src.about import Tasks
7
+ from src.about import M2Types
8
 
9
 
10
  def fields(raw_class):
 
21
  displayed_by_default: bool
22
  hidden: bool = False
23
  never_hidden: bool = False
24
+ dataset_task_col: bool = False
25
+ m2_type_col: bool = False
26
 
27
 
28
  ## Leaderboard columns
 
33
  # Scores
34
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
35
  for task in Tasks:
36
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
37
+ for task in M2Types:
38
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, m2_type_col=True)])
39
  # Model information
40
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
41
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
133
 
134
 
135
  # Column selection
136
+ DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.m2_type_col]
137
+ M2_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
138
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
139
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
140
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
142
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
143
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
144
 
145
+ DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
146
+ TYPES_BENCHMARK_COLS = [t.value.col_name for t in M2Types]
147
 
148
  NUMERIC_INTERVALS = {
149
  "?": pd.Interval(-1, 0, closed="right"),
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -21,7 +21,8 @@ class EvalResult:
21
  org: str
22
  model: str
23
  revision: str # commit hash, "" if main
24
- results: dict
 
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -42,6 +43,9 @@ class EvalResult:
42
 
43
  # Precision
44
  precision = Precision.from_str(config.get("model_dtype"))
 
 
 
45
 
46
  # Get model and org
47
  org_and_model = config.get("model_name", config.get("model_args", None))
@@ -67,28 +71,44 @@ class EvalResult:
67
  architecture = ";".join(architectures)
68
 
69
  # Extract results available in this file (some results are split in several files)
70
- results = {}
71
  for task in Tasks:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
  mean_acc = np.mean(accs) # * 100.0
80
- results[task.benchmark] = mean_acc
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  return self(
83
  eval_name=result_key,
84
  full_model=full_model,
85
  org=org,
86
  model=model,
87
- results=results,
 
88
  precision=precision,
89
  revision=config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture,
 
 
 
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -111,29 +131,54 @@ class EvalResult:
111
  )
112
  print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
113
 
114
- def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
117
- data_dict = {
118
- "eval_name": self.eval_name, # not a column, just a save name,
119
- AutoEvalColumn.precision.name: self.precision.value.name,
120
- AutoEvalColumn.model_type.name: self.model_type.value.name,
121
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
122
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
123
- AutoEvalColumn.architecture.name: self.architecture,
124
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
125
- AutoEvalColumn.revision.name: self.revision,
126
- AutoEvalColumn.average.name: average,
127
- AutoEvalColumn.license.name: self.license,
128
- AutoEvalColumn.likes.name: self.likes,
129
- AutoEvalColumn.params.name: self.num_params,
130
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
131
- }
132
-
133
- for task in Tasks:
134
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
135
-
136
- return data_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
 
139
  def get_request_file_for_model(requests_path, model_name, precision):
@@ -181,15 +226,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
181
 
182
  # Store results of same eval together
183
  eval_name = eval_result.eval_name
184
- if eval_name in eval_results.keys():
185
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
186
- else:
187
- eval_results[eval_name] = eval_result
188
 
189
  results = []
 
190
  for v in eval_results.values():
191
  try:
192
- v.to_dict() # we test if the dict version is complete
193
  results.append(v)
194
  except KeyError: # not all eval values present
195
  continue
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, M2Types
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
21
  org: str
22
  model: str
23
  revision: str # commit hash, "" if main
24
+ dataset_results: dict
25
+ m2_type_results:dict
26
  precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
43
 
44
  # Precision
45
  precision = Precision.from_str(config.get("model_dtype"))
46
+ model_type = ModelType.from_str(config.get("model_type", ""))
47
+ license = config.get("license", "?")
48
+ num_params = config.get("num_params", "?")
49
 
50
  # Get model and org
51
  org_and_model = config.get("model_name", config.get("model_args", None))
 
71
  architecture = ";".join(architectures)
72
 
73
  # Extract results available in this file (some results are split in several files)
74
+ dataset_results = {}
75
  for task in Tasks:
76
  task = task.value
77
 
78
  # We average all scores of a given metric (not all metrics are present in all files)
79
+ accs = np.array([v.get(task.metric, None) for k, v in data["dataset_results"].items() if task.benchmark == k])
80
  if accs.size == 0 or any([acc is None for acc in accs]):
81
  continue
82
 
83
  mean_acc = np.mean(accs) # * 100.0
84
+ dataset_results[task.benchmark] = mean_acc
85
+
86
+ types_results = {}
87
+ for m2_type in M2Types:
88
+ m2_type = m2_type.value
89
+
90
+ # We average all scores of a given metric (not all metrics are present in all files)
91
+ accs = np.array([v.get(m2_type.metric, None) for k, v in data["m2_type_results"].items() if m2_type.benchmark == k])
92
+ if accs.size == 0 or any([acc is None for acc in accs]):
93
+ continue
94
+
95
+ mean_acc = np.mean(accs) # * 100.0
96
+ types_results[m2_type.benchmark] = mean_acc
97
 
98
  return self(
99
  eval_name=result_key,
100
  full_model=full_model,
101
  org=org,
102
  model=model,
103
+ dataset_results=dataset_results,
104
+ m2_type_results=types_results,
105
  precision=precision,
106
  revision=config.get("model_sha", ""),
107
  still_on_hub=still_on_hub,
108
  architecture=architecture,
109
+ model_type=model_type,
110
+ num_params=num_params,
111
+ license=license
112
  )
113
 
114
  def update_with_request_file(self, requests_path):
 
131
  )
132
  print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
133
 
134
+ def to_dict(self, subset):
135
  """Converts the Eval Result to a dict compatible with our dataframe display"""
136
+ if subset == "datasets":
137
+ average = sum([v for v in self.dataset_results.values() if v is not None]) / len(Tasks)
138
+ data_dict = {
139
+ "eval_name": self.eval_name, # not a column, just a save name,
140
+ AutoEvalColumn.precision.name: self.precision.value.name,
141
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
142
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
143
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
144
+ AutoEvalColumn.architecture.name: self.architecture,
145
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
146
+ AutoEvalColumn.revision.name: self.revision,
147
+ AutoEvalColumn.average.name: average,
148
+ AutoEvalColumn.license.name: self.license,
149
+ AutoEvalColumn.likes.name: self.likes,
150
+ AutoEvalColumn.params.name: self.num_params,
151
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
152
+ }
153
+
154
+ for task in Tasks:
155
+ data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
156
+
157
+ return data_dict
158
+
159
+ if subset == "m2_types":
160
+ average = sum([v for v in self.m2_type_results.values() if v is not None]) / len(M2Types)
161
+ data_dict = {
162
+ "eval_name": self.eval_name, # not a column, just a save name,
163
+ AutoEvalColumn.precision.name: self.precision.value.name,
164
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
165
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
166
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
167
+ AutoEvalColumn.architecture.name: self.architecture,
168
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
169
+ AutoEvalColumn.revision.name: self.revision,
170
+ AutoEvalColumn.average.name: average,
171
+ AutoEvalColumn.license.name: self.license,
172
+ AutoEvalColumn.likes.name: self.likes,
173
+ AutoEvalColumn.params.name: self.num_params,
174
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
175
+ }
176
+
177
+ for m2_type in M2Types:
178
+ data_dict[m2_type.value.col_name] = self.m2_type_results[m2_type.value.benchmark]
179
+
180
+ return data_dict
181
+
182
 
183
 
184
  def get_request_file_for_model(requests_path, model_name, precision):
 
226
 
227
  # Store results of same eval together
228
  eval_name = eval_result.eval_name
229
+ # if eval_name in eval_results.keys():
230
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
231
+ # else:
232
+ eval_results[eval_name] = eval_result
233
 
234
  results = []
235
+ # m2_type_results = []
236
  for v in eval_results.values():
237
  try:
238
+ v.to_dict(subset="dataset") # we test if the dict version is complete
239
  results.append(v)
240
  except KeyError: # not all eval values present
241
  continue
src/populate.py CHANGED
@@ -8,13 +8,14 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, subset:str) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
+ all_data_json = [v.to_dict(subset=subset) for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
+ cols = list(set(df.columns).intersection(set(cols)))
19
  df = df[cols].round(decimals=2)
20
 
21
  # filter out if any of the benchmarks have not been produced