Spaces:

m42-health
/

clinical_ner_leaderboard

Runtime error

App Files Files Community

“WadoodAbdul” commited on Jun 27, 2024

Commit

eb6e73c

1 Parent(s): 7d35d7a

added m2_types and updated documentation

Browse files

Files changed (7) hide show

app.py +25 -100
assets/image.png +0 -0
src/about.py +30 -37
src/display/css_html_js.py +7 -0
src/display/utils.py +10 -3
src/leaderboard/read_evals.py +79 -33
src/populate.py +3 -2

app.py CHANGED Viewed

@@ -12,11 +12,14 @@ from src.about import (
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
     EVAL_COLS,
     EVAL_TYPES,
     NUMERIC_INTERVALS,
@@ -52,8 +55,11 @@ except Exception:
     restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-leaderboard_df = original_df.copy()
 (
     finished_eval_queue_df,
@@ -74,7 +80,7 @@ def update_table(
 ):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
-    df = select_columns(filtered_df, columns)
     return df
@@ -82,13 +88,13 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
     return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
-def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [
         AutoEvalColumn.model_type_symbol.name,
         AutoEvalColumn.model.name,
     ]
     # We use COLS to maintain sorting
-    filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns]]
     return filtered_df
@@ -146,6 +152,7 @@ def filter_models(
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
@@ -160,11 +167,11 @@ with demo:
                         )
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
-                            choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
@@ -197,9 +204,8 @@ with demo:
                     #     interactive=True,
                     #     elem_id="filter-columns-size",
                     # )
             leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
                 elem_id="leaderboard-table",
@@ -209,8 +215,8 @@ with demo:
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
                 datatype=TYPES,
                 visible=False,
             )
@@ -254,11 +260,11 @@ with demo:
                         )
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
-                            choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
@@ -293,7 +299,7 @@ with demo:
                     # )
             leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
                 elem_id="leaderboard-table",
@@ -303,8 +309,8 @@ with demo:
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
                 datatype=TYPES,
                 visible=False,
             )
@@ -345,87 +351,6 @@ with demo:
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
@@ -440,4 +365,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
+    LOGO
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
+    DATASET_BENCHMARK_COLS,
+    TYPES_BENCHMARK_COLS,
+    DATASET_COLS,
+    M2_TYPES_COLS,
     EVAL_COLS,
     EVAL_TYPES,
     NUMERIC_INTERVALS,
     restart_space()
+raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
+datasets_leaderboard_df = datasets_original_df.copy()
+raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, M2_TYPES_COLS, TYPES_BENCHMARK_COLS, "m2_types")
+types_leaderboard_df = types_original_df.copy()
 (
     finished_eval_queue_df,
 ):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
+    df = select_columns(filtered_df, columns, list(hidden_df.columns))
     return df
     return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
+def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
     always_here_cols = [
         AutoEvalColumn.model_type_symbol.name,
         AutoEvalColumn.model.name,
     ]
     # We use COLS to maintain sorting
+    filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
     return filtered_df
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
+    gr.HTML(LOGO, elem_classes="logo")
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
                         )
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
+                            choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.m2_type_col],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
+                                if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.m2_type_col
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
                     #     interactive=True,
                     #     elem_id="filter-columns-size",
                     # )
             leaderboard_table = gr.components.Dataframe(
+                value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
                 elem_id="leaderboard-table",
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=datasets_original_df[DATASET_COLS],
+                headers=DATASET_COLS,
                 datatype=TYPES,
                 visible=False,
             )
                         )
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
+                            choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.dataset_task_col],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
+                                if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.dataset_task_col
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
                     # )
             leaderboard_table = gr.components.Dataframe(
+                value=types_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
                 elem_id="leaderboard-table",
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=types_original_df[M2_TYPES_COLS],
+                headers=M2_TYPES_COLS,
                 datatype=TYPES,
                 visible=False,
             )
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])

assets/image.png ADDED Viewed

src/about.py CHANGED Viewed

@@ -7,8 +7,7 @@ class Task:
     benchmark: str
     metric: str
     col_name: str
 # Select your tasks here
 # ---------------------------------------------------
@@ -23,6 +22,21 @@ class Tasks(Enum):
     # task5 = Task("", "f1", "")
     # task6 = Task("", "f1", "")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
@@ -30,28 +44,33 @@ NUM_FEWSHOT = 0  # Change with your few shot
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## About
-Named Entity Recogintion is a significant task for information extraction. However, we do not have a open leaderboard to rank the NER capabilities of models in the Bio-Medical domain.
-MEDICS NER leaderboard aims to solve this by quantifying NER performance on open-source datasets.
-To keep the evaluation widely relevant the entity types in the dataset are mapped to broader M2 types. More information on this mapping can be found here - M2-DATASETS-ARTICLE-LINK
-### Tasks
-📈 We evaluate models on X key datasets, encompassing Y entity types
-- NCBI - INFO
 - CHIA
 - BIORED
 - BC5CD
 ### Evaluation Metrics
 ## Reproducibility
@@ -60,33 +79,7 @@ To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

     benchmark: str
     metric: str
     col_name: str
 # Select your tasks here
 # ---------------------------------------------------
     # task5 = Task("", "f1", "")
     # task6 = Task("", "f1", "")
+@dataclass
+class M2Type:
+    benchmark: str
+    metric: str
+    col_name: str
+class M2Types(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    type0 = M2Type("condition", "f1", "CONDITION")
+    type1 = M2Type("measurement", "f1", "MEASUREMENT")
+    type2 = M2Type("drug", "f1", "DRUG")
+    type3 = M2Type("procedure", "f1", "PROCEDURE")
+    type4 = M2Type("gene", "f1", "GENE")
+    type5 = M2Type("gene variant", "f1", "GENE VARIANT")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
+LOGO = """<img src="file/assets/image.png" alt="M2 X HF" width="500" height="333">"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
+The datasets used for this evaluation encompass a wide range of medical entities, including diseases, symptoms, medications, procedures and anatomical terms. These datasets are sourced from openly available clinical data (including annotations) to ensure comprehensive coverage and reflect the complexity of real-world medical language. More details about the datasets included can be found in the "About" section.
+The evaluation metrics used in this leaderboard focus primarily on the F1-score, a widely recognized measure of a model's accuracy. More details about the evaluation metric can be found in the "About" section
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+Note: It is important to note that the purpose of this evaluation is purely academic and exploratory. The models assessed here have not been approved for clinical use, and their results should not be interpreted as clinically validated. The leaderboard serves as a platform for researchers to compare models, understand their strengths and limitations, and drive further advancements in the field of clinical NLP.
 ## About
+The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
+## How it works
+### Datasets
+📈 We evaluate the models on 4 datasets, encompassing 6 entity types
+- NCBI
 - CHIA
 - BIORED
 - BC5CD
 ### Evaluation Metrics
+We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.
 ## Reproducibility
 """
 EVALUATION_QUEUE_TEXT = """
+Follow the steps detailed in the [medics_ner](https://github.com/WadoodAbdul/medics_ner/blob/3b415e9c4c9561ce5168374813072bde36658ff4/docs/submit_to_leaderboard.md) repo to upload you model to the leaderoard.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/display/css_html_js.py CHANGED Viewed

@@ -1,4 +1,11 @@
 custom_css = """
 .markdown-text {
     font-size: 16px !important;

 custom_css = """
+.logo {
+    width: 500px;
+    height: auto;
+    margin: 0 auto;
+    max-width: 100%
+    object-fit: contain;
+}
 .markdown-text {
     font-size: 16px !important;

src/display/utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ from enum import Enum
 import pandas as pd
 from src.about import Tasks
 def fields(raw_class):
@@ -20,6 +21,8 @@ class ColumnContent:
     displayed_by_default: bool
     hidden: bool = False
     never_hidden: bool = False
 ## Leaderboard columns
@@ -30,7 +33,9 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 # Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -128,7 +133,8 @@ class Precision(Enum):
 # Column selection
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -136,7 +142,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

 import pandas as pd
 from src.about import Tasks
+from src.about import M2Types
 def fields(raw_class):
     displayed_by_default: bool
     hidden: bool = False
     never_hidden: bool = False
+    dataset_task_col: bool = False
+    m2_type_col: bool = False
 ## Leaderboard columns
 # Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
+for task in M2Types:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, m2_type_col=True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 # Column selection
+DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.m2_type_col]
+M2_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+TYPES_BENCHMARK_COLS = [t.value.col_name for t in M2Types]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,7 +8,7 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
 from src.submission.check_validity import is_model_on_hub
@@ -21,7 +21,8 @@ class EvalResult:
     org: str
     model: str
     revision: str  # commit hash, "" if main
-    results: dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original  # Original or Adapter
@@ -42,6 +43,9 @@ class EvalResult:
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
@@ -67,28 +71,44 @@ class EvalResult:
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
-        results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs)  # * 100.0
-            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
-            results=results,
             precision=precision,
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture,
         )
     def update_with_request_file(self, requests_path):
@@ -111,29 +131,54 @@ class EvalResult:
             )
             print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
-    def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
@@ -181,15 +226,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         # Store results of same eval together
         eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():
         try:
-            v.to_dict()  # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, M2Types
 from src.submission.check_validity import is_model_on_hub
     org: str
     model: str
     revision: str  # commit hash, "" if main
+    dataset_results: dict
+    m2_type_results:dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original  # Original or Adapter
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
+        model_type = ModelType.from_str(config.get("model_type", ""))
+        license = config.get("license", "?")
+        num_params = config.get("num_params", "?")
         # Get model and org
         org_and_model = config.get("model_name", config.get("model_args", None))
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
+        dataset_results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["dataset_results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs)  # * 100.0
+            dataset_results[task.benchmark] = mean_acc
+        types_results = {}
+        for m2_type in M2Types:
+            m2_type = m2_type.value
+            # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(m2_type.metric, None) for k, v in data["m2_type_results"].items() if m2_type.benchmark == k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                continue
+            mean_acc = np.mean(accs)  # * 100.0
+            types_results[m2_type.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             model=model,
+            dataset_results=dataset_results,
+            m2_type_results=types_results,
             precision=precision,
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture,
+            model_type=model_type,
+            num_params=num_params,
+            license=license
         )
     def update_with_request_file(self, requests_path):
             )
             print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
+    def to_dict(self, subset):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        if subset == "datasets":
+            average = sum([v for v in self.dataset_results.values() if v is not None]) / len(Tasks)
+            data_dict = {
+                "eval_name": self.eval_name,  # not a column, just a save name,
+                AutoEvalColumn.precision.name: self.precision.value.name,
+                AutoEvalColumn.model_type.name: self.model_type.value.name,
+                AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+                AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+                AutoEvalColumn.architecture.name: self.architecture,
+                AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+                AutoEvalColumn.revision.name: self.revision,
+                AutoEvalColumn.average.name: average,
+                AutoEvalColumn.license.name: self.license,
+                AutoEvalColumn.likes.name: self.likes,
+                AutoEvalColumn.params.name: self.num_params,
+                AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+            }
+            for task in Tasks:
+                data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
+            return data_dict
+        if subset == "m2_types":
+            average = sum([v for v in self.m2_type_results.values() if v is not None]) / len(M2Types)
+            data_dict = {
+                "eval_name": self.eval_name,  # not a column, just a save name,
+                AutoEvalColumn.precision.name: self.precision.value.name,
+                AutoEvalColumn.model_type.name: self.model_type.value.name,
+                AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+                AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+                AutoEvalColumn.architecture.name: self.architecture,
+                AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+                AutoEvalColumn.revision.name: self.revision,
+                AutoEvalColumn.average.name: average,
+                AutoEvalColumn.license.name: self.license,
+                AutoEvalColumn.likes.name: self.likes,
+                AutoEvalColumn.params.name: self.num_params,
+                AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+            }
+            for m2_type in M2Types:
+                data_dict[m2_type.value.col_name] = self.m2_type_results[m2_type.value.benchmark]
+            return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
         # Store results of same eval together
         eval_name = eval_result.eval_name
+        # if eval_name in eval_results.keys():
+        #     eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        # else:
+        eval_results[eval_name] = eval_result
     results = []
+    # m2_type_results = []
     for v in eval_results.values():
         try:
+            v.to_dict(subset="dataset")  # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -8,13 +8,14 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced

 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, subset:str) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
+    all_data_json = [v.to_dict(subset=subset) for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    cols = list(set(df.columns).intersection(set(cols)))
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced