“WadoodAbdul”
commited on
Commit
·
215aa92
1
Parent(s):
4b57d62
added model submission functionality
Browse files- app.py +158 -1
- src/about.py +38 -10
- src/display/utils.py +14 -5
- src/submission/check_validity.py +15 -5
- src/submission/submit.py +102 -17
app.py
CHANGED
@@ -27,13 +27,14 @@ from src.display.utils import (
|
|
27 |
AutoEvalColumn,
|
28 |
ModelType,
|
29 |
ModelArch,
|
|
|
30 |
Precision,
|
31 |
WeightType,
|
32 |
fields,
|
33 |
)
|
34 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
35 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
36 |
-
from src.submission.submit import add_new_eval
|
37 |
|
38 |
|
39 |
def restart_space():
|
@@ -155,7 +156,61 @@ def filter_models(
|
|
155 |
|
156 |
return filtered_df
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
|
|
159 |
demo = gr.Blocks(css=custom_css)
|
160 |
with demo:
|
161 |
gr.HTML(TITLE)
|
@@ -370,6 +425,108 @@ with demo:
|
|
370 |
with gr.Row():
|
371 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
|
374 |
with gr.Row():
|
375 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
27 |
AutoEvalColumn,
|
28 |
ModelType,
|
29 |
ModelArch,
|
30 |
+
PromptTemplateName,
|
31 |
Precision,
|
32 |
WeightType,
|
33 |
fields,
|
34 |
)
|
35 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
36 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
37 |
+
from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
|
38 |
|
39 |
|
40 |
def restart_space():
|
|
|
156 |
|
157 |
return filtered_df
|
158 |
|
159 |
+
def change_submit_request_form(model_architecture):
|
160 |
+
match model_architecture:
|
161 |
+
case "Encoder":
|
162 |
+
return (
|
163 |
+
gr.Textbox(label="Threshold for gliner models", visible=False),
|
164 |
+
gr.Radio(
|
165 |
+
choices=["True", "False"],
|
166 |
+
label="Load GLiNER Tokenizer",
|
167 |
+
visible=False
|
168 |
+
),
|
169 |
+
gr.Dropdown(
|
170 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
171 |
+
label="Prompt for generation",
|
172 |
+
multiselect=False,
|
173 |
+
# value="HTML Highlighted Spans",
|
174 |
+
interactive=True,
|
175 |
+
visible=False
|
176 |
+
)
|
177 |
+
)
|
178 |
+
case "Decoder":
|
179 |
+
return (
|
180 |
+
gr.Textbox(label="Threshold for gliner models", visible=False),
|
181 |
+
gr.Radio(
|
182 |
+
choices=["True", "False"],
|
183 |
+
label="Load GLiNER Tokenizer",
|
184 |
+
visible=False
|
185 |
+
),
|
186 |
+
gr.Dropdown(
|
187 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
188 |
+
label="Prompt for generation",
|
189 |
+
multiselect=False,
|
190 |
+
# value="HTML Highlighted Spans",
|
191 |
+
interactive=True,
|
192 |
+
visible=True
|
193 |
+
)
|
194 |
+
)
|
195 |
+
case "GLiNER Encoder":
|
196 |
+
return (
|
197 |
+
gr.Textbox(label="Threshold for gliner models", visible=True),
|
198 |
+
gr.Radio(
|
199 |
+
choices=["True", "False"],
|
200 |
+
label="Load GLiNER Tokenizer",
|
201 |
+
visible=True
|
202 |
+
),
|
203 |
+
gr.Dropdown(
|
204 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
205 |
+
label="Prompt for generation",
|
206 |
+
multiselect=False,
|
207 |
+
# value="HTML Highlighted Spans",
|
208 |
+
interactive=True,
|
209 |
+
visible=False
|
210 |
+
)
|
211 |
+
)
|
212 |
|
213 |
+
|
214 |
demo = gr.Blocks(css=custom_css)
|
215 |
with demo:
|
216 |
gr.HTML(TITLE)
|
|
|
425 |
with gr.Row():
|
426 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
427 |
|
428 |
+
with gr.Column():
|
429 |
+
with gr.Accordion(
|
430 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
431 |
+
open=False,
|
432 |
+
):
|
433 |
+
with gr.Row():
|
434 |
+
finished_eval_table = gr.components.Dataframe(
|
435 |
+
value=finished_eval_queue_df,
|
436 |
+
headers=EVAL_COLS,
|
437 |
+
datatype=EVAL_TYPES,
|
438 |
+
row_count=5,
|
439 |
+
)
|
440 |
+
with gr.Accordion(
|
441 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
442 |
+
open=False,
|
443 |
+
):
|
444 |
+
with gr.Row():
|
445 |
+
running_eval_table = gr.components.Dataframe(
|
446 |
+
value=running_eval_queue_df,
|
447 |
+
headers=EVAL_COLS,
|
448 |
+
datatype=EVAL_TYPES,
|
449 |
+
row_count=5,
|
450 |
+
)
|
451 |
+
|
452 |
+
with gr.Accordion(
|
453 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
454 |
+
open=False,
|
455 |
+
):
|
456 |
+
with gr.Row():
|
457 |
+
pending_eval_table = gr.components.Dataframe(
|
458 |
+
value=pending_eval_queue_df,
|
459 |
+
headers=EVAL_COLS,
|
460 |
+
datatype=EVAL_TYPES,
|
461 |
+
row_count=5,
|
462 |
+
)
|
463 |
+
with gr.Row():
|
464 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
465 |
+
|
466 |
+
with gr.Row():
|
467 |
+
with gr.Column():
|
468 |
+
|
469 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
470 |
+
|
471 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
472 |
+
|
473 |
+
model_arch = gr.Radio(
|
474 |
+
choices=[t.to_str(" : ") for t in ModelArch if t != ModelArch.Unknown],
|
475 |
+
label="Model Architecture",
|
476 |
+
)
|
477 |
+
|
478 |
+
model_type = gr.Dropdown(
|
479 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
480 |
+
label="Model type",
|
481 |
+
multiselect=False,
|
482 |
+
value=None,
|
483 |
+
interactive=True,
|
484 |
+
)
|
485 |
+
|
486 |
+
with gr.Column():
|
487 |
+
label_normalization_map = gr.Textbox(lines=6, label="Label Normalization Map", placeholder=PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG)
|
488 |
+
gliner_threshold = gr.Textbox(label="Threshold for GLiNER models", visible=False)
|
489 |
+
gliner_tokenizer_bool = gr.Radio(
|
490 |
+
choices=["True", "False"],
|
491 |
+
label="Load GLiNER Tokenizer",
|
492 |
+
visible=False
|
493 |
+
)
|
494 |
+
prompt_name = gr.Dropdown(
|
495 |
+
choices=[prompt_template.value for prompt_template in PromptTemplateName],
|
496 |
+
label="Prompt for generation",
|
497 |
+
multiselect=False,
|
498 |
+
value="HTML Highlighted Spans",
|
499 |
+
interactive=True,
|
500 |
+
visible=False
|
501 |
+
)# should be a dropdown
|
502 |
+
|
503 |
+
# parsing_function - this is tied to the prompt & therefore does not need to be specified
|
504 |
+
# generation_parameters = gr.Textbox(label="Generation params in json format") just default for now
|
505 |
+
|
506 |
+
model_arch.change(fn=change_submit_request_form, inputs=model_arch, outputs=[
|
507 |
+
gliner_threshold,
|
508 |
+
gliner_tokenizer_bool,
|
509 |
+
prompt_name])
|
510 |
+
|
511 |
+
submit_button = gr.Button("Submit Eval")
|
512 |
+
submission_result = gr.Markdown()
|
513 |
+
submit_button.click(
|
514 |
+
add_new_eval,
|
515 |
+
[
|
516 |
+
model_name_textbox,
|
517 |
+
# base_model_name_textbox,
|
518 |
+
revision_name_textbox,
|
519 |
+
model_arch,
|
520 |
+
label_normalization_map,
|
521 |
+
gliner_threshold,
|
522 |
+
gliner_tokenizer_bool,
|
523 |
+
prompt_name,
|
524 |
+
# weight_type,
|
525 |
+
model_type,
|
526 |
+
],
|
527 |
+
submission_result,
|
528 |
+
)
|
529 |
+
|
530 |
|
531 |
with gr.Row():
|
532 |
with gr.Accordion("📙 Citation", open=False):
|
src/about.py
CHANGED
@@ -43,27 +43,32 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
43 |
|
44 |
|
45 |
# Your leaderboard name
|
46 |
-
TITLE = """
|
47 |
LOGO = """<img src="file/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
|
48 |
# What does your leaderboard evaluate?
|
49 |
INTRODUCTION_TEXT = """
|
50 |
-
Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as
|
51 |
-
|
52 |
-
The
|
|
|
|
|
|
|
|
|
53 |
"""
|
54 |
|
55 |
# Which evaluations are you running? how can people reproduce what you have?
|
56 |
LLM_BENCHMARKS_TEXT = f"""
|
57 |
|
58 |
-
#### Disclaimer & Advisory
|
59 |
-
|
60 |
-
It is important to note that the purpose of this evaluation is purely academic and exploratory. The models assessed here have not been approved for clinical use, and their results should not be interpreted as clinically validated. The leaderboard serves as a platform for researchers to compare models, understand their strengths and limitations, and drive further advancements in the field of clinical NLP.
|
61 |
-
|
62 |
-
## About
|
63 |
The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
|
|
|
64 |
|
65 |
## How it works
|
66 |
|
|
|
|
|
|
|
|
|
|
|
67 |
### Datasets
|
68 |
📈 We evaluate the models on 4 datasets, encompassing 6 entity types
|
69 |
- [NCBI](https://huggingface.co/datasets/m42-health/clinical_ncbi)
|
@@ -81,7 +86,30 @@ To reproduce our results, follow the steps detailed [here](https://github.com/Wa
|
|
81 |
"""
|
82 |
|
83 |
EVALUATION_QUEUE_TEXT = """
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
"""
|
86 |
|
87 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
43 |
|
44 |
|
45 |
# Your leaderboard name
|
46 |
+
TITLE = """""" #<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
|
47 |
LOGO = """<img src="file/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
|
48 |
# What does your leaderboard evaluate?
|
49 |
INTRODUCTION_TEXT = """
|
50 |
+
The main goal of the Named Clinical Entity Recognition Leaderboard is to evaluate and benchmark the performance of various language models in accurately identifying and classifying named clinical entities across diverse medical domains. This task is crucial for advancing natural language processing (NLP) applications in healthcare, as accurate entity recognition is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
|
51 |
+
|
52 |
+
The datasets used for this evaluation encompass a wide range of medical entities, including diseases, symptoms, medications, procedures and anatomical terms. These datasets are sourced from openly available clinical data (including annotations) to ensure comprehensive coverage and reflect the complexity of real-world medical language. More details about the datasets included can be found below ("About" section).
|
53 |
+
|
54 |
+
The evaluation metrics used in this leaderboard focus primarily on the F1-score, a widely recognized measure of a model's accuracy. The different modes of evaluation are also described below.
|
55 |
+
|
56 |
+
Disclaimer: It is important to note that the purpose of this evaluation is purely academic and exploratory. The models assessed here have not been approved for clinical use, and their results should not be interpreted as clinically validated. The leaderboard serves as a platform for researchers to compare models, understand their strengths and limitations, and drive further advancements in the field of clinical NLP.
|
57 |
"""
|
58 |
|
59 |
# Which evaluations are you running? how can people reproduce what you have?
|
60 |
LLM_BENCHMARKS_TEXT = f"""
|
61 |
|
|
|
|
|
|
|
|
|
|
|
62 |
The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
|
63 |
+
## About
|
64 |
|
65 |
## How it works
|
66 |
|
67 |
+
### Evaluation method and metrics
|
68 |
+
When training a Named Entity Recognition (NER) system, the most common evaluation methods involve measuring precision, recall, and F1-score at the token level. While these metrics are useful for fine-tuning the NER system, evaluating the predicted named entities for downstream tasks requires metrics at the full named-entity level. We include both evaluation methods: token-based and span-based. We provide an example below which helps in understanding the difference between the methods.
|
69 |
+
Example Sentence: "The patient was diagnosed with a skin cancer disease."
|
70 |
+
For simplicity, let's assume the an example sentence which contains 10 tokens, with a single two-token disease entity (as shown in the figure below).
|
71 |
+
|
72 |
### Datasets
|
73 |
📈 We evaluate the models on 4 datasets, encompassing 6 entity types
|
74 |
- [NCBI](https://huggingface.co/datasets/m42-health/clinical_ncbi)
|
|
|
86 |
"""
|
87 |
|
88 |
EVALUATION_QUEUE_TEXT = """
|
89 |
+
|
90 |
+
Currently, the benchmark supports evaluation for models hosted on the huggingface hub and of type encoder, decoder or gliner type models.
|
91 |
+
If your model needs a custom implementation, follow the steps outlined in the [medics_ner](https://github.com/WadoodAbdul/medics_ner/blob/master/docs/custom_model_implementation.md) repo or reach out to our team!
|
92 |
+
|
93 |
+
|
94 |
+
### Fields Explanation
|
95 |
+
|
96 |
+
#### Model Type:
|
97 |
+
- Fine-Tuned: If the training data consisted of any split/variation of the datasets on the leaderboard.
|
98 |
+
- Zero-Shot: If the model did not have any exposure to the datasets on the leaderboard while training.
|
99 |
+
|
100 |
+
#### Model Architecture:
|
101 |
+
- Encoder: The standard transformer encoder architecture with a token classification head on top.
|
102 |
+
- Decoder: Transformer based autoregressive token generation model.
|
103 |
+
- GLiNER: Architecture outlined in the [GLiNER Paper](https://arxiv.org/abs/2311.08526)
|
104 |
+
|
105 |
+
#### Label Normalization Map:
|
106 |
+
Not all models have been tuned to output the ner label names in the clinical datasets on this leaderboard. Some models cater to the same entity names with a synonym of it.
|
107 |
+
The normalization map can be used to ensure that the models's output are aligned with the labels expected in the datasets.
|
108 |
+
|
109 |
+
Note: Multiple model labels can be mapped to a single entity type in the leaderboard dataset. Ex: 'synonym' and 'disease' to 'condition'
|
110 |
+
|
111 |
+
|
112 |
+
Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
|
113 |
"""
|
114 |
|
115 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/display/utils.py
CHANGED
@@ -60,8 +60,9 @@ class EvalQueueColumn: # Queue column
|
|
60 |
model = ColumnContent("model", "markdown", True)
|
61 |
revision = ColumnContent("revision", "str", True)
|
62 |
private = ColumnContent("private", "bool", True)
|
63 |
-
|
64 |
-
|
|
|
65 |
status = ColumnContent("status", "str", True)
|
66 |
|
67 |
|
@@ -104,7 +105,7 @@ class ModelType(Enum):
|
|
104 |
class ModelArch(Enum):
|
105 |
Encoder = ModelDetails("Encoder")
|
106 |
Decoder = ModelDetails("Decoder")
|
107 |
-
|
108 |
Unknown = ModelDetails(name="Other", symbol="?")
|
109 |
|
110 |
def to_str(self, separator=" "):
|
@@ -116,8 +117,8 @@ class ModelArch(Enum):
|
|
116 |
return ModelArch.Encoder
|
117 |
if "decoder" in type:
|
118 |
return ModelArch.Decoder
|
119 |
-
if "
|
120 |
-
return ModelArch.
|
121 |
# if "unknown" in type:
|
122 |
# return ModelArch.Unknown
|
123 |
return ModelArch.Unknown
|
@@ -154,6 +155,14 @@ class Precision(Enum):
|
|
154 |
return Precision.Unknown
|
155 |
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
# Column selection
|
158 |
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
|
159 |
Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
|
|
|
60 |
model = ColumnContent("model", "markdown", True)
|
61 |
revision = ColumnContent("revision", "str", True)
|
62 |
private = ColumnContent("private", "bool", True)
|
63 |
+
architecture = ColumnContent("model_architecture", "bool", True)
|
64 |
+
# precision = ColumnContent("precision", "str", True)
|
65 |
+
# weight_type = ColumnContent("weight_type", "str", "Original")
|
66 |
status = ColumnContent("status", "str", True)
|
67 |
|
68 |
|
|
|
105 |
class ModelArch(Enum):
|
106 |
Encoder = ModelDetails("Encoder")
|
107 |
Decoder = ModelDetails("Decoder")
|
108 |
+
GLiNEREncoder = ModelDetails("GLiNER Encoder")
|
109 |
Unknown = ModelDetails(name="Other", symbol="?")
|
110 |
|
111 |
def to_str(self, separator=" "):
|
|
|
117 |
return ModelArch.Encoder
|
118 |
if "decoder" in type:
|
119 |
return ModelArch.Decoder
|
120 |
+
if "GLiNEREncoder" in type:
|
121 |
+
return ModelArch.GLiNEREncoder
|
122 |
# if "unknown" in type:
|
123 |
# return ModelArch.Unknown
|
124 |
return ModelArch.Unknown
|
|
|
155 |
return Precision.Unknown
|
156 |
|
157 |
|
158 |
+
class PromptTemplateName(Enum):
|
159 |
+
UniversalNERTemplate = "universal_ner"
|
160 |
+
LLMHTMLHighlightedSpansTemplate = "llm_html_highlighted_spans"
|
161 |
+
LLamaNERTemplate = "llama_70B_ner_v0.3"
|
162 |
+
MixtralNERTemplate = "mixtral_ner_v0.3.jinja"
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
# Column selection
|
167 |
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
|
168 |
Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
|
src/submission/check_validity.py
CHANGED
@@ -59,14 +59,24 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
59 |
return False, "was not found on hub!", None
|
60 |
|
61 |
|
62 |
-
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
|
|
|
|
64 |
try:
|
65 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
66 |
except (AttributeError, TypeError):
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
model_size = size_factor * model_size
|
71 |
return model_size
|
72 |
|
@@ -88,7 +98,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
88 |
continue
|
89 |
with open(os.path.join(root, file), "r") as f:
|
90 |
info = json.load(f)
|
91 |
-
file_names.append(f"{info['model']}_{info['revision']}
|
92 |
|
93 |
# Select organisation
|
94 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
|
|
59 |
return False, "was not found on hub!", None
|
60 |
|
61 |
|
62 |
+
def get_model_size(model_info: ModelInfo, precision: str=None):
|
63 |
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
+
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
65 |
+
|
66 |
try:
|
67 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
68 |
except (AttributeError, TypeError):
|
69 |
+
try:
|
70 |
+
size_match = re.search(size_pattern, model_info.id.lower())
|
71 |
+
model_size = size_match.group(0)
|
72 |
+
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
73 |
+
except AttributeError:
|
74 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
75 |
+
|
76 |
+
if precision:
|
77 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
78 |
+
else:
|
79 |
+
size_factor = 1
|
80 |
model_size = size_factor * model_size
|
81 |
return model_size
|
82 |
|
|
|
98 |
continue
|
99 |
with open(os.path.join(root, file), "r") as f:
|
100 |
info = json.load(f)
|
101 |
+
file_names.append(f"{info['model']}_{info['revision']}")
|
102 |
|
103 |
# Select organisation
|
104 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
src/submission/submit.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
@@ -10,18 +11,57 @@ from src.submission.check_validity import (
|
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
)
|
|
|
13 |
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
-
base_model: str,
|
20 |
revision: str,
|
21 |
-
precision: str,
|
22 |
-
weight_type: str,
|
|
|
|
|
|
|
|
|
|
|
23 |
model_type: str,
|
24 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
global REQUESTED_MODELS
|
26 |
global USERS_TO_SUBMISSION_DATES
|
27 |
if not REQUESTED_MODELS:
|
@@ -33,26 +73,32 @@ def add_new_eval(
|
|
33 |
user_name = model.split("/")[0]
|
34 |
model_path = model.split("/")[1]
|
35 |
|
36 |
-
precision = precision.split(" ")[0]
|
37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
|
39 |
if model_type is None or model_type == "":
|
40 |
return styled_error("Please select a model type.")
|
|
|
|
|
41 |
|
42 |
# Does the model actually exist?
|
43 |
if revision == "":
|
44 |
revision = "main"
|
45 |
|
46 |
-
# Is the model on the hub?
|
47 |
-
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
if not
|
53 |
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
if not model_on_hub:
|
55 |
return styled_error(f'Model "{model}" {error}')
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# Is the model info correctly filled?
|
58 |
try:
|
@@ -60,7 +106,7 @@ def add_new_eval(
|
|
60 |
except Exception:
|
61 |
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
|
63 |
-
model_size = get_model_size(model_info=model_info
|
64 |
|
65 |
# Were the model card and license filled?
|
66 |
try:
|
@@ -72,15 +118,52 @@ def add_new_eval(
|
|
72 |
if not modelcard_OK:
|
73 |
return styled_error(error_msg)
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
# Seems good, creating the eval
|
76 |
print("Adding new eval")
|
77 |
|
|
|
78 |
eval_entry = {
|
79 |
"model": model,
|
80 |
-
"base_model": base_model,
|
81 |
"revision": revision,
|
82 |
-
"precision": precision,
|
83 |
-
"weight_type": weight_type,
|
|
|
84 |
"status": "PENDING",
|
85 |
"submitted_time": current_time,
|
86 |
"model_type": model_type,
|
@@ -88,16 +171,18 @@ def add_new_eval(
|
|
88 |
"params": model_size,
|
89 |
"license": license,
|
90 |
"private": False,
|
|
|
91 |
}
|
92 |
|
93 |
# Check for duplicate submission
|
94 |
-
|
95 |
-
|
|
|
96 |
|
97 |
print("Creating eval file")
|
98 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
-
out_path = f"{OUT_DIR}/{model_path}
|
101 |
|
102 |
with open(out_path, "w") as f:
|
103 |
f.write(json.dumps(eval_entry))
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import ast
|
4 |
from datetime import datetime, timezone
|
5 |
|
6 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
|
|
11 |
get_model_size,
|
12 |
is_model_on_hub,
|
13 |
)
|
14 |
+
from src.display.utils import PromptTemplateName
|
15 |
|
16 |
REQUESTED_MODELS = None
|
17 |
USERS_TO_SUBMISSION_DATES = None
|
18 |
|
19 |
+
PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG = """{
|
20 |
+
"NCBI" : {
|
21 |
+
"" : "condition"
|
22 |
+
},
|
23 |
+
"CHIA" : {
|
24 |
+
"" : "condition"
|
25 |
+
"" : "drug"
|
26 |
+
"" : "procedure"
|
27 |
+
"" : "measurement"
|
28 |
+
},
|
29 |
+
"BIORED" : {
|
30 |
+
"" : "condition"
|
31 |
+
"" : "drug"
|
32 |
+
"" : "gene"
|
33 |
+
"" : "gene variant"
|
34 |
+
},
|
35 |
+
"BC5CDR" : {
|
36 |
+
"" : "condition"
|
37 |
+
"" : "drug"
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
"""
|
42 |
+
|
43 |
def add_new_eval(
|
44 |
model: str,
|
45 |
+
# base_model: str,
|
46 |
revision: str,
|
47 |
+
# precision: str,
|
48 |
+
# weight_type: str,
|
49 |
+
model_arch: str,
|
50 |
+
label_normalization_map: str,
|
51 |
+
gliner_threshold:str,
|
52 |
+
gliner_tokenizer_bool:str,
|
53 |
+
prompt_template_name:str,
|
54 |
model_type: str,
|
55 |
):
|
56 |
+
"""
|
57 |
+
Saves request if valid else returns the error.
|
58 |
+
Validity is checked based on -
|
59 |
+
- model's existence on hub
|
60 |
+
- necessary info on the model's card
|
61 |
+
- label normalization is a valid python dict and contains the keys for all datasets
|
62 |
+
- threshold for gliner is a valid float
|
63 |
+
|
64 |
+
"""
|
65 |
global REQUESTED_MODELS
|
66 |
global USERS_TO_SUBMISSION_DATES
|
67 |
if not REQUESTED_MODELS:
|
|
|
73 |
user_name = model.split("/")[0]
|
74 |
model_path = model.split("/")[1]
|
75 |
|
76 |
+
# precision = precision.split(" ")[0]
|
77 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
78 |
|
79 |
if model_type is None or model_type == "":
|
80 |
return styled_error("Please select a model type.")
|
81 |
+
|
82 |
+
model_type = model_type.split(":")[-1].strip()
|
83 |
|
84 |
# Does the model actually exist?
|
85 |
if revision == "":
|
86 |
revision = "main"
|
87 |
|
88 |
+
# # Is the model on the hub?
|
89 |
+
# if weight_type in ["Delta", "Adapter"]:
|
90 |
+
# base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
91 |
+
# if not base_model_on_hub:
|
92 |
+
# return styled_error(f'Base model "{base_model}" {error}')
|
93 |
|
94 |
+
if not model_arch == "GLiNER Encoder":
|
95 |
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
96 |
if not model_on_hub:
|
97 |
return styled_error(f'Model "{model}" {error}')
|
98 |
+
else:
|
99 |
+
if len(list(API.list_models(model_name=model))) !=1:
|
100 |
+
return styled_error(f'Model "{model}" does not exist on the hub!')
|
101 |
+
|
102 |
|
103 |
# Is the model info correctly filled?
|
104 |
try:
|
|
|
106 |
except Exception:
|
107 |
return styled_error("Could not get your model information. Please fill it up properly.")
|
108 |
|
109 |
+
model_size = get_model_size(model_info=model_info)
|
110 |
|
111 |
# Were the model card and license filled?
|
112 |
try:
|
|
|
118 |
if not modelcard_OK:
|
119 |
return styled_error(error_msg)
|
120 |
|
121 |
+
# Verify the inference config now
|
122 |
+
try:
|
123 |
+
label_normalization_map = ast.literal_eval(label_normalization_map)
|
124 |
+
except Exception as e:
|
125 |
+
return styled_error("Please enter a valid json for the labe; normalization map")
|
126 |
+
|
127 |
+
inference_config = {
|
128 |
+
# "model_arch" : model_arch,
|
129 |
+
"label_normalization_map": label_normalization_map,
|
130 |
+
}
|
131 |
+
|
132 |
+
match model_arch:
|
133 |
+
case "Encoder":
|
134 |
+
pass
|
135 |
+
case "Decoder":
|
136 |
+
if not prompt_template_name in [prompt_template.value for prompt_template in PromptTemplateName]:
|
137 |
+
return styled_error("Prompt template name is invalid")
|
138 |
+
inference_config = {
|
139 |
+
**inference_config,
|
140 |
+
"prompt_template_name": prompt_template_name,
|
141 |
+
}
|
142 |
+
case "GLiNER Encoder":
|
143 |
+
try:
|
144 |
+
gliner_threshold = float(gliner_threshold)
|
145 |
+
gliner_tokenizer_bool = ast.literal_eval(gliner_tokenizer_bool)
|
146 |
+
inference_config = {
|
147 |
+
**inference_config,
|
148 |
+
"gliner_threshold": gliner_threshold,
|
149 |
+
"gliner_tokenizer_bool" : gliner_tokenizer_bool
|
150 |
+
}
|
151 |
+
except Exception as e:
|
152 |
+
return styled_error("Please enter a valid float for the threshold")
|
153 |
+
case _:
|
154 |
+
return styled_error("Model Architecture is invalid")
|
155 |
+
|
156 |
# Seems good, creating the eval
|
157 |
print("Adding new eval")
|
158 |
|
159 |
+
|
160 |
eval_entry = {
|
161 |
"model": model,
|
162 |
+
# "base_model": base_model,
|
163 |
"revision": revision,
|
164 |
+
# "precision": precision,
|
165 |
+
# "weight_type": weight_type,
|
166 |
+
"model_architecture": model_arch,
|
167 |
"status": "PENDING",
|
168 |
"submitted_time": current_time,
|
169 |
"model_type": model_type,
|
|
|
171 |
"params": model_size,
|
172 |
"license": license,
|
173 |
"private": False,
|
174 |
+
"inference_config":inference_config,
|
175 |
}
|
176 |
|
177 |
# Check for duplicate submission
|
178 |
+
|
179 |
+
if f"{model}_{revision}" in REQUESTED_MODELS:
|
180 |
+
return styled_warning("This model has been already submitted. Add the revision if the model has been updated.")
|
181 |
|
182 |
print("Creating eval file")
|
183 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
184 |
os.makedirs(OUT_DIR, exist_ok=True)
|
185 |
+
out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request.json"
|
186 |
|
187 |
with open(out_path, "w") as f:
|
188 |
f.write(json.dumps(eval_entry))
|