Spaces:
Running
Running
Andrea Seveso
commited on
Commit
·
f728b4f
1
Parent(s):
539e451
Colored dataframe
Browse files- app.py +6 -3
- src/about.py +11 -0
app.py
CHANGED
@@ -12,6 +12,8 @@ from src.about import (
|
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
14 |
EVALUATION_QUEUE_TEXT,
|
|
|
|
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
17 |
from src.display.utils import (
|
@@ -129,11 +131,13 @@ def filter_models(
|
|
129 |
|
130 |
def get_macro_area_data():
|
131 |
dataset = pd.read_csv("src/macro_area.csv", sep=',')
|
|
|
132 |
return dataset
|
133 |
|
134 |
|
135 |
def get_question_format_data():
|
136 |
dataset = pd.read_csv("src/question_format.csv", sep=',')
|
|
|
137 |
return dataset
|
138 |
|
139 |
|
@@ -233,13 +237,12 @@ with demo:
|
|
233 |
with gr.TabItem('In Depth Evaluation'):
|
234 |
|
235 |
gr.Markdown('''# In Depth Evaluation''')
|
236 |
-
gr.Markdown(
|
237 |
gr.Dataframe(get_question_format_data())
|
238 |
|
239 |
with gr.TabItem('Evaluation by Macro Area'):
|
240 |
gr.Markdown('''# Macro Area evaluation''')
|
241 |
-
gr.Markdown(
|
242 |
-
'''This table shows the evaluation of the models by macro area. The evaluation is based on the following metrics:''')
|
243 |
gr.Dataframe(get_macro_area_data())
|
244 |
|
245 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
14 |
EVALUATION_QUEUE_TEXT,
|
15 |
+
QUESTION_FORMAT_TEXT,
|
16 |
+
MACRO_AREA_TEXT,
|
17 |
)
|
18 |
from src.display.css_html_js import custom_css
|
19 |
from src.display.utils import (
|
|
|
131 |
|
132 |
def get_macro_area_data():
|
133 |
dataset = pd.read_csv("src/macro_area.csv", sep=',')
|
134 |
+
dataset = dataset.style.highlight_max(color='lightgreen', axis=0)
|
135 |
return dataset
|
136 |
|
137 |
|
138 |
def get_question_format_data():
|
139 |
dataset = pd.read_csv("src/question_format.csv", sep=',')
|
140 |
+
dataset = dataset.style.highlight_max(color='lightgreen', axis=0)
|
141 |
return dataset
|
142 |
|
143 |
|
|
|
237 |
with gr.TabItem('In Depth Evaluation'):
|
238 |
|
239 |
gr.Markdown('''# In Depth Evaluation''')
|
240 |
+
gr.Markdown(QUESTION_FORMAT_TEXT)
|
241 |
gr.Dataframe(get_question_format_data())
|
242 |
|
243 |
with gr.TabItem('Evaluation by Macro Area'):
|
244 |
gr.Markdown('''# Macro Area evaluation''')
|
245 |
+
gr.Markdown(MACRO_AREA_TEXT)
|
|
|
246 |
gr.Dataframe(get_macro_area_data())
|
247 |
|
248 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
src/about.py
CHANGED
@@ -28,8 +28,11 @@ TITLE = """<h1 align="center" id="space-title">👩🏫Invalsi Leaderboard
|
|
28 |
# What does your leaderboard evaluate?
|
29 |
INTRODUCTION_TEXT = """
|
30 |
Welcome into <a href="https://crispresearch.it/"><b>CRISP Bicocca</b></a> Invalsi Leaderboard page!
|
|
|
31 |
We adapted the INVALSI benchmark for automated LLM evaluation, which involves rigorous adaptation of the test format to suit automated processing while retaining the essence of the original tests. In this leaderboard, we provide a detailed assessment of current LLMs, offering a crucial reference point for the academic community.
|
|
|
32 |
Researchers are invited to submit their models for ongoing evaluation, ensuring the benchmark remains a current and valuable resource.
|
|
|
33 |
For more information on the benchmark, please refer to our arXiv paper <a href="https://arxiv.org/abs/SOON"><b>here</b></a> and read the "About" section.
|
34 |
"""
|
35 |
|
@@ -244,3 +247,11 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
|
|
244 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
245 |
CITATION_BUTTON_TEXT = r"""
|
246 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# What does your leaderboard evaluate?
|
29 |
INTRODUCTION_TEXT = """
|
30 |
Welcome into <a href="https://crispresearch.it/"><b>CRISP Bicocca</b></a> Invalsi Leaderboard page!
|
31 |
+
|
32 |
We adapted the INVALSI benchmark for automated LLM evaluation, which involves rigorous adaptation of the test format to suit automated processing while retaining the essence of the original tests. In this leaderboard, we provide a detailed assessment of current LLMs, offering a crucial reference point for the academic community.
|
33 |
+
|
34 |
Researchers are invited to submit their models for ongoing evaluation, ensuring the benchmark remains a current and valuable resource.
|
35 |
+
|
36 |
For more information on the benchmark, please refer to our arXiv paper <a href="https://arxiv.org/abs/SOON"><b>here</b></a> and read the "About" section.
|
37 |
"""
|
38 |
|
|
|
247 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
248 |
CITATION_BUTTON_TEXT = r"""
|
249 |
"""
|
250 |
+
|
251 |
+
QUESTION_FORMAT_TEXT = """
|
252 |
+
Question Format evaluation
|
253 |
+
"""
|
254 |
+
|
255 |
+
MACRO_AREA_TEXT = """"
|
256 |
+
This table shows the evaluation of the models by macro area. The evaluation is based on the following metrics:
|
257 |
+
"""
|