Leaderboard / app.py
chiayewken's picture
Update en results (claude-3.5-sonnet)
8b2adbb
raw
history blame
10.1 kB
import gradio as gr
import pandas as pd
import os
from huggingface_hub import snapshot_download, login
from apscheduler.schedulers.background import BackgroundScheduler
from src.display.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
CONTACT_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
SUB_TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import API
from src.leaderboard.load_results import load_data
def restart_space():
API.restart_space(repo_id="Auto-Arena/Leaderboard")
csv_path = f"./src/results/auto-arena-llms-results-20240624.csv"
csv_path_chinese = f"./src/results/auto-arena-llms-results-chinese-20240531.csv"
df_results = load_data(csv_path)
df_results_chinese = load_data(csv_path_chinese)
all_columns = ['Rank', 'Model', 'From', 'Open?', 'Params(B)', 'Cost', 'Score']
show_columns = ['Rank', 'Model', 'From', 'Open?', 'Params(B)', 'Cost', 'Score']
TYPES = ['number', 'markdown', 'str', 'str', 'str', 'str', 'number']
df_results_init = df_results.copy()[show_columns]
df_results_chinese_init = df_results_chinese.copy()[show_columns]
def update_table(
hidden_df: pd.DataFrame,
# columns: list,
#type_query: list,
open_query: list,
# precision_query: str,
# size_query: list,
# show_deleted: bool,
query: str,
):
# filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
# filtered_df = filter_queries(query, filtered_df)
# df = select_columns(filtered_df, columns)
filtered_df = hidden_df.copy()
# filtered_df = filtered_df[filtered_df['type'].isin(type_query)]
map_open = {'open': 'Yes', 'closed': 'No'}
filtered_df = filtered_df[filtered_df['Open?'].isin([map_open[o] for o in open_query])]
filtered_df = filter_queries(query, filtered_df)
# filtered_df = filtered_df[[map_columns[k] for k in columns]]
# deduplication
# df = df.drop_duplicates(subset=["Model"])
df = filtered_df.drop_duplicates()
df = df[show_columns]
return df
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
return df[(df['Model'].str.contains(query, case=False))]
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
final_df = []
if query != "":
queries = [q.strip() for q in query.split(";")]
for _q in queries:
_q = _q.strip()
if _q != "":
temp_filtered_df = search_table(filtered_df, _q)
if len(temp_filtered_df) > 0:
final_df.append(temp_filtered_df)
if len(final_df) > 0:
filtered_df = pd.concat(final_df)
return filtered_df
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.HTML(SUB_TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
# the first tab
with gr.TabItem("English", elem_id="llm-benchmark-Sum", id=0):
# meta-info
with gr.Row():
with gr.Column():
search_bar = gr.Textbox(
placeholder=" πŸ” Search for models you are interested in (separate multiple models with `;`) and press ENTER...",
show_label=False,
elem_id="search-bar",
)
# with gr.Row():
# with gr.Column():
# type_query = gr.CheckboxGroup(
# choices=["🟒 base", "πŸ”Ά chat"],
# value=["πŸ”Ά chat" ],
# label="model types to show",
# elem_id="type-select",
# interactive=True,
# )
with gr.Column():
open_query = gr.CheckboxGroup(
choices=["open", "closed"],
value=["open", "closed"],
label="open-source OR closed-source models?",
elem_id="open-select",
interactive=True,
)
leaderboard_table = gr.components.Dataframe(
value = df_results,
datatype = TYPES,
elem_id = "leaderboard-table",
interactive = False,
visible=True,
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
)
gr.Markdown("The \"Cost\" column is calculated as USD / Million tokens of output.")
hidden_leaderboard_table_for_search = gr.components.Dataframe(
value=df_results_init,
# elem_id="leaderboard-table",
interactive=False,
visible=False,
)
search_bar.submit(
update_table,
[
# df_avg,
hidden_leaderboard_table_for_search,
# shown_columns,
#type_query,
open_query,
# filter_columns_type,
# filter_columns_precision,
# filter_columns_size,
# deleted_models_visibility,
search_bar,
],
leaderboard_table,
)
#for selector in [type_query, open_query]:
for selector in [open_query]:
selector.change(
update_table,
[
# df_avg,
hidden_leaderboard_table_for_search,
# shown_columns,
#type_query,
open_query,
# filter_columns_type,
# filter_columns_precision,
# filter_columns_size,
# deleted_models_visibility,
search_bar,
],
leaderboard_table,
)
with gr.TabItem("Chinese", elem_id="llm-benchmark-Sum", id=1):
# meta-info
with gr.Row():
with gr.Column():
search_bar = gr.Textbox(
placeholder=" πŸ” Search for models you are interested in (separate multiple models with `;`) and press ENTER...",
show_label=False,
elem_id="search-bar",
)
# with gr.Row():
# with gr.Column():
# type_query = gr.CheckboxGroup(
# choices=["🟒 base", "πŸ”Ά chat"],
# value=["πŸ”Ά chat" ],
# label="model types to show",
# elem_id="type-select",
# interactive=True,
# )
with gr.Column():
open_query = gr.CheckboxGroup(
choices=["open", "closed"],
value=["open", "closed"],
label="open-source OR closed-source models?",
elem_id="open-select",
interactive=True,
)
leaderboard_table = gr.components.Dataframe(
value = df_results_chinese,
datatype = TYPES,
elem_id = "leaderboard-table",
interactive = False,
visible=True,
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
)
gr.Markdown("The \"Cost\" column is calculated as USD / Million tokens of output.")
hidden_leaderboard_table_for_search = gr.components.Dataframe(
value=df_results_chinese_init,
# elem_id="leaderboard-table",
interactive=False,
visible=False,
)
search_bar.submit(
update_table,
[
# df_avg,
hidden_leaderboard_table_for_search,
# shown_columns,
#type_query,
open_query,
# filter_columns_type,
# filter_columns_precision,
# filter_columns_size,
# deleted_models_visibility,
search_bar,
],
leaderboard_table,
)
#for selector in [type_query, open_query]:
for selector in [open_query]:
selector.change(
update_table,
[
# df_avg,
hidden_leaderboard_table_for_search,
# shown_columns,
#type_query,
open_query,
# filter_columns_type,
# filter_columns_precision,
# filter_columns_size,
# deleted_models_visibility,
search_bar,
],
leaderboard_table,
)
# with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=1):
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
# with gr.Row():
# with gr.Accordion("πŸ“™ Citation", open=False):
# citation_button = gr.Textbox(
# value=CITATION_BUTTON_TEXT,
# label=CITATION_BUTTON_LABEL,
# lines=20,
# elem_id="citation-button",
# show_copy_button=True,
# )
gr.Markdown(CONTACT_TEXT, elem_classes="markdown-text")
demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch(share=True)