Spaces:
Running
Running
import pandas as pd | |
import gradio as gr | |
from collections import defaultdict | |
def parse_excel(file_path): | |
xls = pd.ExcelFile(file_path) | |
task_data = defaultdict(lambda: defaultdict(dict)) | |
all_models = set() | |
all_datasets = defaultdict(set) | |
model_urls = {} # εε¨ζ¨‘εURL | |
for sheet_name in xls.sheet_names: | |
if '_' not in sheet_name: | |
continue | |
task_name, lang = sheet_name.rsplit('_', 1) | |
if lang not in ['en', 'zh']: | |
continue | |
df = xls.parse(sheet_name) | |
has_url = 'URL' in df.columns | |
urls = df['URL'].tolist() if has_url else [None] * len(df) | |
models = df.iloc[:, 0].tolist() | |
datasets = [col for col in df.columns[1:] if col != 'URL'] if has_url else df.columns[1:].tolist() | |
for model, url in zip(models, urls): | |
if url and pd.notnull(url): | |
model_urls[model] = url | |
all_models.update(models) | |
all_datasets[task_name].update([(d, lang) for d in datasets]) | |
for idx, row in df.iterrows(): | |
model = row.iloc[0] | |
scores = row[datasets].tolist() if datasets else [] | |
task_data[task_name][lang][model] = dict(zip(datasets, scores)) | |
return task_data, sorted(all_models), dict(all_datasets), model_urls | |
def calculate_averages(task_data, all_models): | |
lang_overall_avg = defaultdict(lambda: defaultdict(list)) | |
task_lang_avg = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) | |
for task, langs in task_data.items(): | |
for lang, models in langs.items(): | |
for model in all_models: | |
if model in models: | |
scores = list(models[model].values()) | |
lang_overall_avg[lang][model].extend(scores) | |
task_lang_avg[task][lang][model].extend(scores) | |
overall = { | |
lang: { | |
model: sum(scores)/len(scores) if scores else 0.0 | |
for model, scores in models.items() | |
} | |
for lang, models in lang_overall_avg.items() | |
} | |
processed_task_avg = defaultdict(dict) | |
for task, langs in task_lang_avg.items(): | |
for lang, models in langs.items(): | |
processed_task_avg[task][lang] = { | |
model: sum(scores)/len(scores) if scores else 0.0 | |
for model, scores in models.items() | |
} | |
return overall, processed_task_avg | |
def filter_models(search_term): | |
if not search_term: | |
return all_models | |
return [m for m in all_models if search_term.lower() in m.lower()] | |
def create_lang_view(lang, models): | |
model_links = [ | |
f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>' | |
if model_urls.get(m) else m | |
for m in models | |
] | |
df_data = { | |
"Model": model_links, | |
f"Overall ({lang.upper()})": [ | |
round(overall_avg[lang].get(m, 0), 3) | |
for m in models | |
] | |
} | |
for task in sorted(task_avg.keys()): | |
task_scores = [] | |
for m in models: | |
score = task_avg[task].get(lang, {}).get(m, 0) | |
task_scores.append(round(score, 3)) | |
df_data[task] = task_scores | |
df = pd.DataFrame(df_data) | |
if not df.empty: | |
numeric_cols = df.columns[df.columns != "Model"] | |
df = df[~(df[numeric_cols] == 0).all(axis=1)] | |
df = df.sort_values(by=f"Overall ({lang.upper()})", ascending=False) | |
df.reset_index(drop=True, inplace=True) | |
return df if not df.empty else pd.DataFrame({"Status": [f"No {lang.upper()} data matching criteria..."]}) | |
def create_overall_view(search_term=None): | |
filtered_models = filter_models(search_term) | |
en_df = create_lang_view('en', filtered_models) | |
zh_df = create_lang_view('zh', filtered_models) | |
return en_df, zh_df | |
def create_task_view(task_name, search_term=None): | |
task_langs = task_data.get(task_name, {}) | |
dfs = [] | |
filtered_models = filter_models(search_term) | |
model_links = [ | |
f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>' | |
if model_urls.get(m) else m | |
for m in filtered_models | |
] | |
for lang in ['en', 'zh']: | |
lang_data = task_langs.get(lang, {}) | |
datasets = [] | |
if lang_data: | |
models_in_lang = list(lang_data.keys()) | |
if models_in_lang: | |
datasets = sorted(lang_data[models_in_lang[0]].keys()) | |
df = pd.DataFrame(columns=["Model", "Avg."] + datasets) | |
for i, model in enumerate(filtered_models): | |
row_data = {"Model": model_links[i]} | |
scores = [] | |
if model in lang_data: | |
for ds in datasets: | |
score = lang_data[model].get(ds, 0.0) | |
row_data[ds] = round(score, 3) | |
scores.append(score) | |
row_data["Avg."] = round(sum(scores)/len(scores) if scores else 0.0, 3) | |
else: | |
row_data.update({ds: 0.0 for ds in datasets}) | |
row_data["Avg."] = 0.0 | |
df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True) | |
if datasets: | |
df = df[["Model", "Avg."] + datasets] | |
numeric_cols = df.columns[df.columns != "Model"] | |
if not numeric_cols.empty: | |
df = df[~(df[numeric_cols] == 0).all(axis=1)] | |
df = df.sort_values(by="Avg.", ascending=False) | |
df.reset_index(drop=True, inplace=True) | |
else: | |
df = pd.DataFrame({"Status": ["There is no data for this language.."]}) | |
dfs.append(df) | |
return dfs | |
task_data, all_models, all_datasets, model_urls = parse_excel('benchmark.xlsx') | |
overall_avg, task_avg = calculate_averages(task_data, all_models) | |
with gr.Blocks(title="Benchmark Leaderboard", css=""".search-box {margin-bottom: 20px} | |
.gradio-container {max-width: 100% !important} | |
.dataframe {width: 100% !important}""") as demo: | |
gr.Markdown("# π° FinMTEB Benchmark Leaderboard") | |
gr.Markdown("**Finance** Massive Text Embedding Benchmark (FinMTEB), an embedding benchmark consists of 64 financial domain-specific text datasets, across English and Chinese, spanning seven different tasks.") | |
gr.Markdown("---") | |
gr.Markdown("π If you feel our work helpful, please cite the following paper: [FinMTEB: Finance Massive Text Embedding Benchmark](https://arxiv.org/abs/2502.10990)") | |
gr.Markdown("Github: [FinMTEB](https://github.com/yixuantt/FinMTEB/blob/main/README.md)") | |
search = gr.Textbox( | |
placeholder="π Enter the model name...", | |
label="model_search", | |
show_label=False, | |
elem_classes=["search-box"] | |
) | |
with gr.Tabs() as main_tabs: | |
with gr.Tab("π Overview"): | |
with gr.Column(elem_classes=["lang-section"]): | |
gr.Markdown("### English Datasets") | |
en_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
with gr.Column(elem_classes=["lang-section"]): | |
gr.Markdown("### Chinese Datasets") | |
zh_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
search.change( | |
create_overall_view, | |
inputs=search, | |
outputs=[en_table, zh_table] | |
) | |
demo.load( | |
lambda: create_overall_view(), | |
outputs=[en_table, zh_table] | |
) | |
for task_name in task_data: | |
with gr.Tab(task_name): | |
with gr.Column(): | |
gr.Markdown("### English Datasets") | |
en_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
with gr.Column(): | |
gr.Markdown("### Chinese Datasets") | |
zh_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
search.change( | |
lambda term, tn=task_name: create_task_view(tn, term), | |
inputs=search, | |
outputs=[en_display, zh_display] | |
) | |
demo.load( | |
lambda tn=task_name: create_task_view(tn), | |
outputs=[en_display, zh_display] | |
) | |
with gr.Tab("π¬ Submit"): | |
gr.Markdown("---") | |
gr.Markdown("For the results report, please send the results to **[email protected]**") | |
gr.Markdown("π Thanks for your contribution!") | |
if __name__ == "__main__": | |
demo.launch() |