FinMTEB / app.py
yixuantt's picture
Update app.py
7d636df verified
import pandas as pd
import gradio as gr
from collections import defaultdict
def parse_excel(file_path):
xls = pd.ExcelFile(file_path)
task_data = defaultdict(lambda: defaultdict(dict))
all_models = set()
all_datasets = defaultdict(set)
model_urls = {} # ε­˜ε‚¨ζ¨‘εž‹URL
for sheet_name in xls.sheet_names:
if '_' not in sheet_name:
continue
task_name, lang = sheet_name.rsplit('_', 1)
if lang not in ['en', 'zh']:
continue
df = xls.parse(sheet_name)
has_url = 'URL' in df.columns
urls = df['URL'].tolist() if has_url else [None] * len(df)
models = df.iloc[:, 0].tolist()
datasets = [col for col in df.columns[1:] if col != 'URL'] if has_url else df.columns[1:].tolist()
for model, url in zip(models, urls):
if url and pd.notnull(url):
model_urls[model] = url
all_models.update(models)
all_datasets[task_name].update([(d, lang) for d in datasets])
for idx, row in df.iterrows():
model = row.iloc[0]
scores = row[datasets].tolist() if datasets else []
task_data[task_name][lang][model] = dict(zip(datasets, scores))
return task_data, sorted(all_models), dict(all_datasets), model_urls
def calculate_averages(task_data, all_models):
lang_overall_avg = defaultdict(lambda: defaultdict(list))
task_lang_avg = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for task, langs in task_data.items():
for lang, models in langs.items():
for model in all_models:
if model in models:
scores = list(models[model].values())
lang_overall_avg[lang][model].extend(scores)
task_lang_avg[task][lang][model].extend(scores)
overall = {
lang: {
model: sum(scores)/len(scores) if scores else 0.0
for model, scores in models.items()
}
for lang, models in lang_overall_avg.items()
}
processed_task_avg = defaultdict(dict)
for task, langs in task_lang_avg.items():
for lang, models in langs.items():
processed_task_avg[task][lang] = {
model: sum(scores)/len(scores) if scores else 0.0
for model, scores in models.items()
}
return overall, processed_task_avg
def filter_models(search_term):
if not search_term:
return all_models
return [m for m in all_models if search_term.lower() in m.lower()]
def create_lang_view(lang, models):
model_links = [
f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>'
if model_urls.get(m) else m
for m in models
]
df_data = {
"Model": model_links,
f"Overall ({lang.upper()})": [
round(overall_avg[lang].get(m, 0), 3)
for m in models
]
}
for task in sorted(task_avg.keys()):
task_scores = []
for m in models:
score = task_avg[task].get(lang, {}).get(m, 0)
task_scores.append(round(score, 3))
df_data[task] = task_scores
df = pd.DataFrame(df_data)
if not df.empty:
numeric_cols = df.columns[df.columns != "Model"]
df = df[~(df[numeric_cols] == 0).all(axis=1)]
df = df.sort_values(by=f"Overall ({lang.upper()})", ascending=False)
df.reset_index(drop=True, inplace=True)
return df if not df.empty else pd.DataFrame({"Status": [f"No {lang.upper()} data matching criteria..."]})
def create_overall_view(search_term=None):
filtered_models = filter_models(search_term)
en_df = create_lang_view('en', filtered_models)
zh_df = create_lang_view('zh', filtered_models)
return en_df, zh_df
def create_task_view(task_name, search_term=None):
task_langs = task_data.get(task_name, {})
dfs = []
filtered_models = filter_models(search_term)
model_links = [
f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>'
if model_urls.get(m) else m
for m in filtered_models
]
for lang in ['en', 'zh']:
lang_data = task_langs.get(lang, {})
datasets = []
if lang_data:
models_in_lang = list(lang_data.keys())
if models_in_lang:
datasets = sorted(lang_data[models_in_lang[0]].keys())
df = pd.DataFrame(columns=["Model", "Avg."] + datasets)
for i, model in enumerate(filtered_models):
row_data = {"Model": model_links[i]}
scores = []
if model in lang_data:
for ds in datasets:
score = lang_data[model].get(ds, 0.0)
row_data[ds] = round(score, 3)
scores.append(score)
row_data["Avg."] = round(sum(scores)/len(scores) if scores else 0.0, 3)
else:
row_data.update({ds: 0.0 for ds in datasets})
row_data["Avg."] = 0.0
df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)
if datasets:
df = df[["Model", "Avg."] + datasets]
numeric_cols = df.columns[df.columns != "Model"]
if not numeric_cols.empty:
df = df[~(df[numeric_cols] == 0).all(axis=1)]
df = df.sort_values(by="Avg.", ascending=False)
df.reset_index(drop=True, inplace=True)
else:
df = pd.DataFrame({"Status": ["There is no data for this language.."]})
dfs.append(df)
return dfs
task_data, all_models, all_datasets, model_urls = parse_excel('benchmark.xlsx')
overall_avg, task_avg = calculate_averages(task_data, all_models)
with gr.Blocks(title="Benchmark Leaderboard", css=""".search-box {margin-bottom: 20px}
.gradio-container {max-width: 100% !important}
.dataframe {width: 100% !important}""") as demo:
gr.Markdown("# πŸ’° FinMTEB Benchmark Leaderboard")
gr.Markdown("**Finance** Massive Text Embedding Benchmark (FinMTEB), an embedding benchmark consists of 64 financial domain-specific text datasets, across English and Chinese, spanning seven different tasks.")
gr.Markdown("---")
gr.Markdown("πŸ“– If you feel our work helpful, please cite the following paper: [FinMTEB: Finance Massive Text Embedding Benchmark](https://arxiv.org/abs/2502.10990)")
gr.Markdown("Github: [FinMTEB](https://github.com/yixuantt/FinMTEB/blob/main/README.md)")
search = gr.Textbox(
placeholder="πŸ” Enter the model name...",
label="model_search",
show_label=False,
elem_classes=["search-box"]
)
with gr.Tabs() as main_tabs:
with gr.Tab("πŸ“Š Overview"):
with gr.Column(elem_classes=["lang-section"]):
gr.Markdown("### English Datasets")
en_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
with gr.Column(elem_classes=["lang-section"]):
gr.Markdown("### Chinese Datasets")
zh_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
search.change(
create_overall_view,
inputs=search,
outputs=[en_table, zh_table]
)
demo.load(
lambda: create_overall_view(),
outputs=[en_table, zh_table]
)
for task_name in task_data:
with gr.Tab(task_name):
with gr.Column():
gr.Markdown("### English Datasets")
en_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
with gr.Column():
gr.Markdown("### Chinese Datasets")
zh_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
search.change(
lambda term, tn=task_name: create_task_view(tn, term),
inputs=search,
outputs=[en_display, zh_display]
)
demo.load(
lambda tn=task_name: create_task_view(tn),
outputs=[en_display, zh_display]
)
with gr.Tab("πŸ“¬ Submit"):
gr.Markdown("---")
gr.Markdown("For the results report, please send the results to **[email protected]**")
gr.Markdown("😊 Thanks for your contribution!")
if __name__ == "__main__":
demo.launch()