import pandas as pd import gradio as gr from collections import defaultdict def parse_excel(file_path): xls = pd.ExcelFile(file_path) task_data = defaultdict(lambda: defaultdict(dict)) all_models = set() all_datasets = defaultdict(set) model_urls = {} # ๅญ˜ๅ‚จๆจกๅž‹URL for sheet_name in xls.sheet_names: if '_' not in sheet_name: continue task_name, lang = sheet_name.rsplit('_', 1) if lang not in ['en', 'zh']: continue df = xls.parse(sheet_name) has_url = 'URL' in df.columns urls = df['URL'].tolist() if has_url else [None] * len(df) models = df.iloc[:, 0].tolist() datasets = [col for col in df.columns[1:] if col != 'URL'] if has_url else df.columns[1:].tolist() for model, url in zip(models, urls): if url and pd.notnull(url): model_urls[model] = url all_models.update(models) all_datasets[task_name].update([(d, lang) for d in datasets]) for idx, row in df.iterrows(): model = row.iloc[0] scores = row[datasets].tolist() if datasets else [] task_data[task_name][lang][model] = dict(zip(datasets, scores)) return task_data, sorted(all_models), dict(all_datasets), model_urls def calculate_averages(task_data, all_models): lang_overall_avg = defaultdict(lambda: defaultdict(list)) task_lang_avg = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) for task, langs in task_data.items(): for lang, models in langs.items(): for model in all_models: if model in models: scores = list(models[model].values()) lang_overall_avg[lang][model].extend(scores) task_lang_avg[task][lang][model].extend(scores) overall = { lang: { model: sum(scores)/len(scores) if scores else 0.0 for model, scores in models.items() } for lang, models in lang_overall_avg.items() } processed_task_avg = defaultdict(dict) for task, langs in task_lang_avg.items(): for lang, models in langs.items(): processed_task_avg[task][lang] = { model: sum(scores)/len(scores) if scores else 0.0 for model, scores in models.items() } return overall, processed_task_avg def filter_models(search_term): if not search_term: return all_models return [m for m in all_models if search_term.lower() in m.lower()] def create_lang_view(lang, models): model_links = [ f'{m}' if model_urls.get(m) else m for m in models ] df_data = { "Model": model_links, f"Overall ({lang.upper()})": [ round(overall_avg[lang].get(m, 0), 3) for m in models ] } for task in sorted(task_avg.keys()): task_scores = [] for m in models: score = task_avg[task].get(lang, {}).get(m, 0) task_scores.append(round(score, 3)) df_data[task] = task_scores df = pd.DataFrame(df_data) if not df.empty: numeric_cols = df.columns[df.columns != "Model"] df = df[~(df[numeric_cols] == 0).all(axis=1)] df = df.sort_values(by=f"Overall ({lang.upper()})", ascending=False) df.reset_index(drop=True, inplace=True) return df if not df.empty else pd.DataFrame({"Status": [f"No {lang.upper()} data matching criteria..."]}) def create_overall_view(search_term=None): filtered_models = filter_models(search_term) en_df = create_lang_view('en', filtered_models) zh_df = create_lang_view('zh', filtered_models) return en_df, zh_df def create_task_view(task_name, search_term=None): task_langs = task_data.get(task_name, {}) dfs = [] filtered_models = filter_models(search_term) model_links = [ f'{m}' if model_urls.get(m) else m for m in filtered_models ] for lang in ['en', 'zh']: lang_data = task_langs.get(lang, {}) datasets = [] if lang_data: models_in_lang = list(lang_data.keys()) if models_in_lang: datasets = sorted(lang_data[models_in_lang[0]].keys()) df = pd.DataFrame(columns=["Model", "Avg."] + datasets) for i, model in enumerate(filtered_models): row_data = {"Model": model_links[i]} scores = [] if model in lang_data: for ds in datasets: score = lang_data[model].get(ds, 0.0) row_data[ds] = round(score, 3) scores.append(score) row_data["Avg."] = round(sum(scores)/len(scores) if scores else 0.0, 3) else: row_data.update({ds: 0.0 for ds in datasets}) row_data["Avg."] = 0.0 df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True) if datasets: df = df[["Model", "Avg."] + datasets] numeric_cols = df.columns[df.columns != "Model"] if not numeric_cols.empty: df = df[~(df[numeric_cols] == 0).all(axis=1)] df = df.sort_values(by="Avg.", ascending=False) df.reset_index(drop=True, inplace=True) else: df = pd.DataFrame({"Status": ["There is no data for this language.."]}) dfs.append(df) return dfs task_data, all_models, all_datasets, model_urls = parse_excel('benchmark.xlsx') overall_avg, task_avg = calculate_averages(task_data, all_models) with gr.Blocks(title="Benchmark Leaderboard", css=""".search-box {margin-bottom: 20px} .gradio-container {max-width: 100% !important} .dataframe {width: 100% !important}""") as demo: gr.Markdown("# ๐Ÿ’ฐ FinMTEB Benchmark Leaderboard") gr.Markdown("**Finance** Massive Text Embedding Benchmark (FinMTEB), an embedding benchmark consists of 64 financial domain-specific text datasets, across English and Chinese, spanning seven different tasks.") gr.Markdown("---") gr.Markdown("๐Ÿ“– If you feel our work helpful, please cite the following paper: [FinMTEB: Finance Massive Text Embedding Benchmark](https://arxiv.org/abs/2502.10990)") gr.Markdown("Github: [FinMTEB](https://github.com/yixuantt/FinMTEB/blob/main/README.md)") search = gr.Textbox( placeholder="๐Ÿ” Enter the model name...", label="model_search", show_label=False, elem_classes=["search-box"] ) with gr.Tabs() as main_tabs: with gr.Tab("๐Ÿ“Š Overview"): with gr.Column(elem_classes=["lang-section"]): gr.Markdown("### English Datasets") en_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) with gr.Column(elem_classes=["lang-section"]): gr.Markdown("### Chinese Datasets") zh_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) search.change( create_overall_view, inputs=search, outputs=[en_table, zh_table] ) demo.load( lambda: create_overall_view(), outputs=[en_table, zh_table] ) for task_name in task_data: with gr.Tab(task_name): with gr.Column(): gr.Markdown("### English Datasets") en_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) with gr.Column(): gr.Markdown("### Chinese Datasets") zh_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) search.change( lambda term, tn=task_name: create_task_view(tn, term), inputs=search, outputs=[en_display, zh_display] ) demo.load( lambda tn=task_name: create_task_view(tn), outputs=[en_display, zh_display] ) with gr.Tab("๐Ÿ“ฌ Submit"): gr.Markdown("---") gr.Markdown("For the results report, please send the results to **ytangch@connect.ust.hk**") gr.Markdown("๐Ÿ˜Š Thanks for your contribution!") if __name__ == "__main__": demo.launch()