|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
robustness_data = { |
|
"Model Name": [ |
|
"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini", |
|
"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B", |
|
"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct", |
|
"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct", |
|
"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M", |
|
"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct", |
|
"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct" |
|
], |
|
"Baseline": [0.95, 0.96, 0.95, 0.97, 0.98, 0.83, 0.95, 0.95, 0.96, 0.94, 0.91, 0.94, 0.95, 0.92, 0.95, 0.95, 0.94, 0.91, 0.95, 0.94, 0.86, 0.88, 0.89, 0.96], |
|
"Misspelled (Ξ)": ["0.95 (0.0)", "0.95 (0.0)", "0.94 (β0.01)", "0.95 (β0.02)", "0.96 (β0.02)", "0.85 (β0.02)", "0.90 (β0.05)", "0.97 (β0.02)", "0.97 (β0.01)", "0.94 (0.0)", "0.90 (β0.01)", "0.92 (β0.02)", "0.92 (β0.03)", "0.91 (β0.01)", "0.94 (β0.01)", "0.94 (0.0)", "0.94 (0.0)", "0.91 (0.0)", "0.92 (β0.03)", "0.94 (0.0)", "0.85 (β0.01)", "0.84 (β0.04)", "0.84 (β0.05)", "0.93 (β0.03)"], |
|
"Incomplete (Ξ)": ["0.95 (0.0)", "0.94 (β0.02)", "0.94 (β0.01)", "0.94 (β0.03)", "0.96 (β0.02)", "0.82 (β0.01)", "0.92 (β0.03)", "0.95 (0.0)", "0.95 (β0.01)", "0.93 (β0.01)", "0.86 (β0.05)", "0.94 (0.0)", "0.93 (β0.02)", "0.90 (β0.02)", "0.94 (β0.01)", "0.93 (β0.02)", "0.93 (β0.01)", "0.91 (0.0)", "0.91 (β0.04)", "0.93 (β0.01)", "0.78 (β0.08)", "0.78 (β0.10)", "0.84 (β0.05)", "0.92 (β0.04)"], |
|
"Out-of-Domain (Ξ)": ["0.88 (β0.07)", "0.92 (β0.04)", "0.92 (β0.03)", "0.89 (β0.08)", "0.95 (β0.03)", "0.87 (β0.04)", "0.93 (β0.02)", "0.92 (β0.03)", "0.94 (β0.02)", "0.91 (β0.03)", "0.82 (β0.09)", "0.87 (β0.07)", "0.90 (β0.05)", "0.85 (β0.07)", "0.94 (β0.01)", "0.92 (β0.03)", "0.92 (β0.02)", "0.86 (β0.05)", "0.91 (β0.04)", "0.90 (β0.04)", "0.79 (β0.07)", "0.83 (β0.05)", "0.81 (β0.08)", "0.90 (β0.06)"], |
|
"OCR Context (Ξ)": ["0.91 (β0.04)", "0.92 (β0.04)", "0.95 (0.0)", "0.94 (β0.03)", "0.90 (β0.08)", "0.72 (β0.11)", "0.86 (β0.09)", "0.89 (β0.06)", "0.93 (β0.03)", "0.88 (β0.06)", "0.80 (β0.11)", "0.88 (β0.06)", "0.89 (β0.06)", "0.80 (β0.12)", "0.88 (β0.07)", "0.92 (β0.03)", "0.91 (β0.03)", "0.77 (β0.14)", "0.89 (β0.06)", "0.91 (β0.03)", "0.69 (β0.17)", "0.78 (β0.10)", "0.72 (β0.17)", "0.89 (β0.07)"], |
|
"Robustness (Ξ)": ["0.83 (β0.12)", "0.84 (β0.12)", "0.85 (β0.10)", "0.81 (β0.16)", "0.90 (β0.08)", "0.64 (β0.19)", "0.82 (β0.13)", "0.86 (β0.09)", "0.89 (β0.07)", "0.80 (β0.14)", "0.70 (β0.21)", "0.80 (β0.14)", "0.82 (β0.13)", "0.75 (β0.17)", "0.86 (β0.09)", "0.85 (β0.10)", "0.84 (β0.10)", "0.74 (β0.17)", "0.80 (β0.15)", "0.82 (β0.12)", "0.58 (β0.28)", "0.70 (β0.18)", "0.63 (β0.26)", "0.83 (β0.13)"] |
|
} |
|
|
|
|
|
context_grounding_data = { |
|
"Model Name": [ |
|
"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini", |
|
"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B", |
|
"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct", |
|
"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct", |
|
"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M", |
|
"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct", |
|
"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct" |
|
], |
|
"Irrelevant Ctx": [0.81, 0.74, 0.52, 0.56, 0.67, 0.32, 0.49, 0.54, 0.50, 0.51, 0.67, 0.46, 0.50, 0.75, 0.75, 0.89, 0.69, 0.63, 0.78, 0.52, 0.54, 0.37, 0.36, 0.95], |
|
"No Ctx": [0.66, 0.64, 0.43, 0.55, 0.51, 0.27, 0.21, 0.24, 0.27, 0.22, 0.63, 0.37, 0.40, 0.64, 0.61, 0.68, 0.60, 0.58, 0.53, 0.48, 0.34, 0.26, 0.25, 0.66], |
|
"Ctx Grounding QA": [0.77, 0.72, 0.50, 0.57, 0.63, 0.30, 0.36, 0.40, 0.41, 0.39, 0.70, 0.48, 0.47, 0.75, 0.70, 0.82, 0.68, 0.65, 0.69, 0.52, 0.47, 0.34, 0.33, 0.83], |
|
"Ctx Grounding TG": [0.46, 0.52, 0.25, 0.45, 0.27, 0.25, 0.27, 0.35, 0.22, 0.20, 0.27, 0.37, 0.31, 0.31, 0.55, 0.55, 0.39, 0.29, 0.37, 0.39, 0.24, 0.10, 0.14, 0.65], |
|
"Ctx Grounding": [0.74, 0.69, 0.47, 0.55, 0.59, 0.30, 0.35, 0.39, 0.38, 0.37, 0.65, 0.47, 0.45, 0.70, 0.68, 0.79, 0.64, 0.60, 0.65, 0.50, 0.44, 0.31, 0.30, 0.80], |
|
"Robustness": [0.83, 0.84, 0.85, 0.81, 0.90, 0.64, 0.82, 0.86, 0.89, 0.80, 0.70, 0.80, 0.82, 0.75, 0.86, 0.85, 0.84, 0.74, 0.80, 0.82, 0.58, 0.70, 0.63, 0.83], |
|
"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81] |
|
} |
|
|
|
|
|
def format_table(df): |
|
styled_df = df.copy() |
|
numeric_columns = [col for col in df.columns if col != "Model Name"] |
|
|
|
for col in numeric_columns: |
|
if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]: |
|
|
|
if any(" (" in str(x) for x in df[col]): |
|
|
|
values = [float(str(x).split(" (")[0]) for x in df[col]] |
|
else: |
|
|
|
values = df[col].astype(float) |
|
|
|
max_value = np.max(values) |
|
styled_df[col] = df[col].apply(lambda x: f"**{x}**" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x) |
|
|
|
return styled_df |
|
|
|
|
|
def create_leaderboard(): |
|
|
|
robustness_df = pd.DataFrame(robustness_data) |
|
context_grounding_df = pd.DataFrame(context_grounding_data) |
|
|
|
|
|
robustness_df = format_table(robustness_df) |
|
context_grounding_df = format_table(context_grounding_df) |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="Financial Model Performance Leaderboard") as demo: |
|
gr.Markdown("# Financial Model Performance Leaderboard") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Tab("Robustness Results"): |
|
gr.DataFrame( |
|
value=robustness_df, |
|
label="Robustness Results", |
|
wrap=True, |
|
elem_classes=["custom-table"] |
|
) |
|
with gr.Column(): |
|
with gr.Tab("Context Grounding Results"): |
|
gr.DataFrame( |
|
value=context_grounding_df, |
|
label="Context Grounding Results", |
|
wrap=True, |
|
elem_classes=["custom-table"] |
|
) |
|
|
|
|
|
demo.css = """ |
|
.custom-table { |
|
font-size: 16px; /* Increase font size for readability */ |
|
line-height: 2; /* Increase line height for longer rows */ |
|
max-height: 600px; /* Set maximum height for scrolling if needed */ |
|
overflow-y: auto; /* Enable vertical scrolling if content exceeds height */ |
|
border-collapse: collapse; |
|
} |
|
.custom-table th, .custom-table td { |
|
padding: 12px; /* Increase padding for spacing */ |
|
border: 1px solid #ddd; |
|
} |
|
.custom-table th { |
|
background-color: #f5f5f5; |
|
font-weight: bold; |
|
} |
|
""" |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_leaderboard() |
|
demo.launch() |