Spaces:

Writer
/

Financial_LLM_Performance_Leaderboard

Running

App Files Files Community

Financial_LLM_Performance_Leaderboard / app.py

wassemgtk

v1.2

e13106d verified 9 days ago

raw

history blame

8.38 kB

	import gradio as gr
	import pandas as pd
	import numpy as np

	# Data for Table 1: Robustness Results (unchanged)
	robustness_data = {
	"Model Name": [
	"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
	"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
	"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
	"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
	"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
	"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
	"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
	],
	"Baseline": [0.95, 0.96, 0.95, 0.97, 0.98, 0.83, 0.95, 0.95, 0.96, 0.94, 0.91, 0.94, 0.95, 0.92, 0.95, 0.95, 0.94, 0.91, 0.95, 0.94, 0.86, 0.88, 0.89, 0.96],
	"Misspelled (Δ)": ["0.95 (0.0)", "0.95 (0.0)", "0.94 (↓0.01)", "0.95 (↓0.02)", "0.96 (↓0.02)", "0.85 (↑0.02)", "0.90 (↓0.05)", "0.97 (↑0.02)", "0.97 (↑0.01)", "0.94 (0.0)", "0.90 (↓0.01)", "0.92 (↓0.02)", "0.92 (↓0.03)", "0.91 (↓0.01)", "0.94 (↓0.01)", "0.94 (0.0)", "0.94 (0.0)", "0.91 (0.0)", "0.92 (↓0.03)", "0.94 (0.0)", "0.85 (↓0.01)", "0.84 (↓0.04)", "0.84 (↓0.05)", "0.93 (↓0.03)"],
	"Incomplete (Δ)": ["0.95 (0.0)", "0.94 (↓0.02)", "0.94 (↓0.01)", "0.94 (↓0.03)", "0.96 (↓0.02)", "0.82 (↓0.01)", "0.92 (↓0.03)", "0.95 (0.0)", "0.95 (↓0.01)", "0.93 (↓0.01)", "0.86 (↓0.05)", "0.94 (0.0)", "0.93 (↓0.02)", "0.90 (↓0.02)", "0.94 (↓0.01)", "0.93 (↓0.02)", "0.93 (↓0.01)", "0.91 (0.0)", "0.91 (↓0.04)", "0.93 (↓0.01)", "0.78 (↓0.08)", "0.78 (↓0.10)", "0.84 (↓0.05)", "0.92 (↓0.04)"],
	"Out-of-Domain (Δ)": ["0.88 (↓0.07)", "0.92 (↓0.04)", "0.92 (↓0.03)", "0.89 (↓0.08)", "0.95 (↓0.03)", "0.87 (↑0.04)", "0.93 (↓0.02)", "0.92 (↓0.03)", "0.94 (↓0.02)", "0.91 (↓0.03)", "0.82 (↓0.09)", "0.87 (↓0.07)", "0.90 (↓0.05)", "0.85 (↓0.07)", "0.94 (↓0.01)", "0.92 (↓0.03)", "0.92 (↓0.02)", "0.86 (↓0.05)", "0.91 (↓0.04)", "0.90 (↓0.04)", "0.79 (↓0.07)", "0.83 (↓0.05)", "0.81 (↓0.08)", "0.90 (↓0.06)"],
	"OCR Context (Δ)": ["0.91 (↓0.04)", "0.92 (↓0.04)", "0.95 (0.0)", "0.94 (↓0.03)", "0.90 (↓0.08)", "0.72 (↓0.11)", "0.86 (↓0.09)", "0.89 (↓0.06)", "0.93 (↓0.03)", "0.88 (↓0.06)", "0.80 (↓0.11)", "0.88 (↓0.06)", "0.89 (↓0.06)", "0.80 (↓0.12)", "0.88 (↓0.07)", "0.92 (↓0.03)", "0.91 (↓0.03)", "0.77 (↓0.14)", "0.89 (↓0.06)", "0.91 (↓0.03)", "0.69 (↓0.17)", "0.78 (↓0.10)", "0.72 (↓0.17)", "0.89 (↓0.07)"],
	"Robustness (Δ)": ["0.83 (↓0.12)", "0.84 (↓0.12)", "0.85 (↓0.10)", "0.81 (↓0.16)", "0.90 (↓0.08)", "0.64 (↓0.19)", "0.82 (↓0.13)", "0.86 (↓0.09)", "0.89 (↓0.07)", "0.80 (↓0.14)", "0.70 (↓0.21)", "0.80 (↓0.14)", "0.82 (↓0.13)", "0.75 (↓0.17)", "0.86 (↓0.09)", "0.85 (↓0.10)", "0.84 (↓0.10)", "0.74 (↓0.17)", "0.80 (↓0.15)", "0.82 (↓0.12)", "0.58 (↓0.28)", "0.70 (↓0.18)", "0.63 (↓0.26)", "0.83 (↓0.13)"]
	}

	# Data for Table 2: Context Grounding Results (unchanged)
	context_grounding_data = {
	"Model Name": [
	"Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
	"DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
	"DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
	"Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
	"Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
	"Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
	"Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
	],
	"Irrelevant Ctx": [0.81, 0.74, 0.52, 0.56, 0.67, 0.32, 0.49, 0.54, 0.50, 0.51, 0.67, 0.46, 0.50, 0.75, 0.75, 0.89, 0.69, 0.63, 0.78, 0.52, 0.54, 0.37, 0.36, 0.95],
	"No Ctx": [0.66, 0.64, 0.43, 0.55, 0.51, 0.27, 0.21, 0.24, 0.27, 0.22, 0.63, 0.37, 0.40, 0.64, 0.61, 0.68, 0.60, 0.58, 0.53, 0.48, 0.34, 0.26, 0.25, 0.66],
	"Ctx Grounding QA": [0.77, 0.72, 0.50, 0.57, 0.63, 0.30, 0.36, 0.40, 0.41, 0.39, 0.70, 0.48, 0.47, 0.75, 0.70, 0.82, 0.68, 0.65, 0.69, 0.52, 0.47, 0.34, 0.33, 0.83],
	"Ctx Grounding TG": [0.46, 0.52, 0.25, 0.45, 0.27, 0.25, 0.27, 0.35, 0.22, 0.20, 0.27, 0.37, 0.31, 0.31, 0.55, 0.55, 0.39, 0.29, 0.37, 0.39, 0.24, 0.10, 0.14, 0.65],
	"Ctx Grounding": [0.74, 0.69, 0.47, 0.55, 0.59, 0.30, 0.35, 0.39, 0.38, 0.37, 0.65, 0.47, 0.45, 0.70, 0.68, 0.79, 0.64, 0.60, 0.65, 0.50, 0.44, 0.31, 0.30, 0.80],
	"Robustness": [0.83, 0.84, 0.85, 0.81, 0.90, 0.64, 0.82, 0.86, 0.89, 0.80, 0.70, 0.80, 0.82, 0.75, 0.86, 0.85, 0.84, 0.74, 0.80, 0.82, 0.58, 0.70, 0.63, 0.83],
	"Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
	}

	# Function to bold the highest score per column (excluding "Model Name")
	def format_table(df):
	styled_df = df.copy()
	numeric_columns = [col for col in df.columns if col != "Model Name"]

	for col in numeric_columns:
	if col in ["Baseline", "Irrelevant Ctx", "No Ctx", "Ctx Grounding QA", "Ctx Grounding TG", "Ctx Grounding", "Robustness", "Compliance"]:
	# Convert string values (e.g., "0.95 (0.0)") to float for comparison, or use direct float values
	if any(" (" in str(x) for x in df[col]):
	# Handle string values with deltas (e.g., "0.95 (0.0)")
	values = [float(str(x).split(" (")[0]) for x in df[col]]
	else:
	# Handle direct float values
	values = df[col].astype(float)

	max_value = np.max(values)
	styled_df[col] = df[col].apply(lambda x: f"{x}" if (float(str(x).split(" (")[0]) if " (" in str(x) else float(x)) == max_value else x)

	return styled_df

	# Function to create the Gradio interface
	def create_leaderboard():
	# Convert data to DataFrames
	robustness_df = pd.DataFrame(robustness_data)
	context_grounding_df = pd.DataFrame(context_grounding_data)

	# Format tables to bold highest scores
	robustness_df = format_table(robustness_df)
	context_grounding_df = format_table(context_grounding_df)

	# Create Gradio interface with a nice theme
	with gr.Blocks(theme=gr.themes.Soft(), title="Financial Model Performance Leaderboard") as demo:
	gr.Markdown("# Financial Model Performance Leaderboard")

	with gr.Row():
	with gr.Column():
	with gr.Tab("Robustness Results"):
	gr.DataFrame(
	value=robustness_df,
	label="Robustness Results",
	wrap=True,
	elem_classes=["custom-table"] # Custom CSS class for styling
	)
	with gr.Column():
	with gr.Tab("Context Grounding Results"):
	gr.DataFrame(
	value=context_grounding_df,
	label="Context Grounding Results",
	wrap=True,
	elem_classes=["custom-table"] # Custom CSS class for styling
	)

	# Custom CSS for better table appearance (larger font, spacing, and height)
	demo.css = """
	.custom-table {
	font-size: 16px; /* Increase font size for readability */
	line-height: 2; /* Increase line height for longer rows */
	max-height: 600px; /* Set maximum height for scrolling if needed */
	overflow-y: auto; /* Enable vertical scrolling if content exceeds height */
	border-collapse: collapse;
	}
	.custom-table th, .custom-table td {
	padding: 12px; /* Increase padding for spacing */
	border: 1px solid #ddd;
	}
	.custom-table th {
	background-color: #f5f5f5;
	font-weight: bold;
	}
	"""

	return demo

	# Launch the Gradio app
	if __name__ == "__main__":
	demo = create_leaderboard()
	demo.launch()