Spaces:

FSMBench
/

Leaderboard

Sleeping

File size: 5,953 Bytes

import os
from glob import glob

import gradio as gr
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.colors import BoundaryNorm, ListedColormap

all_results = pd.read_pickle("final_df.pkl")


def get_accuracy_dataframe(df_mother, category):
    # Calculate overall model accuracy
    # filter for category only
    df = df_mother[df_mother["category"] == category].copy()
    df["is_answer_correct"] = df["is_answer_correct"].astype(float)
    model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index()

    # Calculate model accuracy per difficulty level
    df["difficulty_level"] = df["difficulty_level"].astype(int)
    model_accuracy_per_level = (
        df.groupby(["model", "difficulty_level"])["is_answer_correct"]
        .mean()
        .reset_index()
    )
    model_accuracy_per_level_df = model_accuracy_per_level.pivot(
        index="model", columns="difficulty_level", values="is_answer_correct"
    )

    # Merge overall accuracy and level-based accuracy into a single DataFrame
    model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model")
    model_accuracy_df.rename(
        columns={"is_answer_correct": "Overall Accuracy"}, inplace=True
    )

    model_accuracy_df['model'] = model_accuracy_df['model'].apply(lambda x: x.split('/')[-1])

    # Ensure all expected difficulty levels are present
    expected_levels = [1, 2, 3, 4]  # Adjust based on your data
    for level in expected_levels:
        if level not in model_accuracy_df.columns:
            model_accuracy_df[
                level
            ] = None  # Fill missing levels with None or an appropriate value

    # Rename columns to include levels
    level_columns = {level: f"Level {level} Accuracy" for level in expected_levels}
    model_accuracy_df.rename(columns=level_columns, inplace=True)

    # Multiply by 100 and format to one decimal point
    model_accuracy_df = model_accuracy_df.applymap(
        lambda x: round(x * 100, 1) if isinstance(x, float) else x
    )

    # Add headers with icons
    model_accuracy_df.columns = [
        "🤖 Model Name",
        "⭐ Overall",
        "📈 Level 1",
        "🔍 Level 2",
        "📘 Level 3",
        "🔬 Level 4",
    ]

    model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
    
    return model_accuracy_df


# categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object)
accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly")
accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT")
accuracy_df_vision = get_accuracy_dataframe(all_results, "vision")
accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT")
accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot")


# Define the column names with icons
headers_with_icons = [
    "🤖 Model Name",
    "⭐ Overall",
    "📈 Level 1",
    "🔍 Level 2",
    "📘 Level 3",
    "🔬 Level 4",
]

column_names = [
    "Model Name",
    "Overall Accuracy",
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Level 4 Accuracy",
]


def load_heatmap_textonly(evt: gr.SelectData):
    print(f"./heatmaps/{evt.value}_Textonly.jpg")
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg")
    return heatmap_image


def load_heatmap_cot(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg")
    return heatmap_image


def load_heatmap_vision(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg")
    return heatmap_image


def load_heatmap_vision_cot(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg")
    return heatmap_image


def load_heatmap_1shot(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg")
    return heatmap_image


# Then, use these functions in the corresponding select method calls:

with gr.Blocks() as demo:
    gr.Markdown("# FSM Benchmark Leaderboard")

    # Text-only Benchmark
    with gr.Tab("Text-only Benchmark"):
        leader_board_textonly = gr.Dataframe(
            accuracy_df_textonly, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_textonly = gr.Image(label="", show_label=False)
        leader_board_textonly.select(
            fn=load_heatmap_textonly, outputs=[heatmap_image_textonly]
        )

    # CoT Benchmark
    with gr.Tab("CoT Benchmark"):
        leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image_cot = gr.Image(label="", show_label=False)
        leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot])

    # Vision Benchmark
    with gr.Tab("Vision Benchmark"):
        leader_board_vision = gr.Dataframe(
            accuracy_df_vision, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_vision = gr.Image(label="", show_label=False)
        leader_board_vision.select(
            fn=load_heatmap_vision, outputs=[heatmap_image_vision]
        )

    # Vision-CoT Benchmark
    with gr.Tab("Vision-CoT Benchmark"):
        leader_board_vision_cot = gr.Dataframe(
            accuracy_df_vision_cot, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_vision_cot = gr.Image(label="", show_label=False)
        leader_board_vision_cot.select(
            fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot]
        )

    # 1shot Benchmark
    with gr.Tab("1shot Benchmark"):
        leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image_1shot = gr.Image(label="", show_label=False)
        leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot])

    demo.launch()