File size: 5,953 Bytes
75d681a
e8f4283
c1f8e27
e8f4283
0ec0046
 
e8f4283
 
 
3427ab9
e8f4283
 
3427ab9
e8f4283
3427ab9
 
 
 
 
e8f4283
3427ab9
 
 
 
 
 
 
 
 
 
e8f4283
3427ab9
 
 
 
 
1dca33f
 
3427ab9
 
 
 
 
 
 
 
 
 
 
 
e8f4283
3427ab9
 
 
 
e8f4283
 
 
 
 
 
 
 
 
0ec0046
e8f4283
 
 
a3c4484
16b6bb4
3427ab9
 
 
 
 
 
a3c4484
16b6bb4
a3c4484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3427ab9
 
 
 
a101f39
 
16b6bb4
3427ab9
 
 
c8b7025
 
3427ab9
 
 
a101f39
16b6bb4
3427ab9
 
 
16b6bb4
 
3427ab9
 
16b6bb4
 
 
3427ab9
 
a3c4484
a101f39
3427ab9
 
05e4334
3427ab9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8b7025
3427ab9
 
0ec0046
16b6bb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import os
from glob import glob

import gradio as gr
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.colors import BoundaryNorm, ListedColormap

all_results = pd.read_pickle("final_df.pkl")


def get_accuracy_dataframe(df_mother, category):
    # Calculate overall model accuracy
    # filter for category only
    df = df_mother[df_mother["category"] == category].copy()
    df["is_answer_correct"] = df["is_answer_correct"].astype(float)
    model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index()

    # Calculate model accuracy per difficulty level
    df["difficulty_level"] = df["difficulty_level"].astype(int)
    model_accuracy_per_level = (
        df.groupby(["model", "difficulty_level"])["is_answer_correct"]
        .mean()
        .reset_index()
    )
    model_accuracy_per_level_df = model_accuracy_per_level.pivot(
        index="model", columns="difficulty_level", values="is_answer_correct"
    )

    # Merge overall accuracy and level-based accuracy into a single DataFrame
    model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model")
    model_accuracy_df.rename(
        columns={"is_answer_correct": "Overall Accuracy"}, inplace=True
    )

    model_accuracy_df['model'] = model_accuracy_df['model'].apply(lambda x: x.split('/')[-1])

    # Ensure all expected difficulty levels are present
    expected_levels = [1, 2, 3, 4]  # Adjust based on your data
    for level in expected_levels:
        if level not in model_accuracy_df.columns:
            model_accuracy_df[
                level
            ] = None  # Fill missing levels with None or an appropriate value

    # Rename columns to include levels
    level_columns = {level: f"Level {level} Accuracy" for level in expected_levels}
    model_accuracy_df.rename(columns=level_columns, inplace=True)

    # Multiply by 100 and format to one decimal point
    model_accuracy_df = model_accuracy_df.applymap(
        lambda x: round(x * 100, 1) if isinstance(x, float) else x
    )

    # Add headers with icons
    model_accuracy_df.columns = [
        "πŸ€– Model Name",
        "⭐ Overall",
        "πŸ“ˆ Level 1",
        "πŸ” Level 2",
        "πŸ“˜ Level 3",
        "πŸ”¬ Level 4",
    ]

    model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
    
    return model_accuracy_df


# categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object)
accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly")
accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT")
accuracy_df_vision = get_accuracy_dataframe(all_results, "vision")
accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT")
accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot")


# Define the column names with icons
headers_with_icons = [
    "πŸ€– Model Name",
    "⭐ Overall",
    "πŸ“ˆ Level 1",
    "πŸ” Level 2",
    "πŸ“˜ Level 3",
    "πŸ”¬ Level 4",
]

column_names = [
    "Model Name",
    "Overall Accuracy",
    "Level 1 Accuracy",
    "Level 2 Accuracy",
    "Level 3 Accuracy",
    "Level 4 Accuracy",
]


def load_heatmap_textonly(evt: gr.SelectData):
    print(f"./heatmaps/{evt.value}_Textonly.jpg")
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg")
    return heatmap_image


def load_heatmap_cot(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg")
    return heatmap_image


def load_heatmap_vision(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg")
    return heatmap_image


def load_heatmap_vision_cot(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg")
    return heatmap_image


def load_heatmap_1shot(evt: gr.SelectData):
    heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg")
    return heatmap_image


# Then, use these functions in the corresponding select method calls:

with gr.Blocks() as demo:
    gr.Markdown("# FSM Benchmark Leaderboard")

    # Text-only Benchmark
    with gr.Tab("Text-only Benchmark"):
        leader_board_textonly = gr.Dataframe(
            accuracy_df_textonly, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_textonly = gr.Image(label="", show_label=False)
        leader_board_textonly.select(
            fn=load_heatmap_textonly, outputs=[heatmap_image_textonly]
        )

    # CoT Benchmark
    with gr.Tab("CoT Benchmark"):
        leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image_cot = gr.Image(label="", show_label=False)
        leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot])

    # Vision Benchmark
    with gr.Tab("Vision Benchmark"):
        leader_board_vision = gr.Dataframe(
            accuracy_df_vision, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_vision = gr.Image(label="", show_label=False)
        leader_board_vision.select(
            fn=load_heatmap_vision, outputs=[heatmap_image_vision]
        )

    # Vision-CoT Benchmark
    with gr.Tab("Vision-CoT Benchmark"):
        leader_board_vision_cot = gr.Dataframe(
            accuracy_df_vision_cot, headers=headers_with_icons
        )
        gr.Markdown("## Heatmap")
        heatmap_image_vision_cot = gr.Image(label="", show_label=False)
        leader_board_vision_cot.select(
            fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot]
        )

    # 1shot Benchmark
    with gr.Tab("1shot Benchmark"):
        leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons)
        gr.Markdown("## Heatmap")
        heatmap_image_1shot = gr.Image(label="", show_label=False)
        leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot])

    demo.launch()