Spaces:
Sleeping
Sleeping
update
Browse files- .gitattributes +12 -0
- app.py +86 -40
- results-cot/gpt-3.5-CoT.csv +3 -0
- results-cot/gpt-3.5-CoT.jpg +3 -0
- results-cot/gpt-3.5-CoT.pkl +3 -0
- results-cot/gpt-3.5-CoT.png +3 -0
- results-cot/gpt-4v-CoT-Azure.csv +3 -0
- results-cot/gpt-4v-CoT-Azure.jpg +3 -0
- results-cot/gpt-4v-CoT-Azure.pkl +3 -0
- results-cot/gpt-4v-CoT-Azure.png +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.csv +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.jpg +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.pkl +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.png +3 -0
.gitattributes
CHANGED
@@ -103,3 +103,15 @@ results-vision/claude-3-opus-vision.png filter=lfs diff=lfs merge=lfs -text
|
|
103 |
results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
104 |
results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
|
105 |
results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
104 |
results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
|
105 |
results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text
|
106 |
+
results-cot/gpt-3.5-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
107 |
+
results-cot/gpt-4v-CoT-Azure.jpg filter=lfs diff=lfs merge=lfs -text
|
108 |
+
results-vision-CoT/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
109 |
+
results-cot/gpt-3.5-CoT.png filter=lfs diff=lfs merge=lfs -text
|
110 |
+
results-cot/gpt-4v-CoT-Azure.png filter=lfs diff=lfs merge=lfs -text
|
111 |
+
results-vision-CoT/gemini-pro-vision-CoT.png filter=lfs diff=lfs merge=lfs -text
|
112 |
+
results-cot/gpt-3.5-CoT.pkl filter=lfs diff=lfs merge=lfs -text
|
113 |
+
results-vision-CoT/gemini-pro-vision-CoT.pkl filter=lfs diff=lfs merge=lfs -text
|
114 |
+
results-cot/gpt-4v-CoT-Azure.pkl filter=lfs diff=lfs merge=lfs -text
|
115 |
+
results-cot/gpt-4v-CoT-Azure.csv filter=lfs diff=lfs merge=lfs -text
|
116 |
+
results-vision-CoT/gemini-pro-vision-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
117 |
+
results-cot/gpt-3.5-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -6,19 +6,30 @@ from glob import glob
|
|
6 |
csv_results = glob("results/*.pkl")
|
7 |
# Load vision benchmark results
|
8 |
vision_results = glob("results-vision/*.pkl")
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Load the csv files into a dict with keys being name of the file and values being the data
|
11 |
data = {file: pd.read_pickle(file) for file in csv_results}
|
12 |
# Load the vision files into a dict
|
13 |
vision_data = {file: pd.read_pickle(file) for file in vision_results}
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def calculate_accuracy(df):
|
16 |
return df["parsed_judge_response"].mean() * 100
|
17 |
|
|
|
18 |
def accuracy_breakdown(df):
|
19 |
# 4 level accuracy
|
20 |
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
|
21 |
|
|
|
22 |
# Define the column names with icons
|
23 |
headers_with_icons = [
|
24 |
"🤖 Model Name",
|
@@ -29,16 +40,6 @@ headers_with_icons = [
|
|
29 |
"🔬 Level 4",
|
30 |
]
|
31 |
|
32 |
-
# Process text benchmark data
|
33 |
-
accuracy = {file: calculate_accuracy(data[file]) for file in data}
|
34 |
-
data_for_df = []
|
35 |
-
|
36 |
-
for file, df in data.items():
|
37 |
-
overall_accuracy = round(calculate_accuracy(df), 2)
|
38 |
-
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
39 |
-
model_name = file.split("/")[-1].replace(".pkl", "")
|
40 |
-
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
41 |
-
|
42 |
column_names = [
|
43 |
"Model Name",
|
44 |
"Overall Accuracy",
|
@@ -48,46 +49,65 @@ column_names = [
|
|
48 |
"Level 4 Accuracy",
|
49 |
]
|
50 |
|
51 |
-
#
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
-
# Process vision benchmark data
|
64 |
-
vision_data_for_df = []
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
|
72 |
-
# vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
73 |
-
# vision_accuracy_df.columns = headers_with_icons
|
74 |
-
# vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
|
75 |
-
|
76 |
-
# Do the same for vision_accuracy_df
|
77 |
-
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
78 |
-
vision_accuracy_df = vision_accuracy_df.round(1) # Round to one decimal place
|
79 |
-
vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
80 |
-
vision_accuracy_df.columns = headers_with_icons
|
81 |
-
vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
|
82 |
|
83 |
def load_heatmap(evt: gr.SelectData):
|
84 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
85 |
return heatmap_image
|
86 |
|
|
|
87 |
def load_vision_heatmap(evt: gr.SelectData):
|
88 |
heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
|
89 |
return heatmap_image
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
with gr.Blocks() as demo:
|
92 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
93 |
with gr.Tab("Text-only Benchmark"):
|
@@ -99,9 +119,35 @@ with gr.Blocks() as demo:
|
|
99 |
|
100 |
with gr.Tab("Vision Benchmark"):
|
101 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
102 |
-
leader_board_vision = gr.Dataframe(
|
|
|
|
|
103 |
gr.Markdown("## Heatmap")
|
104 |
heatmap_image_vision = gr.Image(label="", show_label=False)
|
105 |
-
leader_board_vision.select(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
demo.launch()
|
|
|
6 |
csv_results = glob("results/*.pkl")
|
7 |
# Load vision benchmark results
|
8 |
vision_results = glob("results-vision/*.pkl")
|
9 |
+
# Load CoT text benchmark results
|
10 |
+
cot_text_results = glob("results-cot/*.pkl")
|
11 |
+
# Load CoT vision benchmark results
|
12 |
+
cot_vision_results = glob("results-vision-CoT/*.pkl")
|
13 |
|
14 |
# Load the csv files into a dict with keys being name of the file and values being the data
|
15 |
data = {file: pd.read_pickle(file) for file in csv_results}
|
16 |
# Load the vision files into a dict
|
17 |
vision_data = {file: pd.read_pickle(file) for file in vision_results}
|
18 |
+
# Load the CoT text files into a dict
|
19 |
+
cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
|
20 |
+
# Load the CoT vision files into a dict
|
21 |
+
cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
|
22 |
+
|
23 |
|
24 |
def calculate_accuracy(df):
|
25 |
return df["parsed_judge_response"].mean() * 100
|
26 |
|
27 |
+
|
28 |
def accuracy_breakdown(df):
|
29 |
# 4 level accuracy
|
30 |
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
|
31 |
|
32 |
+
|
33 |
# Define the column names with icons
|
34 |
headers_with_icons = [
|
35 |
"🤖 Model Name",
|
|
|
40 |
"🔬 Level 4",
|
41 |
]
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
column_names = [
|
44 |
"Model Name",
|
45 |
"Overall Accuracy",
|
|
|
49 |
"Level 4 Accuracy",
|
50 |
]
|
51 |
|
52 |
+
# Function to process data
|
53 |
+
def process_data(data):
|
54 |
+
data_for_df = []
|
55 |
+
for file, df in data.items():
|
56 |
+
overall_accuracy = round(calculate_accuracy(df), 2)
|
57 |
+
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
58 |
+
model_name = file.split("/")[-1].replace(".pkl", "")
|
59 |
+
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
60 |
+
return data_for_df
|
61 |
+
|
62 |
+
|
63 |
+
# Process all data
|
64 |
+
text_data_for_df = process_data(data)
|
65 |
+
vision_data_for_df = process_data(vision_data)
|
66 |
+
cot_text_data_for_df = process_data(cot_text_data)
|
67 |
+
cot_vision_data_for_df = process_data(cot_vision_data)
|
68 |
+
|
69 |
+
# Create DataFrames
|
70 |
+
accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
|
71 |
+
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
72 |
+
cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
|
73 |
+
cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
|
74 |
|
75 |
+
# Function to finalize DataFrame
|
76 |
+
def finalize_df(df):
|
77 |
+
df = df.round(1) # Round to one decimal place
|
78 |
+
df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
79 |
+
df.columns = headers_with_icons
|
80 |
+
df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
|
81 |
+
return df
|
82 |
|
|
|
|
|
83 |
|
84 |
+
# Finalize all DataFrames
|
85 |
+
accuracy_df = finalize_df(accuracy_df)
|
86 |
+
vision_accuracy_df = finalize_df(vision_accuracy_df)
|
87 |
+
cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
|
88 |
+
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
def load_heatmap(evt: gr.SelectData):
|
92 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
93 |
return heatmap_image
|
94 |
|
95 |
+
|
96 |
def load_vision_heatmap(evt: gr.SelectData):
|
97 |
heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
|
98 |
return heatmap_image
|
99 |
|
100 |
+
|
101 |
+
def load_cot_heatmap(evt: gr.SelectData):
|
102 |
+
heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg")
|
103 |
+
return heatmap_image
|
104 |
+
|
105 |
+
|
106 |
+
def load_cot_vision_heatmap(evt: gr.SelectData):
|
107 |
+
heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg")
|
108 |
+
return heatmap_image
|
109 |
+
|
110 |
+
|
111 |
with gr.Blocks() as demo:
|
112 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
113 |
with gr.Tab("Text-only Benchmark"):
|
|
|
119 |
|
120 |
with gr.Tab("Vision Benchmark"):
|
121 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
122 |
+
leader_board_vision = gr.Dataframe(
|
123 |
+
vision_accuracy_df, headers=headers_with_icons
|
124 |
+
)
|
125 |
gr.Markdown("## Heatmap")
|
126 |
heatmap_image_vision = gr.Image(label="", show_label=False)
|
127 |
+
leader_board_vision.select(
|
128 |
+
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
129 |
+
)
|
130 |
+
|
131 |
+
with gr.Tab("CoT Text-only Benchmark"):
|
132 |
+
gr.Markdown("# CoT Text-only Leaderboard")
|
133 |
+
cot_leader_board_text = gr.Dataframe(
|
134 |
+
cot_text_accuracy_df, headers=headers_with_icons
|
135 |
+
)
|
136 |
+
gr.Markdown("## Heatmap")
|
137 |
+
cot_heatmap_image_text = gr.Image(label="", show_label=False)
|
138 |
+
cot_leader_board_text.select(
|
139 |
+
fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
|
140 |
+
)
|
141 |
+
|
142 |
+
with gr.Tab("CoT Vision Benchmark"):
|
143 |
+
gr.Markdown("# CoT Vision Benchmark Leaderboard")
|
144 |
+
cot_leader_board_vision = gr.Dataframe(
|
145 |
+
cot_vision_accuracy_df, headers=headers_with_icons
|
146 |
+
)
|
147 |
+
gr.Markdown("## Heatmap")
|
148 |
+
cot_heatmap_image_vision = gr.Image(label="", show_label=False)
|
149 |
+
cot_leader_board_vision.select(
|
150 |
+
fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
|
151 |
+
)
|
152 |
|
153 |
+
demo.launch()
|
results-cot/gpt-3.5-CoT.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:25400229561733404647fa6aa2ab0372a8507f8c32a17339e9566a57c2618c93
|
3 |
+
size 14472393
|
results-cot/gpt-3.5-CoT.jpg
ADDED
![]() |
Git LFS Details
|
results-cot/gpt-3.5-CoT.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a5429ee7014934ba056f77e642157cb5ed3305246b6bfb6a335dc6cd874b4fd
|
3 |
+
size 14487910
|
results-cot/gpt-3.5-CoT.png
ADDED
![]() |
Git LFS Details
|
results-cot/gpt-4v-CoT-Azure.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04b4de1a7a4280354c89609d15282109ee60f8f58129960dc0edbb046b12a5c6
|
3 |
+
size 6374181
|
results-cot/gpt-4v-CoT-Azure.jpg
ADDED
![]() |
Git LFS Details
|
results-cot/gpt-4v-CoT-Azure.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52ae5e417e011db84976acd51a024eae7ccea1e686b7f3f0e8158cd77be4f847
|
3 |
+
size 6320889
|
results-cot/gpt-4v-CoT-Azure.png
ADDED
![]() |
Git LFS Details
|
results-vision-CoT/gemini-pro-vision-CoT.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312
|
3 |
+
size 6184119
|
results-vision-CoT/gemini-pro-vision-CoT.jpg
ADDED
![]() |
Git LFS Details
|
results-vision-CoT/gemini-pro-vision-CoT.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff
|
3 |
+
size 6144275
|
results-vision-CoT/gemini-pro-vision-CoT.png
ADDED
![]() |
Git LFS Details
|