taesiri commited on
Commit
16b6bb4
1 Parent(s): b6df3c4
.gitattributes CHANGED
@@ -103,3 +103,15 @@ results-vision/claude-3-opus-vision.png filter=lfs diff=lfs merge=lfs -text
103
  results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
104
  results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
105
  results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
104
  results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
105
  results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text
106
+ results-cot/gpt-3.5-CoT.jpg filter=lfs diff=lfs merge=lfs -text
107
+ results-cot/gpt-4v-CoT-Azure.jpg filter=lfs diff=lfs merge=lfs -text
108
+ results-vision-CoT/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
109
+ results-cot/gpt-3.5-CoT.png filter=lfs diff=lfs merge=lfs -text
110
+ results-cot/gpt-4v-CoT-Azure.png filter=lfs diff=lfs merge=lfs -text
111
+ results-vision-CoT/gemini-pro-vision-CoT.png filter=lfs diff=lfs merge=lfs -text
112
+ results-cot/gpt-3.5-CoT.pkl filter=lfs diff=lfs merge=lfs -text
113
+ results-vision-CoT/gemini-pro-vision-CoT.pkl filter=lfs diff=lfs merge=lfs -text
114
+ results-cot/gpt-4v-CoT-Azure.pkl filter=lfs diff=lfs merge=lfs -text
115
+ results-cot/gpt-4v-CoT-Azure.csv filter=lfs diff=lfs merge=lfs -text
116
+ results-vision-CoT/gemini-pro-vision-CoT.csv filter=lfs diff=lfs merge=lfs -text
117
+ results-cot/gpt-3.5-CoT.csv filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -6,19 +6,30 @@ from glob import glob
6
  csv_results = glob("results/*.pkl")
7
  # Load vision benchmark results
8
  vision_results = glob("results-vision/*.pkl")
 
 
 
 
9
 
10
  # Load the csv files into a dict with keys being name of the file and values being the data
11
  data = {file: pd.read_pickle(file) for file in csv_results}
12
  # Load the vision files into a dict
13
  vision_data = {file: pd.read_pickle(file) for file in vision_results}
 
 
 
 
 
14
 
15
  def calculate_accuracy(df):
16
  return df["parsed_judge_response"].mean() * 100
17
 
 
18
  def accuracy_breakdown(df):
19
  # 4 level accuracy
20
  return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
21
 
 
22
  # Define the column names with icons
23
  headers_with_icons = [
24
  "🤖 Model Name",
@@ -29,16 +40,6 @@ headers_with_icons = [
29
  "🔬 Level 4",
30
  ]
31
 
32
- # Process text benchmark data
33
- accuracy = {file: calculate_accuracy(data[file]) for file in data}
34
- data_for_df = []
35
-
36
- for file, df in data.items():
37
- overall_accuracy = round(calculate_accuracy(df), 2)
38
- breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
39
- model_name = file.split("/")[-1].replace(".pkl", "")
40
- data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
41
-
42
  column_names = [
43
  "Model Name",
44
  "Overall Accuracy",
@@ -48,46 +49,65 @@ column_names = [
48
  "Level 4 Accuracy",
49
  ]
50
 
51
- # accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
52
- # accuracy_df.columns = headers_with_icons
53
- # accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
54
-
55
- # After creating the DataFrame and before sorting
56
- accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
57
- accuracy_df = accuracy_df.round(1) # Round to one decimal place
58
- accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
59
- accuracy_df.columns = headers_with_icons
60
- accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
 
 
 
62
 
63
- # Process vision benchmark data
64
- vision_data_for_df = []
65
 
66
- for file, df in vision_data.items():
67
- overall_accuracy = round(calculate_accuracy(df), 2)
68
- breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
69
- model_name = file.split("/")[-1].replace(".pkl", "")
70
- vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
71
 
72
- # vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
73
- # vision_accuracy_df.columns = headers_with_icons
74
- # vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
75
-
76
- # Do the same for vision_accuracy_df
77
- vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
78
- vision_accuracy_df = vision_accuracy_df.round(1) # Round to one decimal place
79
- vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
80
- vision_accuracy_df.columns = headers_with_icons
81
- vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
82
 
83
  def load_heatmap(evt: gr.SelectData):
84
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
85
  return heatmap_image
86
 
 
87
  def load_vision_heatmap(evt: gr.SelectData):
88
  heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
89
  return heatmap_image
90
 
 
 
 
 
 
 
 
 
 
 
 
91
  with gr.Blocks() as demo:
92
  gr.Markdown("# FSM Benchmark Leaderboard")
93
  with gr.Tab("Text-only Benchmark"):
@@ -99,9 +119,35 @@ with gr.Blocks() as demo:
99
 
100
  with gr.Tab("Vision Benchmark"):
101
  gr.Markdown("# Vision Benchmark Leaderboard")
102
- leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons)
 
 
103
  gr.Markdown("## Heatmap")
104
  heatmap_image_vision = gr.Image(label="", show_label=False)
105
- leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- demo.launch()
 
6
  csv_results = glob("results/*.pkl")
7
  # Load vision benchmark results
8
  vision_results = glob("results-vision/*.pkl")
9
+ # Load CoT text benchmark results
10
+ cot_text_results = glob("results-cot/*.pkl")
11
+ # Load CoT vision benchmark results
12
+ cot_vision_results = glob("results-vision-CoT/*.pkl")
13
 
14
  # Load the csv files into a dict with keys being name of the file and values being the data
15
  data = {file: pd.read_pickle(file) for file in csv_results}
16
  # Load the vision files into a dict
17
  vision_data = {file: pd.read_pickle(file) for file in vision_results}
18
+ # Load the CoT text files into a dict
19
+ cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
20
+ # Load the CoT vision files into a dict
21
+ cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
22
+
23
 
24
  def calculate_accuracy(df):
25
  return df["parsed_judge_response"].mean() * 100
26
 
27
+
28
  def accuracy_breakdown(df):
29
  # 4 level accuracy
30
  return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
31
 
32
+
33
  # Define the column names with icons
34
  headers_with_icons = [
35
  "🤖 Model Name",
 
40
  "🔬 Level 4",
41
  ]
42
 
 
 
 
 
 
 
 
 
 
 
43
  column_names = [
44
  "Model Name",
45
  "Overall Accuracy",
 
49
  "Level 4 Accuracy",
50
  ]
51
 
52
+ # Function to process data
53
+ def process_data(data):
54
+ data_for_df = []
55
+ for file, df in data.items():
56
+ overall_accuracy = round(calculate_accuracy(df), 2)
57
+ breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
58
+ model_name = file.split("/")[-1].replace(".pkl", "")
59
+ data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
60
+ return data_for_df
61
+
62
+
63
+ # Process all data
64
+ text_data_for_df = process_data(data)
65
+ vision_data_for_df = process_data(vision_data)
66
+ cot_text_data_for_df = process_data(cot_text_data)
67
+ cot_vision_data_for_df = process_data(cot_vision_data)
68
+
69
+ # Create DataFrames
70
+ accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
71
+ vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
72
+ cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
73
+ cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
74
 
75
+ # Function to finalize DataFrame
76
+ def finalize_df(df):
77
+ df = df.round(1) # Round to one decimal place
78
+ df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
79
+ df.columns = headers_with_icons
80
+ df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
81
+ return df
82
 
 
 
83
 
84
+ # Finalize all DataFrames
85
+ accuracy_df = finalize_df(accuracy_df)
86
+ vision_accuracy_df = finalize_df(vision_accuracy_df)
87
+ cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
88
+ cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
89
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def load_heatmap(evt: gr.SelectData):
92
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
93
  return heatmap_image
94
 
95
+
96
  def load_vision_heatmap(evt: gr.SelectData):
97
  heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
98
  return heatmap_image
99
 
100
+
101
+ def load_cot_heatmap(evt: gr.SelectData):
102
+ heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg")
103
+ return heatmap_image
104
+
105
+
106
+ def load_cot_vision_heatmap(evt: gr.SelectData):
107
+ heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg")
108
+ return heatmap_image
109
+
110
+
111
  with gr.Blocks() as demo:
112
  gr.Markdown("# FSM Benchmark Leaderboard")
113
  with gr.Tab("Text-only Benchmark"):
 
119
 
120
  with gr.Tab("Vision Benchmark"):
121
  gr.Markdown("# Vision Benchmark Leaderboard")
122
+ leader_board_vision = gr.Dataframe(
123
+ vision_accuracy_df, headers=headers_with_icons
124
+ )
125
  gr.Markdown("## Heatmap")
126
  heatmap_image_vision = gr.Image(label="", show_label=False)
127
+ leader_board_vision.select(
128
+ fn=load_vision_heatmap, outputs=[heatmap_image_vision]
129
+ )
130
+
131
+ with gr.Tab("CoT Text-only Benchmark"):
132
+ gr.Markdown("# CoT Text-only Leaderboard")
133
+ cot_leader_board_text = gr.Dataframe(
134
+ cot_text_accuracy_df, headers=headers_with_icons
135
+ )
136
+ gr.Markdown("## Heatmap")
137
+ cot_heatmap_image_text = gr.Image(label="", show_label=False)
138
+ cot_leader_board_text.select(
139
+ fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
140
+ )
141
+
142
+ with gr.Tab("CoT Vision Benchmark"):
143
+ gr.Markdown("# CoT Vision Benchmark Leaderboard")
144
+ cot_leader_board_vision = gr.Dataframe(
145
+ cot_vision_accuracy_df, headers=headers_with_icons
146
+ )
147
+ gr.Markdown("## Heatmap")
148
+ cot_heatmap_image_vision = gr.Image(label="", show_label=False)
149
+ cot_leader_board_vision.select(
150
+ fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
151
+ )
152
 
153
+ demo.launch()
results-cot/gpt-3.5-CoT.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25400229561733404647fa6aa2ab0372a8507f8c32a17339e9566a57c2618c93
3
+ size 14472393
results-cot/gpt-3.5-CoT.jpg ADDED

Git LFS Details

  • SHA256: ecdcdb3508a90af17ff384cb0a0e1065e33ffad5a69b813a51bfb0bf8287dc92
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
results-cot/gpt-3.5-CoT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a5429ee7014934ba056f77e642157cb5ed3305246b6bfb6a335dc6cd874b4fd
3
+ size 14487910
results-cot/gpt-3.5-CoT.png ADDED

Git LFS Details

  • SHA256: 6ef6e2275200de4bb9853889b68935806cc3f74716007d812808c49c1c19d46f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.02 MB
results-cot/gpt-4v-CoT-Azure.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b4de1a7a4280354c89609d15282109ee60f8f58129960dc0edbb046b12a5c6
3
+ size 6374181
results-cot/gpt-4v-CoT-Azure.jpg ADDED

Git LFS Details

  • SHA256: 6d63da74c747dc220638351069b927925aaa34e580e2c00e70dd29e0d2cefebb
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
results-cot/gpt-4v-CoT-Azure.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ae5e417e011db84976acd51a024eae7ccea1e686b7f3f0e8158cd77be4f847
3
+ size 6320889
results-cot/gpt-4v-CoT-Azure.png ADDED

Git LFS Details

  • SHA256: b8a96d76a726ab67813368f0a630576aee5cda6b5264c2edc65af93932fe4a32
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB
results-vision-CoT/gemini-pro-vision-CoT.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312
3
+ size 6184119
results-vision-CoT/gemini-pro-vision-CoT.jpg ADDED

Git LFS Details

  • SHA256: fed7a1736c7550edca80305d90c975e36da47331bc67f824c23b6bb5525289b4
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
results-vision-CoT/gemini-pro-vision-CoT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff
3
+ size 6144275
results-vision-CoT/gemini-pro-vision-CoT.png ADDED

Git LFS Details

  • SHA256: 49ab8af8d2e3d2fb671b375a830808eb92a84e0faef35d2844f8eed62bd6acf5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB