taesiri commited on
Commit
3427ab9
β€’
1 Parent(s): d02e6ef
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitattributes +50 -0
  2. app.py +106 -124
  3. results/CodeLlama-70b-Instruct-hf.jpg β†’ heatmaps/CodeLlama-70b-Instruct-hf_CoT.jpg +2 -2
  4. results/GPT-4-0125-preview.png β†’ heatmaps/CodeLlama-70b-Instruct-hf_Textonly.jpg +2 -2
  5. results/CodeLlama-70b-Instruct-hf.png β†’ heatmaps/Llama-2-70b-chat-hf_CoT.jpg +2 -2
  6. results/GPT-4-0125-preview.jpg β†’ heatmaps/Llama-2-70b-chat-hf_Textonly.jpg +2 -2
  7. heatmaps/Llama-3-70b-chat-hf_CoT.jpg +3 -0
  8. heatmaps/Llama-3-70b-chat-hf_Textonly.jpg +3 -0
  9. heatmaps/Mistral-7B-Instruct-v0.2_CoT.jpg +3 -0
  10. heatmaps/Mistral-7B-Instruct-v0.2_Textonly.jpg +3 -0
  11. heatmaps/Mixtral-8x7B-Instruct-v0.1_CoT.jpg +3 -0
  12. heatmaps/Mixtral-8x7B-Instruct-v0.1_Textonly.jpg +3 -0
  13. heatmaps/Qwen1.5-72B-Chat_CoT.jpg +3 -0
  14. heatmaps/Qwen1.5-72B-Chat_Textonly.jpg +3 -0
  15. heatmaps/Yi-34B-Chat_CoT.jpg +3 -0
  16. heatmaps/Yi-34B-Chat_Textonly.jpg +3 -0
  17. heatmaps/claude-3-haiku-20240307_1shot.jpg +3 -0
  18. heatmaps/claude-3-haiku-20240307_CoT.jpg +3 -0
  19. heatmaps/claude-3-haiku-20240307_Textonly.jpg +3 -0
  20. heatmaps/claude-3-haiku-20240307_vision-CoT.jpg +3 -0
  21. heatmaps/claude-3-haiku-20240307_vision.jpg +3 -0
  22. heatmaps/claude-3-opus-20240229_CoT.jpg +3 -0
  23. heatmaps/claude-3-opus-20240229_Textonly.jpg +3 -0
  24. heatmaps/claude-3-opus-20240229_vision-CoT.jpg +3 -0
  25. heatmaps/claude-3-opus-20240229_vision.jpg +3 -0
  26. heatmaps/claude-3-sonnet-20240229_CoT.jpg +3 -0
  27. heatmaps/claude-3-sonnet-20240229_Textonly.jpg +3 -0
  28. heatmaps/claude-3-sonnet-20240229_vision-CoT.jpg +3 -0
  29. heatmaps/claude-3-sonnet-20240229_vision.jpg +3 -0
  30. heatmaps/dbrx-instruct_CoT.jpg +3 -0
  31. heatmaps/dbrx-instruct_Textonly.jpg +3 -0
  32. heatmaps/deepseek-llm-67b-chat_CoT.jpg +3 -0
  33. heatmaps/deepseek-llm-67b-chat_Textonly.jpg +3 -0
  34. heatmaps/gemini-pro_CoT.jpg +3 -0
  35. heatmaps/gemini-pro_vision-CoT.jpg +3 -0
  36. heatmaps/gemini-pro_vision.jpg +3 -0
  37. heatmaps/gemma-7b-it_CoT.jpg +3 -0
  38. heatmaps/gemma-7b-it_Textonly.jpg +3 -0
  39. heatmaps/gpt-3.5-0613_CoT.jpg +3 -0
  40. heatmaps/gpt-3.5-0613_Textonly.jpg +3 -0
  41. heatmaps/gpt-3.5-turbo-0125_1shot.jpg +3 -0
  42. heatmaps/gpt-3.5-turbo-0125_CoT.jpg +3 -0
  43. heatmaps/gpt-3.5-turbo-0125_Textonly.jpg +3 -0
  44. heatmaps/gpt-4-0125-preview_CoT.jpg +3 -0
  45. heatmaps/gpt-4-0125-preview_Textonly.jpg +3 -0
  46. heatmaps/gpt-4-1106_CoT.jpg +3 -0
  47. heatmaps/gpt-4-1106_Textonly.jpg +3 -0
  48. heatmaps/gpt-4-turbo-2024-04-09_CoT.jpg +3 -0
  49. results/gpt-4-turbo-2024-04-09.jpg β†’ heatmaps/gpt-4-turbo-2024-04-09_Textonly.jpg +0 -0
  50. heatmaps/gpt-4-vision-preview_vision-CoT.jpg +3 -0
.gitattributes CHANGED
@@ -284,3 +284,53 @@ results/gpt-4-1106.pkl filter=lfs diff=lfs merge=lfs -text
284
  results/Llama-3-70b-chat-hf.pkl filter=lfs diff=lfs merge=lfs -text
285
  results/dbrx-instruct.pkl filter=lfs diff=lfs merge=lfs -text
286
  results/gpt-3.5-0613.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  results/Llama-3-70b-chat-hf.pkl filter=lfs diff=lfs merge=lfs -text
285
  results/dbrx-instruct.pkl filter=lfs diff=lfs merge=lfs -text
286
  results/gpt-3.5-0613.pkl filter=lfs diff=lfs merge=lfs -text
287
+ final_df.pkl filter=lfs diff=lfs merge=lfs -text
288
+ heatmaps/claude-3-haiku-20240307_vision.jpg filter=lfs diff=lfs merge=lfs -text
289
+ heatmaps/Mixtral-8x7B-Instruct-v0.1_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
290
+ heatmaps/claude-3-haiku-20240307_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
291
+ heatmaps/gpt-4-1106_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
292
+ heatmaps/gpt-4-vision-preview_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
293
+ heatmaps/dbrx-instruct_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
294
+ heatmaps/Llama-2-70b-chat-hf_CoT.jpg filter=lfs diff=lfs merge=lfs -text
295
+ heatmaps/Llama-3-70b-chat-hf_CoT.jpg filter=lfs diff=lfs merge=lfs -text
296
+ heatmaps/Llama-3-70b-chat-hf_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
297
+ heatmaps/Mistral-7B-Instruct-v0.2_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
298
+ heatmaps/claude-3-sonnet-20240229_vision.jpg filter=lfs diff=lfs merge=lfs -text
299
+ heatmaps/CodeLlama-70b-Instruct-hf_CoT.jpg filter=lfs diff=lfs merge=lfs -text
300
+ heatmaps/claude-3-haiku-20240307_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
301
+ heatmaps/gemma-7b-it_CoT.jpg filter=lfs diff=lfs merge=lfs -text
302
+ heatmaps/gpt-4-0125-preview_CoT.jpg filter=lfs diff=lfs merge=lfs -text
303
+ heatmaps/claude-3-haiku-20240307_CoT.jpg filter=lfs diff=lfs merge=lfs -text
304
+ heatmaps/Mistral-7B-Instruct-v0.2_CoT.jpg filter=lfs diff=lfs merge=lfs -text
305
+ heatmaps/Qwen1.5-72B-Chat_CoT.jpg filter=lfs diff=lfs merge=lfs -text
306
+ heatmaps/gpt-3.5-turbo-0125_1shot.jpg filter=lfs diff=lfs merge=lfs -text
307
+ heatmaps/claude-3-sonnet-20240229_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
308
+ heatmaps/Mixtral-8x7B-Instruct-v0.1_CoT.jpg filter=lfs diff=lfs merge=lfs -text
309
+ heatmaps/gemma-7b-it_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
310
+ heatmaps/gpt-4-0125-preview_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
311
+ heatmaps/gpt-4-turbo-2024-04-09_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
312
+ heatmaps/claude-3-haiku-20240307_1shot.jpg filter=lfs diff=lfs merge=lfs -text
313
+ heatmaps/claude-3-opus-20240229_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
314
+ heatmaps/deepseek-llm-67b-chat_CoT.jpg filter=lfs diff=lfs merge=lfs -text
315
+ heatmaps/claude-3-opus-20240229_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
316
+ heatmaps/gemini-pro_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
317
+ heatmaps/gpt-3.5-0613_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
318
+ heatmaps/gpt-4-vision-preview_vision.jpg filter=lfs diff=lfs merge=lfs -text
319
+ heatmaps/gemini-pro_CoT.jpg filter=lfs diff=lfs merge=lfs -text
320
+ heatmaps/CodeLlama-70b-Instruct-hf_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
321
+ heatmaps/Qwen1.5-72B-Chat_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
322
+ heatmaps/Yi-34B-Chat_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
323
+ heatmaps/claude-3-sonnet-20240229_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
324
+ heatmaps/dbrx-instruct_CoT.jpg filter=lfs diff=lfs merge=lfs -text
325
+ heatmaps/gemini-pro_vision.jpg filter=lfs diff=lfs merge=lfs -text
326
+ heatmaps/gpt-3.5-0613_CoT.jpg filter=lfs diff=lfs merge=lfs -text
327
+ heatmaps/gpt-4-1106_CoT.jpg filter=lfs diff=lfs merge=lfs -text
328
+ heatmaps/gpt-4-turbo-2024-04-09_CoT.jpg filter=lfs diff=lfs merge=lfs -text
329
+ heatmaps/Llama-2-70b-chat-hf_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
330
+ heatmaps/claude-3-opus-20240229_CoT.jpg filter=lfs diff=lfs merge=lfs -text
331
+ heatmaps/gpt-3.5-turbo-0125_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
332
+ heatmaps/gpt-3.5-turbo-0125_CoT.jpg filter=lfs diff=lfs merge=lfs -text
333
+ heatmaps/Yi-34B-Chat_CoT.jpg filter=lfs diff=lfs merge=lfs -text
334
+ heatmaps/claude-3-opus-20240229_vision.jpg filter=lfs diff=lfs merge=lfs -text
335
+ heatmaps/claude-3-sonnet-20240229_CoT.jpg filter=lfs diff=lfs merge=lfs -text
336
+ heatmaps/deepseek-llm-67b-chat_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -7,27 +7,50 @@ import pandas as pd
7
  import seaborn as sns
8
  from matplotlib.colors import BoundaryNorm, ListedColormap
9
 
10
- all_results = pd.read_pickle("all_results.pkl")
11
 
12
 
13
- def get_accuracy_dataframe(df):
14
  # Calculate overall model accuracy
15
- df['parsed_judge_response'] = df['parsed_judge_response'].astype(float)
16
- model_accuracy = df.groupby('model_name')['parsed_judge_response'].mean().reset_index()
17
-
 
 
18
  # Calculate model accuracy per difficulty level
19
- df['difficulty_level'] = df['difficulty_level'].astype(int)
20
- model_accuracy_per_level = df.groupby(['model_name', 'difficulty_level'])['parsed_judge_response'].mean().reset_index()
21
- model_accuracy_per_level_df = model_accuracy_per_level.pivot(index='model_name', columns='difficulty_level', values='parsed_judge_response')
22
-
 
 
 
 
 
 
23
  # Merge overall accuracy and level-based accuracy into a single DataFrame
24
- model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on='model_name')
25
- model_accuracy_df.rename(columns={1: 'level_1', 2: 'level_2', 3: 'level_3', 4: 'level_4', 5: 'level_5'}, inplace=True)
26
- model_accuracy_df.rename(columns={'parsed_judge_response': 'Accuracy'}, inplace=True)
27
-
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Multiply by 100 and format to one decimal point
29
- model_accuracy_df = model_accuracy_df.applymap(lambda x: round(x * 100, 1) if isinstance(x, float) else x)
30
-
 
 
31
  # Add headers with icons
32
  model_accuracy_df.columns = [
33
  "πŸ€– Model Name",
@@ -40,13 +63,15 @@ def get_accuracy_dataframe(df):
40
 
41
  model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
42
 
43
- # Add a new column at the beginning for the rank
44
- model_accuracy_df.insert(0, '#', range(1, len(model_accuracy_df) + 1))
45
-
46
  return model_accuracy_df
47
 
48
 
49
- accuracy_df = get_accuracy_dataframe(all_results)
 
 
 
 
 
50
 
51
 
52
  # Define the column names with icons
@@ -68,126 +93,83 @@ column_names = [
68
  "Level 4 Accuracy",
69
  ]
70
 
71
- def load_heatmap(evt: gr.SelectData):
72
- heatmap_image = gr.Image(f"results/{evt.value}.jpg")
 
 
73
  return heatmap_image
74
 
75
 
 
 
 
76
 
77
- # # Function to process data
78
- # def process_data(data):
79
- # data_for_df = []
80
- # for file, df in data.items():
81
- # overall_accuracy = round(calculate_accuracy(df), 2)
82
- # breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
83
- # model_name = file.split("/")[-1].replace(".pkl", "")
84
- # data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
85
- # return data_for_df
86
 
 
 
 
87
 
88
- # # Function to finalize DataFrame
89
- # def finalize_df(df):
90
- # df = df.round(1) # Round to one decimal place
91
- # df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
92
- # df.columns = headers_with_icons
93
- # df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
94
- # # add a new column with the order (index)
95
- # df["#"] = range(1, len(df) + 1)
96
- # # bring rank to the first column
97
- # cols = df.columns.tolist()
98
- # cols = cols[-1:] + cols[:-1]
99
- # df = df[cols]
100
 
101
- # return df
 
 
102
 
103
 
104
- def load_heatmap(evt: gr.SelectData):
105
- heatmap_image = gr.Image(f"results/{evt.value}.jpg")
106
  return heatmap_image
107
 
108
 
 
 
109
  with gr.Blocks() as demo:
110
  gr.Markdown("# FSM Benchmark Leaderboard")
 
 
111
  with gr.Tab("Text-only Benchmark"):
112
- leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  gr.Markdown("## Heatmap")
114
- heatmap_image_qwen = gr.Image(label="", show_label=False)
115
- leader_board.select(fn=load_heatmap, outputs=[heatmap_image_qwen])
116
-
117
- # with gr.Tab("Vision Benchmark", visible=False):
118
- # gr.Markdown("# Vision Benchmark Leaderboard")
119
- # leader_board_vision = gr.Dataframe(
120
- # vision_accuracy_df, headers=headers_with_icons
121
- # )
122
- # gr.Markdown("## Heatmap")
123
- # heatmap_image_vision = gr.Image(label="", show_label=False)
124
- # leader_board_vision.select(
125
- # fn=load_vision_heatmap, outputs=[heatmap_image_vision]
126
- # )
127
-
128
- # with gr.Tab("Text-only Benchmark (CoT)", visible=False):
129
- # gr.Markdown("# Text-only Leaderboard (CoT)")
130
- # cot_leader_board_text = gr.Dataframe(
131
- # cot_text_accuracy_df, headers=headers_with_icons
132
- # )
133
- # gr.Markdown("## Heatmap")
134
- # cot_heatmap_image_text = gr.Image(label="", show_label=False)
135
- # cot_leader_board_text.select(
136
- # fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
137
- # )
138
-
139
- # with gr.Tab("Constraint Text-only Results (CoT)", visible=False):
140
- # gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
141
- # included_models_cot = gr.CheckboxGroup(
142
- # label="Models to include",
143
- # choices=all_cot_text_only_models,
144
- # value=all_cot_text_only_models,
145
- # interactive=True,
146
- # )
147
- # with gr.Row():
148
- # number_of_queries_cot = gr.Textbox(label="Number of included queries")
149
- # number_of_fsms_cot = gr.Textbox(label="Number of included FSMs")
150
-
151
- # constrained_leader_board_text_cot = gr.Dataframe()
152
- # constrained_leader_board_plot_cot = gr.Plot()
153
-
154
- # with gr.Tab("Majority Vote (Subset 1)", visible=False):
155
- # gr.Markdown("## Majority Vote (Subset 1)")
156
- # intersection_leader_board = gr.Dataframe(
157
- # intersection_df_acc, headers=headers_with_icons
158
- # )
159
- # heatmap_image = gr.Plot(label="Model Heatmap")
160
-
161
- # with gr.Tab("Text-only Benchmark (deprecated)", visible=False):
162
- # gr.Markdown("# Text-only Leaderboard")
163
- # leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
164
- # gr.Markdown("## Heatmap")
165
- # heatmap_image = gr.Image(label="", show_label=False)
166
- # leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
167
-
168
- # # ============ Callbacks ============
169
-
170
- # included_models_cot.select(
171
- # fn=calculate_order_by_first_substring_cot,
172
- # inputs=[included_models_cot],
173
- # outputs=[
174
- # constrained_leader_board_text_cot,
175
- # number_of_queries_cot,
176
- # number_of_fsms_cot,
177
- # ],
178
- # queue=True,
179
- # )
180
-
181
- # constrained_leader_board_text.select(
182
- # fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot]
183
- # )
184
-
185
- # constrained_leader_board_text_cot.select(
186
- # fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
187
- # )
188
-
189
- # intersection_leader_board.select(
190
- # fn=show_intersection_heatmap, outputs=[heatmap_image]
191
- # )
192
 
193
  demo.launch()
 
7
  import seaborn as sns
8
  from matplotlib.colors import BoundaryNorm, ListedColormap
9
 
10
+ all_results = pd.read_pickle("final_df.pkl")
11
 
12
 
13
+ def get_accuracy_dataframe(df_mother, category):
14
  # Calculate overall model accuracy
15
+ # filter for category only
16
+ df = df_mother[df_mother["category"] == category].copy()
17
+ df["is_answer_correct"] = df["is_answer_correct"].astype(float)
18
+ model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index()
19
+
20
  # Calculate model accuracy per difficulty level
21
+ df["difficulty_level"] = df["difficulty_level"].astype(int)
22
+ model_accuracy_per_level = (
23
+ df.groupby(["model", "difficulty_level"])["is_answer_correct"]
24
+ .mean()
25
+ .reset_index()
26
+ )
27
+ model_accuracy_per_level_df = model_accuracy_per_level.pivot(
28
+ index="model", columns="difficulty_level", values="is_answer_correct"
29
+ )
30
+
31
  # Merge overall accuracy and level-based accuracy into a single DataFrame
32
+ model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model")
33
+ model_accuracy_df.rename(
34
+ columns={"is_answer_correct": "Overall Accuracy"}, inplace=True
35
+ )
36
+
37
+ # Ensure all expected difficulty levels are present
38
+ expected_levels = [1, 2, 3, 4] # Adjust based on your data
39
+ for level in expected_levels:
40
+ if level not in model_accuracy_df.columns:
41
+ model_accuracy_df[
42
+ level
43
+ ] = None # Fill missing levels with None or an appropriate value
44
+
45
+ # Rename columns to include levels
46
+ level_columns = {level: f"Level {level} Accuracy" for level in expected_levels}
47
+ model_accuracy_df.rename(columns=level_columns, inplace=True)
48
+
49
  # Multiply by 100 and format to one decimal point
50
+ model_accuracy_df = model_accuracy_df.applymap(
51
+ lambda x: round(x * 100, 1) if isinstance(x, float) else x
52
+ )
53
+
54
  # Add headers with icons
55
  model_accuracy_df.columns = [
56
  "πŸ€– Model Name",
 
63
 
64
  model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
65
 
 
 
 
66
  return model_accuracy_df
67
 
68
 
69
+ # categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object)
70
+ accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly")
71
+ accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT")
72
+ accuracy_df_vision = get_accuracy_dataframe(all_results, "vision")
73
+ accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT")
74
+ accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot")
75
 
76
 
77
  # Define the column names with icons
 
93
  "Level 4 Accuracy",
94
  ]
95
 
96
+
97
+ def load_heatmap_textonly(evt: gr.SelectData):
98
+ print(f"./heatmaps/{evt.value}_Textonly.jpg")
99
+ heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg")
100
  return heatmap_image
101
 
102
 
103
+ def load_heatmap_cot(evt: gr.SelectData):
104
+ heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg")
105
+ return heatmap_image
106
 
 
 
 
 
 
 
 
 
 
107
 
108
+ def load_heatmap_vision(evt: gr.SelectData):
109
+ heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg")
110
+ return heatmap_image
111
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ def load_heatmap_vision_cot(evt: gr.SelectData):
114
+ heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg")
115
+ return heatmap_image
116
 
117
 
118
+ def load_heatmap_1shot(evt: gr.SelectData):
119
+ heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg")
120
  return heatmap_image
121
 
122
 
123
+ # Then, use these functions in the corresponding select method calls:
124
+
125
  with gr.Blocks() as demo:
126
  gr.Markdown("# FSM Benchmark Leaderboard")
127
+
128
+ # Text-only Benchmark
129
  with gr.Tab("Text-only Benchmark"):
130
+ leader_board_textonly = gr.Dataframe(
131
+ accuracy_df_textonly, headers=headers_with_icons
132
+ )
133
+ gr.Markdown("## Heatmap")
134
+ heatmap_image_textonly = gr.Image(label="", show_label=False)
135
+ leader_board_textonly.select(
136
+ fn=load_heatmap_textonly, outputs=[heatmap_image_textonly]
137
+ )
138
+
139
+ # CoT Benchmark
140
+ with gr.Tab("CoT Benchmark"):
141
+ leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons)
142
+ gr.Markdown("## Heatmap")
143
+ heatmap_image_cot = gr.Image(label="", show_label=False)
144
+ leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot])
145
+
146
+ # Vision Benchmark
147
+ with gr.Tab("Vision Benchmark"):
148
+ leader_board_vision = gr.Dataframe(
149
+ accuracy_df_vision, headers=headers_with_icons
150
+ )
151
+ gr.Markdown("## Heatmap")
152
+ heatmap_image_vision = gr.Image(label="", show_label=False)
153
+ leader_board_vision.select(
154
+ fn=load_heatmap_vision, outputs=[heatmap_image_vision]
155
+ )
156
+
157
+ # Vision-CoT Benchmark
158
+ with gr.Tab("Vision-CoT Benchmark"):
159
+ leader_board_vision_cot = gr.Dataframe(
160
+ accuracy_df_vision_cot, headers=headers_with_icons
161
+ )
162
+ gr.Markdown("## Heatmap")
163
+ heatmap_image_vision_cot = gr.Image(label="", show_label=False)
164
+ leader_board_vision_cot.select(
165
+ fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot]
166
+ )
167
+
168
+ # 1shot Benchmark
169
+ with gr.Tab("1shot Benchmark"):
170
+ leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons)
171
  gr.Markdown("## Heatmap")
172
+ heatmap_image_1shot = gr.Image(label="", show_label=False)
173
+ leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  demo.launch()
results/CodeLlama-70b-Instruct-hf.jpg β†’ heatmaps/CodeLlama-70b-Instruct-hf_CoT.jpg RENAMED
File without changes
results/GPT-4-0125-preview.png β†’ heatmaps/CodeLlama-70b-Instruct-hf_Textonly.jpg RENAMED
File without changes
results/CodeLlama-70b-Instruct-hf.png β†’ heatmaps/Llama-2-70b-chat-hf_CoT.jpg RENAMED
File without changes
results/GPT-4-0125-preview.jpg β†’ heatmaps/Llama-2-70b-chat-hf_Textonly.jpg RENAMED
File without changes
heatmaps/Llama-3-70b-chat-hf_CoT.jpg ADDED

Git LFS Details

  • SHA256: 45e4e0dbdb6ecf372246158de9708088ce189d420c3cb7e8e101565802209833
  • Pointer size: 132 Bytes
  • Size of remote file: 1.28 MB
heatmaps/Llama-3-70b-chat-hf_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 255d9bb5c182e5947463e7cec5bea78d8aef3b038bb74fa400aaa2c2d7cbe02a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.29 MB
heatmaps/Mistral-7B-Instruct-v0.2_CoT.jpg ADDED

Git LFS Details

  • SHA256: a8cedc95db3dfedb187d1c690cb5d78a08547f083e276cf3c4946a9dab8d2fb9
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/Mistral-7B-Instruct-v0.2_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 8cfc569d1baca9826b010eeb90af43b0077ea533d7fc8c1ae494671b255cdf1f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/Mixtral-8x7B-Instruct-v0.1_CoT.jpg ADDED

Git LFS Details

  • SHA256: 7fd1c72aafdf96102115d66d1ef4e8941cd3e75007d4769f253913d92fd3ee11
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/Mixtral-8x7B-Instruct-v0.1_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 7433170eae759affb90bd714dcc655831503d483c82f9878bd3d5607a4448476
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/Qwen1.5-72B-Chat_CoT.jpg ADDED

Git LFS Details

  • SHA256: 6b607383ae11aa64e0058dbfbc294c1366054ac3bd2910d7803de604ee75dcf5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.31 MB
heatmaps/Qwen1.5-72B-Chat_Textonly.jpg ADDED

Git LFS Details

  • SHA256: ec21974c9b3987b565a2e152af9c2b33b1b15d054389a3ab8ca4635d67830c8c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.31 MB
heatmaps/Yi-34B-Chat_CoT.jpg ADDED

Git LFS Details

  • SHA256: e486fb5d66e68c02cfefe02527ce9c9768742c3aefa98f218a4f45fa5f390bd2
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/Yi-34B-Chat_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 277071c8daf563feff3de2f0ce819bc56a3d2d2ba87d7862ff7684f1abd02bdb
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/claude-3-haiku-20240307_1shot.jpg ADDED

Git LFS Details

  • SHA256: ba1d76eeccd60fab64bdba46234b5445bc8e827765c78f86cf1f7139cf5ac392
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
heatmaps/claude-3-haiku-20240307_CoT.jpg ADDED

Git LFS Details

  • SHA256: c30a5e503b394d209b6b626efec0f8240061b2aa2f84043d2172179127cb1021
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
heatmaps/claude-3-haiku-20240307_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 11ceaabc9a0e37b39f419dbdc06d9db85154084ed752c857e4d6b66daa829741
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
heatmaps/claude-3-haiku-20240307_vision-CoT.jpg ADDED

Git LFS Details

  • SHA256: c03395379bdfaf4f2a199a0b3b13b297d78a2e1a70dd0c466c155b2d1d76d5d0
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/claude-3-haiku-20240307_vision.jpg ADDED

Git LFS Details

  • SHA256: 3f1237260665f554651e781141211f0a713167f68e61a6683ad652cc1f00020c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/claude-3-opus-20240229_CoT.jpg ADDED

Git LFS Details

  • SHA256: 382fa0ff6881fcc78056d4025343f521d5e047c370e6b4d20cbbb6ef140a1e46
  • Pointer size: 132 Bytes
  • Size of remote file: 1.21 MB
heatmaps/claude-3-opus-20240229_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 576298ca335a6da82c6c7c4144639a4a92c2b902674430fab1eb38d91d93e5bb
  • Pointer size: 132 Bytes
  • Size of remote file: 1.21 MB
heatmaps/claude-3-opus-20240229_vision-CoT.jpg ADDED

Git LFS Details

  • SHA256: 1a6ac07ac9337a62d6b0152e6b3582405f5ded6197ec31e0370952f2df38c3be
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/claude-3-opus-20240229_vision.jpg ADDED

Git LFS Details

  • SHA256: 5056abaef945199d6ef4e3833ae20ca763c80aff44ca59d991567637dee4d237
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/claude-3-sonnet-20240229_CoT.jpg ADDED

Git LFS Details

  • SHA256: 97f3fcc436adf32351392db6fe0b36969554ecfef9f5a6be87d6cb73b59d3840
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/claude-3-sonnet-20240229_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 6e3d02435e530cf058e48fd6262d23131656aec109f09c64349fac30338a4988
  • Pointer size: 132 Bytes
  • Size of remote file: 1.29 MB
heatmaps/claude-3-sonnet-20240229_vision-CoT.jpg ADDED

Git LFS Details

  • SHA256: f63f154b6c312cd298c32cf805a00118922dd8fd7da1169a8b64d415740a1ebc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/claude-3-sonnet-20240229_vision.jpg ADDED

Git LFS Details

  • SHA256: ea55e964a1547d4462364c8f41ecbaf870af6cb151589bc45af03bbcc171afad
  • Pointer size: 132 Bytes
  • Size of remote file: 1.28 MB
heatmaps/dbrx-instruct_CoT.jpg ADDED

Git LFS Details

  • SHA256: 8720a093a238c8312629c46c0b54e62aa1a95059dce9ec5b269508450acd21a6
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/dbrx-instruct_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 208c1caeaa0c862c0cadb4a6f017906f1e1925383b09d6b9929732a71618984e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/deepseek-llm-67b-chat_CoT.jpg ADDED

Git LFS Details

  • SHA256: 0299b8c3d112641c428a5c77d368b92a8560908221b214a63b91a8f2ce97a070
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/deepseek-llm-67b-chat_Textonly.jpg ADDED

Git LFS Details

  • SHA256: a59e661ad07580f7bd280be46e26caeec50fa082920bd4e0e7ad5be1653fb116
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/gemini-pro_CoT.jpg ADDED

Git LFS Details

  • SHA256: e73ac717609221026e584cbf2ba76538d6091459eccb63763142faa8cd233a82
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/gemini-pro_vision-CoT.jpg ADDED

Git LFS Details

  • SHA256: 116b2a07fb740560e59471347f4c6da6fbb251ef0a6cda7f4c3cd9cf8ef2beac
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/gemini-pro_vision.jpg ADDED

Git LFS Details

  • SHA256: 36cc139756d58b144d03cffdd23df7a07a71f1a3b77d8544b35d24afbece3a10
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
heatmaps/gemma-7b-it_CoT.jpg ADDED

Git LFS Details

  • SHA256: a7abc637ee1f0206737eeecef445c785c8de7d7a8830e1a4dd3934453c497dc1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.34 MB
heatmaps/gemma-7b-it_Textonly.jpg ADDED

Git LFS Details

  • SHA256: e436e4d353b0de47a639aaf10fe1aa61ea59e9020b83924a6f38f023177c1631
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/gpt-3.5-0613_CoT.jpg ADDED

Git LFS Details

  • SHA256: e36f991250d11adf955df905756e4abb8313fb2f46a06bb6e4fb1c1f117dd332
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/gpt-3.5-0613_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 57cc450248c9ce976f6eb76d148c75ff1087499a585c2dd12fd8c3c27e1f1ce2
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/gpt-3.5-turbo-0125_1shot.jpg ADDED

Git LFS Details

  • SHA256: 45770b4c8e836e17cccb2ad73ed7894802f070fbfb760c166881a9ada6eccaa5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/gpt-3.5-turbo-0125_CoT.jpg ADDED

Git LFS Details

  • SHA256: 8ebed5226148f5222839dd16aec869786bf608e66320a77eadaaa2633808189c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/gpt-3.5-turbo-0125_Textonly.jpg ADDED

Git LFS Details

  • SHA256: 13f8baca85b9833cad9236fc7b94e0f645f5d9979d200eb49b086b2219ac7b7e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
heatmaps/gpt-4-0125-preview_CoT.jpg ADDED

Git LFS Details

  • SHA256: fa4af344d9bdf4aac858531ac565bc09235774629e0f4a6485f13b01bbb98835
  • Pointer size: 132 Bytes
  • Size of remote file: 1.23 MB
heatmaps/gpt-4-0125-preview_Textonly.jpg ADDED

Git LFS Details

  • SHA256: d339d6e1527fa18f390ca34ab0ba833a9a187109396c638f2e4ec1c16be11f3a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.24 MB
heatmaps/gpt-4-1106_CoT.jpg ADDED

Git LFS Details

  • SHA256: ce05c6404989f6ec8653bfd7fe9839c536cdce24ef75f6cd1fdb4800eba766c3
  • Pointer size: 132 Bytes
  • Size of remote file: 1.23 MB
heatmaps/gpt-4-1106_Textonly.jpg ADDED

Git LFS Details

  • SHA256: ddf8c95dc35b8277b7a2b5e9a08e366f0152af08ee78c398f12bc083a29adb94
  • Pointer size: 132 Bytes
  • Size of remote file: 1.24 MB
heatmaps/gpt-4-turbo-2024-04-09_CoT.jpg ADDED

Git LFS Details

  • SHA256: 7b3e95c1c0459f483504cf4c34034ddb3a68bac4cbab158627af90034686b6d3
  • Pointer size: 132 Bytes
  • Size of remote file: 1.22 MB
results/gpt-4-turbo-2024-04-09.jpg β†’ heatmaps/gpt-4-turbo-2024-04-09_Textonly.jpg RENAMED
File without changes
heatmaps/gpt-4-vision-preview_vision-CoT.jpg ADDED

Git LFS Details

  • SHA256: 910c2d4820c49249d143e390f500aa149f68ba00a56c3fcb0c61b2485e93d3ab
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB