taesiri commited on
Commit
c1f8e27
1 Parent(s): 16b6bb4
Files changed (1) hide show
  1. app.py +84 -0
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  from glob import glob
4
 
 
5
  # Load text benchmark results
6
  csv_results = glob("results/*.pkl")
7
  # Load vision benchmark results
@@ -11,6 +12,35 @@ cot_text_results = glob("results-cot/*.pkl")
11
  # Load CoT vision benchmark results
12
  cot_vision_results = glob("results-vision-CoT/*.pkl")
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Load the csv files into a dict with keys being name of the file and values being the data
15
  data = {file: pd.read_pickle(file) for file in csv_results}
16
  # Load the vision files into a dict
@@ -88,6 +118,8 @@ cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
88
  cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
89
 
90
 
 
 
91
  def load_heatmap(evt: gr.SelectData):
92
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
93
  return heatmap_image
@@ -108,6 +140,48 @@ def load_cot_vision_heatmap(evt: gr.SelectData):
108
  return heatmap_image
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  with gr.Blocks() as demo:
112
  gr.Markdown("# FSM Benchmark Leaderboard")
113
  with gr.Tab("Text-only Benchmark"):
@@ -150,4 +224,14 @@ with gr.Blocks() as demo:
150
  fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
151
  )
152
 
 
 
 
 
 
 
 
 
 
 
153
  demo.launch()
 
2
  import pandas as pd
3
  from glob import glob
4
 
5
+
6
  # Load text benchmark results
7
  csv_results = glob("results/*.pkl")
8
  # Load vision benchmark results
 
12
  # Load CoT vision benchmark results
13
  cot_vision_results = glob("results-vision-CoT/*.pkl")
14
 
15
+ # Function to load data, add model type and name
16
+ def load_data(files, model_type):
17
+ data = []
18
+ for file in files:
19
+ df = pd.read_pickle(file)
20
+ df["Model Type"] = model_type
21
+ df["Model Name"] = file.split("/")[-1].replace(".pkl", "")
22
+ data.append(df)
23
+ return pd.concat(data, ignore_index=True)
24
+
25
+
26
+ # Load and label all data
27
+ data = load_data(csv_results, "Text Only")
28
+ vision_data = load_data(vision_results, "Vision")
29
+ cot_text_data = load_data(cot_text_results, "CoT Text Only")
30
+ cot_vision_data = load_data(cot_vision_results, "CoT Vision")
31
+
32
+ # Combine all data into a single DataFrame
33
+ all_data = pd.concat(
34
+ [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
35
+ )
36
+
37
+ all_model_names = all_data['Model Name'].unique()
38
+ all_text_only_model_names = list(all_data[all_data['Model Type'] == 'Text Only']['Model Name'].unique())
39
+ print(all_text_only_model_names)
40
+
41
+ ## Continue with the cold code --
42
+ # TODO: Update me to read from all_data for later
43
+
44
  # Load the csv files into a dict with keys being name of the file and values being the data
45
  data = {file: pd.read_pickle(file) for file in csv_results}
46
  # Load the vision files into a dict
 
118
  cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
119
 
120
 
121
+
122
+
123
  def load_heatmap(evt: gr.SelectData):
124
  heatmap_image = gr.Image(f"results/{evt.value}.jpg")
125
  return heatmap_image
 
140
  return heatmap_image
141
 
142
 
143
+ def calculate_order_by_first_substring(selected_models):
144
+
145
+ first_columns = all_data[all_data['substring_index'] == 1]
146
+ query_ids_df = first_columns[first_columns['Model Type'] == 'Text Only']
147
+
148
+
149
+ # Filter to include only the selected models
150
+ query_ids_df = query_ids_df[query_ids_df['Model Name'].isin(selected_models)]
151
+
152
+ print(len(query_ids_df))
153
+
154
+ query_ids_df = query_ids_df.groupby('query_id').filter(lambda x: x['parsed_judge_response'].eq(1).all())
155
+
156
+ print(len(query_ids_df))
157
+
158
+ query_ids = query_ids_df.query_id.unique()
159
+ # print('query_ids', len(query_ids))
160
+
161
+ # filter out fsm_ids and
162
+ fsm_ids = query_ids_df.fsm_id.unique()
163
+ print('fsm_ids', len(fsm_ids), "Total of 25 FSM is solvable by everything on the first substring")
164
+
165
+
166
+ # now filter all_data for query_ids and text only, then calcaulte the accuracy based on the parsed_judge_response for each model
167
+
168
+ text_only = all_data[all_data['Model Type'] == 'Text Only']
169
+ text_only_filtered = text_only[text_only['fsm_id'].isin(fsm_ids)]
170
+ # print # of query_ids from text_only_filtered
171
+ print(f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}")
172
+
173
+ text_only_filtered = text_only_filtered.groupby(['Model Name'])['parsed_judge_response'].mean().reset_index()
174
+ text_only_filtered['Accuracy'] = text_only_filtered['parsed_judge_response'] * 100
175
+ text_only_filtered.drop('parsed_judge_response', axis=1, inplace=True)
176
+ text_only_filtered.sort_values('Accuracy', ascending=False)
177
+
178
+ # round to two decimal places
179
+ text_only_filtered['Accuracy'] = text_only_filtered['Accuracy'].apply(lambda x: round(x, 2))
180
+
181
+ return text_only_filtered
182
+
183
+
184
+
185
  with gr.Blocks() as demo:
186
  gr.Markdown("# FSM Benchmark Leaderboard")
187
  with gr.Tab("Text-only Benchmark"):
 
224
  fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
225
  )
226
 
227
+ with gr.Tab("Constraint Text-only Results"):
228
+ gr.Markdown("## Constraint Text-only Leaderboard by first substring")
229
+ included_models = gr.CheckboxGroup(
230
+ label="Models to include", choices=all_text_only_model_names, value=all_text_only_model_names
231
+ )
232
+ constrained_leader_board_text = gr.Dataframe()
233
+
234
+
235
+ included_models.input(fn=calculate_order_by_first_substring, inputs=[included_models], outputs=[constrained_leader_board_text])
236
+
237
  demo.launch()