Spaces:

FSMBench
/

Leaderboard

Sleeping

App Files Files Community

taesiri commited on Apr 18

Commit

0ec0046

•

1 Parent(s): ed1f48a

update

Browse files

Files changed (2) hide show

app.py +95 -0
intersection_results/gpt-3.5-judge-by_Qwen_5times_intersection_subset_1.pkl +3 -0

app.py CHANGED Viewed

@@ -7,6 +7,13 @@ from matplotlib.colors import ListedColormap, BoundaryNorm
 from glob import glob
 import os
 # Load text benchmark results
 csv_results = glob("results/*.pkl")
 # Load vision benchmark results
@@ -16,6 +23,7 @@ cot_text_results = glob("results-cot/*.pkl")
 # Load CoT vision benchmark results
 # cot_vision_results = glob("results-vision-CoT/*.pkl")
 # Function to load data, add model type and name
 def load_data(files, model_type):
     data = []
@@ -62,6 +70,18 @@ cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
 # cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
 def calculate_accuracy(df):
     return df["parsed_judge_response"].mean() * 100
@@ -90,6 +110,7 @@ column_names = [
     "Level 4 Accuracy",
 ]
 # Function to process data
 def process_data(data):
     data_for_df = []
@@ -113,6 +134,7 @@ vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
 cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
 # cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
 # Function to finalize DataFrame
 def finalize_df(df):
     df = df.round(1)  # Round to one decimal place
@@ -327,6 +349,63 @@ def generate_heatmap_for_specific_model_cot(model_name):
     return fig
 def show_constraint_heatmap(evt: gr.SelectData):
     model_name = evt.value
     return generate_heatmap_for_specific_model(model_name)
@@ -337,6 +416,11 @@ def show_constraint_heatmap_cot(evt: gr.SelectData):
     return generate_heatmap_for_specific_model_cot(model_name)
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
@@ -417,6 +501,13 @@ with gr.Blocks() as demo:
         constrained_leader_board_text_cot = gr.Dataframe()
         constrained_leader_board_plot_cot = gr.Plot()
     included_models_cot.select(
         fn=calculate_order_by_first_substring_cot,
         inputs=[included_models_cot],
@@ -436,4 +527,8 @@ with gr.Blocks() as demo:
         fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
     )
     demo.launch()

 from glob import glob
 import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.colors import ListedColormap, BoundaryNorm
+import pandas as pd
 # Load text benchmark results
 csv_results = glob("results/*.pkl")
 # Load vision benchmark results
 # Load CoT vision benchmark results
 # cot_vision_results = glob("results-vision-CoT/*.pkl")
 # Function to load data, add model type and name
 def load_data(files, model_type):
     data = []
 # cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
+intersection_df = pd.read_pickle(
+    "./intersection_results/gpt-3.5-judge-by_Qwen_5times_intersection_subset_1.pkl"
+)
+# accuracy for each model
+intersection_df_acc = (
+    intersection_df.groupby("model_name")["parsed_judge_response"].mean().reset_index()
+)
+intersection_df_acc["Accuracy"] = intersection_df_acc["parsed_judge_response"] * 100
+intersection_df_acc.drop("parsed_judge_response", axis=1, inplace=True)
+intersection_df_acc.sort_values("Accuracy", ascending=False, inplace=True)
 def calculate_accuracy(df):
     return df["parsed_judge_response"].mean() * 100
     "Level 4 Accuracy",
 ]
 # Function to process data
 def process_data(data):
     data_for_df = []
 cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
 # cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
 # Function to finalize DataFrame
 def finalize_df(df):
     df = df.round(1)  # Round to one decimal place
     return fig
+def generate_heatmap_for_intersection_model(model_name):
+    global intersection_df
+    cmap = ListedColormap(["lightblue", "red", "green"])
+    bounds = [-1.5, -0.5, 0.5, 1.5]
+    norm = BoundaryNorm(bounds, cmap.N)
+    # Filter for a specific model
+    model_df = intersection_df[intersection_df["model_name"] == model_name].copy()
+    if model_df.empty:
+        print(f"No data found for model {model_name}. Skipping heatmap generation.")
+        return None
+    model_df["fsm_info"] = model_df.apply(
+        lambda x: f"{x['num_states']} states, {x['num_alphabet']} alphabet", axis=1
+    )
+    model_df = model_df.sort_values(by=["num_states", "num_alphabet"])
+    pivot_df = (
+        model_df.pivot_table(
+            index="fsm_info",
+            columns="substring_index",
+            values="parsed_judge_response",
+            aggfunc="first",
+        )
+        .fillna(-1)
+        .astype(float)
+    )
+    # Dynamically adjust figure size
+    num_rows, num_cols = pivot_df.shape
+    fig_width = max(12, num_cols * 0.5)
+    fig_height = max(8, num_rows * 0.4)
+    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
+    sns.heatmap(
+        pivot_df,
+        cmap=cmap,
+        linewidths=1,
+        linecolor="black",
+        norm=norm,
+        cbar=False,
+        square=True,
+        ax=ax,
+    )
+    plt.title(f"Heatmap for Model: {model_name}", fontsize=12)
+    plt.xlabel("Substring Index")
+    plt.ylabel("FSM (States, Alphabet)")
+    plt.xticks(rotation=45)
+    sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
+    plt.close(fig)  # Prevent it from showing immediately
+    return fig
 def show_constraint_heatmap(evt: gr.SelectData):
     model_name = evt.value
     return generate_heatmap_for_specific_model(model_name)
     return generate_heatmap_for_specific_model_cot(model_name)
+def show_intersection_heatmap(evt: gr.SelectData):
+    model_name = evt.value
+    return generate_heatmap_for_intersection_model(model_name)
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
         constrained_leader_board_text_cot = gr.Dataframe()
         constrained_leader_board_plot_cot = gr.Plot()
+    with gr.Tab("Majority Vote (Subset 1)"):
+        gr.Markdown("## Majority Vote (Subset 1)")
+        intersection_leader_board = gr.Dataframe(
+            intersection_df_acc, headers=headers_with_icons
+        )
+        heatmap_image = gr.Plot(label="Model Heatmap")
     included_models_cot.select(
         fn=calculate_order_by_first_substring_cot,
         inputs=[included_models_cot],
         fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
     )
+    intersection_leader_board.select(
+        fn=show_intersection_heatmap, outputs=[heatmap_image]
+    )
     demo.launch()

intersection_results/gpt-3.5-judge-by_Qwen_5times_intersection_subset_1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1cc52129234d9667a4cc388bd1da3a2021f1bbb7ea556e20ee6d5e159b2b1a8
+size 1482609