cccjc commited on
Commit
eeb88fb
·
1 Parent(s): 0d5512e

Add more model results, add single-image setting table

Browse files
app.py CHANGED
@@ -1,27 +1,35 @@
1
  import gradio as gr
2
- from utils import get_leaderboard_data, SUPER_GROUPS, MODEL_GROUPS
3
  import os
4
  from constants import *
5
 
6
  # Get the directory of the current script
7
  current_dir = os.path.dirname(os.path.abspath(__file__))
8
 
9
- # Construct the path to the CSS file
10
- css_file = os.path.join(current_dir, "static", "css", "style.css")
 
 
11
 
12
- # Read the CSS file
13
- with open(css_file, "r") as f:
14
- css = f.read()
 
 
 
 
15
 
16
- def update_leaderboard(selected_super_group, selected_model_group):
17
- headers, data = get_leaderboard_data(selected_super_group, selected_model_group)
18
- return gr.Dataframe(
19
- value=data,
20
- headers=headers,
21
- datatype=["str"] + ["number"] * (len(headers) - 1),
22
- )
23
 
24
- with gr.Blocks(css=css) as block:
 
 
 
 
 
 
25
  gr.Markdown(
26
  LEADERBOARD_INTRODUCTION
27
  )
@@ -39,24 +47,36 @@ with gr.Blocks(css=css) as block:
39
  TABLE_INTRODUCTION
40
  )
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  with gr.Row():
43
  super_group_selector = gr.Radio(
44
- choices=list(SUPER_GROUPS.keys()),
45
  label="Select a dimension to display breakdown results. We use different column colors to distinguish the overall benchmark scores and breakdown results.",
46
- value=list(SUPER_GROUPS.keys())[0]
47
  )
48
  model_group_selector = gr.Radio(
49
- choices=list(MODEL_GROUPS.keys()),
50
  label="Select a model group",
51
  value="All"
52
  )
53
 
54
- initial_headers, initial_data = get_leaderboard_data(list(SUPER_GROUPS.keys())[0], "All")
55
- gr.Markdown(
56
- "**Table 1: MEGA-Bench full results.** <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$",
57
- elem_classes="table-caption",
58
- latex_delimiters=[ {"left": "$", "right": "$", "display": False }],
59
- )
60
  data_component = gr.Dataframe(
61
  value=initial_data,
62
  headers=initial_headers,
@@ -65,10 +85,61 @@ with gr.Blocks(css=css) as block:
65
  elem_classes="custom-dataframe",
66
  max_height=1200,
67
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  refresh_button = gr.Button("Refresh")
69
- refresh_button.click(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
70
- super_group_selector.change(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
71
- model_group_selector.change(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  with gr.TabItem("📝 Data Information", elem_id="qa-tab-table2", id=2):
74
  gr.Markdown(DATA_INFO, elem_classes="markdown-text")
 
1
  import gradio as gr
2
+ from utils import DefaultDataLoader, CoreSingleDataLoader
3
  import os
4
  from constants import *
5
 
6
  # Get the directory of the current script
7
  current_dir = os.path.dirname(os.path.abspath(__file__))
8
 
9
+ # Construct paths to CSS files
10
+ base_css_file = os.path.join(current_dir, "static", "css", "style.css")
11
+ default_css_file = os.path.join(current_dir, "static", "css", "default.css")
12
+ core_single_css_file = os.path.join(current_dir, "static", "css", "core_single.css")
13
 
14
+ # Read CSS files
15
+ with open(base_css_file, "r") as f:
16
+ base_css = f.read()
17
+ with open(default_css_file, "r") as f:
18
+ default_css = f.read()
19
+ with open(core_single_css_file, "r") as f:
20
+ core_single_css = f.read()
21
 
22
+ # Initialize data loaders
23
+ default_loader = DefaultDataLoader()
24
+ core_single_loader = CoreSingleDataLoader()
 
 
 
 
25
 
26
+ with gr.Blocks() as block:
27
+ # Add a style element that we'll update
28
+ css_style = gr.HTML(
29
+ f"<style>{base_css}\n{default_css}</style>",
30
+ visible=False
31
+ )
32
+
33
  gr.Markdown(
34
  LEADERBOARD_INTRODUCTION
35
  )
 
47
  TABLE_INTRODUCTION
48
  )
49
 
50
+ with gr.Row():
51
+ table_selector = gr.Radio(
52
+ choices=["Default", "Core Single-image"],
53
+ label="Select table to display",
54
+ value="Default"
55
+ )
56
+
57
+ # Define different captions for each table
58
+ default_caption = "**Table 1: MEGA-Bench full results.** <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
59
+ core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. Compared to the default table, some models with only single-image support are added."
60
+
61
+ caption_component = gr.Markdown(
62
+ value=default_caption,
63
+ elem_classes="table-caption",
64
+ latex_delimiters=[{"left": "$", "right": "$", "display": False}],
65
+ )
66
+
67
  with gr.Row():
68
  super_group_selector = gr.Radio(
69
+ choices=list(default_loader.SUPER_GROUPS.keys()),
70
  label="Select a dimension to display breakdown results. We use different column colors to distinguish the overall benchmark scores and breakdown results.",
71
+ value=list(default_loader.SUPER_GROUPS.keys())[0]
72
  )
73
  model_group_selector = gr.Radio(
74
+ choices=list(default_loader.BASE_MODEL_GROUPS.keys()),
75
  label="Select a model group",
76
  value="All"
77
  )
78
 
79
+ initial_headers, initial_data = default_loader.get_leaderboard_data(list(default_loader.SUPER_GROUPS.keys())[0], "All")
 
 
 
 
 
80
  data_component = gr.Dataframe(
81
  value=initial_data,
82
  headers=initial_headers,
 
85
  elem_classes="custom-dataframe",
86
  max_height=1200,
87
  )
88
+
89
+ def update_table_and_caption(table_type, super_group, model_group):
90
+ if table_type == "Default":
91
+ headers, data = default_loader.get_leaderboard_data(super_group, model_group)
92
+ caption = default_caption
93
+ current_css = f"{base_css}\n{default_css}"
94
+ else: # Core Single-image
95
+ headers, data = core_single_loader.get_leaderboard_data(super_group, model_group)
96
+ caption = core_single_image_caption
97
+ current_css = f"{base_css}\n{core_single_css}"
98
+
99
+ return [
100
+ gr.Dataframe(
101
+ value=data,
102
+ headers=headers,
103
+ datatype=["str"] + ["number"] * (len(headers) - 1),
104
+ ),
105
+ caption,
106
+ f"<style>{current_css}</style>"
107
+ ]
108
+
109
+ def update_selectors(table_type):
110
+ loader = default_loader if table_type == "Default" else core_single_loader
111
+ return [
112
+ gr.Radio(choices=list(loader.SUPER_GROUPS.keys())),
113
+ gr.Radio(choices=list(loader.MODEL_GROUPS.keys()))
114
+ ]
115
+
116
  refresh_button = gr.Button("Refresh")
117
+
118
+ # Update click and change handlers to include caption updates
119
+ refresh_button.click(
120
+ fn=update_table_and_caption,
121
+ inputs=[table_selector, super_group_selector, model_group_selector],
122
+ outputs=[data_component, caption_component, css_style]
123
+ )
124
+ super_group_selector.change(
125
+ fn=update_table_and_caption,
126
+ inputs=[table_selector, super_group_selector, model_group_selector],
127
+ outputs=[data_component, caption_component, css_style]
128
+ )
129
+ model_group_selector.change(
130
+ fn=update_table_and_caption,
131
+ inputs=[table_selector, super_group_selector, model_group_selector],
132
+ outputs=[data_component, caption_component, css_style]
133
+ )
134
+ table_selector.change(
135
+ fn=update_selectors,
136
+ inputs=[table_selector],
137
+ outputs=[super_group_selector, model_group_selector]
138
+ ).then(
139
+ fn=update_table_and_caption,
140
+ inputs=[table_selector, super_group_selector, model_group_selector],
141
+ outputs=[data_component, caption_component, css_style]
142
+ )
143
 
144
  with gr.TabItem("📝 Data Information", elem_id="qa-tab-table2", id=2):
145
  gr.Markdown(DATA_INFO, elem_classes="markdown-text")
static/css/core_single.css ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .custom-dataframe thead th:nth-child(-n+2),
2
+ .custom-dataframe tbody td:nth-child(-n+2) {
3
+ background-color: var(--global-column-background) !important;
4
+ }
5
+
6
+ .custom-dataframe thead th:nth-child(n+3),
7
+ .custom-dataframe tbody td:nth-child(n+3) {
8
+ background-color: var(--dimension-column-background) !important;
9
+ }
10
+
11
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+2) {
12
+ background-color: var(--row-even-global) !important;
13
+ }
14
+
15
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+3) {
16
+ background-color: var(--row-even-dimension) !important;
17
+ }
18
+
19
+ /* Dark mode styles */
20
+ @media (prefers-color-scheme: dark) {
21
+ .custom-dataframe {
22
+ color: var(--text-color) !important;
23
+ background-color: var(--background-color) !important;
24
+ }
25
+
26
+ .custom-dataframe thead th {
27
+ background-color: var(--header-background) !important;
28
+ color: var(--text-color) !important;
29
+ }
30
+
31
+ .custom-dataframe tbody td {
32
+ background-color: var(--background-color) !important;
33
+ color: var(--text-color) !important;
34
+ }
35
+
36
+ .custom-dataframe thead th:nth-child(-n+2),
37
+ .custom-dataframe tbody td:nth-child(-n+2) {
38
+ background-color: var(--global-column-background) !important;
39
+ }
40
+
41
+ .custom-dataframe thead th:nth-child(n+3),
42
+ .custom-dataframe tbody td:nth-child(n+3) {
43
+ background-color: var(--dimension-column-background) !important;
44
+ }
45
+
46
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+2) {
47
+ background-color: var(--row-even-global) !important;
48
+ }
49
+
50
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+3) {
51
+ background-color: var(--row-even-dimension) !important;
52
+ }
53
+
54
+ .custom-dataframe tbody tr:hover td {
55
+ background-color: var(--hover-background) !important;
56
+ }
57
+ }
static/css/default.css ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .custom-dataframe thead th:nth-child(-n+5),
2
+ .custom-dataframe tbody td:nth-child(-n+5) {
3
+ background-color: var(--global-column-background) !important;
4
+ }
5
+
6
+ .custom-dataframe thead th:nth-child(n+6),
7
+ .custom-dataframe tbody td:nth-child(n+6) {
8
+ background-color: var(--dimension-column-background) !important;
9
+ }
10
+
11
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+5) {
12
+ background-color: var(--row-even-global) !important;
13
+ }
14
+
15
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+6) {
16
+ background-color: var(--row-even-dimension) !important;
17
+ }
18
+
19
+ /* Dark mode styles */
20
+ @media (prefers-color-scheme: dark) {
21
+ .custom-dataframe {
22
+ color: var(--text-color) !important;
23
+ background-color: var(--background-color) !important;
24
+ }
25
+
26
+ .custom-dataframe thead th {
27
+ background-color: var(--header-background) !important;
28
+ color: var(--text-color) !important;
29
+ }
30
+
31
+ .custom-dataframe tbody td {
32
+ background-color: var(--background-color) !important;
33
+ color: var(--text-color) !important;
34
+ }
35
+
36
+ .custom-dataframe thead th:nth-child(-n+5),
37
+ .custom-dataframe tbody td:nth-child(-n+5) {
38
+ background-color: var(--global-column-background) !important;
39
+ }
40
+
41
+ .custom-dataframe thead th:nth-child(n+6),
42
+ .custom-dataframe tbody td:nth-child(n+6) {
43
+ background-color: var(--dimension-column-background) !important;
44
+ }
45
+
46
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+5) {
47
+ background-color: var(--row-even-global) !important;
48
+ }
49
+
50
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+6) {
51
+ background-color: var(--row-even-dimension) !important;
52
+ }
53
+
54
+ .custom-dataframe tbody tr:hover td {
55
+ background-color: var(--hover-background) !important;
56
+ }
57
+ }
static/css/style.css CHANGED
@@ -40,64 +40,6 @@
40
  color: var(--text-color) !important;
41
  }
42
 
43
- .custom-dataframe thead th:nth-child(-n+5),
44
- .custom-dataframe tbody td:nth-child(-n+5) {
45
- background-color: var(--global-column-background) !important;
46
- }
47
-
48
- .custom-dataframe thead th:nth-child(n+6),
49
- .custom-dataframe tbody td:nth-child(n+6) {
50
- background-color: var(--dimension-column-background) !important;
51
- }
52
-
53
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+5) {
54
- background-color: var(--row-even-global) !important;
55
- }
56
-
57
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+6) {
58
- background-color: var(--row-even-dimension) !important;
59
- }
60
-
61
- /* Dark mode styles */
62
- @media (prefers-color-scheme: dark) {
63
- .custom-dataframe {
64
- color: var(--text-color) !important;
65
- background-color: var(--background-color) !important;
66
- }
67
-
68
- .custom-dataframe thead th {
69
- background-color: var(--header-background) !important;
70
- color: var(--text-color) !important;
71
- }
72
-
73
- .custom-dataframe tbody td {
74
- background-color: var(--background-color) !important;
75
- color: var(--text-color) !important;
76
- }
77
-
78
- .custom-dataframe thead th:nth-child(-n+5),
79
- .custom-dataframe tbody td:nth-child(-n+5) {
80
- background-color: var(--global-column-background) !important;
81
- }
82
-
83
- .custom-dataframe thead th:nth-child(n+6),
84
- .custom-dataframe tbody td:nth-child(n+6) {
85
- background-color: var(--dimension-column-background) !important;
86
- }
87
-
88
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+5) {
89
- background-color: var(--row-even-global) !important;
90
- }
91
-
92
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+6) {
93
- background-color: var(--row-even-dimension) !important;
94
- }
95
-
96
- .custom-dataframe tbody tr:hover td {
97
- background-color: var(--hover-background) !important;
98
- }
99
- }
100
-
101
  .table-caption {
102
  text-align: center;
103
  margin-top: 10px;
 
40
  color: var(--text-color) !important;
41
  }
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  .table-caption {
44
  text-align: center;
45
  margin-top: 10px;
static/eval_results/Core_SI/all_model_keywords_stats.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Core_SI/all_summary.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Aquila_VL_2B": {
3
+ "num_eval_tasks": 273,
4
+ "num_eval_samples": 4116,
5
+ "num_not_eval_samples": 0,
6
+ "num_total_samples": 4377,
7
+ "macro_mean_score": 0.20770364903712493,
8
+ "micro_mean_score": 0.20333142638522636,
9
+ "missing_tasks": []
10
+ },
11
+ "Aria": {
12
+ "num_eval_tasks": 273,
13
+ "num_eval_samples": 4116,
14
+ "num_not_eval_samples": 0,
15
+ "num_total_samples": 4377,
16
+ "macro_mean_score": 0.3178882776147889,
17
+ "micro_mean_score": 0.3101511832828904,
18
+ "missing_tasks": []
19
+ },
20
+ "Claude_3.5": {
21
+ "num_eval_tasks": 273,
22
+ "num_eval_samples": 4116,
23
+ "num_not_eval_samples": 0,
24
+ "num_total_samples": 4116,
25
+ "macro_mean_score": 0.520276385877485,
26
+ "micro_mean_score": 0.520276385877485
27
+ },
28
+ "Claude_3.5_new": {
29
+ "num_eval_tasks": 273,
30
+ "num_eval_samples": 4116,
31
+ "num_not_eval_samples": 0,
32
+ "num_total_samples": 4116,
33
+ "macro_mean_score": 0.5462752278980763,
34
+ "micro_mean_score": 0.5462752278980763
35
+ },
36
+ "GPT_4o": {
37
+ "num_eval_tasks": 273,
38
+ "num_eval_samples": 4116,
39
+ "num_not_eval_samples": 0,
40
+ "num_total_samples": 4116,
41
+ "macro_mean_score": 0.5529953662872719,
42
+ "micro_mean_score": 0.5529953662872719
43
+ },
44
+ "GPT_4o_mini": {
45
+ "num_eval_tasks": 273,
46
+ "num_eval_samples": 4116,
47
+ "num_not_eval_samples": 0,
48
+ "num_total_samples": 4116,
49
+ "macro_mean_score": 0.44285970964797233,
50
+ "micro_mean_score": 0.44285970964797233
51
+ },
52
+ "Gemini_1.5_flash_002": {
53
+ "num_eval_tasks": 273,
54
+ "num_eval_samples": 4116,
55
+ "num_not_eval_samples": 0,
56
+ "num_total_samples": 4116,
57
+ "macro_mean_score": 0.42188460865574384,
58
+ "micro_mean_score": 0.42188460865574384
59
+ },
60
+ "Gemini_1.5_pro_002": {
61
+ "num_eval_tasks": 273,
62
+ "num_eval_samples": 4116,
63
+ "num_not_eval_samples": 0,
64
+ "num_total_samples": 4116,
65
+ "macro_mean_score": 0.4914311038229404,
66
+ "micro_mean_score": 0.4914311038229404
67
+ },
68
+ "Idefics3": {
69
+ "num_eval_tasks": 273,
70
+ "num_eval_samples": 4116,
71
+ "num_not_eval_samples": 0,
72
+ "num_total_samples": 4377,
73
+ "macro_mean_score": 0.08941182847569326,
74
+ "micro_mean_score": 0.08779475233900695,
75
+ "missing_tasks": []
76
+ },
77
+ "InternVL2_2B": {
78
+ "num_eval_tasks": 273,
79
+ "num_eval_samples": 4116,
80
+ "num_not_eval_samples": 0,
81
+ "num_total_samples": 4377,
82
+ "macro_mean_score": 0.12069001041308772,
83
+ "micro_mean_score": 0.11842605219090299,
84
+ "missing_tasks": []
85
+ },
86
+ "InternVL2_76B": {
87
+ "num_eval_tasks": 273,
88
+ "num_eval_samples": 4116,
89
+ "num_not_eval_samples": 0,
90
+ "num_total_samples": 4377,
91
+ "macro_mean_score": 0.3998616568018755,
92
+ "micro_mean_score": 0.39149064302628933,
93
+ "missing_tasks": []
94
+ },
95
+ "InternVL2_8B": {
96
+ "num_eval_tasks": 273,
97
+ "num_eval_samples": 4116,
98
+ "num_not_eval_samples": 0,
99
+ "num_total_samples": 4377,
100
+ "macro_mean_score": 0.27650612401825575,
101
+ "micro_mean_score": 0.27119471729837735,
102
+ "missing_tasks": []
103
+ },
104
+ "Llama_3_2_11B": {
105
+ "num_eval_tasks": 273,
106
+ "num_eval_samples": 4116,
107
+ "num_not_eval_samples": 0,
108
+ "num_total_samples": 4377,
109
+ "macro_mean_score": 0.20789144960796493,
110
+ "micro_mean_score": 0.20163641703273802,
111
+ "missing_tasks": []
112
+ },
113
+ "MiniCPM_v2.6": {
114
+ "num_eval_tasks": 273,
115
+ "num_eval_samples": 4116,
116
+ "num_not_eval_samples": 0,
117
+ "num_total_samples": 4377,
118
+ "macro_mean_score": 0.23230765810722817,
119
+ "micro_mean_score": 0.22684118052665975,
120
+ "missing_tasks": []
121
+ },
122
+ "Molmo_72B": {
123
+ "num_eval_tasks": 270,
124
+ "num_eval_samples": 4073,
125
+ "num_not_eval_samples": 0,
126
+ "num_total_samples": 4331,
127
+ "macro_mean_score": 0.36480000609384927,
128
+ "micro_mean_score": 0.36205779758110807,
129
+ "missing_tasks": [
130
+ "table_understanding",
131
+ "MMSoc_Misinformation_PolitiFact",
132
+ "planning_screenshot_termes"
133
+ ]
134
+ },
135
+ "Molmo_7B_D": {
136
+ "num_eval_tasks": 272,
137
+ "num_eval_samples": 4102,
138
+ "num_not_eval_samples": 0,
139
+ "num_total_samples": 4362,
140
+ "macro_mean_score": 0.2098088446992518,
141
+ "micro_mean_score": 0.20550929661464645,
142
+ "missing_tasks": [
143
+ "MMSoc_Misinformation_PolitiFact"
144
+ ]
145
+ },
146
+ "NVLM": {
147
+ "num_eval_tasks": 273,
148
+ "num_eval_samples": 4116,
149
+ "num_not_eval_samples": 0,
150
+ "num_total_samples": 4377,
151
+ "macro_mean_score": 0.32989872890926025,
152
+ "micro_mean_score": 0.32315683713111915,
153
+ "missing_tasks": []
154
+ },
155
+ "POINTS_7B": {
156
+ "num_eval_tasks": 273,
157
+ "num_eval_samples": 4116,
158
+ "num_not_eval_samples": 0,
159
+ "num_total_samples": 4377,
160
+ "macro_mean_score": 0.25511317681632334,
161
+ "micro_mean_score": 0.24927711632415062,
162
+ "missing_tasks": []
163
+ },
164
+ "Phi-3.5-vision": {
165
+ "num_eval_tasks": 273,
166
+ "num_eval_samples": 4116,
167
+ "num_not_eval_samples": 0,
168
+ "num_total_samples": 4377,
169
+ "macro_mean_score": 0.2561274958722834,
170
+ "micro_mean_score": 0.2504214576875906,
171
+ "missing_tasks": []
172
+ },
173
+ "Pixtral_12B": {
174
+ "num_eval_tasks": 273,
175
+ "num_eval_samples": 4116,
176
+ "num_not_eval_samples": 0,
177
+ "num_total_samples": 4377,
178
+ "macro_mean_score": 0.3436942439614412,
179
+ "micro_mean_score": 0.3373564384613738,
180
+ "missing_tasks": []
181
+ },
182
+ "Qwen2_VL_2B": {
183
+ "num_eval_tasks": 273,
184
+ "num_eval_samples": 4116,
185
+ "num_not_eval_samples": 0,
186
+ "num_total_samples": 4377,
187
+ "macro_mean_score": 0.22787906973244856,
188
+ "micro_mean_score": 0.2234748515064842,
189
+ "missing_tasks": []
190
+ },
191
+ "Qwen2_VL_72B": {
192
+ "num_eval_tasks": 273,
193
+ "num_eval_samples": 4116,
194
+ "num_not_eval_samples": 0,
195
+ "num_total_samples": 4377,
196
+ "macro_mean_score": 0.4730536307784527,
197
+ "micro_mean_score": 0.4659830915476831,
198
+ "missing_tasks": []
199
+ },
200
+ "Qwen2_VL_7B": {
201
+ "num_eval_tasks": 273,
202
+ "num_eval_samples": 4116,
203
+ "num_not_eval_samples": 0,
204
+ "num_total_samples": 4377,
205
+ "macro_mean_score": 0.3538656561495699,
206
+ "micro_mean_score": 0.34581250459157137,
207
+ "missing_tasks": []
208
+ },
209
+ "llava_onevision_72B": {
210
+ "num_eval_tasks": 273,
211
+ "num_eval_samples": 4116,
212
+ "num_not_eval_samples": 0,
213
+ "num_total_samples": 4377,
214
+ "macro_mean_score": 0.312618242621264,
215
+ "micro_mean_score": 0.3098623876487132,
216
+ "missing_tasks": []
217
+ },
218
+ "llava_onevision_7B": {
219
+ "num_eval_tasks": 273,
220
+ "num_eval_samples": 4116,
221
+ "num_not_eval_samples": 0,
222
+ "num_total_samples": 4377,
223
+ "macro_mean_score": 0.23683339637631812,
224
+ "micro_mean_score": 0.23283041278687175,
225
+ "missing_tasks": []
226
+ }
227
+ }
static/eval_results/{all_model_keywords_stats.json → Default/all_model_keywords_stats.json} RENAMED
The diff for this file is too large to render. See raw diff
 
static/eval_results/{all_summary.json → Default/all_summary.json} RENAMED
@@ -5,16 +5,16 @@
5
  "num_eval_samples": 6539,
6
  "num_not_eval_samples": 0,
7
  "num_total_samples": 6961,
8
- "macro_mean_score": 0.5203470034386184,
9
- "micro_mean_score": 0.514305381949725
10
  },
11
  "core_cot": {
12
  "num_eval_tasks": 440,
13
  "num_eval_samples": 6539,
14
  "num_not_eval_samples": 0,
15
  "num_total_samples": 6961,
16
- "macro_mean_score": 0.5265059698578094,
17
- "micro_mean_score": 0.5236365938368621
18
  },
19
  "open": {
20
  "num_eval_tasks": 65,
@@ -23,7 +23,7 @@
23
  "macro_mean_score": 0.6478225794744895,
24
  "micro_mean_score": 0.665391229578676
25
  },
26
- "overall_score": 0.542120979016392
27
  },
28
  "Gemini_1.5_pro_002": {
29
  "core_noncot": {
@@ -117,8 +117,8 @@
117
  "num_eval_samples": 6539,
118
  "num_not_eval_samples": 0,
119
  "num_total_samples": 6961,
120
- "macro_mean_score": 0.525918992480593,
121
- "micro_mean_score": 0.5230784020211157
122
  },
123
  "open": {
124
  "num_eval_tasks": 65,
@@ -127,7 +127,7 @@
127
  "macro_mean_score": 0.6563419761104125,
128
  "micro_mean_score": 0.6724419604471196
129
  },
130
- "overall_score": 0.5427061091854214
131
  },
132
  "GPT_4o_mini": {
133
  "core_noncot": {
@@ -492,5 +492,83 @@
492
  "micro_mean_score": 0.3947549441100602
493
  },
494
  "overall_score": 0.25566537510391796
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  }
496
  }
 
5
  "num_eval_samples": 6539,
6
  "num_not_eval_samples": 0,
7
  "num_total_samples": 6961,
8
+ "macro_mean_score": 0.5203440930873326,
9
+ "micro_mean_score": 0.514302640282204
10
  },
11
  "core_cot": {
12
  "num_eval_tasks": 440,
13
  "num_eval_samples": 6539,
14
  "num_not_eval_samples": 0,
15
  "num_total_samples": 6961,
16
+ "macro_mean_score": 0.5265030595065238,
17
+ "micro_mean_score": 0.5236338521693411
18
  },
19
  "open": {
20
  "num_eval_tasks": 65,
 
23
  "macro_mean_score": 0.6478225794744895,
24
  "micro_mean_score": 0.665391229578676
25
  },
26
+ "overall_score": 0.5421184432647768
27
  },
28
  "Gemini_1.5_pro_002": {
29
  "core_noncot": {
 
117
  "num_eval_samples": 6539,
118
  "num_not_eval_samples": 0,
119
  "num_total_samples": 6961,
120
+ "macro_mean_score": 0.5259191914020757,
121
+ "micro_mean_score": 0.5230785894131227
122
  },
123
  "open": {
124
  "num_eval_tasks": 65,
 
127
  "macro_mean_score": 0.6563419761104125,
128
  "micro_mean_score": 0.6724419604471196
129
  },
130
+ "overall_score": 0.5427062825031487
131
  },
132
  "GPT_4o_mini": {
133
  "core_noncot": {
 
492
  "micro_mean_score": 0.3947549441100602
493
  },
494
  "overall_score": 0.25566537510391796
495
+ },
496
+ "InternVL2_2B": {
497
+ "core_noncot": {
498
+ "num_eval_tasks": 440,
499
+ "num_eval_samples": 6539,
500
+ "num_not_eval_samples": 0,
501
+ "num_total_samples": 6961,
502
+ "macro_mean_score": 0.09089701489596874,
503
+ "micro_mean_score": 0.09036328295381871
504
+ },
505
+ "core_cot": {
506
+ "num_eval_tasks": 440,
507
+ "num_eval_samples": 6539,
508
+ "num_not_eval_samples": 0,
509
+ "num_total_samples": 6961,
510
+ "macro_mean_score": 0.13141974398938763,
511
+ "micro_mean_score": 0.13063500716262516
512
+ },
513
+ "open": {
514
+ "num_eval_tasks": 65,
515
+ "num_eval_samples": 1163,
516
+ "num_total_samples": 1224,
517
+ "macro_mean_score": 0.23864417043743646,
518
+ "micro_mean_score": 0.24901117798796224
519
+ },
520
+ "overall_score": 0.14522090778963154
521
+ },
522
+ "Qwen2_VL_2B": {
523
+ "core_noncot": {
524
+ "num_eval_tasks": 440,
525
+ "num_eval_samples": 6539,
526
+ "num_not_eval_samples": 0,
527
+ "num_total_samples": 6961,
528
+ "macro_mean_score": 0.16448220309703876,
529
+ "micro_mean_score": 0.1610710186451323
530
+ },
531
+ "core_cot": {
532
+ "num_eval_tasks": 440,
533
+ "num_eval_samples": 6539,
534
+ "num_not_eval_samples": 0,
535
+ "num_total_samples": 6961,
536
+ "macro_mean_score": 0.20877163406364055,
537
+ "micro_mean_score": 0.20561526268932287
538
+ },
539
+ "open": {
540
+ "num_eval_tasks": 65,
541
+ "num_eval_samples": 1163,
542
+ "num_total_samples": 1224,
543
+ "macro_mean_score": 0.3154302566225611,
544
+ "micro_mean_score": 0.33856405846947557
545
+ },
546
+ "overall_score": 0.22249997162072932
547
+ },
548
+ "Aquila_VL_2B": {
549
+ "core_noncot": {
550
+ "num_eval_tasks": 440,
551
+ "num_eval_samples": 6539,
552
+ "num_not_eval_samples": 0,
553
+ "num_total_samples": 6961,
554
+ "macro_mean_score": 0.16317824309838627,
555
+ "micro_mean_score": 0.16198837245148487
556
+ },
557
+ "core_cot": {
558
+ "num_eval_tasks": 440,
559
+ "num_eval_samples": 6539,
560
+ "num_not_eval_samples": 0,
561
+ "num_total_samples": 6961,
562
+ "macro_mean_score": 0.159970161379836,
563
+ "micro_mean_score": 0.15844711671722148
564
+ },
565
+ "open": {
566
+ "num_eval_tasks": 65,
567
+ "num_eval_samples": 1163,
568
+ "num_total_samples": 1224,
569
+ "macro_mean_score": 0.24567572098570653,
570
+ "micro_mean_score": 0.2704213241616509
571
+ },
572
+ "overall_score": 0.17379673035120966
573
  }
574
  }
utils.py CHANGED
@@ -1,21 +1,8 @@
1
  import pandas as pd
2
- import gradio as gr
3
- import csv
4
  import json
5
- import os
6
- import shutil
7
- from huggingface_hub import Repository
8
- import numpy as np
9
 
10
- # Load the JSON data
11
- with open("./static/eval_results/all_model_keywords_stats.json", "r") as f:
12
- MODEL_DATA = json.load(f)
13
-
14
- with open("./static/eval_results/all_summary.json", "r") as f:
15
- SUMMARY_DATA = json.load(f)
16
-
17
-
18
- # Define model name mapping
19
  MODEL_NAME_MAP = {
20
  "Claude_3.5_new": "Claude-3.5-Sonnet (1022)",
21
  "GPT_4o": "GPT-4o (0513)",
@@ -36,9 +23,14 @@ MODEL_NAME_MAP = {
36
  "Phi-3.5-vision": "Phi-3.5-Vision",
37
  "MiniCPM_v2.6": "MiniCPM-V2.6",
38
  "Idefics3": "Idefics3-8B-Llama3",
 
 
 
 
 
 
39
  }
40
 
41
- # Custom name mapping for dimensions and keywords
42
  DIMENSION_NAME_MAP = {
43
  "skills": "Skills",
44
  "input_format": "Input Format",
@@ -91,59 +83,157 @@ KEYWORD_NAME_MAP = {
91
  "video": "Video",
92
  }
93
 
94
- # Extract super groups (dimensions) and their keywords
95
- SUPER_GROUPS = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in MODEL_DATA[next(iter(MODEL_DATA))][dim].keys()]
96
- for dim in MODEL_DATA[next(iter(MODEL_DATA))]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def get_original_dimension(mapped_dimension):
99
  return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
100
 
101
  def get_original_keyword(mapped_keyword):
102
  return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
103
 
104
- # Define model groups
105
- MODEL_GROUPS = {
106
- "All": list(MODEL_DATA.keys()),
107
- "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM'],
108
- "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
109
- "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
110
- "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
111
- "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM'],
112
- "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
113
- }
114
-
115
  def get_display_model_name(model_name):
116
  return MODEL_NAME_MAP.get(model_name, model_name)
117
-
118
- def get_df(selected_super_group, selected_model_group):
119
- original_dimension = get_original_dimension(selected_super_group)
120
- data = []
121
- for model in MODEL_GROUPS[selected_model_group]:
122
- model_data = MODEL_DATA[model]
123
- summary = SUMMARY_DATA[model]
124
- core_noncot_score = summary["core_noncot"]["macro_mean_score"]
125
- core_cot_score = summary["core_cot"]["macro_mean_score"]
126
- row = {
127
- "Models": get_display_model_name(model), # Use the mapped name
128
- "Overall": round(summary["overall_score"] * 100, 2),
129
- "Core(w/o CoT)": round(core_noncot_score * 100, 2),
130
- "Core(w/ CoT)": round(core_cot_score * 100, 2),
131
- "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
132
- }
133
- for keyword in SUPER_GROUPS[selected_super_group]:
134
- original_keyword = get_original_keyword(keyword)
135
- if original_dimension in model_data and original_keyword in model_data[original_dimension]:
136
- row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
137
- else:
138
- row[keyword] = None
139
- data.append(row)
140
-
141
- df = pd.DataFrame(data)
142
- df = df.sort_values(by="Overall", ascending=False)
143
- return df
144
-
145
- def get_leaderboard_data(selected_super_group, selected_model_group):
146
- df = get_df(selected_super_group, selected_model_group)
147
- headers = ["Models", "Overall", "Core(w/o CoT)", "Core(w/ CoT)", "Open-ended"] + SUPER_GROUPS[selected_super_group]
148
- data = df[headers].values.tolist()
149
- return headers, data
 
1
  import pandas as pd
 
 
2
  import json
3
+ from typing import Dict, Any, Tuple
 
 
 
4
 
5
+ # Keep all the constant mappings outside the class
 
 
 
 
 
 
 
 
6
  MODEL_NAME_MAP = {
7
  "Claude_3.5_new": "Claude-3.5-Sonnet (1022)",
8
  "GPT_4o": "GPT-4o (0513)",
 
23
  "Phi-3.5-vision": "Phi-3.5-Vision",
24
  "MiniCPM_v2.6": "MiniCPM-V2.6",
25
  "Idefics3": "Idefics3-8B-Llama3",
26
+ "Aquila_VL_2B": "Aquila-VL-2B-llava-qwen",
27
+ "POINTS_7B": "POINTS-Qwen2.5-7B",
28
+ "Qwen2_VL_2B": "Qwen2-VL-2B",
29
+ "InternVL2_2B": "InternVL2-2B",
30
+ "Molmo_7B_D": "Molmo-7B-D-0924",
31
+ "Molmo_72B": "Molmo-72B-0924",
32
  }
33
 
 
34
  DIMENSION_NAME_MAP = {
35
  "skills": "Skills",
36
  "input_format": "Input Format",
 
83
  "video": "Video",
84
  }
85
 
86
+ class BaseDataLoader:
87
+ # Define the base MODEL_GROUPS structure
88
+ BASE_MODEL_GROUPS = {
89
+ "All": list(MODEL_NAME_MAP.keys()),
90
+ "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM'],
91
+ "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
92
+ "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
93
+ "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
94
+ "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM'],
95
+ "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
96
+ }
97
+
98
+ def __init__(self):
99
+ self.MODEL_DATA = self._load_model_data()
100
+ self.SUMMARY_DATA = self._load_summary_data()
101
+ self.SUPER_GROUPS = self._initialize_super_groups()
102
+ self.MODEL_GROUPS = self._initialize_model_groups()
103
+
104
+ def _initialize_super_groups(self):
105
+ # Define the desired order of super groups
106
+
107
+ groups = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in self.MODEL_DATA[next(iter(self.MODEL_DATA))][dim].keys()]
108
+ for dim in self.MODEL_DATA[next(iter(self.MODEL_DATA))]}
109
+
110
+ order = ["Skills", "Application", "Output Format", "Input Format", "Visual Input Number"]
111
+ # Sort the dictionary based on the predefined order
112
+ return {k: groups[k] for k in order if k in groups}
113
+
114
+ def _initialize_model_groups(self) -> Dict[str, list]:
115
+ # Get the list of available models from the loaded data
116
+ available_models = set(self.MODEL_DATA.keys())
117
+
118
+ # Create filtered groups based on available models
119
+ filtered_groups = {}
120
+ for group_name, models in self.BASE_MODEL_GROUPS.items():
121
+ if group_name == "All":
122
+ filtered_groups[group_name] = sorted(list(available_models))
123
+ else:
124
+ filtered_models = [model for model in models if model in available_models]
125
+ if filtered_models: # Only include group if it has models
126
+ filtered_groups[group_name] = filtered_models
127
+
128
+ return filtered_groups
129
+
130
+ def _load_model_data(self) -> Dict[str, Any]:
131
+ raise NotImplementedError("Subclasses must implement _load_model_data")
132
+
133
+ def _load_summary_data(self) -> Dict[str, Any]:
134
+ raise NotImplementedError("Subclasses must implement _load_summary_data")
135
+
136
+ def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
137
+ raise NotImplementedError("Subclasses must implement get_df")
138
+
139
+ def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
140
+ raise NotImplementedError("Subclasses must implement get_leaderboard_data")
141
+
142
+
143
+ class DefaultDataLoader(BaseDataLoader):
144
+ def __init__(self):
145
+ super().__init__()
146
+
147
+ def _load_model_data(self) -> Dict[str, Any]:
148
+ with open("./static/eval_results/Default/all_model_keywords_stats.json", "r") as f:
149
+ return json.load(f)
150
 
151
+ def _load_summary_data(self) -> Dict[str, Any]:
152
+ with open("./static/eval_results/Default/all_summary.json", "r") as f:
153
+ return json.load(f)
154
+
155
+ def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
156
+ original_dimension = get_original_dimension(selected_super_group)
157
+ data = []
158
+ for model in self.MODEL_GROUPS[selected_model_group]:
159
+ model_data = self.MODEL_DATA[model]
160
+ summary = self.SUMMARY_DATA[model]
161
+ core_noncot_score = summary["core_noncot"]["macro_mean_score"]
162
+ core_cot_score = summary["core_cot"]["macro_mean_score"]
163
+ row = {
164
+ "Models": get_display_model_name(model),
165
+ "Overall": round(summary["overall_score"] * 100, 2),
166
+ "Core(w/o CoT)": round(core_noncot_score * 100, 2),
167
+ "Core(w/ CoT)": round(core_cot_score * 100, 2),
168
+ "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
169
+ }
170
+ for keyword in self.SUPER_GROUPS[selected_super_group]:
171
+ original_keyword = get_original_keyword(keyword)
172
+ if original_dimension in model_data and original_keyword in model_data[original_dimension]:
173
+ row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
174
+ else:
175
+ row[keyword] = None
176
+ data.append(row)
177
+
178
+ df = pd.DataFrame(data)
179
+ df = df.sort_values(by="Overall", ascending=False)
180
+ return df
181
+
182
+ def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
183
+ df = self.get_df(selected_super_group, selected_model_group)
184
+ headers = ["Models", "Overall", "Core(w/o CoT)", "Core(w/ CoT)", "Open-ended"] + self.SUPER_GROUPS[selected_super_group]
185
+ data = df[headers].values.tolist()
186
+ return headers, data
187
+
188
+
189
+ class CoreSingleDataLoader(BaseDataLoader):
190
+ def __init__(self):
191
+ super().__init__()
192
+
193
+ def _load_model_data(self) -> Dict[str, Any]:
194
+ with open("./static/eval_results/Core_SI/all_model_keywords_stats.json", "r") as f:
195
+ return json.load(f)
196
+
197
+ def _load_summary_data(self) -> Dict[str, Any]:
198
+ with open("./static/eval_results/Core_SI/all_summary.json", "r") as f:
199
+ return json.load(f)
200
+
201
+ def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
202
+ original_dimension = get_original_dimension(selected_super_group)
203
+ data = []
204
+ for model in self.MODEL_GROUPS[selected_model_group]:
205
+ model_data = self.MODEL_DATA[model]
206
+ summary = self.SUMMARY_DATA[model]
207
+ core_si_score = summary["macro_mean_score"]
208
+ row = {
209
+ "Models": get_display_model_name(model),
210
+ "Core SI": round(core_si_score * 100, 2),
211
+ }
212
+ for keyword in self.SUPER_GROUPS[selected_super_group]:
213
+ original_keyword = get_original_keyword(keyword)
214
+ if original_dimension in model_data and original_keyword in model_data[original_dimension]:
215
+ row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
216
+ else:
217
+ row[keyword] = None
218
+ data.append(row)
219
+
220
+ df = pd.DataFrame(data)
221
+ df = df.sort_values(by="Core SI", ascending=False)
222
+ return df
223
+
224
+ def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
225
+ df = self.get_df(selected_super_group, selected_model_group)
226
+ headers = ["Models", "Core SI"] + self.SUPER_GROUPS[selected_super_group]
227
+ data = df[headers].values.tolist()
228
+ return headers, data
229
+
230
+
231
+ # Keep your helper functions
232
  def get_original_dimension(mapped_dimension):
233
  return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
234
 
235
  def get_original_keyword(mapped_keyword):
236
  return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
237
 
 
 
 
 
 
 
 
 
 
 
 
238
  def get_display_model_name(model_name):
239
  return MODEL_NAME_MAP.get(model_name, model_name)