cccjc commited on
Commit
b971fc0
·
1 Parent(s): f7c7d6f

add gemini-flash-2.0-exp

Browse files
constants.py CHANGED
@@ -118,6 +118,7 @@ MODEL_NAME_MAP = {
118
  "InternVL2_5_8B": "InternVL2.5-8B",
119
  "Grok-2-vision-1212": "Grok-2-vision-1212",
120
  "Gemini-2.0-thinking": "Gemini-2.0-flash-thinking",
 
121
  "Gemini-exp-1206": "Gemini-exp-1206",
122
  }
123
 
@@ -209,15 +210,16 @@ MODEL_URLS = {
209
  "Grok-2-vision-1212": "https://x.ai/blog/grok-1212",
210
  "Gemini-2.0-thinking": "://ai.google.dev/gemini-api/docs/thinking-mode",
211
  "Gemini-exp-1206": "https://blog.google/feed/gemini-exp-1206/",
 
212
  }
213
 
214
  # Define the base MODEL_GROUPS structure
215
  BASE_MODEL_GROUPS = {
216
  "All": list(MODEL_NAME_MAP.keys()),
217
- "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-2.0-thinking", "Gemini-exp-1206"],
218
- "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"],
219
- "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-2.0-thinking", "Gemini-exp-1206"],
220
- "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
221
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
222
  "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"]
223
  }
 
118
  "InternVL2_5_8B": "InternVL2.5-8B",
119
  "Grok-2-vision-1212": "Grok-2-vision-1212",
120
  "Gemini-2.0-thinking": "Gemini-2.0-flash-thinking",
121
+ "Gemini-Flash-2.0-exp": "Gemini-Flash-2.0-exp",
122
  "Gemini-exp-1206": "Gemini-exp-1206",
123
  }
124
 
 
210
  "Grok-2-vision-1212": "https://x.ai/blog/grok-1212",
211
  "Gemini-2.0-thinking": "://ai.google.dev/gemini-api/docs/thinking-mode",
212
  "Gemini-exp-1206": "https://blog.google/feed/gemini-exp-1206/",
213
+ "Gemini-Flash-2.0-exp": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/#gemini-2-0-flash",
214
  }
215
 
216
  # Define the base MODEL_GROUPS structure
217
  BASE_MODEL_GROUPS = {
218
  "All": list(MODEL_NAME_MAP.keys()),
219
+ "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-exp-1206"],
220
+ "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Gemini-2.0-thinking", "Gemini-Flash-2.0-exp"],
221
+ "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-exp-1206"],
222
+ "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', "Gemini-Flash-2.0-exp", "Gemini-2.0-thinking"],
223
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
224
  "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"]
225
  }
static/eval_results/Default/Gemini-Flash-2.0-exp/summary_and_keyword_stats.json ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6531,
6
+ "macro_mean_score": 0.5407469682493938
7
+ },
8
+ "open": {
9
+ "num_eval_tasks": 65,
10
+ "num_eval_samples": 1158,
11
+ "macro_mean_score": 0.632968160070652
12
+ },
13
+ "overall_score": 0.552617022642229
14
+ },
15
+ "keyword_stats": {
16
+ "skills": {
17
+ "Object Recognition and Classification": {
18
+ "count": 303,
19
+ "num_samples": 4745,
20
+ "tasks": [],
21
+ "average_score": 0.5782979256943778
22
+ },
23
+ "Language Understanding and Generation": {
24
+ "count": 154,
25
+ "num_samples": 2503,
26
+ "tasks": [],
27
+ "average_score": 0.6135285008667205
28
+ },
29
+ "Commonsense and Social Reasoning": {
30
+ "count": 51,
31
+ "num_samples": 853,
32
+ "tasks": [],
33
+ "average_score": 0.6466029104690044
34
+ },
35
+ "Scene and Event Understanding": {
36
+ "count": 154,
37
+ "num_samples": 2467,
38
+ "tasks": [],
39
+ "average_score": 0.574739062334026
40
+ },
41
+ "Domain-Specific Knowledge and Skills": {
42
+ "count": 77,
43
+ "num_samples": 1385,
44
+ "tasks": [],
45
+ "average_score": 0.5672870648345623
46
+ },
47
+ "Spatial and Temporal Reasoning": {
48
+ "count": 152,
49
+ "num_samples": 2434,
50
+ "tasks": [],
51
+ "average_score": 0.41187777635464107
52
+ },
53
+ "Ethical and Safety Reasoning": {
54
+ "count": 15,
55
+ "num_samples": 245,
56
+ "tasks": [],
57
+ "average_score": 0.6641203007518797
58
+ },
59
+ "Text Recognition (OCR)": {
60
+ "count": 137,
61
+ "num_samples": 2232,
62
+ "tasks": [],
63
+ "average_score": 0.59555536867041
64
+ },
65
+ "Mathematical and Logical Reasoning": {
66
+ "count": 109,
67
+ "num_samples": 1908,
68
+ "tasks": [],
69
+ "average_score": 0.47799127604909475
70
+ },
71
+ "Planning and Decision Making": {
72
+ "count": 37,
73
+ "num_samples": 576,
74
+ "tasks": [],
75
+ "average_score": 0.258060016054743
76
+ }
77
+ },
78
+ "input_format": {
79
+ "Photographs": {
80
+ "count": 143,
81
+ "num_samples": 2243,
82
+ "tasks": [],
83
+ "average_score": 0.5805088447248222
84
+ },
85
+ "Artistic and Creative Content": {
86
+ "count": 32,
87
+ "num_samples": 540,
88
+ "tasks": [],
89
+ "average_score": 0.5716301924585377
90
+ },
91
+ "Videos": {
92
+ "count": 43,
93
+ "num_samples": 698,
94
+ "tasks": [],
95
+ "average_score": 0.5056938373064443
96
+ },
97
+ "Diagrams and Data Visualizations": {
98
+ "count": 101,
99
+ "num_samples": 1717,
100
+ "tasks": [],
101
+ "average_score": 0.5282795652139135
102
+ },
103
+ "Text-Based Images and Documents": {
104
+ "count": 82,
105
+ "num_samples": 1294,
106
+ "tasks": [],
107
+ "average_score": 0.49897892967383023
108
+ },
109
+ "User Interface Screenshots": {
110
+ "count": 93,
111
+ "num_samples": 1511,
112
+ "tasks": [],
113
+ "average_score": 0.6095454500115458
114
+ },
115
+ "3D Models and Aerial Imagery": {
116
+ "count": 11,
117
+ "num_samples": 169,
118
+ "tasks": [],
119
+ "average_score": 0.4601450277175127
120
+ }
121
+ },
122
+ "output_format": {
123
+ "contextual_formatted_text": {
124
+ "count": 98,
125
+ "num_samples": 1511,
126
+ "tasks": [],
127
+ "average_score": 0.5296197886902184
128
+ },
129
+ "open_ended_output": {
130
+ "count": 80,
131
+ "num_samples": 1449,
132
+ "tasks": [],
133
+ "average_score": 0.6089096459304206
134
+ },
135
+ "structured_output": {
136
+ "count": 110,
137
+ "num_samples": 1713,
138
+ "tasks": [],
139
+ "average_score": 0.4969695210701893
140
+ },
141
+ "numerical_data": {
142
+ "count": 49,
143
+ "num_samples": 862,
144
+ "tasks": [],
145
+ "average_score": 0.5234237145984363
146
+ },
147
+ "multiple_choice": {
148
+ "count": 85,
149
+ "num_samples": 1363,
150
+ "tasks": [],
151
+ "average_score": 0.5912336308948948
152
+ },
153
+ "exact_text": {
154
+ "count": 83,
155
+ "num_samples": 1274,
156
+ "tasks": [],
157
+ "average_score": 0.5769496085438593
158
+ }
159
+ },
160
+ "input_num": {
161
+ "1-image": {
162
+ "count": 315,
163
+ "num_samples": 5215,
164
+ "tasks": [],
165
+ "average_score": 0.5772282223913567
166
+ },
167
+ "video": {
168
+ "count": 43,
169
+ "num_samples": 698,
170
+ "tasks": [],
171
+ "average_score": 0.5056938373064443
172
+ },
173
+ "4-5 images": {
174
+ "count": 34,
175
+ "num_samples": 520,
176
+ "tasks": [],
177
+ "average_score": 0.4884965564296879
178
+ },
179
+ "9-image or more": {
180
+ "count": 41,
181
+ "num_samples": 623,
182
+ "tasks": [],
183
+ "average_score": 0.6019278856911068
184
+ },
185
+ "6-8 images": {
186
+ "count": 21,
187
+ "num_samples": 314,
188
+ "tasks": [],
189
+ "average_score": 0.4508408919123204
190
+ },
191
+ "2-3 images": {
192
+ "count": 51,
193
+ "num_samples": 802,
194
+ "tasks": [],
195
+ "average_score": 0.4851820865640744
196
+ }
197
+ },
198
+ "app": {
199
+ "Knowledge": {
200
+ "count": 97,
201
+ "num_samples": 1602,
202
+ "tasks": [],
203
+ "average_score": 0.625268184642034
204
+ },
205
+ "Information_Extraction": {
206
+ "count": 72,
207
+ "num_samples": 1119,
208
+ "tasks": [],
209
+ "average_score": 0.6396914731086236
210
+ },
211
+ "Perception": {
212
+ "count": 145,
213
+ "num_samples": 2310,
214
+ "tasks": [],
215
+ "average_score": 0.5719180669097592
216
+ },
217
+ "Coding": {
218
+ "count": 31,
219
+ "num_samples": 474,
220
+ "tasks": [],
221
+ "average_score": 0.4948051263516159
222
+ },
223
+ "Science": {
224
+ "count": 29,
225
+ "num_samples": 574,
226
+ "tasks": [],
227
+ "average_score": 0.5678501530733836
228
+ },
229
+ "Planning": {
230
+ "count": 78,
231
+ "num_samples": 1237,
232
+ "tasks": [],
233
+ "average_score": 0.3603705820786078
234
+ },
235
+ "Metrics": {
236
+ "count": 20,
237
+ "num_samples": 309,
238
+ "tasks": [],
239
+ "average_score": 0.6566188392926626
240
+ },
241
+ "Mathematics": {
242
+ "count": 33,
243
+ "num_samples": 547,
244
+ "tasks": [],
245
+ "average_score": 0.49656912770604317
246
+ }
247
+ }
248
+ }
249
+ }
static/eval_results/Default/Gemini-Flash-2.0-exp/task_results.json ADDED
@@ -0,0 +1,3537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "ascii_art_30",
4
+ "score": 0.21428571428571427,
5
+ "eval_type": "llm",
6
+ "num_demo": 1,
7
+ "num_query": 14
8
+ },
9
+ {
10
+ "name": "humor_explanation",
11
+ "score": 0.8533333333333335,
12
+ "eval_type": "llm",
13
+ "num_demo": 1,
14
+ "num_query": 15
15
+ },
16
+ {
17
+ "name": "funqa_unexpected_action_humor_video",
18
+ "score": 0.3733333333333333,
19
+ "eval_type": "llm",
20
+ "num_demo": 1,
21
+ "num_query": 15
22
+ },
23
+ {
24
+ "name": "science_figure_explanation",
25
+ "score": 0.872413793103448,
26
+ "eval_type": "llm",
27
+ "num_demo": 1,
28
+ "num_query": 29
29
+ },
30
+ {
31
+ "name": "video_qa",
32
+ "score": 0.8571428571428573,
33
+ "eval_type": "llm",
34
+ "num_demo": 1,
35
+ "num_query": 14
36
+ },
37
+ {
38
+ "name": "vibe_eval_phrase",
39
+ "score": 0.7571428571428573,
40
+ "eval_type": "llm",
41
+ "num_demo": 1,
42
+ "num_query": 14
43
+ },
44
+ {
45
+ "name": "traffic_accident_analysis",
46
+ "score": 0.6714285714285714,
47
+ "eval_type": "llm",
48
+ "num_demo": 1,
49
+ "num_query": 14
50
+ },
51
+ {
52
+ "name": "figurative_speech_explanation",
53
+ "score": 0.8344827586206895,
54
+ "eval_type": "llm",
55
+ "num_demo": 1,
56
+ "num_query": 29
57
+ },
58
+ {
59
+ "name": "video_detail_description",
60
+ "score": 0.5210526315789474,
61
+ "eval_type": "llm",
62
+ "num_demo": 1,
63
+ "num_query": 19
64
+ },
65
+ {
66
+ "name": "table2latex_complex",
67
+ "score": 0.8111111111111111,
68
+ "eval_type": "llm",
69
+ "num_demo": 1,
70
+ "num_query": 9
71
+ },
72
+ {
73
+ "name": "unusual_images",
74
+ "score": 0.889655172413793,
75
+ "eval_type": "llm",
76
+ "num_demo": 1,
77
+ "num_query": 29
78
+ },
79
+ {
80
+ "name": "funqa_unexpected_action_creative_video",
81
+ "score": 0.3866666666666667,
82
+ "eval_type": "llm",
83
+ "num_demo": 1,
84
+ "num_query": 15
85
+ },
86
+ {
87
+ "name": "art_explanation",
88
+ "score": 0.4655172413793103,
89
+ "eval_type": "llm",
90
+ "num_demo": 1,
91
+ "num_query": 29
92
+ },
93
+ {
94
+ "name": "ocr_open_ended_qa",
95
+ "score": 0.7965517241379308,
96
+ "eval_type": "llm",
97
+ "num_demo": 1,
98
+ "num_query": 29
99
+ },
100
+ {
101
+ "name": "video_short_title",
102
+ "score": 0.6928571428571428,
103
+ "eval_type": "llm",
104
+ "num_demo": 1,
105
+ "num_query": 14
106
+ },
107
+ {
108
+ "name": "bar_chart_interpretation",
109
+ "score": 0.6655172413793102,
110
+ "eval_type": "llm",
111
+ "num_demo": 1,
112
+ "num_query": 29
113
+ },
114
+ {
115
+ "name": "activitynetqa",
116
+ "score": 0.4473684210526316,
117
+ "eval_type": "llm",
118
+ "num_demo": 1,
119
+ "num_query": 19
120
+ },
121
+ {
122
+ "name": "scibench_w_solution_open_ended",
123
+ "score": 0.618,
124
+ "eval_type": "llm",
125
+ "num_demo": 1,
126
+ "num_query": 25
127
+ },
128
+ {
129
+ "name": "doc_vqa",
130
+ "score": 0.7562500000000001,
131
+ "eval_type": "llm",
132
+ "num_demo": 1,
133
+ "num_query": 16
134
+ },
135
+ {
136
+ "name": "video2notes",
137
+ "score": 0.7000000000000001,
138
+ "eval_type": "llm",
139
+ "num_demo": 1,
140
+ "num_query": 14
141
+ },
142
+ {
143
+ "name": "GUI_Chat_Hard",
144
+ "score": 0.4115384615384616,
145
+ "eval_type": "llm",
146
+ "num_demo": 1,
147
+ "num_query": 26
148
+ },
149
+ {
150
+ "name": "image_humor_understanding",
151
+ "score": 0.882758620689655,
152
+ "eval_type": "llm",
153
+ "num_demo": 1,
154
+ "num_query": 29
155
+ },
156
+ {
157
+ "name": "video_summary",
158
+ "score": 0.692857142857143,
159
+ "eval_type": "llm",
160
+ "num_demo": 1,
161
+ "num_query": 14
162
+ },
163
+ {
164
+ "name": "defeasible_reasoning",
165
+ "score": 0.8620689655172413,
166
+ "eval_type": "llm",
167
+ "num_demo": 1,
168
+ "num_query": 29
169
+ },
170
+ {
171
+ "name": "funny_image_title",
172
+ "score": 0.6499999999999998,
173
+ "eval_type": "llm",
174
+ "num_demo": 1,
175
+ "num_query": 14
176
+ },
177
+ {
178
+ "name": "video_content_follow_up",
179
+ "score": 0.7285714285714286,
180
+ "eval_type": "llm",
181
+ "num_demo": 1,
182
+ "num_query": 14
183
+ },
184
+ {
185
+ "name": "nextqa_oe",
186
+ "score": 0.4157894736842105,
187
+ "eval_type": "llm",
188
+ "num_demo": 1,
189
+ "num_query": 19
190
+ },
191
+ {
192
+ "name": "tweets_captioning",
193
+ "score": 0.5499999999999999,
194
+ "eval_type": "llm",
195
+ "num_demo": 1,
196
+ "num_query": 14
197
+ },
198
+ {
199
+ "name": "wikihow_complex_task_completion",
200
+ "score": 0.7555555555555555,
201
+ "eval_type": "llm",
202
+ "num_demo": 1,
203
+ "num_query": 9
204
+ },
205
+ {
206
+ "name": "graph_interpretation",
207
+ "score": 0.8413793103448274,
208
+ "eval_type": "llm",
209
+ "num_demo": 1,
210
+ "num_query": 29
211
+ },
212
+ {
213
+ "name": "generated_video_artifacts",
214
+ "score": 0.3625,
215
+ "eval_type": "llm",
216
+ "num_demo": 1,
217
+ "num_query": 16
218
+ },
219
+ {
220
+ "name": "meme_explain",
221
+ "score": 0.8857142857142858,
222
+ "eval_type": "llm",
223
+ "num_demo": 1,
224
+ "num_query": 14
225
+ },
226
+ {
227
+ "name": "funqa_unexpected_action_magic_video",
228
+ "score": 0.6466666666666667,
229
+ "eval_type": "llm",
230
+ "num_demo": 1,
231
+ "num_query": 15
232
+ },
233
+ {
234
+ "name": "guess_image_generation_prompt",
235
+ "score": 0.8263157894736844,
236
+ "eval_type": "llm",
237
+ "num_demo": 1,
238
+ "num_query": 19
239
+ },
240
+ {
241
+ "name": "visualization_with_code",
242
+ "score": 0.6714285714285715,
243
+ "eval_type": "llm",
244
+ "num_demo": 1,
245
+ "num_query": 14
246
+ },
247
+ {
248
+ "name": "iq_test_open_ended",
249
+ "score": 0.6689655172413791,
250
+ "eval_type": "llm",
251
+ "num_demo": 1,
252
+ "num_query": 29
253
+ },
254
+ {
255
+ "name": "electrocardiogram",
256
+ "score": 0.3857142857142857,
257
+ "eval_type": "llm",
258
+ "num_demo": 1,
259
+ "num_query": 14
260
+ },
261
+ {
262
+ "name": "image_captioning_with_additional_requirements",
263
+ "score": 0.9285714285714287,
264
+ "eval_type": "llm",
265
+ "num_demo": 1,
266
+ "num_query": 14
267
+ },
268
+ {
269
+ "name": "docci_image_description_long",
270
+ "score": 0.7928571428571428,
271
+ "eval_type": "llm",
272
+ "num_demo": 1,
273
+ "num_query": 14
274
+ },
275
+ {
276
+ "name": "paper_review_writing",
277
+ "score": 0.5866666666666666,
278
+ "eval_type": "llm",
279
+ "num_demo": 1,
280
+ "num_query": 15
281
+ },
282
+ {
283
+ "name": "sceneqa_scene_transition_video",
284
+ "score": 0.34285714285714286,
285
+ "eval_type": "llm",
286
+ "num_demo": 1,
287
+ "num_query": 14
288
+ },
289
+ {
290
+ "name": "GUI_Chat_Easy",
291
+ "score": 0.5769230769230769,
292
+ "eval_type": "llm",
293
+ "num_demo": 1,
294
+ "num_query": 26
295
+ },
296
+ {
297
+ "name": "bridge_strategies_advanced",
298
+ "score": 0.15714285714285717,
299
+ "eval_type": "llm",
300
+ "num_demo": 1,
301
+ "num_query": 14
302
+ },
303
+ {
304
+ "name": "bridge_strategies_worldclass",
305
+ "score": 0.08571428571428572,
306
+ "eval_type": "llm",
307
+ "num_demo": 1,
308
+ "num_query": 14
309
+ },
310
+ {
311
+ "name": "bridge_strategies_expert",
312
+ "score": 0.19999999999999998,
313
+ "eval_type": "llm",
314
+ "num_demo": 1,
315
+ "num_query": 14
316
+ },
317
+ {
318
+ "name": "multi_lingual_manual_explanation_scooter_Arabic",
319
+ "score": 0.4928571428571428,
320
+ "eval_type": "llm",
321
+ "num_demo": 1,
322
+ "num_query": 14
323
+ },
324
+ {
325
+ "name": "multi_lingual_manual_explanation_scooter_Russian",
326
+ "score": 0.46428571428571425,
327
+ "eval_type": "llm",
328
+ "num_demo": 1,
329
+ "num_query": 14
330
+ },
331
+ {
332
+ "name": "multi_lingual_manual_explanation_scooter_French",
333
+ "score": 0.4785714285714286,
334
+ "eval_type": "llm",
335
+ "num_demo": 1,
336
+ "num_query": 14
337
+ },
338
+ {
339
+ "name": "multi_lingual_manual_explanation_scooter_Spanish",
340
+ "score": 0.5142857142857143,
341
+ "eval_type": "llm",
342
+ "num_demo": 1,
343
+ "num_query": 14
344
+ },
345
+ {
346
+ "name": "multi_lingual_manual_explanation_scooter_Chinese",
347
+ "score": 0.5285714285714286,
348
+ "eval_type": "llm",
349
+ "num_demo": 1,
350
+ "num_query": 14
351
+ },
352
+ {
353
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
354
+ "score": 0.7428571428571429,
355
+ "eval_type": "llm",
356
+ "num_demo": 1,
357
+ "num_query": 14
358
+ },
359
+ {
360
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
361
+ "score": 0.6857142857142857,
362
+ "eval_type": "llm",
363
+ "num_demo": 1,
364
+ "num_query": 14
365
+ },
366
+ {
367
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
368
+ "score": 0.7357142857142858,
369
+ "eval_type": "llm",
370
+ "num_demo": 1,
371
+ "num_query": 14
372
+ },
373
+ {
374
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
375
+ "score": 0.7000000000000001,
376
+ "eval_type": "llm",
377
+ "num_demo": 1,
378
+ "num_query": 14
379
+ },
380
+ {
381
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
382
+ "score": 0.6857142857142857,
383
+ "eval_type": "llm",
384
+ "num_demo": 1,
385
+ "num_query": 14
386
+ },
387
+ {
388
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
389
+ "score": 0.7285714285714285,
390
+ "eval_type": "llm",
391
+ "num_demo": 1,
392
+ "num_query": 14
393
+ },
394
+ {
395
+ "name": "table_understanding_fetaqa",
396
+ "score": 0.5928571428571429,
397
+ "eval_type": "llm",
398
+ "num_demo": 1,
399
+ "num_query": 14
400
+ },
401
+ {
402
+ "name": "red_teaming_celebrity",
403
+ "score": 0.8550000000000001,
404
+ "eval_type": "llm",
405
+ "num_demo": 0,
406
+ "num_query": 20
407
+ },
408
+ {
409
+ "name": "red_teaming_captcha",
410
+ "score": 0.13157894736842107,
411
+ "eval_type": "llm",
412
+ "num_demo": 1,
413
+ "num_query": 19
414
+ },
415
+ {
416
+ "name": "red_teaming_jailbreak",
417
+ "score": 0.5850000000000001,
418
+ "eval_type": "llm",
419
+ "num_demo": 0,
420
+ "num_query": 20
421
+ },
422
+ {
423
+ "name": "red_teaming_visualmisleading",
424
+ "score": 0.8789473684210528,
425
+ "eval_type": "llm",
426
+ "num_demo": 1,
427
+ "num_query": 19
428
+ },
429
+ {
430
+ "name": "red_teaming_visual_order_A",
431
+ "score": 0.8947368421052634,
432
+ "eval_type": "llm",
433
+ "num_demo": 1,
434
+ "num_query": 19
435
+ },
436
+ {
437
+ "name": "red_teaming_racial",
438
+ "score": 0.7600000000000001,
439
+ "eval_type": "llm",
440
+ "num_demo": 0,
441
+ "num_query": 20
442
+ },
443
+ {
444
+ "name": "red_teaming_visual_order_B",
445
+ "score": 0.9,
446
+ "eval_type": "llm",
447
+ "num_demo": 1,
448
+ "num_query": 19
449
+ },
450
+ {
451
+ "name": "red_teaming_politics",
452
+ "score": 0.695,
453
+ "eval_type": "llm",
454
+ "num_demo": 0,
455
+ "num_query": 20
456
+ },
457
+ {
458
+ "name": "brand_logo_recognition_and_elaboration",
459
+ "score": 0.88,
460
+ "eval_type": "rule",
461
+ "num_demo": 1,
462
+ "num_query": 25
463
+ },
464
+ {
465
+ "name": "exchange_rate_estimate_plot",
466
+ "score": 0.9733571428571428,
467
+ "eval_type": "rule",
468
+ "num_demo": 1,
469
+ "num_query": 14
470
+ },
471
+ {
472
+ "name": "places365_similar_scene_retrieval",
473
+ "score": 0.7142857142857143,
474
+ "eval_type": "rule",
475
+ "num_demo": 1,
476
+ "num_query": 14
477
+ },
478
+ {
479
+ "name": "math_parity",
480
+ "score": 0.8,
481
+ "eval_type": "rule",
482
+ "num_demo": 1,
483
+ "num_query": 15
484
+ },
485
+ {
486
+ "name": "av_human_multiview_counting",
487
+ "score": 0.06666666666666667,
488
+ "eval_type": "rule",
489
+ "num_demo": 1,
490
+ "num_query": 15
491
+ },
492
+ {
493
+ "name": "traffic_future_prediction_from_line_plot",
494
+ "score": 0.6567894736842107,
495
+ "eval_type": "rule",
496
+ "num_demo": 1,
497
+ "num_query": 19
498
+ },
499
+ {
500
+ "name": "graph_chordless_cycle",
501
+ "score": 0.42857142857142855,
502
+ "eval_type": "rule",
503
+ "num_demo": 1,
504
+ "num_query": 14
505
+ },
506
+ {
507
+ "name": "multi_load_type_prediction_from_plot",
508
+ "score": 0.4761904761904761,
509
+ "eval_type": "rule",
510
+ "num_demo": 1,
511
+ "num_query": 14
512
+ },
513
+ {
514
+ "name": "youtube_video_info_parsing",
515
+ "score": 0.7976190476190476,
516
+ "eval_type": "rule",
517
+ "num_demo": 1,
518
+ "num_query": 14
519
+ },
520
+ {
521
+ "name": "super_clevr_scene_understanding",
522
+ "score": 0.5,
523
+ "eval_type": "rule",
524
+ "num_demo": 1,
525
+ "num_query": 14
526
+ },
527
+ {
528
+ "name": "figureqa",
529
+ "score": 0.42857142857142855,
530
+ "eval_type": "rule",
531
+ "num_demo": 1,
532
+ "num_query": 14
533
+ },
534
+ {
535
+ "name": "code_programming_test_advanced",
536
+ "score": 0.1111111111111111,
537
+ "eval_type": "rule",
538
+ "num_demo": 1,
539
+ "num_query": 18
540
+ },
541
+ {
542
+ "name": "face_keypoint_detection",
543
+ "score": 0.5787325110618333,
544
+ "eval_type": "rule",
545
+ "num_demo": 1,
546
+ "num_query": 14
547
+ },
548
+ {
549
+ "name": "widerface_face_count_and_event_classification",
550
+ "score": 0.7142857142857143,
551
+ "eval_type": "rule",
552
+ "num_demo": 1,
553
+ "num_query": 14
554
+ },
555
+ {
556
+ "name": "average_humidity_estimate_plot",
557
+ "score": 0.8413333333333332,
558
+ "eval_type": "rule",
559
+ "num_demo": 1,
560
+ "num_query": 15
561
+ },
562
+ {
563
+ "name": "code_output_result",
564
+ "score": 0.5714285714285714,
565
+ "eval_type": "rule",
566
+ "num_demo": 1,
567
+ "num_query": 14
568
+ },
569
+ {
570
+ "name": "clevrer_video_moving_object_count",
571
+ "score": 0.3333333333333333,
572
+ "eval_type": "rule",
573
+ "num_demo": 1,
574
+ "num_query": 21
575
+ },
576
+ {
577
+ "name": "weather_info_parsing",
578
+ "score": 0.896825396825397,
579
+ "eval_type": "rule",
580
+ "num_demo": 1,
581
+ "num_query": 14
582
+ },
583
+ {
584
+ "name": "egocentric_analysis_single_image",
585
+ "score": 0.4444444444444444,
586
+ "eval_type": "rule",
587
+ "num_demo": 1,
588
+ "num_query": 9
589
+ },
590
+ {
591
+ "name": "logo2k_same_type_logo_retrieval",
592
+ "score": 0.9642857142857143,
593
+ "eval_type": "rule",
594
+ "num_demo": 1,
595
+ "num_query": 14
596
+ },
597
+ {
598
+ "name": "graph_hamiltonian_cycle",
599
+ "score": 0.45357142857142857,
600
+ "eval_type": "rule",
601
+ "num_demo": 1,
602
+ "num_query": 14
603
+ },
604
+ {
605
+ "name": "waybill_number_sequence_extraction",
606
+ "score": 0.7857142857142857,
607
+ "eval_type": "rule",
608
+ "num_demo": 1,
609
+ "num_query": 14
610
+ },
611
+ {
612
+ "name": "graph_maxflow",
613
+ "score": 0.4,
614
+ "eval_type": "rule",
615
+ "num_demo": 1,
616
+ "num_query": 15
617
+ },
618
+ {
619
+ "name": "clevrer_object_existence_video",
620
+ "score": 0.5625,
621
+ "eval_type": "rule",
622
+ "num_demo": 1,
623
+ "num_query": 16
624
+ },
625
+ {
626
+ "name": "TV_show_info_parsing",
627
+ "score": 0.8015873015873015,
628
+ "eval_type": "rule",
629
+ "num_demo": 1,
630
+ "num_query": 14
631
+ },
632
+ {
633
+ "name": "sta_action_localization_video",
634
+ "score": 0.3125,
635
+ "eval_type": "rule",
636
+ "num_demo": 1,
637
+ "num_query": 16
638
+ },
639
+ {
640
+ "name": "visual_dialog_image_guessing",
641
+ "score": 0.9333333333333333,
642
+ "eval_type": "rule",
643
+ "num_demo": 1,
644
+ "num_query": 15
645
+ },
646
+ {
647
+ "name": "insect_order_classification",
648
+ "score": 0.2,
649
+ "eval_type": "rule",
650
+ "num_demo": 1,
651
+ "num_query": 15
652
+ },
653
+ {
654
+ "name": "electricity_plot_future_prediction",
655
+ "score": 0.8230157894736841,
656
+ "eval_type": "rule",
657
+ "num_demo": 1,
658
+ "num_query": 19
659
+ },
660
+ {
661
+ "name": "perception_test_video_character_order",
662
+ "score": 0.875,
663
+ "eval_type": "rule",
664
+ "num_demo": 1,
665
+ "num_query": 16
666
+ },
667
+ {
668
+ "name": "chemistry_exams_v",
669
+ "score": 0.35714285714285715,
670
+ "eval_type": "rule",
671
+ "num_demo": 1,
672
+ "num_query": 14
673
+ },
674
+ {
675
+ "name": "vlnqa_egocentric_navigation_video",
676
+ "score": 0.4375,
677
+ "eval_type": "rule",
678
+ "num_demo": 1,
679
+ "num_query": 16
680
+ },
681
+ {
682
+ "name": "finance_table_understanding",
683
+ "score": 0.6428571428571429,
684
+ "eval_type": "rule",
685
+ "num_demo": 1,
686
+ "num_query": 14
687
+ },
688
+ {
689
+ "name": "funsd_document_qa",
690
+ "score": 0.7857142857142857,
691
+ "eval_type": "rule",
692
+ "num_demo": 1,
693
+ "num_query": 14
694
+ },
695
+ {
696
+ "name": "star_object_interaction_video",
697
+ "score": 0.625,
698
+ "eval_type": "rule",
699
+ "num_demo": 1,
700
+ "num_query": 16
701
+ },
702
+ {
703
+ "name": "video_to_camera_trajectory_retrieval",
704
+ "score": 0.42857142857142855,
705
+ "eval_type": "rule",
706
+ "num_demo": 1,
707
+ "num_query": 14
708
+ },
709
+ {
710
+ "name": "vibe_eval_open",
711
+ "score": 0.0,
712
+ "eval_type": "rule",
713
+ "num_demo": 1,
714
+ "num_query": 14
715
+ },
716
+ {
717
+ "name": "question_solution_solving",
718
+ "score": 0.2857142857142857,
719
+ "eval_type": "rule",
720
+ "num_demo": 1,
721
+ "num_query": 14
722
+ },
723
+ {
724
+ "name": "graph_theory",
725
+ "score": 0.2857142857142857,
726
+ "eval_type": "rule",
727
+ "num_demo": 1,
728
+ "num_query": 14
729
+ },
730
+ {
731
+ "name": "code_programming_test_hard",
732
+ "score": 0.14285714285714285,
733
+ "eval_type": "rule",
734
+ "num_demo": 1,
735
+ "num_query": 14
736
+ },
737
+ {
738
+ "name": "geometry_analytic",
739
+ "score": 0.14285714285714285,
740
+ "eval_type": "rule",
741
+ "num_demo": 1,
742
+ "num_query": 14
743
+ },
744
+ {
745
+ "name": "3d_fragments_understanding",
746
+ "score": 0.42857142857142855,
747
+ "eval_type": "rule",
748
+ "num_demo": 1,
749
+ "num_query": 14
750
+ },
751
+ {
752
+ "name": "geometry_length",
753
+ "score": 0.42857142857142855,
754
+ "eval_type": "rule",
755
+ "num_demo": 1,
756
+ "num_query": 14
757
+ },
758
+ {
759
+ "name": "algebra",
760
+ "score": 0.2857142857142857,
761
+ "eval_type": "rule",
762
+ "num_demo": 1,
763
+ "num_query": 14
764
+ },
765
+ {
766
+ "name": "chess_puzzle_single_step",
767
+ "score": 0.0,
768
+ "eval_type": "rule",
769
+ "num_demo": 1,
770
+ "num_query": 15
771
+ },
772
+ {
773
+ "name": "action_sequence_understanding",
774
+ "score": 0.8571428571428571,
775
+ "eval_type": "rule",
776
+ "num_demo": 1,
777
+ "num_query": 14
778
+ },
779
+ {
780
+ "name": "emotion_recognition",
781
+ "score": 0.6428571428571429,
782
+ "eval_type": "rule",
783
+ "num_demo": 1,
784
+ "num_query": 14
785
+ },
786
+ {
787
+ "name": "chess_winner_identification",
788
+ "score": 0.4,
789
+ "eval_type": "rule",
790
+ "num_demo": 1,
791
+ "num_query": 15
792
+ },
793
+ {
794
+ "name": "perception_test_object_shuffle_video",
795
+ "score": 0.25,
796
+ "eval_type": "rule",
797
+ "num_demo": 1,
798
+ "num_query": 16
799
+ },
800
+ {
801
+ "name": "physical_property_reasoning",
802
+ "score": 1.0,
803
+ "eval_type": "rule",
804
+ "num_demo": 1,
805
+ "num_query": 14
806
+ },
807
+ {
808
+ "name": "humor_understand_caption_match",
809
+ "score": 0.7333333333333333,
810
+ "eval_type": "rule",
811
+ "num_demo": 1,
812
+ "num_query": 15
813
+ },
814
+ {
815
+ "name": "coco_object_detection_by_query_property",
816
+ "score": 0.6698026360094582,
817
+ "eval_type": "rule",
818
+ "num_demo": 1,
819
+ "num_query": 14
820
+ },
821
+ {
822
+ "name": "cam_traj_to_video_selection",
823
+ "score": 0.5,
824
+ "eval_type": "rule",
825
+ "num_demo": 1,
826
+ "num_query": 14
827
+ },
828
+ {
829
+ "name": "multilingual_game_info_parsing",
830
+ "score": 0.7857142857142857,
831
+ "eval_type": "rule",
832
+ "num_demo": 1,
833
+ "num_query": 14
834
+ },
835
+ {
836
+ "name": "mnist_pattern",
837
+ "score": 0.8571428571428571,
838
+ "eval_type": "rule",
839
+ "num_demo": 1,
840
+ "num_query": 14
841
+ },
842
+ {
843
+ "name": "cheapest_flight_identification",
844
+ "score": 0.42857142857142855,
845
+ "eval_type": "rule",
846
+ "num_demo": 1,
847
+ "num_query": 14
848
+ },
849
+ {
850
+ "name": "newspaper_page_parse_and_count",
851
+ "score": 0.4,
852
+ "eval_type": "rule",
853
+ "num_demo": 1,
854
+ "num_query": 15
855
+ },
856
+ {
857
+ "name": "clevrer_moving_direction_video",
858
+ "score": 0.1875,
859
+ "eval_type": "rule",
860
+ "num_demo": 1,
861
+ "num_query": 16
862
+ },
863
+ {
864
+ "name": "dvqa",
865
+ "score": 0.7894736842105263,
866
+ "eval_type": "rule",
867
+ "num_demo": 1,
868
+ "num_query": 19
869
+ },
870
+ {
871
+ "name": "science_basic_physics",
872
+ "score": 0.8,
873
+ "eval_type": "rule",
874
+ "num_demo": 1,
875
+ "num_query": 15
876
+ },
877
+ {
878
+ "name": "electricity_future_prediction_from_table",
879
+ "score": 0.737017543859649,
880
+ "eval_type": "rule",
881
+ "num_demo": 1,
882
+ "num_query": 19
883
+ },
884
+ {
885
+ "name": "physics_exams_v",
886
+ "score": 0.42857142857142855,
887
+ "eval_type": "rule",
888
+ "num_demo": 1,
889
+ "num_query": 14
890
+ },
891
+ {
892
+ "name": "license_plate_recognition",
893
+ "score": 0.7857142857142857,
894
+ "eval_type": "rule",
895
+ "num_demo": 1,
896
+ "num_query": 14
897
+ },
898
+ {
899
+ "name": "snli_ve_visual_entailment",
900
+ "score": 0.8666666666666667,
901
+ "eval_type": "rule",
902
+ "num_demo": 1,
903
+ "num_query": 15
904
+ },
905
+ {
906
+ "name": "game_info_retrieval",
907
+ "score": 0.5714285714285714,
908
+ "eval_type": "rule",
909
+ "num_demo": 1,
910
+ "num_query": 14
911
+ },
912
+ {
913
+ "name": "places365_scene_type_classification",
914
+ "score": 0.9285714285714286,
915
+ "eval_type": "rule",
916
+ "num_demo": 1,
917
+ "num_query": 14
918
+ },
919
+ {
920
+ "name": "music_info_retrieval",
921
+ "score": 0.42857142857142855,
922
+ "eval_type": "rule",
923
+ "num_demo": 1,
924
+ "num_query": 14
925
+ },
926
+ {
927
+ "name": "3d_indoor_scene_text_bbox_selection",
928
+ "score": 0.35714285714285715,
929
+ "eval_type": "rule",
930
+ "num_demo": 1,
931
+ "num_query": 14
932
+ },
933
+ {
934
+ "name": "code_error_line_identification",
935
+ "score": 0.35714285714285715,
936
+ "eval_type": "rule",
937
+ "num_demo": 1,
938
+ "num_query": 14
939
+ },
940
+ {
941
+ "name": "geometry_descriptive",
942
+ "score": 0.07142857142857142,
943
+ "eval_type": "rule",
944
+ "num_demo": 1,
945
+ "num_query": 14
946
+ },
947
+ {
948
+ "name": "top_rated_hotel_identification",
949
+ "score": 0.7857142857142857,
950
+ "eval_type": "rule",
951
+ "num_demo": 1,
952
+ "num_query": 14
953
+ },
954
+ {
955
+ "name": "science_molecule_chemistry",
956
+ "score": 0.9333333333333333,
957
+ "eval_type": "rule",
958
+ "num_demo": 1,
959
+ "num_query": 15
960
+ },
961
+ {
962
+ "name": "face_identity_matching",
963
+ "score": 0.8666666666666667,
964
+ "eval_type": "rule",
965
+ "num_demo": 1,
966
+ "num_query": 15
967
+ },
968
+ {
969
+ "name": "game_info_parsing",
970
+ "score": 0.9285714285714286,
971
+ "eval_type": "rule",
972
+ "num_demo": 1,
973
+ "num_query": 14
974
+ },
975
+ {
976
+ "name": "music_info_parsing",
977
+ "score": 0.6071428571428571,
978
+ "eval_type": "rule",
979
+ "num_demo": 1,
980
+ "num_query": 14
981
+ },
982
+ {
983
+ "name": "deciphering_oracle_bone",
984
+ "score": 0.07142857142857142,
985
+ "eval_type": "rule",
986
+ "num_demo": 1,
987
+ "num_query": 14
988
+ },
989
+ {
990
+ "name": "video_content_reasoning",
991
+ "score": 0.2222222222222222,
992
+ "eval_type": "rule",
993
+ "num_demo": 1,
994
+ "num_query": 9
995
+ },
996
+ {
997
+ "name": "multilingual_movie_info_parsing",
998
+ "score": 0.7346938775510203,
999
+ "eval_type": "rule",
1000
+ "num_demo": 1,
1001
+ "num_query": 14
1002
+ },
1003
+ {
1004
+ "name": "iconqa_count_and_reasoning",
1005
+ "score": 0.631578947368421,
1006
+ "eval_type": "rule",
1007
+ "num_demo": 1,
1008
+ "num_query": 19
1009
+ },
1010
+ {
1011
+ "name": "graph_connectivity",
1012
+ "score": 0.8,
1013
+ "eval_type": "rule",
1014
+ "num_demo": 1,
1015
+ "num_query": 15
1016
+ },
1017
+ {
1018
+ "name": "graph_shortest_path_planar",
1019
+ "score": 0.35714285714285715,
1020
+ "eval_type": "rule",
1021
+ "num_demo": 1,
1022
+ "num_query": 14
1023
+ },
1024
+ {
1025
+ "name": "famous_building_recognition",
1026
+ "score": 0.9375,
1027
+ "eval_type": "rule",
1028
+ "num_demo": 1,
1029
+ "num_query": 16
1030
+ },
1031
+ {
1032
+ "name": "muma_theory_of_mind_belief_of_goal",
1033
+ "score": 0.6666666666666666,
1034
+ "eval_type": "rule",
1035
+ "num_demo": 1,
1036
+ "num_query": 15
1037
+ },
1038
+ {
1039
+ "name": "signboard_identification",
1040
+ "score": 0.7857142857142857,
1041
+ "eval_type": "rule",
1042
+ "num_demo": 1,
1043
+ "num_query": 14
1044
+ },
1045
+ {
1046
+ "name": "geometry_transformation",
1047
+ "score": 0.2857142857142857,
1048
+ "eval_type": "rule",
1049
+ "num_demo": 1,
1050
+ "num_query": 14
1051
+ },
1052
+ {
1053
+ "name": "image_style_recognition",
1054
+ "score": 1.0,
1055
+ "eval_type": "rule",
1056
+ "num_demo": 1,
1057
+ "num_query": 14
1058
+ },
1059
+ {
1060
+ "name": "math_convexity_value_estimation",
1061
+ "score": 0.5207276876255985,
1062
+ "eval_type": "rule",
1063
+ "num_demo": 1,
1064
+ "num_query": 15
1065
+ },
1066
+ {
1067
+ "name": "av_vehicle_multiview_counting",
1068
+ "score": 0.2,
1069
+ "eval_type": "rule",
1070
+ "num_demo": 1,
1071
+ "num_query": 15
1072
+ },
1073
+ {
1074
+ "name": "long_string_letter_recognition",
1075
+ "score": 0.21428571428571427,
1076
+ "eval_type": "rule",
1077
+ "num_demo": 1,
1078
+ "num_query": 14
1079
+ },
1080
+ {
1081
+ "name": "3d_indoor_scene_text_bbox_prediction",
1082
+ "score": 0.19380819004670155,
1083
+ "eval_type": "rule",
1084
+ "num_demo": 1,
1085
+ "num_query": 14
1086
+ },
1087
+ {
1088
+ "name": "movie_info_parsing",
1089
+ "score": 0.7142857142857143,
1090
+ "eval_type": "rule",
1091
+ "num_demo": 1,
1092
+ "num_query": 14
1093
+ },
1094
+ {
1095
+ "name": "av_view_identification",
1096
+ "score": 0.26666666666666666,
1097
+ "eval_type": "rule",
1098
+ "num_demo": 1,
1099
+ "num_query": 15
1100
+ },
1101
+ {
1102
+ "name": "handwritten_math_expression_extraction",
1103
+ "score": 0.6428571428571429,
1104
+ "eval_type": "rule",
1105
+ "num_demo": 1,
1106
+ "num_query": 14
1107
+ },
1108
+ {
1109
+ "name": "geometry_solid",
1110
+ "score": 0.5,
1111
+ "eval_type": "rule",
1112
+ "num_demo": 1,
1113
+ "num_query": 14
1114
+ },
1115
+ {
1116
+ "name": "arc_agi",
1117
+ "score": 0.0,
1118
+ "eval_type": "rule",
1119
+ "num_demo": 1,
1120
+ "num_query": 14
1121
+ },
1122
+ {
1123
+ "name": "animal_pose_estimation",
1124
+ "score": 0.3668898610562918,
1125
+ "eval_type": "rule",
1126
+ "num_demo": 1,
1127
+ "num_query": 14
1128
+ },
1129
+ {
1130
+ "name": "single_person_pose_estimation",
1131
+ "score": 0.298785583093911,
1132
+ "eval_type": "rule",
1133
+ "num_demo": 1,
1134
+ "num_query": 14
1135
+ },
1136
+ {
1137
+ "name": "next_action_prediction",
1138
+ "score": 0.42857142857142855,
1139
+ "eval_type": "rule",
1140
+ "num_demo": 1,
1141
+ "num_query": 14
1142
+ },
1143
+ {
1144
+ "name": "human_relationship_reasoning",
1145
+ "score": 1.0,
1146
+ "eval_type": "rule",
1147
+ "num_demo": 1,
1148
+ "num_query": 14
1149
+ },
1150
+ {
1151
+ "name": "graph_shortest_path_kamada_kawai",
1152
+ "score": 0.42857142857142855,
1153
+ "eval_type": "rule",
1154
+ "num_demo": 1,
1155
+ "num_query": 14
1156
+ },
1157
+ {
1158
+ "name": "geometry_area",
1159
+ "score": 0.42857142857142855,
1160
+ "eval_type": "rule",
1161
+ "num_demo": 1,
1162
+ "num_query": 14
1163
+ },
1164
+ {
1165
+ "name": "coco_person_detection",
1166
+ "score": 0.6026048935518167,
1167
+ "eval_type": "rule",
1168
+ "num_demo": 1,
1169
+ "num_query": 14
1170
+ },
1171
+ {
1172
+ "name": "chart_vqa",
1173
+ "score": 0.7857142857142857,
1174
+ "eval_type": "rule",
1175
+ "num_demo": 1,
1176
+ "num_query": 14
1177
+ },
1178
+ {
1179
+ "name": "IAM_line_ocr_and_locate",
1180
+ "score": 0.6713087080689123,
1181
+ "eval_type": "rule",
1182
+ "num_demo": 1,
1183
+ "num_query": 14
1184
+ },
1185
+ {
1186
+ "name": "hotel_booking_confirmation_parsing",
1187
+ "score": 0.7071428571428571,
1188
+ "eval_type": "rule",
1189
+ "num_demo": 1,
1190
+ "num_query": 14
1191
+ },
1192
+ {
1193
+ "name": "clevrer_video_moving_object_property_recognition",
1194
+ "score": 0.5625,
1195
+ "eval_type": "rule",
1196
+ "num_demo": 1,
1197
+ "num_query": 16
1198
+ },
1199
+ {
1200
+ "name": "ili_ratio_future_prediction",
1201
+ "score": 0.16085714285714284,
1202
+ "eval_type": "rule",
1203
+ "num_demo": 1,
1204
+ "num_query": 14
1205
+ },
1206
+ {
1207
+ "name": "landmark_check_two_images",
1208
+ "score": 0.7777777777777779,
1209
+ "eval_type": "rule",
1210
+ "num_demo": 1,
1211
+ "num_query": 15
1212
+ },
1213
+ {
1214
+ "name": "graph_hamiltonian_path",
1215
+ "score": 0.324404761904762,
1216
+ "eval_type": "rule",
1217
+ "num_demo": 1,
1218
+ "num_query": 14
1219
+ },
1220
+ {
1221
+ "name": "vizwiz_quality_accessment_for_blind",
1222
+ "score": 0.35714285714285715,
1223
+ "eval_type": "rule",
1224
+ "num_demo": 1,
1225
+ "num_query": 14
1226
+ },
1227
+ {
1228
+ "name": "nlvr2_two_image_compare_qa",
1229
+ "score": 0.7857142857142857,
1230
+ "eval_type": "rule",
1231
+ "num_demo": 1,
1232
+ "num_query": 14
1233
+ },
1234
+ {
1235
+ "name": "electricity_load_estimate_plot",
1236
+ "score": 0.7926428571428572,
1237
+ "eval_type": "rule",
1238
+ "num_demo": 1,
1239
+ "num_query": 14
1240
+ },
1241
+ {
1242
+ "name": "weather_info_retrieval",
1243
+ "score": 0.8571428571428571,
1244
+ "eval_type": "rule",
1245
+ "num_demo": 1,
1246
+ "num_query": 14
1247
+ },
1248
+ {
1249
+ "name": "tqa_textbook_qa",
1250
+ "score": 0.8571428571428571,
1251
+ "eval_type": "rule",
1252
+ "num_demo": 1,
1253
+ "num_query": 14
1254
+ },
1255
+ {
1256
+ "name": "stock_info_parsing",
1257
+ "score": 0.9663865546218489,
1258
+ "eval_type": "rule",
1259
+ "num_demo": 1,
1260
+ "num_query": 14
1261
+ },
1262
+ {
1263
+ "name": "math_exams_v",
1264
+ "score": 0.5,
1265
+ "eval_type": "rule",
1266
+ "num_demo": 1,
1267
+ "num_query": 14
1268
+ },
1269
+ {
1270
+ "name": "coco_ood_global_image_retrieval_by_query_property",
1271
+ "score": 0.7928571428571428,
1272
+ "eval_type": "rule",
1273
+ "num_demo": 1,
1274
+ "num_query": 14
1275
+ },
1276
+ {
1277
+ "name": "quizlet_question_solving",
1278
+ "score": 0.6428571428571429,
1279
+ "eval_type": "rule",
1280
+ "num_demo": 1,
1281
+ "num_query": 14
1282
+ },
1283
+ {
1284
+ "name": "newspaper_ocr_in_query_box",
1285
+ "score": 0.5333333333333333,
1286
+ "eval_type": "rule",
1287
+ "num_demo": 1,
1288
+ "num_query": 15
1289
+ },
1290
+ {
1291
+ "name": "clevr_arithmetic",
1292
+ "score": 0.5789473684210527,
1293
+ "eval_type": "rule",
1294
+ "num_demo": 1,
1295
+ "num_query": 19
1296
+ },
1297
+ {
1298
+ "name": "stock_info_retrieval",
1299
+ "score": 0.8571428571428571,
1300
+ "eval_type": "rule",
1301
+ "num_demo": 1,
1302
+ "num_query": 14
1303
+ },
1304
+ {
1305
+ "name": "mvsa_sentiment_classification",
1306
+ "score": 0.7857142857142857,
1307
+ "eval_type": "rule",
1308
+ "num_demo": 1,
1309
+ "num_query": 14
1310
+ },
1311
+ {
1312
+ "name": "code_programming_extremely_hard",
1313
+ "score": 0.0625,
1314
+ "eval_type": "rule",
1315
+ "num_demo": 1,
1316
+ "num_query": 16
1317
+ },
1318
+ {
1319
+ "name": "av_multicamera_tracking_predict_bbox",
1320
+ "score": 0.04470906681625265,
1321
+ "eval_type": "rule",
1322
+ "num_demo": 1,
1323
+ "num_query": 14
1324
+ },
1325
+ {
1326
+ "name": "egocentric_spatial_reasoning",
1327
+ "score": 0.5555555555555556,
1328
+ "eval_type": "rule",
1329
+ "num_demo": 1,
1330
+ "num_query": 9
1331
+ },
1332
+ {
1333
+ "name": "stock_price_future_prediction",
1334
+ "score": 0.8637857142857143,
1335
+ "eval_type": "rule",
1336
+ "num_demo": 1,
1337
+ "num_query": 14
1338
+ },
1339
+ {
1340
+ "name": "Ad_count_detection",
1341
+ "score": 0.42857142857142855,
1342
+ "eval_type": "rule",
1343
+ "num_demo": 1,
1344
+ "num_query": 14
1345
+ },
1346
+ {
1347
+ "name": "recover_masked_word_in_figure",
1348
+ "score": 0.21428571428571427,
1349
+ "eval_type": "rule",
1350
+ "num_demo": 1,
1351
+ "num_query": 14
1352
+ },
1353
+ {
1354
+ "name": "polygon_interior_angles",
1355
+ "score": 0.0,
1356
+ "eval_type": "rule",
1357
+ "num_demo": 1,
1358
+ "num_query": 14
1359
+ },
1360
+ {
1361
+ "name": "web_action_grounding",
1362
+ "score": 0.6428571428571429,
1363
+ "eval_type": "rule",
1364
+ "num_demo": 1,
1365
+ "num_query": 14
1366
+ },
1367
+ {
1368
+ "name": "latex_complex_formula_convertion",
1369
+ "score": 0.29411764705882354,
1370
+ "eval_type": "rule",
1371
+ "num_demo": 1,
1372
+ "num_query": 17
1373
+ },
1374
+ {
1375
+ "name": "transit_map_intersection_points",
1376
+ "score": 0.3898809523809524,
1377
+ "eval_type": "rule",
1378
+ "num_demo": 1,
1379
+ "num_query": 14
1380
+ },
1381
+ {
1382
+ "name": "paper_review_acceptance",
1383
+ "score": 0.4666666666666667,
1384
+ "eval_type": "rule",
1385
+ "num_demo": 1,
1386
+ "num_query": 15
1387
+ },
1388
+ {
1389
+ "name": "arxiv_vqa",
1390
+ "score": 0.7857142857142857,
1391
+ "eval_type": "rule",
1392
+ "num_demo": 1,
1393
+ "num_query": 14
1394
+ },
1395
+ {
1396
+ "name": "perception_test_video_action_count",
1397
+ "score": 0.3125,
1398
+ "eval_type": "rule",
1399
+ "num_demo": 1,
1400
+ "num_query": 16
1401
+ },
1402
+ {
1403
+ "name": "code_match_problem",
1404
+ "score": 1.0,
1405
+ "eval_type": "rule",
1406
+ "num_demo": 1,
1407
+ "num_query": 14
1408
+ },
1409
+ {
1410
+ "name": "vln_hindi_next_step",
1411
+ "score": 0.2,
1412
+ "eval_type": "rule",
1413
+ "num_demo": 1,
1414
+ "num_query": 15
1415
+ },
1416
+ {
1417
+ "name": "medical_image_artifacts_indentification",
1418
+ "score": 0.21428571428571427,
1419
+ "eval_type": "rule",
1420
+ "num_demo": 1,
1421
+ "num_query": 14
1422
+ },
1423
+ {
1424
+ "name": "song_title_identification_from_lyrics",
1425
+ "score": 0.5714285714285714,
1426
+ "eval_type": "rule",
1427
+ "num_demo": 1,
1428
+ "num_query": 14
1429
+ },
1430
+ {
1431
+ "name": "medical_abdomen_endscopy_organ_recognition",
1432
+ "score": 0.3154761904761904,
1433
+ "eval_type": "rule",
1434
+ "num_demo": 1,
1435
+ "num_query": 14
1436
+ },
1437
+ {
1438
+ "name": "move_pos_to_pos_hanoi_4_pole",
1439
+ "score": 0.03571428571428571,
1440
+ "eval_type": "rule",
1441
+ "num_demo": 1,
1442
+ "num_query": 14
1443
+ },
1444
+ {
1445
+ "name": "video_camera_motion_description",
1446
+ "score": 0.21428571428571427,
1447
+ "eval_type": "rule",
1448
+ "num_demo": 1,
1449
+ "num_query": 14
1450
+ },
1451
+ {
1452
+ "name": "actor_recognition_in_Movie",
1453
+ "score": 0.9285714285714286,
1454
+ "eval_type": "rule",
1455
+ "num_demo": 1,
1456
+ "num_query": 14
1457
+ },
1458
+ {
1459
+ "name": "bongard_problem",
1460
+ "score": 0.42105263157894735,
1461
+ "eval_type": "rule",
1462
+ "num_demo": 1,
1463
+ "num_query": 19
1464
+ },
1465
+ {
1466
+ "name": "ascii_art_understanding",
1467
+ "score": 0.7142857142857143,
1468
+ "eval_type": "rule",
1469
+ "num_demo": 1,
1470
+ "num_query": 14
1471
+ },
1472
+ {
1473
+ "name": "calendar_schedule_suggestion",
1474
+ "score": 0.6428571428571429,
1475
+ "eval_type": "rule",
1476
+ "num_demo": 1,
1477
+ "num_query": 14
1478
+ },
1479
+ {
1480
+ "name": "geometry_reasoning_overlapped_circle",
1481
+ "score": 0.75,
1482
+ "eval_type": "rule",
1483
+ "num_demo": 1,
1484
+ "num_query": 14
1485
+ },
1486
+ {
1487
+ "name": "planning_screenshot_barman",
1488
+ "score": 0.26666666666666666,
1489
+ "eval_type": "rule",
1490
+ "num_demo": 1,
1491
+ "num_query": 15
1492
+ },
1493
+ {
1494
+ "name": "planning_screenshot_floortile",
1495
+ "score": 0.0,
1496
+ "eval_type": "rule",
1497
+ "num_demo": 1,
1498
+ "num_query": 15
1499
+ },
1500
+ {
1501
+ "name": "nextqa_mc",
1502
+ "score": 0.7894736842105263,
1503
+ "eval_type": "rule",
1504
+ "num_demo": 1,
1505
+ "num_query": 19
1506
+ },
1507
+ {
1508
+ "name": "graph_isomorphism",
1509
+ "score": 0.5333333333333333,
1510
+ "eval_type": "rule",
1511
+ "num_demo": 1,
1512
+ "num_query": 15
1513
+ },
1514
+ {
1515
+ "name": "code_programming_test_easy",
1516
+ "score": 0.375,
1517
+ "eval_type": "rule",
1518
+ "num_demo": 1,
1519
+ "num_query": 24
1520
+ },
1521
+ {
1522
+ "name": "biology_exams_v",
1523
+ "score": 0.6428571428571429,
1524
+ "eval_type": "rule",
1525
+ "num_demo": 1,
1526
+ "num_query": 14
1527
+ },
1528
+ {
1529
+ "name": "long_string_number_recognition",
1530
+ "score": 1.0,
1531
+ "eval_type": "rule",
1532
+ "num_demo": 1,
1533
+ "num_query": 14
1534
+ },
1535
+ {
1536
+ "name": "kvqa_knowledge_aware_qa",
1537
+ "score": 0.42105263157894735,
1538
+ "eval_type": "rule",
1539
+ "num_demo": 1,
1540
+ "num_query": 19
1541
+ },
1542
+ {
1543
+ "name": "video_grounding_temporal",
1544
+ "score": 0.26666666666666666,
1545
+ "eval_type": "rule",
1546
+ "num_demo": 1,
1547
+ "num_query": 15
1548
+ },
1549
+ {
1550
+ "name": "math_breakpoint",
1551
+ "score": 0.8,
1552
+ "eval_type": "rule",
1553
+ "num_demo": 1,
1554
+ "num_query": 15
1555
+ },
1556
+ {
1557
+ "name": "ancient_map_understanding",
1558
+ "score": 0.8571428571428571,
1559
+ "eval_type": "rule",
1560
+ "num_demo": 1,
1561
+ "num_query": 14
1562
+ },
1563
+ {
1564
+ "name": "landmark_recognition_and_qa",
1565
+ "score": 0.7111111111111111,
1566
+ "eval_type": "rule",
1567
+ "num_demo": 1,
1568
+ "num_query": 15
1569
+ },
1570
+ {
1571
+ "name": "code_execution",
1572
+ "score": 0.8125,
1573
+ "eval_type": "rule",
1574
+ "num_demo": 1,
1575
+ "num_query": 16
1576
+ },
1577
+ {
1578
+ "name": "music_sheet_format_QA",
1579
+ "score": 0.6428571428571429,
1580
+ "eval_type": "rule",
1581
+ "num_demo": 1,
1582
+ "num_query": 14
1583
+ },
1584
+ {
1585
+ "name": "code_solution_compare",
1586
+ "score": 0.35714285714285715,
1587
+ "eval_type": "rule",
1588
+ "num_demo": 1,
1589
+ "num_query": 14
1590
+ },
1591
+ {
1592
+ "name": "annoying_word_search",
1593
+ "score": 0.0,
1594
+ "eval_type": "rule",
1595
+ "num_demo": 1,
1596
+ "num_query": 14
1597
+ },
1598
+ {
1599
+ "name": "interpret_force_perspective_illusion",
1600
+ "score": 0.6666666666666666,
1601
+ "eval_type": "rule",
1602
+ "num_demo": 1,
1603
+ "num_query": 15
1604
+ },
1605
+ {
1606
+ "name": "muma_theory_of_mind_social_goal",
1607
+ "score": 0.4666666666666667,
1608
+ "eval_type": "rule",
1609
+ "num_demo": 1,
1610
+ "num_query": 15
1611
+ },
1612
+ {
1613
+ "name": "healthcare_info_judgement",
1614
+ "score": 0.9285714285714286,
1615
+ "eval_type": "rule",
1616
+ "num_demo": 1,
1617
+ "num_query": 14
1618
+ },
1619
+ {
1620
+ "name": "photo_sharing_image_retrieval",
1621
+ "score": 0.7142857142857143,
1622
+ "eval_type": "rule",
1623
+ "num_demo": 1,
1624
+ "num_query": 14
1625
+ },
1626
+ {
1627
+ "name": "multiview_reasoning_camera_moving",
1628
+ "score": 0.42857142857142855,
1629
+ "eval_type": "rule",
1630
+ "num_demo": 1,
1631
+ "num_query": 14
1632
+ },
1633
+ {
1634
+ "name": "geometry_plot_position_relationship",
1635
+ "score": 0.8571428571428571,
1636
+ "eval_type": "rule",
1637
+ "num_demo": 1,
1638
+ "num_query": 14
1639
+ },
1640
+ {
1641
+ "name": "map_diagram_qa",
1642
+ "score": 0.35714285714285715,
1643
+ "eval_type": "rule",
1644
+ "num_demo": 1,
1645
+ "num_query": 14
1646
+ },
1647
+ {
1648
+ "name": "movie_info_retrieval",
1649
+ "score": 0.9285714285714286,
1650
+ "eval_type": "rule",
1651
+ "num_demo": 1,
1652
+ "num_query": 14
1653
+ },
1654
+ {
1655
+ "name": "vln_identify_location",
1656
+ "score": 0.2121212121212121,
1657
+ "eval_type": "rule",
1658
+ "num_demo": 1,
1659
+ "num_query": 15
1660
+ },
1661
+ {
1662
+ "name": "pmc_vqa_medical_image_qa",
1663
+ "score": 0.5789473684210527,
1664
+ "eval_type": "rule",
1665
+ "num_demo": 1,
1666
+ "num_query": 19
1667
+ },
1668
+ {
1669
+ "name": "medical_polyp_segmentation_single_object_rater",
1670
+ "score": 0.35714285714285715,
1671
+ "eval_type": "rule",
1672
+ "num_demo": 1,
1673
+ "num_query": 14
1674
+ },
1675
+ {
1676
+ "name": "logical_reasoning_2D_views_of_3D_shapes",
1677
+ "score": 0.21428571428571427,
1678
+ "eval_type": "rule",
1679
+ "num_demo": 1,
1680
+ "num_query": 14
1681
+ },
1682
+ {
1683
+ "name": "medical_blood_vessels_recognition",
1684
+ "score": 0.7142857142857143,
1685
+ "eval_type": "rule",
1686
+ "num_demo": 1,
1687
+ "num_query": 14
1688
+ },
1689
+ {
1690
+ "name": "medical_abdomen_MRI_organ_recognition",
1691
+ "score": 0.3988095238095238,
1692
+ "eval_type": "rule",
1693
+ "num_demo": 1,
1694
+ "num_query": 14
1695
+ },
1696
+ {
1697
+ "name": "relative_depth_of_different_points",
1698
+ "score": 0.6428571428571429,
1699
+ "eval_type": "rule",
1700
+ "num_demo": 1,
1701
+ "num_query": 14
1702
+ },
1703
+ {
1704
+ "name": "google_streetview_circle_sorting",
1705
+ "score": 0.06666666666666667,
1706
+ "eval_type": "rule",
1707
+ "num_demo": 1,
1708
+ "num_query": 15
1709
+ },
1710
+ {
1711
+ "name": "location_vqa",
1712
+ "score": 0.5,
1713
+ "eval_type": "rule",
1714
+ "num_demo": 1,
1715
+ "num_query": 14
1716
+ },
1717
+ {
1718
+ "name": "topological_sort",
1719
+ "score": 0.0,
1720
+ "eval_type": "rule",
1721
+ "num_demo": 1,
1722
+ "num_query": 14
1723
+ },
1724
+ {
1725
+ "name": "mindmap_elements_parsing",
1726
+ "score": 0.35714285714285715,
1727
+ "eval_type": "rule",
1728
+ "num_demo": 1,
1729
+ "num_query": 14
1730
+ },
1731
+ {
1732
+ "name": "code_add_tag",
1733
+ "score": 0.6,
1734
+ "eval_type": "rule",
1735
+ "num_demo": 1,
1736
+ "num_query": 15
1737
+ },
1738
+ {
1739
+ "name": "painting_QA",
1740
+ "score": 0.7857142857142857,
1741
+ "eval_type": "rule",
1742
+ "num_demo": 1,
1743
+ "num_query": 14
1744
+ },
1745
+ {
1746
+ "name": "vln_identify_robot",
1747
+ "score": 0.4,
1748
+ "eval_type": "rule",
1749
+ "num_demo": 1,
1750
+ "num_query": 15
1751
+ },
1752
+ {
1753
+ "name": "planning_visual_barman",
1754
+ "score": 0.0,
1755
+ "eval_type": "rule",
1756
+ "num_demo": 1,
1757
+ "num_query": 15
1758
+ },
1759
+ {
1760
+ "name": "TRANCE_physics_reasoning_view",
1761
+ "score": 0.21428571428571427,
1762
+ "eval_type": "rule",
1763
+ "num_demo": 1,
1764
+ "num_query": 14
1765
+ },
1766
+ {
1767
+ "name": "code_translation_hard",
1768
+ "score": 0.05952380952380952,
1769
+ "eval_type": "rule",
1770
+ "num_demo": 1,
1771
+ "num_query": 14
1772
+ },
1773
+ {
1774
+ "name": "scibench_fundamental_wo_solution",
1775
+ "score": 0.42857142857142855,
1776
+ "eval_type": "rule",
1777
+ "num_demo": 1,
1778
+ "num_query": 49
1779
+ },
1780
+ {
1781
+ "name": "2d_image_jigsaw_puzzle_easy",
1782
+ "score": 0.1976190476190476,
1783
+ "eval_type": "rule",
1784
+ "num_demo": 1,
1785
+ "num_query": 14
1786
+ },
1787
+ {
1788
+ "name": "geometry_reasoning_nested_squares",
1789
+ "score": 0.42857142857142855,
1790
+ "eval_type": "rule",
1791
+ "num_demo": 1,
1792
+ "num_query": 14
1793
+ },
1794
+ {
1795
+ "name": "font_recognition",
1796
+ "score": 0.07142857142857142,
1797
+ "eval_type": "rule",
1798
+ "num_demo": 1,
1799
+ "num_query": 14
1800
+ },
1801
+ {
1802
+ "name": "rocks_samples_compare",
1803
+ "score": 0.6428571428571429,
1804
+ "eval_type": "rule",
1805
+ "num_demo": 1,
1806
+ "num_query": 14
1807
+ },
1808
+ {
1809
+ "name": "mensa_iq_test",
1810
+ "score": 0.4372549019607843,
1811
+ "eval_type": "rule",
1812
+ "num_demo": 1,
1813
+ "num_query": 17
1814
+ },
1815
+ {
1816
+ "name": "dish_ingredient_match",
1817
+ "score": 0.14285714285714285,
1818
+ "eval_type": "rule",
1819
+ "num_demo": 1,
1820
+ "num_query": 14
1821
+ },
1822
+ {
1823
+ "name": "flowchart_code_generation",
1824
+ "score": 0.5555555555555556,
1825
+ "eval_type": "rule",
1826
+ "num_demo": 1,
1827
+ "num_query": 9
1828
+ },
1829
+ {
1830
+ "name": "geometry_reasoning_count_line_intersections",
1831
+ "score": 0.6428571428571429,
1832
+ "eval_type": "rule",
1833
+ "num_demo": 1,
1834
+ "num_query": 14
1835
+ },
1836
+ {
1837
+ "name": "functionality_matching_in_different_objects",
1838
+ "score": 0.7142857142857143,
1839
+ "eval_type": "rule",
1840
+ "num_demo": 1,
1841
+ "num_query": 14
1842
+ },
1843
+ {
1844
+ "name": "worldle",
1845
+ "score": 0.5549424986515583,
1846
+ "eval_type": "rule",
1847
+ "num_demo": 1,
1848
+ "num_query": 14
1849
+ },
1850
+ {
1851
+ "name": "stackoverflow_debug_QA",
1852
+ "score": 0.6428571428571429,
1853
+ "eval_type": "rule",
1854
+ "num_demo": 1,
1855
+ "num_query": 14
1856
+ },
1857
+ {
1858
+ "name": "logical_reasoning_find_odd_one_out",
1859
+ "score": 0.2857142857142857,
1860
+ "eval_type": "rule",
1861
+ "num_demo": 1,
1862
+ "num_query": 14
1863
+ },
1864
+ {
1865
+ "name": "circuit_diagram_understanding",
1866
+ "score": 0.5333333333333333,
1867
+ "eval_type": "rule",
1868
+ "num_demo": 1,
1869
+ "num_query": 15
1870
+ },
1871
+ {
1872
+ "name": "web_action_prediction",
1873
+ "score": 0.8571428571428571,
1874
+ "eval_type": "rule",
1875
+ "num_demo": 1,
1876
+ "num_query": 14
1877
+ },
1878
+ {
1879
+ "name": "medical_retrieval_given_surgeon_activity",
1880
+ "score": 0.42857142857142855,
1881
+ "eval_type": "rule",
1882
+ "num_demo": 1,
1883
+ "num_query": 14
1884
+ },
1885
+ {
1886
+ "name": "google_streetview_line_sorting",
1887
+ "score": 0.26666666666666666,
1888
+ "eval_type": "rule",
1889
+ "num_demo": 1,
1890
+ "num_query": 15
1891
+ },
1892
+ {
1893
+ "name": "recipe_image_ordering",
1894
+ "score": 0.8571428571428571,
1895
+ "eval_type": "rule",
1896
+ "num_demo": 1,
1897
+ "num_query": 14
1898
+ },
1899
+ {
1900
+ "name": "music_sheet_sentiment",
1901
+ "score": 0.5,
1902
+ "eval_type": "rule",
1903
+ "num_demo": 1,
1904
+ "num_query": 14
1905
+ },
1906
+ {
1907
+ "name": "TRANCE_physics_reasoning_basic",
1908
+ "score": 0.7058823529411765,
1909
+ "eval_type": "rule",
1910
+ "num_demo": 1,
1911
+ "num_query": 17
1912
+ },
1913
+ {
1914
+ "name": "signage_navigation",
1915
+ "score": 0.6428571428571429,
1916
+ "eval_type": "rule",
1917
+ "num_demo": 1,
1918
+ "num_query": 14
1919
+ },
1920
+ {
1921
+ "name": "go_capture_stone",
1922
+ "score": 0.13333333333333333,
1923
+ "eval_type": "rule",
1924
+ "num_demo": 1,
1925
+ "num_query": 15
1926
+ },
1927
+ {
1928
+ "name": "google_streetview_direction_understanding",
1929
+ "score": 0.5714285714285714,
1930
+ "eval_type": "rule",
1931
+ "num_demo": 1,
1932
+ "num_query": 14
1933
+ },
1934
+ {
1935
+ "name": "code_translation_advanced",
1936
+ "score": 0.14285714285714285,
1937
+ "eval_type": "rule",
1938
+ "num_demo": 1,
1939
+ "num_query": 14
1940
+ },
1941
+ {
1942
+ "name": "code_retrieval",
1943
+ "score": 0.42857142857142855,
1944
+ "eval_type": "rule",
1945
+ "num_demo": 1,
1946
+ "num_query": 14
1947
+ },
1948
+ {
1949
+ "name": "comic_page_ordering",
1950
+ "score": 0.0,
1951
+ "eval_type": "rule",
1952
+ "num_demo": 1,
1953
+ "num_query": 14
1954
+ },
1955
+ {
1956
+ "name": "movie_retrieval_by_actor",
1957
+ "score": 0.9285714285714286,
1958
+ "eval_type": "rule",
1959
+ "num_demo": 1,
1960
+ "num_query": 14
1961
+ },
1962
+ {
1963
+ "name": "webpage_code_understanding",
1964
+ "score": 0.6666666666666666,
1965
+ "eval_type": "rule",
1966
+ "num_demo": 1,
1967
+ "num_query": 9
1968
+ },
1969
+ {
1970
+ "name": "counting_multi_image",
1971
+ "score": 0.6428571428571429,
1972
+ "eval_type": "rule",
1973
+ "num_demo": 1,
1974
+ "num_query": 14
1975
+ },
1976
+ {
1977
+ "name": "video_eval_visual_pref",
1978
+ "score": 0.5625,
1979
+ "eval_type": "rule",
1980
+ "num_demo": 1,
1981
+ "num_query": 16
1982
+ },
1983
+ {
1984
+ "name": "visual_correspondance_in_two_images",
1985
+ "score": 0.7142857142857143,
1986
+ "eval_type": "rule",
1987
+ "num_demo": 1,
1988
+ "num_query": 14
1989
+ },
1990
+ {
1991
+ "name": "forensic_detection_of_different_images",
1992
+ "score": 0.7142857142857143,
1993
+ "eval_type": "rule",
1994
+ "num_demo": 1,
1995
+ "num_query": 14
1996
+ },
1997
+ {
1998
+ "name": "sign_language",
1999
+ "score": 0.0,
2000
+ "eval_type": "rule",
2001
+ "num_demo": 1,
2002
+ "num_query": 14
2003
+ },
2004
+ {
2005
+ "name": "monthly_weather_days_count",
2006
+ "score": 0.42857142857142855,
2007
+ "eval_type": "rule",
2008
+ "num_demo": 1,
2009
+ "num_query": 14
2010
+ },
2011
+ {
2012
+ "name": "medical_counting_lymphocytes",
2013
+ "score": 0.0,
2014
+ "eval_type": "rule",
2015
+ "num_demo": 1,
2016
+ "num_query": 14
2017
+ },
2018
+ {
2019
+ "name": "weather_map_climate_type_temperature_parsing",
2020
+ "score": 0.6428571428571429,
2021
+ "eval_type": "rule",
2022
+ "num_demo": 1,
2023
+ "num_query": 14
2024
+ },
2025
+ {
2026
+ "name": "knowledge_sign_recognition",
2027
+ "score": 0.4444444444444444,
2028
+ "eval_type": "rule",
2029
+ "num_demo": 1,
2030
+ "num_query": 9
2031
+ },
2032
+ {
2033
+ "name": "top_video_creator_identification",
2034
+ "score": 0.35714285714285715,
2035
+ "eval_type": "rule",
2036
+ "num_demo": 1,
2037
+ "num_query": 14
2038
+ },
2039
+ {
2040
+ "name": "code_visualization_output_understanding",
2041
+ "score": 0.5,
2042
+ "eval_type": "rule",
2043
+ "num_demo": 1,
2044
+ "num_query": 10
2045
+ },
2046
+ {
2047
+ "name": "TRANCE_physics_reasoning_event",
2048
+ "score": 0.14285714285714285,
2049
+ "eval_type": "rule",
2050
+ "num_demo": 1,
2051
+ "num_query": 14
2052
+ },
2053
+ {
2054
+ "name": "rebus",
2055
+ "score": 0.43478260869565216,
2056
+ "eval_type": "rule",
2057
+ "num_demo": 1,
2058
+ "num_query": 23
2059
+ },
2060
+ {
2061
+ "name": "vln_tegulu_next_step",
2062
+ "score": 0.13333333333333333,
2063
+ "eval_type": "rule",
2064
+ "num_demo": 1,
2065
+ "num_query": 15
2066
+ },
2067
+ {
2068
+ "name": "video_grounding_spatial",
2069
+ "score": 0.5,
2070
+ "eval_type": "rule",
2071
+ "num_demo": 1,
2072
+ "num_query": 14
2073
+ },
2074
+ {
2075
+ "name": "ishihara_test",
2076
+ "score": 0.19285714285714287,
2077
+ "eval_type": "rule",
2078
+ "num_demo": 1,
2079
+ "num_query": 14
2080
+ },
2081
+ {
2082
+ "name": "music_sheet_name",
2083
+ "score": 0.5333333333333333,
2084
+ "eval_type": "rule",
2085
+ "num_demo": 1,
2086
+ "num_query": 15
2087
+ },
2088
+ {
2089
+ "name": "paper_vqa",
2090
+ "score": 0.42857142857142855,
2091
+ "eval_type": "rule",
2092
+ "num_demo": 1,
2093
+ "num_query": 14
2094
+ },
2095
+ {
2096
+ "name": "vln_english_next_step",
2097
+ "score": 0.13333333333333333,
2098
+ "eval_type": "rule",
2099
+ "num_demo": 1,
2100
+ "num_query": 15
2101
+ },
2102
+ {
2103
+ "name": "google_streetview_circle_reasoning",
2104
+ "score": 0.4,
2105
+ "eval_type": "rule",
2106
+ "num_demo": 1,
2107
+ "num_query": 15
2108
+ },
2109
+ {
2110
+ "name": "product_ocr_qa",
2111
+ "score": 0.42857142857142855,
2112
+ "eval_type": "rule",
2113
+ "num_demo": 1,
2114
+ "num_query": 14
2115
+ },
2116
+ {
2117
+ "name": "multilingual_news_qa",
2118
+ "score": 0.7857142857142857,
2119
+ "eval_type": "rule",
2120
+ "num_demo": 1,
2121
+ "num_query": 14
2122
+ },
2123
+ {
2124
+ "name": "geometry_reasoning_circled_letter",
2125
+ "score": 0.7857142857142857,
2126
+ "eval_type": "rule",
2127
+ "num_demo": 1,
2128
+ "num_query": 14
2129
+ },
2130
+ {
2131
+ "name": "music_sheet_author",
2132
+ "score": 0.125,
2133
+ "eval_type": "rule",
2134
+ "num_demo": 1,
2135
+ "num_query": 16
2136
+ },
2137
+ {
2138
+ "name": "GUI_Act_Web_Single",
2139
+ "score": 0.026658939437844104,
2140
+ "eval_type": "rule",
2141
+ "num_demo": 1,
2142
+ "num_query": 14
2143
+ },
2144
+ {
2145
+ "name": "planning_visual_blocksworld",
2146
+ "score": 0.13333333333333333,
2147
+ "eval_type": "rule",
2148
+ "num_demo": 1,
2149
+ "num_query": 15
2150
+ },
2151
+ {
2152
+ "name": "game_platform_support_identification",
2153
+ "score": 0.9642857142857143,
2154
+ "eval_type": "rule",
2155
+ "num_demo": 1,
2156
+ "num_query": 14
2157
+ },
2158
+ {
2159
+ "name": "GUI_Act_Mobile_swipe",
2160
+ "score": 0.5319532782862981,
2161
+ "eval_type": "rule",
2162
+ "num_demo": 1,
2163
+ "num_query": 13
2164
+ },
2165
+ {
2166
+ "name": "mahjong",
2167
+ "score": 0.0,
2168
+ "eval_type": "rule",
2169
+ "num_demo": 1,
2170
+ "num_query": 14
2171
+ },
2172
+ {
2173
+ "name": "booking_web_recommendation",
2174
+ "score": 0.7117630385487528,
2175
+ "eval_type": "rule",
2176
+ "num_demo": 1,
2177
+ "num_query": 14
2178
+ },
2179
+ {
2180
+ "name": "video_eval_dynamic_pref",
2181
+ "score": 0.9375,
2182
+ "eval_type": "rule",
2183
+ "num_demo": 1,
2184
+ "num_query": 16
2185
+ },
2186
+ {
2187
+ "name": "extract_webpage_headline",
2188
+ "score": 0.5714285714285714,
2189
+ "eval_type": "rule",
2190
+ "num_demo": 1,
2191
+ "num_query": 14
2192
+ },
2193
+ {
2194
+ "name": "video_action_recognition",
2195
+ "score": 0.8214285714285714,
2196
+ "eval_type": "rule",
2197
+ "num_demo": 1,
2198
+ "num_query": 14
2199
+ },
2200
+ {
2201
+ "name": "planning_screenshot_storage",
2202
+ "score": 0.26666666666666666,
2203
+ "eval_type": "rule",
2204
+ "num_demo": 1,
2205
+ "num_query": 15
2206
+ },
2207
+ {
2208
+ "name": "scibench_calculus_wo_solution",
2209
+ "score": 0.4489795918367347,
2210
+ "eval_type": "rule",
2211
+ "num_demo": 1,
2212
+ "num_query": 49
2213
+ },
2214
+ {
2215
+ "name": "knowledge_graph_understanding",
2216
+ "score": 0.8,
2217
+ "eval_type": "rule",
2218
+ "num_demo": 1,
2219
+ "num_query": 15
2220
+ },
2221
+ {
2222
+ "name": "media_homepage_profile",
2223
+ "score": 0.21281397174254318,
2224
+ "eval_type": "rule",
2225
+ "num_demo": 1,
2226
+ "num_query": 14
2227
+ },
2228
+ {
2229
+ "name": "image_translation_en2cn",
2230
+ "score": 0.4747283144053553,
2231
+ "eval_type": "rule",
2232
+ "num_demo": 1,
2233
+ "num_query": 9
2234
+ },
2235
+ {
2236
+ "name": "google_streetview_line_reasoning",
2237
+ "score": 0.6,
2238
+ "eval_type": "rule",
2239
+ "num_demo": 1,
2240
+ "num_query": 15
2241
+ },
2242
+ {
2243
+ "name": "realworld_qa_en2cn",
2244
+ "score": 0.7142857142857143,
2245
+ "eval_type": "rule",
2246
+ "num_demo": 1,
2247
+ "num_query": 14
2248
+ },
2249
+ {
2250
+ "name": "code_translation_easy",
2251
+ "score": 0.5952380952380951,
2252
+ "eval_type": "rule",
2253
+ "num_demo": 1,
2254
+ "num_query": 14
2255
+ },
2256
+ {
2257
+ "name": "soccer_offside",
2258
+ "score": 0.5555555555555556,
2259
+ "eval_type": "rule",
2260
+ "num_demo": 1,
2261
+ "num_query": 9
2262
+ },
2263
+ {
2264
+ "name": "planning_visual_floortile",
2265
+ "score": 0.0,
2266
+ "eval_type": "rule",
2267
+ "num_demo": 1,
2268
+ "num_query": 15
2269
+ },
2270
+ {
2271
+ "name": "planning_visual_storage",
2272
+ "score": 0.0,
2273
+ "eval_type": "rule",
2274
+ "num_demo": 1,
2275
+ "num_query": 15
2276
+ },
2277
+ {
2278
+ "name": "video_segments_reordering",
2279
+ "score": 0.14285714285714285,
2280
+ "eval_type": "rule",
2281
+ "num_demo": 1,
2282
+ "num_query": 14
2283
+ },
2284
+ {
2285
+ "name": "medical_parasite_detection",
2286
+ "score": 0.7142857142857143,
2287
+ "eval_type": "rule",
2288
+ "num_demo": 1,
2289
+ "num_query": 14
2290
+ },
2291
+ {
2292
+ "name": "video_intent_recognition",
2293
+ "score": 0.7142857142857143,
2294
+ "eval_type": "rule",
2295
+ "num_demo": 1,
2296
+ "num_query": 14
2297
+ },
2298
+ {
2299
+ "name": "geometry_reasoning_grid",
2300
+ "score": 0.75,
2301
+ "eval_type": "rule",
2302
+ "num_demo": 1,
2303
+ "num_query": 14
2304
+ },
2305
+ {
2306
+ "name": "GUI_Act_Web_Multi",
2307
+ "score": 0.23629435627595965,
2308
+ "eval_type": "rule",
2309
+ "num_demo": 1,
2310
+ "num_query": 14
2311
+ },
2312
+ {
2313
+ "name": "chinese_idiom_recognition",
2314
+ "score": 0.7142857142857143,
2315
+ "eval_type": "rule",
2316
+ "num_demo": 1,
2317
+ "num_query": 14
2318
+ },
2319
+ {
2320
+ "name": "relative_reflectance_of_different_regions",
2321
+ "score": 0.21428571428571427,
2322
+ "eval_type": "rule",
2323
+ "num_demo": 1,
2324
+ "num_query": 14
2325
+ },
2326
+ {
2327
+ "name": "number_comparison",
2328
+ "score": 1.0,
2329
+ "eval_type": "rule",
2330
+ "num_demo": 1,
2331
+ "num_query": 14
2332
+ },
2333
+ {
2334
+ "name": "entertainment_web_game_style",
2335
+ "score": 0.7857142857142857,
2336
+ "eval_type": "rule",
2337
+ "num_demo": 1,
2338
+ "num_query": 14
2339
+ },
2340
+ {
2341
+ "name": "media_recommend_solutions_stackoverflow",
2342
+ "score": 0.7857142857142857,
2343
+ "eval_type": "rule",
2344
+ "num_demo": 1,
2345
+ "num_query": 14
2346
+ },
2347
+ {
2348
+ "name": "orchestra_score_recognition",
2349
+ "score": 0.21428571428571427,
2350
+ "eval_type": "rule",
2351
+ "num_demo": 1,
2352
+ "num_query": 14
2353
+ },
2354
+ {
2355
+ "name": "planning_visual_termes",
2356
+ "score": 0.0,
2357
+ "eval_type": "rule",
2358
+ "num_demo": 1,
2359
+ "num_query": 15
2360
+ },
2361
+ {
2362
+ "name": "video_eval_factual_pref",
2363
+ "score": 0.7857142857142857,
2364
+ "eval_type": "rule",
2365
+ "num_demo": 1,
2366
+ "num_query": 14
2367
+ },
2368
+ {
2369
+ "name": "planning_screenshot_blocksworld",
2370
+ "score": 0.3333333333333333,
2371
+ "eval_type": "rule",
2372
+ "num_demo": 1,
2373
+ "num_query": 15
2374
+ },
2375
+ {
2376
+ "name": "icon_arithmetic_puzzle",
2377
+ "score": 0.6785714285714286,
2378
+ "eval_type": "rule",
2379
+ "num_demo": 1,
2380
+ "num_query": 14
2381
+ },
2382
+ {
2383
+ "name": "planning_visual_grippers",
2384
+ "score": 0.2,
2385
+ "eval_type": "rule",
2386
+ "num_demo": 1,
2387
+ "num_query": 15
2388
+ },
2389
+ {
2390
+ "name": "planning_screenshot_grippers",
2391
+ "score": 0.4,
2392
+ "eval_type": "rule",
2393
+ "num_demo": 1,
2394
+ "num_query": 15
2395
+ },
2396
+ {
2397
+ "name": "character_recognition_in_TV_shows",
2398
+ "score": 0.6428571428571429,
2399
+ "eval_type": "rule",
2400
+ "num_demo": 1,
2401
+ "num_query": 14
2402
+ },
2403
+ {
2404
+ "name": "highest_discount_game_price_identification",
2405
+ "score": 1.0,
2406
+ "eval_type": "rule",
2407
+ "num_demo": 1,
2408
+ "num_query": 14
2409
+ },
2410
+ {
2411
+ "name": "remaining_playback_time_calculation",
2412
+ "score": 0.0,
2413
+ "eval_type": "rule",
2414
+ "num_demo": 1,
2415
+ "num_query": 14
2416
+ },
2417
+ {
2418
+ "name": "medical_cell_recognition",
2419
+ "score": 0.5714285714285714,
2420
+ "eval_type": "rule",
2421
+ "num_demo": 1,
2422
+ "num_query": 14
2423
+ },
2424
+ {
2425
+ "name": "pokemon_3D_recognition",
2426
+ "score": 0.7333333333333333,
2427
+ "eval_type": "rule",
2428
+ "num_demo": 1,
2429
+ "num_query": 15
2430
+ },
2431
+ {
2432
+ "name": "chess_find_legal_moves",
2433
+ "score": 0.20779353910574844,
2434
+ "eval_type": "rule",
2435
+ "num_demo": 1,
2436
+ "num_query": 14
2437
+ },
2438
+ {
2439
+ "name": "rocks_samples_identify",
2440
+ "score": 0.4,
2441
+ "eval_type": "rule",
2442
+ "num_demo": 1,
2443
+ "num_query": 15
2444
+ },
2445
+ {
2446
+ "name": "paper_review_rating",
2447
+ "score": 0.7641625001389684,
2448
+ "eval_type": "rule",
2449
+ "num_demo": 1,
2450
+ "num_query": 15
2451
+ },
2452
+ {
2453
+ "name": "distinguish_ai_generated_image",
2454
+ "score": 0.8421052631578947,
2455
+ "eval_type": "rule",
2456
+ "num_demo": 1,
2457
+ "num_query": 19
2458
+ },
2459
+ {
2460
+ "name": "autonomous_driving_scene_analysis",
2461
+ "score": 1.0,
2462
+ "eval_type": "rule",
2463
+ "num_demo": 1,
2464
+ "num_query": 14
2465
+ },
2466
+ {
2467
+ "name": "code_translation_Python",
2468
+ "score": 0.5416666666666667,
2469
+ "eval_type": "rule",
2470
+ "num_demo": 1,
2471
+ "num_query": 16
2472
+ },
2473
+ {
2474
+ "name": "counting_single_image",
2475
+ "score": 0.7857142857142857,
2476
+ "eval_type": "rule",
2477
+ "num_demo": 1,
2478
+ "num_query": 14
2479
+ },
2480
+ {
2481
+ "name": "MMMU_pro_exam_screenshot",
2482
+ "score": 0.40404040404040403,
2483
+ "eval_type": "rule",
2484
+ "num_demo": 1,
2485
+ "num_query": 99
2486
+ },
2487
+ {
2488
+ "name": "GUI_Act_Mobile_tap",
2489
+ "score": 0.2857142857142857,
2490
+ "eval_type": "rule",
2491
+ "num_demo": 1,
2492
+ "num_query": 14
2493
+ },
2494
+ {
2495
+ "name": "road_map_find_highway_between_two_place",
2496
+ "score": 0.8235294117647058,
2497
+ "eval_type": "rule",
2498
+ "num_demo": 1,
2499
+ "num_query": 17
2500
+ },
2501
+ {
2502
+ "name": "waldo",
2503
+ "score": 0.001960708071909715,
2504
+ "eval_type": "rule",
2505
+ "num_demo": 1,
2506
+ "num_query": 18
2507
+ },
2508
+ {
2509
+ "name": "clevrer_physics",
2510
+ "score": 0.45,
2511
+ "eval_type": "rule",
2512
+ "num_demo": 1,
2513
+ "num_query": 20
2514
+ },
2515
+ {
2516
+ "name": "chess_sygyzy_endgames",
2517
+ "score": 0.15739022881880024,
2518
+ "eval_type": "rule",
2519
+ "num_demo": 1,
2520
+ "num_query": 14
2521
+ },
2522
+ {
2523
+ "name": "llavaguard",
2524
+ "score": 0.5357142857142857,
2525
+ "eval_type": "rule",
2526
+ "num_demo": 1,
2527
+ "num_query": 14
2528
+ },
2529
+ {
2530
+ "name": "photoshop_operation",
2531
+ "score": 0.29523809523809524,
2532
+ "eval_type": "rule",
2533
+ "num_demo": 1,
2534
+ "num_query": 14
2535
+ },
2536
+ {
2537
+ "name": "MMMU_physics_chemistry_selected",
2538
+ "score": 0.8571428571428571,
2539
+ "eval_type": "rule",
2540
+ "num_demo": 1,
2541
+ "num_query": 14
2542
+ },
2543
+ {
2544
+ "name": "medical_multi_organ_segmentation_rater",
2545
+ "score": 0.35714285714285715,
2546
+ "eval_type": "rule",
2547
+ "num_demo": 1,
2548
+ "num_query": 14
2549
+ },
2550
+ {
2551
+ "name": "cultural_vqa",
2552
+ "score": 0.5333333333333333,
2553
+ "eval_type": "rule",
2554
+ "num_demo": 1,
2555
+ "num_query": 15
2556
+ },
2557
+ {
2558
+ "name": "medical_content_based_retrieval_radiology",
2559
+ "score": 0.9285714285714286,
2560
+ "eval_type": "rule",
2561
+ "num_demo": 1,
2562
+ "num_query": 14
2563
+ },
2564
+ {
2565
+ "name": "logical_reasoning_fit_pattern",
2566
+ "score": 0.2857142857142857,
2567
+ "eval_type": "rule",
2568
+ "num_demo": 1,
2569
+ "num_query": 14
2570
+ },
2571
+ {
2572
+ "name": "planning_screenshot_tyreworld",
2573
+ "score": 1.0,
2574
+ "eval_type": "rule",
2575
+ "num_demo": 1,
2576
+ "num_query": 15
2577
+ },
2578
+ {
2579
+ "name": "tv_show_retrieval_by_character",
2580
+ "score": 0.8571428571428571,
2581
+ "eval_type": "rule",
2582
+ "num_demo": 1,
2583
+ "num_query": 14
2584
+ },
2585
+ {
2586
+ "name": "music_sheet_note_count",
2587
+ "score": 0.058823529411764705,
2588
+ "eval_type": "rule",
2589
+ "num_demo": 1,
2590
+ "num_query": 17
2591
+ },
2592
+ {
2593
+ "name": "semantic_matching_of_two_images",
2594
+ "score": 0.35714285714285715,
2595
+ "eval_type": "rule",
2596
+ "num_demo": 1,
2597
+ "num_query": 14
2598
+ },
2599
+ {
2600
+ "name": "medical_keywords_based_retrieval_non_radiology",
2601
+ "score": 1.0,
2602
+ "eval_type": "rule",
2603
+ "num_demo": 1,
2604
+ "num_query": 14
2605
+ },
2606
+ {
2607
+ "name": "booking_web_rating",
2608
+ "score": 1.0,
2609
+ "eval_type": "rule",
2610
+ "num_demo": 1,
2611
+ "num_query": 14
2612
+ },
2613
+ {
2614
+ "name": "planning_screenshot_termes",
2615
+ "score": 0.0,
2616
+ "eval_type": "rule",
2617
+ "num_demo": 1,
2618
+ "num_query": 15
2619
+ },
2620
+ {
2621
+ "name": "geographic_remote_sensing_land_cover",
2622
+ "score": 0.6428571428571429,
2623
+ "eval_type": "rule",
2624
+ "num_demo": 1,
2625
+ "num_query": 14
2626
+ },
2627
+ {
2628
+ "name": "logical_reasoning_2d_folding",
2629
+ "score": 0.07142857142857142,
2630
+ "eval_type": "rule",
2631
+ "num_demo": 1,
2632
+ "num_query": 14
2633
+ },
2634
+ {
2635
+ "name": "hashtag_recommendation",
2636
+ "score": 0.9166666666666666,
2637
+ "eval_type": "rule",
2638
+ "num_demo": 1,
2639
+ "num_query": 14
2640
+ },
2641
+ {
2642
+ "name": "visual_prediction_rater_plane_segmentation",
2643
+ "score": 0.5111111111111112,
2644
+ "eval_type": "rule",
2645
+ "num_demo": 1,
2646
+ "num_query": 15
2647
+ },
2648
+ {
2649
+ "name": "multiple_states_identify_europe",
2650
+ "score": 0.7857142857142857,
2651
+ "eval_type": "rule",
2652
+ "num_demo": 1,
2653
+ "num_query": 14
2654
+ },
2655
+ {
2656
+ "name": "multiple_states_identify_americas",
2657
+ "score": 0.7142857142857143,
2658
+ "eval_type": "rule",
2659
+ "num_demo": 1,
2660
+ "num_query": 14
2661
+ },
2662
+ {
2663
+ "name": "visual_prediction_rater_surface_normal_estimation",
2664
+ "score": 0.6904761904761905,
2665
+ "eval_type": "rule",
2666
+ "num_demo": 1,
2667
+ "num_query": 14
2668
+ },
2669
+ {
2670
+ "name": "adapted_cvbench_distance",
2671
+ "score": 0.8571428571428571,
2672
+ "eval_type": "rule",
2673
+ "num_demo": 1,
2674
+ "num_query": 14
2675
+ },
2676
+ {
2677
+ "name": "visual_prediction_rater_openable_part_segmentation",
2678
+ "score": 0.21428571428571427,
2679
+ "eval_type": "rule",
2680
+ "num_demo": 1,
2681
+ "num_query": 14
2682
+ },
2683
+ {
2684
+ "name": "adapted_cvbench_count",
2685
+ "score": 0.42857142857142855,
2686
+ "eval_type": "rule",
2687
+ "num_demo": 1,
2688
+ "num_query": 14
2689
+ },
2690
+ {
2691
+ "name": "adapted_cvbench_depth",
2692
+ "score": 1.0,
2693
+ "eval_type": "rule",
2694
+ "num_demo": 1,
2695
+ "num_query": 14
2696
+ },
2697
+ {
2698
+ "name": "visual_prediction_rater_3d_assembled_quality_understanding",
2699
+ "score": 0.5714285714285714,
2700
+ "eval_type": "rule",
2701
+ "num_demo": 1,
2702
+ "num_query": 14
2703
+ },
2704
+ {
2705
+ "name": "adapted_cvbench_relation",
2706
+ "score": 0.5714285714285714,
2707
+ "eval_type": "rule",
2708
+ "num_demo": 1,
2709
+ "num_query": 14
2710
+ },
2711
+ {
2712
+ "name": "visual_prediction_rater_semantic_segmentation",
2713
+ "score": 0.5208333333333333,
2714
+ "eval_type": "rule",
2715
+ "num_demo": 1,
2716
+ "num_query": 16
2717
+ },
2718
+ {
2719
+ "name": "symbolic_graphics_programs_computer_aided_design",
2720
+ "score": 0.2857142857142857,
2721
+ "eval_type": "rule",
2722
+ "num_demo": 1,
2723
+ "num_query": 14
2724
+ },
2725
+ {
2726
+ "name": "visual_prediction_rater_depth_estimation",
2727
+ "score": 0.619047619047619,
2728
+ "eval_type": "rule",
2729
+ "num_demo": 1,
2730
+ "num_query": 14
2731
+ },
2732
+ {
2733
+ "name": "visual_prediction_rater_novel_view_synthesis",
2734
+ "score": 0.2857142857142857,
2735
+ "eval_type": "rule",
2736
+ "num_demo": 1,
2737
+ "num_query": 14
2738
+ },
2739
+ {
2740
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
2741
+ "score": 0.1111111111111111,
2742
+ "eval_type": "rule",
2743
+ "num_demo": 1,
2744
+ "num_query": 18
2745
+ },
2746
+ {
2747
+ "name": "table_understanding_complex_question_answering",
2748
+ "score": 0.5714285714285714,
2749
+ "eval_type": "rule",
2750
+ "num_demo": 1,
2751
+ "num_query": 14
2752
+ },
2753
+ {
2754
+ "name": "visual_prediction_rater_panoptic_segmentation",
2755
+ "score": 0.5476190476190476,
2756
+ "eval_type": "rule",
2757
+ "num_demo": 1,
2758
+ "num_query": 14
2759
+ },
2760
+ {
2761
+ "name": "table_understanding_fact_verification",
2762
+ "score": 0.9047619047619048,
2763
+ "eval_type": "rule",
2764
+ "num_demo": 1,
2765
+ "num_query": 14
2766
+ },
2767
+ {
2768
+ "name": "panel_images_multi_question",
2769
+ "score": 0.8095238095238095,
2770
+ "eval_type": "rule",
2771
+ "num_demo": 1,
2772
+ "num_query": 14
2773
+ },
2774
+ {
2775
+ "name": "multiple_states_identify_asia",
2776
+ "score": 0.9000000000000001,
2777
+ "eval_type": "rule",
2778
+ "num_demo": 1,
2779
+ "num_query": 14
2780
+ },
2781
+ {
2782
+ "name": "panel_images_single_question",
2783
+ "score": 1.0,
2784
+ "eval_type": "rule",
2785
+ "num_demo": 1,
2786
+ "num_query": 14
2787
+ },
2788
+ {
2789
+ "name": "multiple_states_identify_africa",
2790
+ "score": 0.8142857142857143,
2791
+ "eval_type": "rule",
2792
+ "num_demo": 1,
2793
+ "num_query": 14
2794
+ },
2795
+ {
2796
+ "name": "MMSoc_Misinformation_GossipCop",
2797
+ "score": 0.5714285714285714,
2798
+ "eval_type": "rule",
2799
+ "num_demo": 1,
2800
+ "num_query": 14
2801
+ },
2802
+ {
2803
+ "name": "MMSoc_HatefulMemes",
2804
+ "score": 0.7142857142857143,
2805
+ "eval_type": "rule",
2806
+ "num_demo": 1,
2807
+ "num_query": 14
2808
+ },
2809
+ {
2810
+ "name": "poetry_petrarchian_sonnet_optional_meter",
2811
+ "score": 0.0,
2812
+ "eval_type": "rule",
2813
+ "num_demo": 0,
2814
+ "num_query": 15
2815
+ },
2816
+ {
2817
+ "name": "MMSoc_Memotion",
2818
+ "score": 0.6117647058823531,
2819
+ "eval_type": "rule",
2820
+ "num_demo": 1,
2821
+ "num_query": 17
2822
+ },
2823
+ {
2824
+ "name": "poetry_haiku",
2825
+ "score": 0.8666666666666667,
2826
+ "eval_type": "rule",
2827
+ "num_demo": 0,
2828
+ "num_query": 15
2829
+ },
2830
+ {
2831
+ "name": "MMSoc_Misinformation_PolitiFact",
2832
+ "score": 0.7857142857142857,
2833
+ "eval_type": "rule",
2834
+ "num_demo": 1,
2835
+ "num_query": 14
2836
+ },
2837
+ {
2838
+ "name": "poetry_shakespearean_sonnet",
2839
+ "score": 0.06666666666666667,
2840
+ "eval_type": "rule",
2841
+ "num_demo": 0,
2842
+ "num_query": 15
2843
+ },
2844
+ {
2845
+ "name": "poetry_acrostic_alliteration",
2846
+ "score": 0.7333333333333333,
2847
+ "eval_type": "rule",
2848
+ "num_demo": 0,
2849
+ "num_query": 15
2850
+ },
2851
+ {
2852
+ "name": "screenshot_lighteval_math",
2853
+ "score": 0.6666666666666666,
2854
+ "eval_type": "rule",
2855
+ "num_demo": 1,
2856
+ "num_query": 15
2857
+ },
2858
+ {
2859
+ "name": "poetry_acrostic",
2860
+ "score": 0.9333333333333333,
2861
+ "eval_type": "rule",
2862
+ "num_demo": 0,
2863
+ "num_query": 15
2864
+ },
2865
+ {
2866
+ "name": "poetry_limerick",
2867
+ "score": 0.5333333333333333,
2868
+ "eval_type": "rule",
2869
+ "num_demo": 0,
2870
+ "num_query": 15
2871
+ },
2872
+ {
2873
+ "name": "screenshot_theoremqa",
2874
+ "score": 0.8571428571428571,
2875
+ "eval_type": "rule",
2876
+ "num_demo": 1,
2877
+ "num_query": 14
2878
+ },
2879
+ {
2880
+ "name": "poetry_custom_rhyming_scheme",
2881
+ "score": 0.2,
2882
+ "eval_type": "rule",
2883
+ "num_demo": 0,
2884
+ "num_query": 15
2885
+ },
2886
+ {
2887
+ "name": "text_entity_replace",
2888
+ "score": 0.6428571428571429,
2889
+ "eval_type": "rule",
2890
+ "num_demo": 1,
2891
+ "num_query": 14
2892
+ },
2893
+ {
2894
+ "name": "background_change",
2895
+ "score": 0.8571428571428571,
2896
+ "eval_type": "rule",
2897
+ "num_demo": 1,
2898
+ "num_query": 14
2899
+ },
2900
+ {
2901
+ "name": "face_attribute_edit",
2902
+ "score": 0.5,
2903
+ "eval_type": "rule",
2904
+ "num_demo": 1,
2905
+ "num_query": 14
2906
+ },
2907
+ {
2908
+ "name": "face_swap",
2909
+ "score": 0.5,
2910
+ "eval_type": "rule",
2911
+ "num_demo": 1,
2912
+ "num_query": 14
2913
+ },
2914
+ {
2915
+ "name": "text_style",
2916
+ "score": 0.6428571428571429,
2917
+ "eval_type": "rule",
2918
+ "num_demo": 1,
2919
+ "num_query": 14
2920
+ },
2921
+ {
2922
+ "name": "number_puzzle_sudoku",
2923
+ "score": 0.0,
2924
+ "eval_type": "rule",
2925
+ "num_demo": 1,
2926
+ "num_query": 15
2927
+ },
2928
+ {
2929
+ "name": "out_of_context",
2930
+ "score": 0.8571428571428571,
2931
+ "eval_type": "rule",
2932
+ "num_demo": 1,
2933
+ "num_query": 14
2934
+ },
2935
+ {
2936
+ "name": "clip_stable_diffusion_generate",
2937
+ "score": 0.5714285714285714,
2938
+ "eval_type": "rule",
2939
+ "num_demo": 1,
2940
+ "num_query": 14
2941
+ },
2942
+ {
2943
+ "name": "veracity",
2944
+ "score": 0.8571428571428571,
2945
+ "eval_type": "rule",
2946
+ "num_demo": 1,
2947
+ "num_query": 14
2948
+ },
2949
+ {
2950
+ "name": "counterfactual_arithmetic",
2951
+ "score": 0.8571428571428571,
2952
+ "eval_type": "rule",
2953
+ "num_demo": 1,
2954
+ "num_query": 14
2955
+ },
2956
+ {
2957
+ "name": "maze_2d_8x8",
2958
+ "score": 0.0,
2959
+ "eval_type": "rule",
2960
+ "num_demo": 1,
2961
+ "num_query": 14
2962
+ },
2963
+ {
2964
+ "name": "shape_composition_shapes",
2965
+ "score": 0.47176870748299315,
2966
+ "eval_type": "rule",
2967
+ "num_demo": 1,
2968
+ "num_query": 14
2969
+ },
2970
+ {
2971
+ "name": "shape_composition_colours",
2972
+ "score": 0.4498299319727891,
2973
+ "eval_type": "rule",
2974
+ "num_demo": 1,
2975
+ "num_query": 14
2976
+ },
2977
+ {
2978
+ "name": "autorater_aesthetics",
2979
+ "score": 1.0,
2980
+ "eval_type": "rule",
2981
+ "num_demo": 1,
2982
+ "num_query": 14
2983
+ },
2984
+ {
2985
+ "name": "autorater_unmask",
2986
+ "score": 0.5714285714285714,
2987
+ "eval_type": "rule",
2988
+ "num_demo": 1,
2989
+ "num_query": 14
2990
+ },
2991
+ {
2992
+ "name": "number_puzzle_kakuro_5x5",
2993
+ "score": 0.0,
2994
+ "eval_type": "rule",
2995
+ "num_demo": 1,
2996
+ "num_query": 15
2997
+ },
2998
+ {
2999
+ "name": "autorater_semantics",
3000
+ "score": 0.9285714285714286,
3001
+ "eval_type": "rule",
3002
+ "num_demo": 1,
3003
+ "num_query": 14
3004
+ },
3005
+ {
3006
+ "name": "app_interactive_operations_iphone_settings",
3007
+ "score": 0.7142857142857143,
3008
+ "eval_type": "rule",
3009
+ "num_demo": 1,
3010
+ "num_query": 14
3011
+ },
3012
+ {
3013
+ "name": "app_interactive_operations_amazon",
3014
+ "score": 0.9285714285714286,
3015
+ "eval_type": "rule",
3016
+ "num_demo": 1,
3017
+ "num_query": 14
3018
+ },
3019
+ {
3020
+ "name": "autorater_motion_guided_editing",
3021
+ "score": 0.14285714285714285,
3022
+ "eval_type": "rule",
3023
+ "num_demo": 1,
3024
+ "num_query": 14
3025
+ },
3026
+ {
3027
+ "name": "app_interactive_operations_tiktok",
3028
+ "score": 0.6428571428571429,
3029
+ "eval_type": "rule",
3030
+ "num_demo": 1,
3031
+ "num_query": 14
3032
+ },
3033
+ {
3034
+ "name": "autorater_artifact",
3035
+ "score": 0.6428571428571429,
3036
+ "eval_type": "rule",
3037
+ "num_demo": 1,
3038
+ "num_query": 14
3039
+ },
3040
+ {
3041
+ "name": "app_interactive_operations_ppt",
3042
+ "score": 0.7857142857142857,
3043
+ "eval_type": "rule",
3044
+ "num_demo": 1,
3045
+ "num_query": 14
3046
+ },
3047
+ {
3048
+ "name": "autorater_mask",
3049
+ "score": 0.6428571428571429,
3050
+ "eval_type": "rule",
3051
+ "num_demo": 1,
3052
+ "num_query": 14
3053
+ },
3054
+ {
3055
+ "name": "app_interactive_operations_alipay",
3056
+ "score": 0.6470588235294118,
3057
+ "eval_type": "rule",
3058
+ "num_demo": 1,
3059
+ "num_query": 17
3060
+ },
3061
+ {
3062
+ "name": "autorater_subject",
3063
+ "score": 0.6428571428571429,
3064
+ "eval_type": "rule",
3065
+ "num_demo": 1,
3066
+ "num_query": 14
3067
+ },
3068
+ {
3069
+ "name": "app_interactive_operations_leetcode",
3070
+ "score": 0.5714285714285714,
3071
+ "eval_type": "rule",
3072
+ "num_demo": 1,
3073
+ "num_query": 14
3074
+ },
3075
+ {
3076
+ "name": "app_interactive_operations_excel",
3077
+ "score": 0.8571428571428571,
3078
+ "eval_type": "rule",
3079
+ "num_demo": 1,
3080
+ "num_query": 14
3081
+ },
3082
+ {
3083
+ "name": "app_interactive_operations_zoom",
3084
+ "score": 0.6666666666666666,
3085
+ "eval_type": "rule",
3086
+ "num_demo": 1,
3087
+ "num_query": 15
3088
+ },
3089
+ {
3090
+ "name": "autorater_control",
3091
+ "score": 0.9285714285714286,
3092
+ "eval_type": "rule",
3093
+ "num_demo": 1,
3094
+ "num_query": 14
3095
+ },
3096
+ {
3097
+ "name": "app_interactive_operations_youtube",
3098
+ "score": 0.7857142857142857,
3099
+ "eval_type": "rule",
3100
+ "num_demo": 1,
3101
+ "num_query": 14
3102
+ },
3103
+ {
3104
+ "name": "app_interactive_operations_twitter",
3105
+ "score": 0.6428571428571429,
3106
+ "eval_type": "rule",
3107
+ "num_demo": 1,
3108
+ "num_query": 14
3109
+ },
3110
+ {
3111
+ "name": "app_interactive_operations_word",
3112
+ "score": 0.5714285714285714,
3113
+ "eval_type": "rule",
3114
+ "num_demo": 1,
3115
+ "num_query": 14
3116
+ },
3117
+ {
3118
+ "name": "autorater_3d_model_texturing",
3119
+ "score": 0.7857142857142857,
3120
+ "eval_type": "rule",
3121
+ "num_demo": 1,
3122
+ "num_query": 14
3123
+ },
3124
+ {
3125
+ "name": "app_interactive_operations_instagram",
3126
+ "score": 0.7857142857142857,
3127
+ "eval_type": "rule",
3128
+ "num_demo": 1,
3129
+ "num_query": 14
3130
+ },
3131
+ {
3132
+ "name": "app_interactive_operations_notes",
3133
+ "score": 0.5714285714285714,
3134
+ "eval_type": "rule",
3135
+ "num_demo": 1,
3136
+ "num_query": 14
3137
+ },
3138
+ {
3139
+ "name": "autorater_artifact_reason",
3140
+ "score": 0.6666666666666666,
3141
+ "eval_type": "rule",
3142
+ "num_demo": 0,
3143
+ "num_query": 15
3144
+ },
3145
+ {
3146
+ "name": "chess_puzzles_crushing",
3147
+ "score": 0.0,
3148
+ "eval_type": "rule",
3149
+ "num_demo": 1,
3150
+ "num_query": 14
3151
+ },
3152
+ {
3153
+ "name": "app_layout_understanding_amazon",
3154
+ "score": 0.5714285714285714,
3155
+ "eval_type": "rule",
3156
+ "num_demo": 1,
3157
+ "num_query": 14
3158
+ },
3159
+ {
3160
+ "name": "chess_puzzles_checkmate",
3161
+ "score": 0.0,
3162
+ "eval_type": "rule",
3163
+ "num_demo": 1,
3164
+ "num_query": 14
3165
+ },
3166
+ {
3167
+ "name": "app_layout_understanding_instagram",
3168
+ "score": 0.7142857142857143,
3169
+ "eval_type": "rule",
3170
+ "num_demo": 1,
3171
+ "num_query": 14
3172
+ },
3173
+ {
3174
+ "name": "chess_puzzles_equality",
3175
+ "score": 0.0,
3176
+ "eval_type": "rule",
3177
+ "num_demo": 1,
3178
+ "num_query": 15
3179
+ },
3180
+ {
3181
+ "name": "app_layout_understanding_zoom",
3182
+ "score": 0.6,
3183
+ "eval_type": "rule",
3184
+ "num_demo": 1,
3185
+ "num_query": 15
3186
+ },
3187
+ {
3188
+ "name": "app_layout_understanding_notes",
3189
+ "score": 0.5,
3190
+ "eval_type": "rule",
3191
+ "num_demo": 1,
3192
+ "num_query": 14
3193
+ },
3194
+ {
3195
+ "name": "app_layout_understanding_word",
3196
+ "score": 0.6428571428571429,
3197
+ "eval_type": "rule",
3198
+ "num_demo": 1,
3199
+ "num_query": 14
3200
+ },
3201
+ {
3202
+ "name": "app_layout_understanding_twitter",
3203
+ "score": 0.7142857142857143,
3204
+ "eval_type": "rule",
3205
+ "num_demo": 1,
3206
+ "num_query": 14
3207
+ },
3208
+ {
3209
+ "name": "app_layout_understanding_iphone_settings",
3210
+ "score": 0.8571428571428571,
3211
+ "eval_type": "rule",
3212
+ "num_demo": 1,
3213
+ "num_query": 14
3214
+ },
3215
+ {
3216
+ "name": "app_layout_understanding_youtube",
3217
+ "score": 0.7857142857142857,
3218
+ "eval_type": "rule",
3219
+ "num_demo": 1,
3220
+ "num_query": 14
3221
+ },
3222
+ {
3223
+ "name": "app_layout_understanding_leetcode",
3224
+ "score": 0.6428571428571429,
3225
+ "eval_type": "rule",
3226
+ "num_demo": 1,
3227
+ "num_query": 14
3228
+ },
3229
+ {
3230
+ "name": "app_layout_understanding_ppt",
3231
+ "score": 0.7142857142857143,
3232
+ "eval_type": "rule",
3233
+ "num_demo": 1,
3234
+ "num_query": 14
3235
+ },
3236
+ {
3237
+ "name": "app_layout_understanding_tiktok",
3238
+ "score": 0.8571428571428571,
3239
+ "eval_type": "rule",
3240
+ "num_demo": 1,
3241
+ "num_query": 14
3242
+ },
3243
+ {
3244
+ "name": "app_layout_understanding_alipay",
3245
+ "score": 0.8235294117647058,
3246
+ "eval_type": "rule",
3247
+ "num_demo": 1,
3248
+ "num_query": 17
3249
+ },
3250
+ {
3251
+ "name": "app_layout_understanding_excel",
3252
+ "score": 0.7142857142857143,
3253
+ "eval_type": "rule",
3254
+ "num_demo": 1,
3255
+ "num_query": 14
3256
+ },
3257
+ {
3258
+ "name": "ocr_resume_employer_plain",
3259
+ "score": 0.6428571428571429,
3260
+ "eval_type": "rule",
3261
+ "num_demo": 1,
3262
+ "num_query": 14
3263
+ },
3264
+ {
3265
+ "name": "ocr_article_journal",
3266
+ "score": 0.7857142857142857,
3267
+ "eval_type": "rule",
3268
+ "num_demo": 1,
3269
+ "num_query": 14
3270
+ },
3271
+ {
3272
+ "name": "ocr_resume_experience_plain",
3273
+ "score": 0.7142857142857143,
3274
+ "eval_type": "rule",
3275
+ "num_demo": 1,
3276
+ "num_query": 14
3277
+ },
3278
+ {
3279
+ "name": "ball_cup_swap_3",
3280
+ "score": 0.21428571428571427,
3281
+ "eval_type": "rule",
3282
+ "num_demo": 1,
3283
+ "num_query": 14
3284
+ },
3285
+ {
3286
+ "name": "ocr_table_to_markdown",
3287
+ "score": 0.9285714285714286,
3288
+ "eval_type": "rule",
3289
+ "num_demo": 1,
3290
+ "num_query": 14
3291
+ },
3292
+ {
3293
+ "name": "ocr_math_text_latex",
3294
+ "score": 0.42857142857142855,
3295
+ "eval_type": "rule",
3296
+ "num_demo": 1,
3297
+ "num_query": 14
3298
+ },
3299
+ {
3300
+ "name": "ocr_table_to_latex",
3301
+ "score": 0.7142857142857143,
3302
+ "eval_type": "rule",
3303
+ "num_demo": 1,
3304
+ "num_query": 14
3305
+ },
3306
+ {
3307
+ "name": "ocr_resume_school_plain",
3308
+ "score": 0.8571428571428571,
3309
+ "eval_type": "rule",
3310
+ "num_demo": 1,
3311
+ "num_query": 14
3312
+ },
3313
+ {
3314
+ "name": "ocr_article_authors",
3315
+ "score": 0.8214285714285714,
3316
+ "eval_type": "rule",
3317
+ "num_demo": 1,
3318
+ "num_query": 14
3319
+ },
3320
+ {
3321
+ "name": "ocr_table_to_html",
3322
+ "score": 0.7142857142857143,
3323
+ "eval_type": "rule",
3324
+ "num_demo": 1,
3325
+ "num_query": 14
3326
+ },
3327
+ {
3328
+ "name": "ocr_resume_skill_plain",
3329
+ "score": 0.5714285714285714,
3330
+ "eval_type": "rule",
3331
+ "num_demo": 1,
3332
+ "num_query": 14
3333
+ },
3334
+ {
3335
+ "name": "ocr_table_to_csv",
3336
+ "score": 0.6428571428571429,
3337
+ "eval_type": "rule",
3338
+ "num_demo": 1,
3339
+ "num_query": 14
3340
+ },
3341
+ {
3342
+ "name": "crossword_mini_5x5",
3343
+ "score": 0.6714285714285715,
3344
+ "eval_type": "rule",
3345
+ "num_demo": 1,
3346
+ "num_query": 14
3347
+ },
3348
+ {
3349
+ "name": "ocr_math_equation",
3350
+ "score": 0.42857142857142855,
3351
+ "eval_type": "rule",
3352
+ "num_demo": 1,
3353
+ "num_query": 14
3354
+ },
3355
+ {
3356
+ "name": "contain_repeat_length",
3357
+ "score": 0.4666666666666667,
3358
+ "eval_type": "rule",
3359
+ "num_demo": 0,
3360
+ "num_query": 15
3361
+ },
3362
+ {
3363
+ "name": "multi_contain_position_only",
3364
+ "score": 0.26666666666666666,
3365
+ "eval_type": "rule",
3366
+ "num_demo": 0,
3367
+ "num_query": 15
3368
+ },
3369
+ {
3370
+ "name": "contain_position_images",
3371
+ "score": 0.3333333333333333,
3372
+ "eval_type": "rule",
3373
+ "num_demo": 0,
3374
+ "num_query": 15
3375
+ },
3376
+ {
3377
+ "name": "xor_images",
3378
+ "score": 0.8666666666666667,
3379
+ "eval_type": "rule",
3380
+ "num_demo": 0,
3381
+ "num_query": 15
3382
+ },
3383
+ {
3384
+ "name": "contain_position_length",
3385
+ "score": 0.8666666666666667,
3386
+ "eval_type": "rule",
3387
+ "num_demo": 0,
3388
+ "num_query": 15
3389
+ },
3390
+ {
3391
+ "name": "multi_contain_repeat",
3392
+ "score": 0.0,
3393
+ "eval_type": "rule",
3394
+ "num_demo": 0,
3395
+ "num_query": 15
3396
+ },
3397
+ {
3398
+ "name": "multi_contain_repeat_position_only_length",
3399
+ "score": 0.06666666666666667,
3400
+ "eval_type": "rule",
3401
+ "num_demo": 0,
3402
+ "num_query": 15
3403
+ },
3404
+ {
3405
+ "name": "pictionary_skribbl_io",
3406
+ "score": 0.3,
3407
+ "eval_type": "rule",
3408
+ "num_demo": 1,
3409
+ "num_query": 20
3410
+ },
3411
+ {
3412
+ "name": "pictionary_doodle_guess",
3413
+ "score": 0.8,
3414
+ "eval_type": "rule",
3415
+ "num_demo": 1,
3416
+ "num_query": 15
3417
+ },
3418
+ {
3419
+ "name": "contain_contain_images",
3420
+ "score": 0.9333333333333333,
3421
+ "eval_type": "rule",
3422
+ "num_demo": 0,
3423
+ "num_query": 15
3424
+ },
3425
+ {
3426
+ "name": "pictionary_genai_output_chinese",
3427
+ "score": 0.35714285714285715,
3428
+ "eval_type": "rule",
3429
+ "num_demo": 1,
3430
+ "num_query": 14
3431
+ },
3432
+ {
3433
+ "name": "pictionary_cartoon_drawing_guess",
3434
+ "score": 0.8571428571428571,
3435
+ "eval_type": "rule",
3436
+ "num_demo": 1,
3437
+ "num_query": 14
3438
+ },
3439
+ {
3440
+ "name": "contain_length",
3441
+ "score": 0.5333333333333333,
3442
+ "eval_type": "rule",
3443
+ "num_demo": 0,
3444
+ "num_query": 15
3445
+ },
3446
+ {
3447
+ "name": "pictionary_chinese_food_img2en",
3448
+ "score": 0.7857142857142857,
3449
+ "eval_type": "rule",
3450
+ "num_demo": 1,
3451
+ "num_query": 14
3452
+ },
3453
+ {
3454
+ "name": "contain_contain_length",
3455
+ "score": 0.9333333333333333,
3456
+ "eval_type": "rule",
3457
+ "num_demo": 0,
3458
+ "num_query": 15
3459
+ },
3460
+ {
3461
+ "name": "reward_models_t2i_reward",
3462
+ "score": 0.7857142857142857,
3463
+ "eval_type": "rule",
3464
+ "num_demo": 1,
3465
+ "num_query": 14
3466
+ },
3467
+ {
3468
+ "name": "reward_models_i2t_reward",
3469
+ "score": 0.5714285714285714,
3470
+ "eval_type": "rule",
3471
+ "num_demo": 1,
3472
+ "num_query": 14
3473
+ },
3474
+ {
3475
+ "name": "memorization_chinese_celebrity",
3476
+ "score": 0.5714285714285714,
3477
+ "eval_type": "rule",
3478
+ "num_demo": 1,
3479
+ "num_query": 14
3480
+ },
3481
+ {
3482
+ "name": "memorization_papers",
3483
+ "score": 0.6,
3484
+ "eval_type": "rule",
3485
+ "num_demo": 1,
3486
+ "num_query": 15
3487
+ },
3488
+ {
3489
+ "name": "memorization_famous_treaty",
3490
+ "score": 0.8214285714285714,
3491
+ "eval_type": "rule",
3492
+ "num_demo": 1,
3493
+ "num_query": 14
3494
+ },
3495
+ {
3496
+ "name": "memorization_indian_celebrity",
3497
+ "score": 0.8214285714285714,
3498
+ "eval_type": "rule",
3499
+ "num_demo": 1,
3500
+ "num_query": 14
3501
+ },
3502
+ {
3503
+ "name": "research_website_parsing_blogpost",
3504
+ "score": 0.07142857142857142,
3505
+ "eval_type": "rule",
3506
+ "num_demo": 1,
3507
+ "num_query": 14
3508
+ },
3509
+ {
3510
+ "name": "research_website_parsing_publication",
3511
+ "score": 0.07142857142857142,
3512
+ "eval_type": "rule",
3513
+ "num_demo": 1,
3514
+ "num_query": 14
3515
+ },
3516
+ {
3517
+ "name": "research_website_parsing_homepage",
3518
+ "score": 0.21428571428571427,
3519
+ "eval_type": "rule",
3520
+ "num_demo": 1,
3521
+ "num_query": 14
3522
+ },
3523
+ {
3524
+ "name": "video_motion_matching_real_3D",
3525
+ "score": 0.6428571428571429,
3526
+ "eval_type": "rule",
3527
+ "num_demo": 1,
3528
+ "num_query": 14
3529
+ },
3530
+ {
3531
+ "name": "video_motion_matching_3D_real",
3532
+ "score": 0.6,
3533
+ "eval_type": "rule",
3534
+ "num_demo": 1,
3535
+ "num_query": 15
3536
+ }
3537
+ ]
static/eval_results/SI/Gemini-Flash-2.0-exp/summary_and_keyword_stats.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 273,
5
+ "num_eval_samples": 4108,
6
+ "macro_mean_score": 0.564920890288835
7
+ },
8
+ "open": {
9
+ "num_eval_tasks": 42,
10
+ "num_eval_samples": 808,
11
+ "macro_mean_score": 0.6572258810577452
12
+ },
13
+ "overall_score": 0.5772282223913563
14
+ },
15
+ "keyword_stats": {
16
+ "skills": {
17
+ "Object Recognition and Classification": {
18
+ "count": 172,
19
+ "num_samples": 2704,
20
+ "tasks": [],
21
+ "average_score": 0.596725933625478
22
+ },
23
+ "Domain-Specific Knowledge and Skills": {
24
+ "count": 46,
25
+ "num_samples": 896,
26
+ "tasks": [],
27
+ "average_score": 0.5808105220673139
28
+ },
29
+ "Mathematical and Logical Reasoning": {
30
+ "count": 91,
31
+ "num_samples": 1628,
32
+ "tasks": [],
33
+ "average_score": 0.5140802058578953
34
+ },
35
+ "Spatial and Temporal Reasoning": {
36
+ "count": 78,
37
+ "num_samples": 1270,
38
+ "tasks": [],
39
+ "average_score": 0.45162462384154295
40
+ },
41
+ "Text Recognition (OCR)": {
42
+ "count": 101,
43
+ "num_samples": 1680,
44
+ "tasks": [],
45
+ "average_score": 0.6001986640309387
46
+ },
47
+ "Scene and Event Understanding": {
48
+ "count": 60,
49
+ "num_samples": 1004,
50
+ "tasks": [],
51
+ "average_score": 0.6719749185963775
52
+ },
53
+ "Language Understanding and Generation": {
54
+ "count": 102,
55
+ "num_samples": 1707,
56
+ "tasks": [],
57
+ "average_score": 0.6431550536063879
58
+ },
59
+ "Commonsense and Social Reasoning": {
60
+ "count": 38,
61
+ "num_samples": 652,
62
+ "tasks": [],
63
+ "average_score": 0.6799316098253394
64
+ },
65
+ "Planning and Decision Making": {
66
+ "count": 23,
67
+ "num_samples": 355,
68
+ "tasks": [],
69
+ "average_score": 0.20199150554951897
70
+ },
71
+ "Ethical and Safety Reasoning": {
72
+ "count": 10,
73
+ "num_samples": 170,
74
+ "tasks": [],
75
+ "average_score": 0.7211804511278196
76
+ }
77
+ },
78
+ "input_format": {
79
+ "Photographs": {
80
+ "count": 83,
81
+ "num_samples": 1310,
82
+ "tasks": [],
83
+ "average_score": 0.6249728230779608
84
+ },
85
+ "Diagrams and Data Visualizations": {
86
+ "count": 88,
87
+ "num_samples": 1523,
88
+ "tasks": [],
89
+ "average_score": 0.5662031698153198
90
+ },
91
+ "User Interface Screenshots": {
92
+ "count": 67,
93
+ "num_samples": 1117,
94
+ "tasks": [],
95
+ "average_score": 0.5760042013946798
96
+ },
97
+ "Text-Based Images and Documents": {
98
+ "count": 53,
99
+ "num_samples": 847,
100
+ "tasks": [],
101
+ "average_score": 0.5055178839706232
102
+ },
103
+ "Artistic and Creative Content": {
104
+ "count": 22,
105
+ "num_samples": 388,
106
+ "tasks": [],
107
+ "average_score": 0.6451175637719202
108
+ },
109
+ "3D Models and Aerial Imagery": {
110
+ "count": 2,
111
+ "num_samples": 30,
112
+ "tasks": [],
113
+ "average_score": 0.27547552359477934
114
+ }
115
+ },
116
+ "output_format": {
117
+ "structured_output": {
118
+ "count": 72,
119
+ "num_samples": 1120,
120
+ "tasks": [],
121
+ "average_score": 0.5687115330249284
122
+ },
123
+ "numerical_data": {
124
+ "count": 39,
125
+ "num_samples": 694,
126
+ "tasks": [],
127
+ "average_score": 0.5489576369705605
128
+ },
129
+ "multiple_choice": {
130
+ "count": 33,
131
+ "num_samples": 567,
132
+ "tasks": [],
133
+ "average_score": 0.6389872753509118
134
+ },
135
+ "contextual_formatted_text": {
136
+ "count": 63,
137
+ "num_samples": 972,
138
+ "tasks": [],
139
+ "average_score": 0.5354828682148746
140
+ },
141
+ "exact_text": {
142
+ "count": 57,
143
+ "num_samples": 876,
144
+ "tasks": [],
145
+ "average_score": 0.5603798147704114
146
+ },
147
+ "open_ended_output": {
148
+ "count": 51,
149
+ "num_samples": 986,
150
+ "tasks": [],
151
+ "average_score": 0.641307090345096
152
+ }
153
+ },
154
+ "input_num": {
155
+ "1-image": {
156
+ "count": 315,
157
+ "num_samples": 5215,
158
+ "tasks": [],
159
+ "average_score": 0.5772282223913566
160
+ }
161
+ },
162
+ "app": {
163
+ "Knowledge": {
164
+ "count": 77,
165
+ "num_samples": 1291,
166
+ "tasks": [],
167
+ "average_score": 0.6310985524386292
168
+ },
169
+ "Perception": {
170
+ "count": 82,
171
+ "num_samples": 1318,
172
+ "tasks": [],
173
+ "average_score": 0.6639417814110704
174
+ },
175
+ "Mathematics": {
176
+ "count": 30,
177
+ "num_samples": 497,
178
+ "tasks": [],
179
+ "average_score": 0.5009952551800727
180
+ },
181
+ "Information_Extraction": {
182
+ "count": 41,
183
+ "num_samples": 639,
184
+ "tasks": [],
185
+ "average_score": 0.6501282930308486
186
+ },
187
+ "Science": {
188
+ "count": 22,
189
+ "num_samples": 469,
190
+ "tasks": [],
191
+ "average_score": 0.5602180589214083
192
+ },
193
+ "Planning": {
194
+ "count": 44,
195
+ "num_samples": 712,
196
+ "tasks": [],
197
+ "average_score": 0.31397734418769446
198
+ },
199
+ "Coding": {
200
+ "count": 16,
201
+ "num_samples": 244,
202
+ "tasks": [],
203
+ "average_score": 0.5677003092903828
204
+ },
205
+ "Metrics": {
206
+ "count": 3,
207
+ "num_samples": 45,
208
+ "tasks": [],
209
+ "average_score": 0.626984126984127
210
+ }
211
+ }
212
+ }
213
+ }
static/eval_results/SI/Gemini-Flash-2.0-exp/task_results.json ADDED
@@ -0,0 +1,2207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "brand_logo_recognition_and_elaboration",
4
+ "score": 0.88,
5
+ "eval_type": "rule",
6
+ "num_demo": 1,
7
+ "num_query": 25
8
+ },
9
+ {
10
+ "name": "exchange_rate_estimate_plot",
11
+ "score": 0.9733571428571428,
12
+ "eval_type": "rule",
13
+ "num_demo": 1,
14
+ "num_query": 14
15
+ },
16
+ {
17
+ "name": "math_parity",
18
+ "score": 0.8,
19
+ "eval_type": "rule",
20
+ "num_demo": 1,
21
+ "num_query": 15
22
+ },
23
+ {
24
+ "name": "traffic_future_prediction_from_line_plot",
25
+ "score": 0.6567894736842107,
26
+ "eval_type": "rule",
27
+ "num_demo": 1,
28
+ "num_query": 19
29
+ },
30
+ {
31
+ "name": "graph_chordless_cycle",
32
+ "score": 0.42857142857142855,
33
+ "eval_type": "rule",
34
+ "num_demo": 1,
35
+ "num_query": 14
36
+ },
37
+ {
38
+ "name": "youtube_video_info_parsing",
39
+ "score": 0.7976190476190476,
40
+ "eval_type": "rule",
41
+ "num_demo": 1,
42
+ "num_query": 14
43
+ },
44
+ {
45
+ "name": "super_clevr_scene_understanding",
46
+ "score": 0.5,
47
+ "eval_type": "rule",
48
+ "num_demo": 1,
49
+ "num_query": 14
50
+ },
51
+ {
52
+ "name": "figureqa",
53
+ "score": 0.42857142857142855,
54
+ "eval_type": "rule",
55
+ "num_demo": 1,
56
+ "num_query": 14
57
+ },
58
+ {
59
+ "name": "face_keypoint_detection",
60
+ "score": 0.5787325110618333,
61
+ "eval_type": "rule",
62
+ "num_demo": 1,
63
+ "num_query": 14
64
+ },
65
+ {
66
+ "name": "widerface_face_count_and_event_classification",
67
+ "score": 0.7142857142857143,
68
+ "eval_type": "rule",
69
+ "num_demo": 1,
70
+ "num_query": 14
71
+ },
72
+ {
73
+ "name": "average_humidity_estimate_plot",
74
+ "score": 0.8413333333333332,
75
+ "eval_type": "rule",
76
+ "num_demo": 1,
77
+ "num_query": 15
78
+ },
79
+ {
80
+ "name": "weather_info_parsing",
81
+ "score": 0.896825396825397,
82
+ "eval_type": "rule",
83
+ "num_demo": 1,
84
+ "num_query": 14
85
+ },
86
+ {
87
+ "name": "egocentric_analysis_single_image",
88
+ "score": 0.4444444444444444,
89
+ "eval_type": "rule",
90
+ "num_demo": 1,
91
+ "num_query": 9
92
+ },
93
+ {
94
+ "name": "waybill_number_sequence_extraction",
95
+ "score": 0.7857142857142857,
96
+ "eval_type": "rule",
97
+ "num_demo": 1,
98
+ "num_query": 14
99
+ },
100
+ {
101
+ "name": "graph_maxflow",
102
+ "score": 0.4,
103
+ "eval_type": "rule",
104
+ "num_demo": 1,
105
+ "num_query": 15
106
+ },
107
+ {
108
+ "name": "TV_show_info_parsing",
109
+ "score": 0.8015873015873015,
110
+ "eval_type": "rule",
111
+ "num_demo": 1,
112
+ "num_query": 14
113
+ },
114
+ {
115
+ "name": "insect_order_classification",
116
+ "score": 0.2,
117
+ "eval_type": "rule",
118
+ "num_demo": 1,
119
+ "num_query": 15
120
+ },
121
+ {
122
+ "name": "electricity_plot_future_prediction",
123
+ "score": 0.8230157894736841,
124
+ "eval_type": "rule",
125
+ "num_demo": 1,
126
+ "num_query": 19
127
+ },
128
+ {
129
+ "name": "chemistry_exams_v",
130
+ "score": 0.35714285714285715,
131
+ "eval_type": "rule",
132
+ "num_demo": 1,
133
+ "num_query": 14
134
+ },
135
+ {
136
+ "name": "finance_table_understanding",
137
+ "score": 0.6428571428571429,
138
+ "eval_type": "rule",
139
+ "num_demo": 1,
140
+ "num_query": 14
141
+ },
142
+ {
143
+ "name": "funsd_document_qa",
144
+ "score": 0.7857142857142857,
145
+ "eval_type": "rule",
146
+ "num_demo": 1,
147
+ "num_query": 14
148
+ },
149
+ {
150
+ "name": "vibe_eval_open",
151
+ "score": 0.0,
152
+ "eval_type": "rule",
153
+ "num_demo": 1,
154
+ "num_query": 14
155
+ },
156
+ {
157
+ "name": "question_solution_solving",
158
+ "score": 0.2857142857142857,
159
+ "eval_type": "rule",
160
+ "num_demo": 1,
161
+ "num_query": 14
162
+ },
163
+ {
164
+ "name": "graph_theory",
165
+ "score": 0.2857142857142857,
166
+ "eval_type": "rule",
167
+ "num_demo": 1,
168
+ "num_query": 14
169
+ },
170
+ {
171
+ "name": "geometry_analytic",
172
+ "score": 0.14285714285714285,
173
+ "eval_type": "rule",
174
+ "num_demo": 1,
175
+ "num_query": 14
176
+ },
177
+ {
178
+ "name": "geometry_length",
179
+ "score": 0.42857142857142855,
180
+ "eval_type": "rule",
181
+ "num_demo": 1,
182
+ "num_query": 14
183
+ },
184
+ {
185
+ "name": "algebra",
186
+ "score": 0.2857142857142857,
187
+ "eval_type": "rule",
188
+ "num_demo": 1,
189
+ "num_query": 14
190
+ },
191
+ {
192
+ "name": "chess_puzzle_single_step",
193
+ "score": 0.0,
194
+ "eval_type": "rule",
195
+ "num_demo": 1,
196
+ "num_query": 15
197
+ },
198
+ {
199
+ "name": "chess_winner_identification",
200
+ "score": 0.4,
201
+ "eval_type": "rule",
202
+ "num_demo": 1,
203
+ "num_query": 15
204
+ },
205
+ {
206
+ "name": "physical_property_reasoning",
207
+ "score": 1.0,
208
+ "eval_type": "rule",
209
+ "num_demo": 1,
210
+ "num_query": 14
211
+ },
212
+ {
213
+ "name": "humor_understand_caption_match",
214
+ "score": 0.7333333333333333,
215
+ "eval_type": "rule",
216
+ "num_demo": 1,
217
+ "num_query": 15
218
+ },
219
+ {
220
+ "name": "coco_object_detection_by_query_property",
221
+ "score": 0.6698026360094582,
222
+ "eval_type": "rule",
223
+ "num_demo": 1,
224
+ "num_query": 14
225
+ },
226
+ {
227
+ "name": "multilingual_game_info_parsing",
228
+ "score": 0.7857142857142857,
229
+ "eval_type": "rule",
230
+ "num_demo": 1,
231
+ "num_query": 14
232
+ },
233
+ {
234
+ "name": "mnist_pattern",
235
+ "score": 0.8571428571428571,
236
+ "eval_type": "rule",
237
+ "num_demo": 1,
238
+ "num_query": 14
239
+ },
240
+ {
241
+ "name": "newspaper_page_parse_and_count",
242
+ "score": 0.4,
243
+ "eval_type": "rule",
244
+ "num_demo": 1,
245
+ "num_query": 15
246
+ },
247
+ {
248
+ "name": "dvqa",
249
+ "score": 0.7894736842105263,
250
+ "eval_type": "rule",
251
+ "num_demo": 1,
252
+ "num_query": 19
253
+ },
254
+ {
255
+ "name": "science_basic_physics",
256
+ "score": 0.8,
257
+ "eval_type": "rule",
258
+ "num_demo": 1,
259
+ "num_query": 15
260
+ },
261
+ {
262
+ "name": "electricity_future_prediction_from_table",
263
+ "score": 0.737017543859649,
264
+ "eval_type": "rule",
265
+ "num_demo": 1,
266
+ "num_query": 19
267
+ },
268
+ {
269
+ "name": "physics_exams_v",
270
+ "score": 0.42857142857142855,
271
+ "eval_type": "rule",
272
+ "num_demo": 1,
273
+ "num_query": 14
274
+ },
275
+ {
276
+ "name": "license_plate_recognition",
277
+ "score": 0.7857142857142857,
278
+ "eval_type": "rule",
279
+ "num_demo": 1,
280
+ "num_query": 14
281
+ },
282
+ {
283
+ "name": "snli_ve_visual_entailment",
284
+ "score": 0.8666666666666667,
285
+ "eval_type": "rule",
286
+ "num_demo": 1,
287
+ "num_query": 15
288
+ },
289
+ {
290
+ "name": "places365_scene_type_classification",
291
+ "score": 0.9285714285714286,
292
+ "eval_type": "rule",
293
+ "num_demo": 1,
294
+ "num_query": 14
295
+ },
296
+ {
297
+ "name": "3d_indoor_scene_text_bbox_selection",
298
+ "score": 0.35714285714285715,
299
+ "eval_type": "rule",
300
+ "num_demo": 1,
301
+ "num_query": 14
302
+ },
303
+ {
304
+ "name": "geometry_descriptive",
305
+ "score": 0.07142857142857142,
306
+ "eval_type": "rule",
307
+ "num_demo": 1,
308
+ "num_query": 14
309
+ },
310
+ {
311
+ "name": "top_rated_hotel_identification",
312
+ "score": 0.7857142857142857,
313
+ "eval_type": "rule",
314
+ "num_demo": 1,
315
+ "num_query": 14
316
+ },
317
+ {
318
+ "name": "science_molecule_chemistry",
319
+ "score": 0.9333333333333333,
320
+ "eval_type": "rule",
321
+ "num_demo": 1,
322
+ "num_query": 15
323
+ },
324
+ {
325
+ "name": "game_info_parsing",
326
+ "score": 0.9285714285714286,
327
+ "eval_type": "rule",
328
+ "num_demo": 1,
329
+ "num_query": 14
330
+ },
331
+ {
332
+ "name": "music_info_parsing",
333
+ "score": 0.6071428571428571,
334
+ "eval_type": "rule",
335
+ "num_demo": 1,
336
+ "num_query": 14
337
+ },
338
+ {
339
+ "name": "deciphering_oracle_bone",
340
+ "score": 0.07142857142857142,
341
+ "eval_type": "rule",
342
+ "num_demo": 1,
343
+ "num_query": 14
344
+ },
345
+ {
346
+ "name": "multilingual_movie_info_parsing",
347
+ "score": 0.7346938775510203,
348
+ "eval_type": "rule",
349
+ "num_demo": 1,
350
+ "num_query": 14
351
+ },
352
+ {
353
+ "name": "iconqa_count_and_reasoning",
354
+ "score": 0.631578947368421,
355
+ "eval_type": "rule",
356
+ "num_demo": 1,
357
+ "num_query": 19
358
+ },
359
+ {
360
+ "name": "graph_connectivity",
361
+ "score": 0.8,
362
+ "eval_type": "rule",
363
+ "num_demo": 1,
364
+ "num_query": 15
365
+ },
366
+ {
367
+ "name": "graph_shortest_path_planar",
368
+ "score": 0.35714285714285715,
369
+ "eval_type": "rule",
370
+ "num_demo": 1,
371
+ "num_query": 14
372
+ },
373
+ {
374
+ "name": "famous_building_recognition",
375
+ "score": 0.9375,
376
+ "eval_type": "rule",
377
+ "num_demo": 1,
378
+ "num_query": 16
379
+ },
380
+ {
381
+ "name": "signboard_identification",
382
+ "score": 0.7857142857142857,
383
+ "eval_type": "rule",
384
+ "num_demo": 1,
385
+ "num_query": 14
386
+ },
387
+ {
388
+ "name": "geometry_transformation",
389
+ "score": 0.2857142857142857,
390
+ "eval_type": "rule",
391
+ "num_demo": 1,
392
+ "num_query": 14
393
+ },
394
+ {
395
+ "name": "image_style_recognition",
396
+ "score": 1.0,
397
+ "eval_type": "rule",
398
+ "num_demo": 1,
399
+ "num_query": 14
400
+ },
401
+ {
402
+ "name": "math_convexity_value_estimation",
403
+ "score": 0.5207276876255985,
404
+ "eval_type": "rule",
405
+ "num_demo": 1,
406
+ "num_query": 15
407
+ },
408
+ {
409
+ "name": "long_string_letter_recognition",
410
+ "score": 0.21428571428571427,
411
+ "eval_type": "rule",
412
+ "num_demo": 1,
413
+ "num_query": 14
414
+ },
415
+ {
416
+ "name": "3d_indoor_scene_text_bbox_prediction",
417
+ "score": 0.19380819004670155,
418
+ "eval_type": "rule",
419
+ "num_demo": 1,
420
+ "num_query": 14
421
+ },
422
+ {
423
+ "name": "movie_info_parsing",
424
+ "score": 0.7142857142857143,
425
+ "eval_type": "rule",
426
+ "num_demo": 1,
427
+ "num_query": 14
428
+ },
429
+ {
430
+ "name": "handwritten_math_expression_extraction",
431
+ "score": 0.6428571428571429,
432
+ "eval_type": "rule",
433
+ "num_demo": 1,
434
+ "num_query": 14
435
+ },
436
+ {
437
+ "name": "geometry_solid",
438
+ "score": 0.5,
439
+ "eval_type": "rule",
440
+ "num_demo": 1,
441
+ "num_query": 14
442
+ },
443
+ {
444
+ "name": "animal_pose_estimation",
445
+ "score": 0.3668898610562918,
446
+ "eval_type": "rule",
447
+ "num_demo": 1,
448
+ "num_query": 14
449
+ },
450
+ {
451
+ "name": "single_person_pose_estimation",
452
+ "score": 0.298785583093911,
453
+ "eval_type": "rule",
454
+ "num_demo": 1,
455
+ "num_query": 14
456
+ },
457
+ {
458
+ "name": "human_relationship_reasoning",
459
+ "score": 1.0,
460
+ "eval_type": "rule",
461
+ "num_demo": 1,
462
+ "num_query": 14
463
+ },
464
+ {
465
+ "name": "graph_shortest_path_kamada_kawai",
466
+ "score": 0.42857142857142855,
467
+ "eval_type": "rule",
468
+ "num_demo": 1,
469
+ "num_query": 14
470
+ },
471
+ {
472
+ "name": "geometry_area",
473
+ "score": 0.42857142857142855,
474
+ "eval_type": "rule",
475
+ "num_demo": 1,
476
+ "num_query": 14
477
+ },
478
+ {
479
+ "name": "coco_person_detection",
480
+ "score": 0.6026048935518167,
481
+ "eval_type": "rule",
482
+ "num_demo": 1,
483
+ "num_query": 14
484
+ },
485
+ {
486
+ "name": "chart_vqa",
487
+ "score": 0.7857142857142857,
488
+ "eval_type": "rule",
489
+ "num_demo": 1,
490
+ "num_query": 14
491
+ },
492
+ {
493
+ "name": "hotel_booking_confirmation_parsing",
494
+ "score": 0.7071428571428571,
495
+ "eval_type": "rule",
496
+ "num_demo": 1,
497
+ "num_query": 14
498
+ },
499
+ {
500
+ "name": "ili_ratio_future_prediction",
501
+ "score": 0.16085714285714284,
502
+ "eval_type": "rule",
503
+ "num_demo": 1,
504
+ "num_query": 14
505
+ },
506
+ {
507
+ "name": "nlvr2_two_image_compare_qa",
508
+ "score": 0.7857142857142857,
509
+ "eval_type": "rule",
510
+ "num_demo": 1,
511
+ "num_query": 14
512
+ },
513
+ {
514
+ "name": "electricity_load_estimate_plot",
515
+ "score": 0.7926428571428572,
516
+ "eval_type": "rule",
517
+ "num_demo": 1,
518
+ "num_query": 14
519
+ },
520
+ {
521
+ "name": "tqa_textbook_qa",
522
+ "score": 0.8571428571428571,
523
+ "eval_type": "rule",
524
+ "num_demo": 1,
525
+ "num_query": 14
526
+ },
527
+ {
528
+ "name": "stock_info_parsing",
529
+ "score": 0.9663865546218489,
530
+ "eval_type": "rule",
531
+ "num_demo": 1,
532
+ "num_query": 14
533
+ },
534
+ {
535
+ "name": "math_exams_v",
536
+ "score": 0.5,
537
+ "eval_type": "rule",
538
+ "num_demo": 1,
539
+ "num_query": 14
540
+ },
541
+ {
542
+ "name": "quizlet_question_solving",
543
+ "score": 0.6428571428571429,
544
+ "eval_type": "rule",
545
+ "num_demo": 1,
546
+ "num_query": 14
547
+ },
548
+ {
549
+ "name": "newspaper_ocr_in_query_box",
550
+ "score": 0.5333333333333333,
551
+ "eval_type": "rule",
552
+ "num_demo": 1,
553
+ "num_query": 15
554
+ },
555
+ {
556
+ "name": "mvsa_sentiment_classification",
557
+ "score": 0.7857142857142857,
558
+ "eval_type": "rule",
559
+ "num_demo": 1,
560
+ "num_query": 14
561
+ },
562
+ {
563
+ "name": "egocentric_spatial_reasoning",
564
+ "score": 0.5555555555555556,
565
+ "eval_type": "rule",
566
+ "num_demo": 1,
567
+ "num_query": 9
568
+ },
569
+ {
570
+ "name": "stock_price_future_prediction",
571
+ "score": 0.8637857142857143,
572
+ "eval_type": "rule",
573
+ "num_demo": 1,
574
+ "num_query": 14
575
+ },
576
+ {
577
+ "name": "Ad_count_detection",
578
+ "score": 0.42857142857142855,
579
+ "eval_type": "rule",
580
+ "num_demo": 1,
581
+ "num_query": 14
582
+ },
583
+ {
584
+ "name": "recover_masked_word_in_figure",
585
+ "score": 0.21428571428571427,
586
+ "eval_type": "rule",
587
+ "num_demo": 1,
588
+ "num_query": 14
589
+ },
590
+ {
591
+ "name": "polygon_interior_angles",
592
+ "score": 0.0,
593
+ "eval_type": "rule",
594
+ "num_demo": 1,
595
+ "num_query": 14
596
+ },
597
+ {
598
+ "name": "web_action_grounding",
599
+ "score": 0.6428571428571429,
600
+ "eval_type": "rule",
601
+ "num_demo": 1,
602
+ "num_query": 14
603
+ },
604
+ {
605
+ "name": "latex_complex_formula_convertion",
606
+ "score": 0.29411764705882354,
607
+ "eval_type": "rule",
608
+ "num_demo": 1,
609
+ "num_query": 17
610
+ },
611
+ {
612
+ "name": "transit_map_intersection_points",
613
+ "score": 0.3898809523809524,
614
+ "eval_type": "rule",
615
+ "num_demo": 1,
616
+ "num_query": 14
617
+ },
618
+ {
619
+ "name": "arxiv_vqa",
620
+ "score": 0.7857142857142857,
621
+ "eval_type": "rule",
622
+ "num_demo": 1,
623
+ "num_query": 14
624
+ },
625
+ {
626
+ "name": "medical_image_artifacts_indentification",
627
+ "score": 0.21428571428571427,
628
+ "eval_type": "rule",
629
+ "num_demo": 1,
630
+ "num_query": 14
631
+ },
632
+ {
633
+ "name": "song_title_identification_from_lyrics",
634
+ "score": 0.5714285714285714,
635
+ "eval_type": "rule",
636
+ "num_demo": 1,
637
+ "num_query": 14
638
+ },
639
+ {
640
+ "name": "actor_recognition_in_Movie",
641
+ "score": 0.9285714285714286,
642
+ "eval_type": "rule",
643
+ "num_demo": 1,
644
+ "num_query": 14
645
+ },
646
+ {
647
+ "name": "bongard_problem",
648
+ "score": 0.42105263157894735,
649
+ "eval_type": "rule",
650
+ "num_demo": 1,
651
+ "num_query": 19
652
+ },
653
+ {
654
+ "name": "ascii_art_understanding",
655
+ "score": 0.7142857142857143,
656
+ "eval_type": "rule",
657
+ "num_demo": 1,
658
+ "num_query": 14
659
+ },
660
+ {
661
+ "name": "calendar_schedule_suggestion",
662
+ "score": 0.6428571428571429,
663
+ "eval_type": "rule",
664
+ "num_demo": 1,
665
+ "num_query": 14
666
+ },
667
+ {
668
+ "name": "geometry_reasoning_overlapped_circle",
669
+ "score": 0.75,
670
+ "eval_type": "rule",
671
+ "num_demo": 1,
672
+ "num_query": 14
673
+ },
674
+ {
675
+ "name": "planning_screenshot_barman",
676
+ "score": 0.26666666666666666,
677
+ "eval_type": "rule",
678
+ "num_demo": 1,
679
+ "num_query": 15
680
+ },
681
+ {
682
+ "name": "planning_screenshot_floortile",
683
+ "score": 0.0,
684
+ "eval_type": "rule",
685
+ "num_demo": 1,
686
+ "num_query": 15
687
+ },
688
+ {
689
+ "name": "graph_isomorphism",
690
+ "score": 0.5333333333333333,
691
+ "eval_type": "rule",
692
+ "num_demo": 1,
693
+ "num_query": 15
694
+ },
695
+ {
696
+ "name": "code_programming_test_easy",
697
+ "score": 0.375,
698
+ "eval_type": "rule",
699
+ "num_demo": 1,
700
+ "num_query": 24
701
+ },
702
+ {
703
+ "name": "biology_exams_v",
704
+ "score": 0.6428571428571429,
705
+ "eval_type": "rule",
706
+ "num_demo": 1,
707
+ "num_query": 14
708
+ },
709
+ {
710
+ "name": "long_string_number_recognition",
711
+ "score": 1.0,
712
+ "eval_type": "rule",
713
+ "num_demo": 1,
714
+ "num_query": 14
715
+ },
716
+ {
717
+ "name": "kvqa_knowledge_aware_qa",
718
+ "score": 0.42105263157894735,
719
+ "eval_type": "rule",
720
+ "num_demo": 1,
721
+ "num_query": 19
722
+ },
723
+ {
724
+ "name": "math_breakpoint",
725
+ "score": 0.8,
726
+ "eval_type": "rule",
727
+ "num_demo": 1,
728
+ "num_query": 15
729
+ },
730
+ {
731
+ "name": "landmark_recognition_and_qa",
732
+ "score": 0.7111111111111111,
733
+ "eval_type": "rule",
734
+ "num_demo": 1,
735
+ "num_query": 15
736
+ },
737
+ {
738
+ "name": "code_execution",
739
+ "score": 0.8125,
740
+ "eval_type": "rule",
741
+ "num_demo": 1,
742
+ "num_query": 16
743
+ },
744
+ {
745
+ "name": "music_sheet_format_QA",
746
+ "score": 0.6428571428571429,
747
+ "eval_type": "rule",
748
+ "num_demo": 1,
749
+ "num_query": 14
750
+ },
751
+ {
752
+ "name": "annoying_word_search",
753
+ "score": 0.0,
754
+ "eval_type": "rule",
755
+ "num_demo": 1,
756
+ "num_query": 14
757
+ },
758
+ {
759
+ "name": "interpret_force_perspective_illusion",
760
+ "score": 0.6666666666666666,
761
+ "eval_type": "rule",
762
+ "num_demo": 1,
763
+ "num_query": 15
764
+ },
765
+ {
766
+ "name": "healthcare_info_judgement",
767
+ "score": 0.9285714285714286,
768
+ "eval_type": "rule",
769
+ "num_demo": 1,
770
+ "num_query": 14
771
+ },
772
+ {
773
+ "name": "geometry_plot_position_relationship",
774
+ "score": 0.8571428571428571,
775
+ "eval_type": "rule",
776
+ "num_demo": 1,
777
+ "num_query": 14
778
+ },
779
+ {
780
+ "name": "map_diagram_qa",
781
+ "score": 0.35714285714285715,
782
+ "eval_type": "rule",
783
+ "num_demo": 1,
784
+ "num_query": 14
785
+ },
786
+ {
787
+ "name": "pmc_vqa_medical_image_qa",
788
+ "score": 0.5789473684210527,
789
+ "eval_type": "rule",
790
+ "num_demo": 1,
791
+ "num_query": 19
792
+ },
793
+ {
794
+ "name": "medical_blood_vessels_recognition",
795
+ "score": 0.7142857142857143,
796
+ "eval_type": "rule",
797
+ "num_demo": 1,
798
+ "num_query": 14
799
+ },
800
+ {
801
+ "name": "relative_depth_of_different_points",
802
+ "score": 0.6428571428571429,
803
+ "eval_type": "rule",
804
+ "num_demo": 1,
805
+ "num_query": 14
806
+ },
807
+ {
808
+ "name": "location_vqa",
809
+ "score": 0.5,
810
+ "eval_type": "rule",
811
+ "num_demo": 1,
812
+ "num_query": 14
813
+ },
814
+ {
815
+ "name": "topological_sort",
816
+ "score": 0.0,
817
+ "eval_type": "rule",
818
+ "num_demo": 1,
819
+ "num_query": 14
820
+ },
821
+ {
822
+ "name": "mindmap_elements_parsing",
823
+ "score": 0.35714285714285715,
824
+ "eval_type": "rule",
825
+ "num_demo": 1,
826
+ "num_query": 14
827
+ },
828
+ {
829
+ "name": "scibench_fundamental_wo_solution",
830
+ "score": 0.42857142857142855,
831
+ "eval_type": "rule",
832
+ "num_demo": 1,
833
+ "num_query": 49
834
+ },
835
+ {
836
+ "name": "geometry_reasoning_nested_squares",
837
+ "score": 0.42857142857142855,
838
+ "eval_type": "rule",
839
+ "num_demo": 1,
840
+ "num_query": 14
841
+ },
842
+ {
843
+ "name": "font_recognition",
844
+ "score": 0.07142857142857142,
845
+ "eval_type": "rule",
846
+ "num_demo": 1,
847
+ "num_query": 14
848
+ },
849
+ {
850
+ "name": "mensa_iq_test",
851
+ "score": 0.4372549019607843,
852
+ "eval_type": "rule",
853
+ "num_demo": 1,
854
+ "num_query": 17
855
+ },
856
+ {
857
+ "name": "flowchart_code_generation",
858
+ "score": 0.5555555555555556,
859
+ "eval_type": "rule",
860
+ "num_demo": 1,
861
+ "num_query": 9
862
+ },
863
+ {
864
+ "name": "geometry_reasoning_count_line_intersections",
865
+ "score": 0.6428571428571429,
866
+ "eval_type": "rule",
867
+ "num_demo": 1,
868
+ "num_query": 14
869
+ },
870
+ {
871
+ "name": "stackoverflow_debug_QA",
872
+ "score": 0.6428571428571429,
873
+ "eval_type": "rule",
874
+ "num_demo": 1,
875
+ "num_query": 14
876
+ },
877
+ {
878
+ "name": "logical_reasoning_find_odd_one_out",
879
+ "score": 0.2857142857142857,
880
+ "eval_type": "rule",
881
+ "num_demo": 1,
882
+ "num_query": 14
883
+ },
884
+ {
885
+ "name": "circuit_diagram_understanding",
886
+ "score": 0.5333333333333333,
887
+ "eval_type": "rule",
888
+ "num_demo": 1,
889
+ "num_query": 15
890
+ },
891
+ {
892
+ "name": "web_action_prediction",
893
+ "score": 0.8571428571428571,
894
+ "eval_type": "rule",
895
+ "num_demo": 1,
896
+ "num_query": 14
897
+ },
898
+ {
899
+ "name": "signage_navigation",
900
+ "score": 0.6428571428571429,
901
+ "eval_type": "rule",
902
+ "num_demo": 1,
903
+ "num_query": 14
904
+ },
905
+ {
906
+ "name": "go_capture_stone",
907
+ "score": 0.13333333333333333,
908
+ "eval_type": "rule",
909
+ "num_demo": 1,
910
+ "num_query": 15
911
+ },
912
+ {
913
+ "name": "webpage_code_understanding",
914
+ "score": 0.6666666666666666,
915
+ "eval_type": "rule",
916
+ "num_demo": 1,
917
+ "num_query": 9
918
+ },
919
+ {
920
+ "name": "monthly_weather_days_count",
921
+ "score": 0.42857142857142855,
922
+ "eval_type": "rule",
923
+ "num_demo": 1,
924
+ "num_query": 14
925
+ },
926
+ {
927
+ "name": "medical_counting_lymphocytes",
928
+ "score": 0.0,
929
+ "eval_type": "rule",
930
+ "num_demo": 1,
931
+ "num_query": 14
932
+ },
933
+ {
934
+ "name": "weather_map_climate_type_temperature_parsing",
935
+ "score": 0.6428571428571429,
936
+ "eval_type": "rule",
937
+ "num_demo": 1,
938
+ "num_query": 14
939
+ },
940
+ {
941
+ "name": "top_video_creator_identification",
942
+ "score": 0.35714285714285715,
943
+ "eval_type": "rule",
944
+ "num_demo": 1,
945
+ "num_query": 14
946
+ },
947
+ {
948
+ "name": "rebus",
949
+ "score": 0.43478260869565216,
950
+ "eval_type": "rule",
951
+ "num_demo": 1,
952
+ "num_query": 23
953
+ },
954
+ {
955
+ "name": "ishihara_test",
956
+ "score": 0.19285714285714287,
957
+ "eval_type": "rule",
958
+ "num_demo": 1,
959
+ "num_query": 14
960
+ },
961
+ {
962
+ "name": "paper_vqa",
963
+ "score": 0.42857142857142855,
964
+ "eval_type": "rule",
965
+ "num_demo": 1,
966
+ "num_query": 14
967
+ },
968
+ {
969
+ "name": "product_ocr_qa",
970
+ "score": 0.42857142857142855,
971
+ "eval_type": "rule",
972
+ "num_demo": 1,
973
+ "num_query": 14
974
+ },
975
+ {
976
+ "name": "geometry_reasoning_circled_letter",
977
+ "score": 0.7857142857142857,
978
+ "eval_type": "rule",
979
+ "num_demo": 1,
980
+ "num_query": 14
981
+ },
982
+ {
983
+ "name": "GUI_Act_Web_Single",
984
+ "score": 0.026658939437844104,
985
+ "eval_type": "rule",
986
+ "num_demo": 1,
987
+ "num_query": 14
988
+ },
989
+ {
990
+ "name": "game_platform_support_identification",
991
+ "score": 0.9642857142857143,
992
+ "eval_type": "rule",
993
+ "num_demo": 1,
994
+ "num_query": 14
995
+ },
996
+ {
997
+ "name": "GUI_Act_Mobile_swipe",
998
+ "score": 0.5319532782862981,
999
+ "eval_type": "rule",
1000
+ "num_demo": 1,
1001
+ "num_query": 13
1002
+ },
1003
+ {
1004
+ "name": "mahjong",
1005
+ "score": 0.0,
1006
+ "eval_type": "rule",
1007
+ "num_demo": 1,
1008
+ "num_query": 14
1009
+ },
1010
+ {
1011
+ "name": "extract_webpage_headline",
1012
+ "score": 0.5714285714285714,
1013
+ "eval_type": "rule",
1014
+ "num_demo": 1,
1015
+ "num_query": 14
1016
+ },
1017
+ {
1018
+ "name": "planning_screenshot_storage",
1019
+ "score": 0.26666666666666666,
1020
+ "eval_type": "rule",
1021
+ "num_demo": 1,
1022
+ "num_query": 15
1023
+ },
1024
+ {
1025
+ "name": "scibench_calculus_wo_solution",
1026
+ "score": 0.4489795918367347,
1027
+ "eval_type": "rule",
1028
+ "num_demo": 1,
1029
+ "num_query": 49
1030
+ },
1031
+ {
1032
+ "name": "knowledge_graph_understanding",
1033
+ "score": 0.8,
1034
+ "eval_type": "rule",
1035
+ "num_demo": 1,
1036
+ "num_query": 15
1037
+ },
1038
+ {
1039
+ "name": "image_translation_en2cn",
1040
+ "score": 0.4747283144053553,
1041
+ "eval_type": "rule",
1042
+ "num_demo": 1,
1043
+ "num_query": 9
1044
+ },
1045
+ {
1046
+ "name": "realworld_qa_en2cn",
1047
+ "score": 0.7142857142857143,
1048
+ "eval_type": "rule",
1049
+ "num_demo": 1,
1050
+ "num_query": 14
1051
+ },
1052
+ {
1053
+ "name": "soccer_offside",
1054
+ "score": 0.5555555555555556,
1055
+ "eval_type": "rule",
1056
+ "num_demo": 1,
1057
+ "num_query": 9
1058
+ },
1059
+ {
1060
+ "name": "planning_visual_storage",
1061
+ "score": 0.0,
1062
+ "eval_type": "rule",
1063
+ "num_demo": 1,
1064
+ "num_query": 15
1065
+ },
1066
+ {
1067
+ "name": "geometry_reasoning_grid",
1068
+ "score": 0.75,
1069
+ "eval_type": "rule",
1070
+ "num_demo": 1,
1071
+ "num_query": 14
1072
+ },
1073
+ {
1074
+ "name": "GUI_Act_Web_Multi",
1075
+ "score": 0.23629435627595965,
1076
+ "eval_type": "rule",
1077
+ "num_demo": 1,
1078
+ "num_query": 14
1079
+ },
1080
+ {
1081
+ "name": "chinese_idiom_recognition",
1082
+ "score": 0.7142857142857143,
1083
+ "eval_type": "rule",
1084
+ "num_demo": 1,
1085
+ "num_query": 14
1086
+ },
1087
+ {
1088
+ "name": "relative_reflectance_of_different_regions",
1089
+ "score": 0.21428571428571427,
1090
+ "eval_type": "rule",
1091
+ "num_demo": 1,
1092
+ "num_query": 14
1093
+ },
1094
+ {
1095
+ "name": "number_comparison",
1096
+ "score": 1.0,
1097
+ "eval_type": "rule",
1098
+ "num_demo": 1,
1099
+ "num_query": 14
1100
+ },
1101
+ {
1102
+ "name": "entertainment_web_game_style",
1103
+ "score": 0.7857142857142857,
1104
+ "eval_type": "rule",
1105
+ "num_demo": 1,
1106
+ "num_query": 14
1107
+ },
1108
+ {
1109
+ "name": "orchestra_score_recognition",
1110
+ "score": 0.21428571428571427,
1111
+ "eval_type": "rule",
1112
+ "num_demo": 1,
1113
+ "num_query": 14
1114
+ },
1115
+ {
1116
+ "name": "planning_screenshot_blocksworld",
1117
+ "score": 0.3333333333333333,
1118
+ "eval_type": "rule",
1119
+ "num_demo": 1,
1120
+ "num_query": 15
1121
+ },
1122
+ {
1123
+ "name": "icon_arithmetic_puzzle",
1124
+ "score": 0.6785714285714286,
1125
+ "eval_type": "rule",
1126
+ "num_demo": 1,
1127
+ "num_query": 14
1128
+ },
1129
+ {
1130
+ "name": "planning_screenshot_grippers",
1131
+ "score": 0.4,
1132
+ "eval_type": "rule",
1133
+ "num_demo": 1,
1134
+ "num_query": 15
1135
+ },
1136
+ {
1137
+ "name": "character_recognition_in_TV_shows",
1138
+ "score": 0.6428571428571429,
1139
+ "eval_type": "rule",
1140
+ "num_demo": 1,
1141
+ "num_query": 14
1142
+ },
1143
+ {
1144
+ "name": "highest_discount_game_price_identification",
1145
+ "score": 1.0,
1146
+ "eval_type": "rule",
1147
+ "num_demo": 1,
1148
+ "num_query": 14
1149
+ },
1150
+ {
1151
+ "name": "remaining_playback_time_calculation",
1152
+ "score": 0.0,
1153
+ "eval_type": "rule",
1154
+ "num_demo": 1,
1155
+ "num_query": 14
1156
+ },
1157
+ {
1158
+ "name": "medical_cell_recognition",
1159
+ "score": 0.5714285714285714,
1160
+ "eval_type": "rule",
1161
+ "num_demo": 1,
1162
+ "num_query": 14
1163
+ },
1164
+ {
1165
+ "name": "chess_find_legal_moves",
1166
+ "score": 0.20779353910574844,
1167
+ "eval_type": "rule",
1168
+ "num_demo": 1,
1169
+ "num_query": 14
1170
+ },
1171
+ {
1172
+ "name": "distinguish_ai_generated_image",
1173
+ "score": 0.8421052631578947,
1174
+ "eval_type": "rule",
1175
+ "num_demo": 1,
1176
+ "num_query": 19
1177
+ },
1178
+ {
1179
+ "name": "autonomous_driving_scene_analysis",
1180
+ "score": 1.0,
1181
+ "eval_type": "rule",
1182
+ "num_demo": 1,
1183
+ "num_query": 14
1184
+ },
1185
+ {
1186
+ "name": "counting_single_image",
1187
+ "score": 0.7857142857142857,
1188
+ "eval_type": "rule",
1189
+ "num_demo": 1,
1190
+ "num_query": 14
1191
+ },
1192
+ {
1193
+ "name": "MMMU_pro_exam_screenshot",
1194
+ "score": 0.40404040404040403,
1195
+ "eval_type": "rule",
1196
+ "num_demo": 1,
1197
+ "num_query": 99
1198
+ },
1199
+ {
1200
+ "name": "GUI_Act_Mobile_tap",
1201
+ "score": 0.2857142857142857,
1202
+ "eval_type": "rule",
1203
+ "num_demo": 1,
1204
+ "num_query": 14
1205
+ },
1206
+ {
1207
+ "name": "road_map_find_highway_between_two_place",
1208
+ "score": 0.8235294117647058,
1209
+ "eval_type": "rule",
1210
+ "num_demo": 1,
1211
+ "num_query": 17
1212
+ },
1213
+ {
1214
+ "name": "clevrer_physics",
1215
+ "score": 0.45,
1216
+ "eval_type": "rule",
1217
+ "num_demo": 1,
1218
+ "num_query": 20
1219
+ },
1220
+ {
1221
+ "name": "chess_sygyzy_endgames",
1222
+ "score": 0.15739022881880024,
1223
+ "eval_type": "rule",
1224
+ "num_demo": 1,
1225
+ "num_query": 14
1226
+ },
1227
+ {
1228
+ "name": "llavaguard",
1229
+ "score": 0.5357142857142857,
1230
+ "eval_type": "rule",
1231
+ "num_demo": 1,
1232
+ "num_query": 14
1233
+ },
1234
+ {
1235
+ "name": "MMMU_physics_chemistry_selected",
1236
+ "score": 0.8571428571428571,
1237
+ "eval_type": "rule",
1238
+ "num_demo": 1,
1239
+ "num_query": 14
1240
+ },
1241
+ {
1242
+ "name": "medical_multi_organ_segmentation_rater",
1243
+ "score": 0.35714285714285715,
1244
+ "eval_type": "rule",
1245
+ "num_demo": 1,
1246
+ "num_query": 14
1247
+ },
1248
+ {
1249
+ "name": "cultural_vqa",
1250
+ "score": 0.5333333333333333,
1251
+ "eval_type": "rule",
1252
+ "num_demo": 1,
1253
+ "num_query": 15
1254
+ },
1255
+ {
1256
+ "name": "logical_reasoning_fit_pattern",
1257
+ "score": 0.2857142857142857,
1258
+ "eval_type": "rule",
1259
+ "num_demo": 1,
1260
+ "num_query": 14
1261
+ },
1262
+ {
1263
+ "name": "planning_screenshot_tyreworld",
1264
+ "score": 1.0,
1265
+ "eval_type": "rule",
1266
+ "num_demo": 1,
1267
+ "num_query": 15
1268
+ },
1269
+ {
1270
+ "name": "music_sheet_note_count",
1271
+ "score": 0.058823529411764705,
1272
+ "eval_type": "rule",
1273
+ "num_demo": 1,
1274
+ "num_query": 17
1275
+ },
1276
+ {
1277
+ "name": "planning_screenshot_termes",
1278
+ "score": 0.0,
1279
+ "eval_type": "rule",
1280
+ "num_demo": 1,
1281
+ "num_query": 15
1282
+ },
1283
+ {
1284
+ "name": "hashtag_recommendation",
1285
+ "score": 0.9166666666666666,
1286
+ "eval_type": "rule",
1287
+ "num_demo": 1,
1288
+ "num_query": 14
1289
+ },
1290
+ {
1291
+ "name": "multiple_states_identify_europe",
1292
+ "score": 0.7857142857142857,
1293
+ "eval_type": "rule",
1294
+ "num_demo": 1,
1295
+ "num_query": 14
1296
+ },
1297
+ {
1298
+ "name": "multiple_states_identify_americas",
1299
+ "score": 0.7142857142857143,
1300
+ "eval_type": "rule",
1301
+ "num_demo": 1,
1302
+ "num_query": 14
1303
+ },
1304
+ {
1305
+ "name": "adapted_cvbench_distance",
1306
+ "score": 0.8571428571428571,
1307
+ "eval_type": "rule",
1308
+ "num_demo": 1,
1309
+ "num_query": 14
1310
+ },
1311
+ {
1312
+ "name": "adapted_cvbench_count",
1313
+ "score": 0.42857142857142855,
1314
+ "eval_type": "rule",
1315
+ "num_demo": 1,
1316
+ "num_query": 14
1317
+ },
1318
+ {
1319
+ "name": "adapted_cvbench_depth",
1320
+ "score": 1.0,
1321
+ "eval_type": "rule",
1322
+ "num_demo": 1,
1323
+ "num_query": 14
1324
+ },
1325
+ {
1326
+ "name": "adapted_cvbench_relation",
1327
+ "score": 0.5714285714285714,
1328
+ "eval_type": "rule",
1329
+ "num_demo": 1,
1330
+ "num_query": 14
1331
+ },
1332
+ {
1333
+ "name": "symbolic_graphics_programs_computer_aided_design",
1334
+ "score": 0.2857142857142857,
1335
+ "eval_type": "rule",
1336
+ "num_demo": 1,
1337
+ "num_query": 14
1338
+ },
1339
+ {
1340
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
1341
+ "score": 0.1111111111111111,
1342
+ "eval_type": "rule",
1343
+ "num_demo": 1,
1344
+ "num_query": 18
1345
+ },
1346
+ {
1347
+ "name": "table_understanding_complex_question_answering",
1348
+ "score": 0.5714285714285714,
1349
+ "eval_type": "rule",
1350
+ "num_demo": 1,
1351
+ "num_query": 14
1352
+ },
1353
+ {
1354
+ "name": "table_understanding_fact_verification",
1355
+ "score": 0.9047619047619048,
1356
+ "eval_type": "rule",
1357
+ "num_demo": 1,
1358
+ "num_query": 14
1359
+ },
1360
+ {
1361
+ "name": "panel_images_multi_question",
1362
+ "score": 0.8095238095238095,
1363
+ "eval_type": "rule",
1364
+ "num_demo": 1,
1365
+ "num_query": 14
1366
+ },
1367
+ {
1368
+ "name": "multiple_states_identify_asia",
1369
+ "score": 0.9000000000000001,
1370
+ "eval_type": "rule",
1371
+ "num_demo": 1,
1372
+ "num_query": 14
1373
+ },
1374
+ {
1375
+ "name": "panel_images_single_question",
1376
+ "score": 1.0,
1377
+ "eval_type": "rule",
1378
+ "num_demo": 1,
1379
+ "num_query": 14
1380
+ },
1381
+ {
1382
+ "name": "multiple_states_identify_africa",
1383
+ "score": 0.8142857142857143,
1384
+ "eval_type": "rule",
1385
+ "num_demo": 1,
1386
+ "num_query": 14
1387
+ },
1388
+ {
1389
+ "name": "MMSoc_Misinformation_GossipCop",
1390
+ "score": 0.5714285714285714,
1391
+ "eval_type": "rule",
1392
+ "num_demo": 1,
1393
+ "num_query": 14
1394
+ },
1395
+ {
1396
+ "name": "MMSoc_HatefulMemes",
1397
+ "score": 0.7142857142857143,
1398
+ "eval_type": "rule",
1399
+ "num_demo": 1,
1400
+ "num_query": 14
1401
+ },
1402
+ {
1403
+ "name": "poetry_petrarchian_sonnet_optional_meter",
1404
+ "score": 0.0,
1405
+ "eval_type": "rule",
1406
+ "num_demo": 0,
1407
+ "num_query": 15
1408
+ },
1409
+ {
1410
+ "name": "MMSoc_Memotion",
1411
+ "score": 0.6117647058823531,
1412
+ "eval_type": "rule",
1413
+ "num_demo": 1,
1414
+ "num_query": 17
1415
+ },
1416
+ {
1417
+ "name": "poetry_haiku",
1418
+ "score": 0.8666666666666667,
1419
+ "eval_type": "rule",
1420
+ "num_demo": 0,
1421
+ "num_query": 15
1422
+ },
1423
+ {
1424
+ "name": "MMSoc_Misinformation_PolitiFact",
1425
+ "score": 0.7857142857142857,
1426
+ "eval_type": "rule",
1427
+ "num_demo": 1,
1428
+ "num_query": 14
1429
+ },
1430
+ {
1431
+ "name": "poetry_shakespearean_sonnet",
1432
+ "score": 0.06666666666666667,
1433
+ "eval_type": "rule",
1434
+ "num_demo": 0,
1435
+ "num_query": 15
1436
+ },
1437
+ {
1438
+ "name": "poetry_acrostic_alliteration",
1439
+ "score": 0.7333333333333333,
1440
+ "eval_type": "rule",
1441
+ "num_demo": 0,
1442
+ "num_query": 15
1443
+ },
1444
+ {
1445
+ "name": "screenshot_lighteval_math",
1446
+ "score": 0.6666666666666666,
1447
+ "eval_type": "rule",
1448
+ "num_demo": 1,
1449
+ "num_query": 15
1450
+ },
1451
+ {
1452
+ "name": "poetry_acrostic",
1453
+ "score": 0.9333333333333333,
1454
+ "eval_type": "rule",
1455
+ "num_demo": 0,
1456
+ "num_query": 15
1457
+ },
1458
+ {
1459
+ "name": "poetry_limerick",
1460
+ "score": 0.5333333333333333,
1461
+ "eval_type": "rule",
1462
+ "num_demo": 0,
1463
+ "num_query": 15
1464
+ },
1465
+ {
1466
+ "name": "screenshot_theoremqa",
1467
+ "score": 0.8571428571428571,
1468
+ "eval_type": "rule",
1469
+ "num_demo": 1,
1470
+ "num_query": 14
1471
+ },
1472
+ {
1473
+ "name": "poetry_custom_rhyming_scheme",
1474
+ "score": 0.2,
1475
+ "eval_type": "rule",
1476
+ "num_demo": 0,
1477
+ "num_query": 15
1478
+ },
1479
+ {
1480
+ "name": "text_entity_replace",
1481
+ "score": 0.6428571428571429,
1482
+ "eval_type": "rule",
1483
+ "num_demo": 1,
1484
+ "num_query": 14
1485
+ },
1486
+ {
1487
+ "name": "background_change",
1488
+ "score": 0.8571428571428571,
1489
+ "eval_type": "rule",
1490
+ "num_demo": 1,
1491
+ "num_query": 14
1492
+ },
1493
+ {
1494
+ "name": "face_attribute_edit",
1495
+ "score": 0.5,
1496
+ "eval_type": "rule",
1497
+ "num_demo": 1,
1498
+ "num_query": 14
1499
+ },
1500
+ {
1501
+ "name": "face_swap",
1502
+ "score": 0.5,
1503
+ "eval_type": "rule",
1504
+ "num_demo": 1,
1505
+ "num_query": 14
1506
+ },
1507
+ {
1508
+ "name": "text_style",
1509
+ "score": 0.6428571428571429,
1510
+ "eval_type": "rule",
1511
+ "num_demo": 1,
1512
+ "num_query": 14
1513
+ },
1514
+ {
1515
+ "name": "number_puzzle_sudoku",
1516
+ "score": 0.0,
1517
+ "eval_type": "rule",
1518
+ "num_demo": 1,
1519
+ "num_query": 15
1520
+ },
1521
+ {
1522
+ "name": "out_of_context",
1523
+ "score": 0.8571428571428571,
1524
+ "eval_type": "rule",
1525
+ "num_demo": 1,
1526
+ "num_query": 14
1527
+ },
1528
+ {
1529
+ "name": "clip_stable_diffusion_generate",
1530
+ "score": 0.5714285714285714,
1531
+ "eval_type": "rule",
1532
+ "num_demo": 1,
1533
+ "num_query": 14
1534
+ },
1535
+ {
1536
+ "name": "veracity",
1537
+ "score": 0.8571428571428571,
1538
+ "eval_type": "rule",
1539
+ "num_demo": 1,
1540
+ "num_query": 14
1541
+ },
1542
+ {
1543
+ "name": "counterfactual_arithmetic",
1544
+ "score": 0.8571428571428571,
1545
+ "eval_type": "rule",
1546
+ "num_demo": 1,
1547
+ "num_query": 14
1548
+ },
1549
+ {
1550
+ "name": "maze_2d_8x8",
1551
+ "score": 0.0,
1552
+ "eval_type": "rule",
1553
+ "num_demo": 1,
1554
+ "num_query": 14
1555
+ },
1556
+ {
1557
+ "name": "shape_composition_shapes",
1558
+ "score": 0.47176870748299315,
1559
+ "eval_type": "rule",
1560
+ "num_demo": 1,
1561
+ "num_query": 14
1562
+ },
1563
+ {
1564
+ "name": "shape_composition_colours",
1565
+ "score": 0.4498299319727891,
1566
+ "eval_type": "rule",
1567
+ "num_demo": 1,
1568
+ "num_query": 14
1569
+ },
1570
+ {
1571
+ "name": "number_puzzle_kakuro_5x5",
1572
+ "score": 0.0,
1573
+ "eval_type": "rule",
1574
+ "num_demo": 1,
1575
+ "num_query": 15
1576
+ },
1577
+ {
1578
+ "name": "autorater_artifact",
1579
+ "score": 0.6428571428571429,
1580
+ "eval_type": "rule",
1581
+ "num_demo": 1,
1582
+ "num_query": 14
1583
+ },
1584
+ {
1585
+ "name": "autorater_artifact_reason",
1586
+ "score": 0.6666666666666666,
1587
+ "eval_type": "rule",
1588
+ "num_demo": 0,
1589
+ "num_query": 15
1590
+ },
1591
+ {
1592
+ "name": "chess_puzzles_crushing",
1593
+ "score": 0.0,
1594
+ "eval_type": "rule",
1595
+ "num_demo": 1,
1596
+ "num_query": 14
1597
+ },
1598
+ {
1599
+ "name": "app_layout_understanding_amazon",
1600
+ "score": 0.5714285714285714,
1601
+ "eval_type": "rule",
1602
+ "num_demo": 1,
1603
+ "num_query": 14
1604
+ },
1605
+ {
1606
+ "name": "chess_puzzles_checkmate",
1607
+ "score": 0.0,
1608
+ "eval_type": "rule",
1609
+ "num_demo": 1,
1610
+ "num_query": 14
1611
+ },
1612
+ {
1613
+ "name": "app_layout_understanding_instagram",
1614
+ "score": 0.7142857142857143,
1615
+ "eval_type": "rule",
1616
+ "num_demo": 1,
1617
+ "num_query": 14
1618
+ },
1619
+ {
1620
+ "name": "chess_puzzles_equality",
1621
+ "score": 0.0,
1622
+ "eval_type": "rule",
1623
+ "num_demo": 1,
1624
+ "num_query": 15
1625
+ },
1626
+ {
1627
+ "name": "app_layout_understanding_zoom",
1628
+ "score": 0.6,
1629
+ "eval_type": "rule",
1630
+ "num_demo": 1,
1631
+ "num_query": 15
1632
+ },
1633
+ {
1634
+ "name": "app_layout_understanding_notes",
1635
+ "score": 0.5,
1636
+ "eval_type": "rule",
1637
+ "num_demo": 1,
1638
+ "num_query": 14
1639
+ },
1640
+ {
1641
+ "name": "app_layout_understanding_word",
1642
+ "score": 0.6428571428571429,
1643
+ "eval_type": "rule",
1644
+ "num_demo": 1,
1645
+ "num_query": 14
1646
+ },
1647
+ {
1648
+ "name": "app_layout_understanding_twitter",
1649
+ "score": 0.7142857142857143,
1650
+ "eval_type": "rule",
1651
+ "num_demo": 1,
1652
+ "num_query": 14
1653
+ },
1654
+ {
1655
+ "name": "app_layout_understanding_iphone_settings",
1656
+ "score": 0.8571428571428571,
1657
+ "eval_type": "rule",
1658
+ "num_demo": 1,
1659
+ "num_query": 14
1660
+ },
1661
+ {
1662
+ "name": "app_layout_understanding_youtube",
1663
+ "score": 0.7857142857142857,
1664
+ "eval_type": "rule",
1665
+ "num_demo": 1,
1666
+ "num_query": 14
1667
+ },
1668
+ {
1669
+ "name": "app_layout_understanding_leetcode",
1670
+ "score": 0.6428571428571429,
1671
+ "eval_type": "rule",
1672
+ "num_demo": 1,
1673
+ "num_query": 14
1674
+ },
1675
+ {
1676
+ "name": "app_layout_understanding_ppt",
1677
+ "score": 0.7142857142857143,
1678
+ "eval_type": "rule",
1679
+ "num_demo": 1,
1680
+ "num_query": 14
1681
+ },
1682
+ {
1683
+ "name": "app_layout_understanding_tiktok",
1684
+ "score": 0.8571428571428571,
1685
+ "eval_type": "rule",
1686
+ "num_demo": 1,
1687
+ "num_query": 14
1688
+ },
1689
+ {
1690
+ "name": "app_layout_understanding_alipay",
1691
+ "score": 0.8235294117647058,
1692
+ "eval_type": "rule",
1693
+ "num_demo": 1,
1694
+ "num_query": 17
1695
+ },
1696
+ {
1697
+ "name": "app_layout_understanding_excel",
1698
+ "score": 0.7142857142857143,
1699
+ "eval_type": "rule",
1700
+ "num_demo": 1,
1701
+ "num_query": 14
1702
+ },
1703
+ {
1704
+ "name": "ocr_resume_employer_plain",
1705
+ "score": 0.6428571428571429,
1706
+ "eval_type": "rule",
1707
+ "num_demo": 1,
1708
+ "num_query": 14
1709
+ },
1710
+ {
1711
+ "name": "ocr_article_journal",
1712
+ "score": 0.7857142857142857,
1713
+ "eval_type": "rule",
1714
+ "num_demo": 1,
1715
+ "num_query": 14
1716
+ },
1717
+ {
1718
+ "name": "ocr_resume_experience_plain",
1719
+ "score": 0.7142857142857143,
1720
+ "eval_type": "rule",
1721
+ "num_demo": 1,
1722
+ "num_query": 14
1723
+ },
1724
+ {
1725
+ "name": "ocr_table_to_markdown",
1726
+ "score": 0.9285714285714286,
1727
+ "eval_type": "rule",
1728
+ "num_demo": 1,
1729
+ "num_query": 14
1730
+ },
1731
+ {
1732
+ "name": "ocr_math_text_latex",
1733
+ "score": 0.42857142857142855,
1734
+ "eval_type": "rule",
1735
+ "num_demo": 1,
1736
+ "num_query": 14
1737
+ },
1738
+ {
1739
+ "name": "ocr_table_to_latex",
1740
+ "score": 0.7142857142857143,
1741
+ "eval_type": "rule",
1742
+ "num_demo": 1,
1743
+ "num_query": 14
1744
+ },
1745
+ {
1746
+ "name": "ocr_resume_school_plain",
1747
+ "score": 0.8571428571428571,
1748
+ "eval_type": "rule",
1749
+ "num_demo": 1,
1750
+ "num_query": 14
1751
+ },
1752
+ {
1753
+ "name": "ocr_article_authors",
1754
+ "score": 0.8214285714285714,
1755
+ "eval_type": "rule",
1756
+ "num_demo": 1,
1757
+ "num_query": 14
1758
+ },
1759
+ {
1760
+ "name": "ocr_table_to_html",
1761
+ "score": 0.7142857142857143,
1762
+ "eval_type": "rule",
1763
+ "num_demo": 1,
1764
+ "num_query": 14
1765
+ },
1766
+ {
1767
+ "name": "ocr_resume_skill_plain",
1768
+ "score": 0.5714285714285714,
1769
+ "eval_type": "rule",
1770
+ "num_demo": 1,
1771
+ "num_query": 14
1772
+ },
1773
+ {
1774
+ "name": "ocr_table_to_csv",
1775
+ "score": 0.6428571428571429,
1776
+ "eval_type": "rule",
1777
+ "num_demo": 1,
1778
+ "num_query": 14
1779
+ },
1780
+ {
1781
+ "name": "crossword_mini_5x5",
1782
+ "score": 0.6714285714285715,
1783
+ "eval_type": "rule",
1784
+ "num_demo": 1,
1785
+ "num_query": 14
1786
+ },
1787
+ {
1788
+ "name": "ocr_math_equation",
1789
+ "score": 0.42857142857142855,
1790
+ "eval_type": "rule",
1791
+ "num_demo": 1,
1792
+ "num_query": 14
1793
+ },
1794
+ {
1795
+ "name": "contain_repeat_length",
1796
+ "score": 0.4666666666666667,
1797
+ "eval_type": "rule",
1798
+ "num_demo": 0,
1799
+ "num_query": 15
1800
+ },
1801
+ {
1802
+ "name": "contain_position_length",
1803
+ "score": 0.8666666666666667,
1804
+ "eval_type": "rule",
1805
+ "num_demo": 0,
1806
+ "num_query": 15
1807
+ },
1808
+ {
1809
+ "name": "pictionary_skribbl_io",
1810
+ "score": 0.3,
1811
+ "eval_type": "rule",
1812
+ "num_demo": 1,
1813
+ "num_query": 20
1814
+ },
1815
+ {
1816
+ "name": "pictionary_doodle_guess",
1817
+ "score": 0.8,
1818
+ "eval_type": "rule",
1819
+ "num_demo": 1,
1820
+ "num_query": 15
1821
+ },
1822
+ {
1823
+ "name": "pictionary_genai_output_chinese",
1824
+ "score": 0.35714285714285715,
1825
+ "eval_type": "rule",
1826
+ "num_demo": 1,
1827
+ "num_query": 14
1828
+ },
1829
+ {
1830
+ "name": "pictionary_cartoon_drawing_guess",
1831
+ "score": 0.8571428571428571,
1832
+ "eval_type": "rule",
1833
+ "num_demo": 1,
1834
+ "num_query": 14
1835
+ },
1836
+ {
1837
+ "name": "contain_length",
1838
+ "score": 0.5333333333333333,
1839
+ "eval_type": "rule",
1840
+ "num_demo": 0,
1841
+ "num_query": 15
1842
+ },
1843
+ {
1844
+ "name": "pictionary_chinese_food_img2en",
1845
+ "score": 0.7857142857142857,
1846
+ "eval_type": "rule",
1847
+ "num_demo": 1,
1848
+ "num_query": 14
1849
+ },
1850
+ {
1851
+ "name": "contain_contain_length",
1852
+ "score": 0.9333333333333333,
1853
+ "eval_type": "rule",
1854
+ "num_demo": 0,
1855
+ "num_query": 15
1856
+ },
1857
+ {
1858
+ "name": "reward_models_i2t_reward",
1859
+ "score": 0.5714285714285714,
1860
+ "eval_type": "rule",
1861
+ "num_demo": 1,
1862
+ "num_query": 14
1863
+ },
1864
+ {
1865
+ "name": "memorization_chinese_celebrity",
1866
+ "score": 0.5714285714285714,
1867
+ "eval_type": "rule",
1868
+ "num_demo": 1,
1869
+ "num_query": 14
1870
+ },
1871
+ {
1872
+ "name": "memorization_papers",
1873
+ "score": 0.6,
1874
+ "eval_type": "rule",
1875
+ "num_demo": 1,
1876
+ "num_query": 15
1877
+ },
1878
+ {
1879
+ "name": "memorization_famous_treaty",
1880
+ "score": 0.8214285714285714,
1881
+ "eval_type": "rule",
1882
+ "num_demo": 1,
1883
+ "num_query": 14
1884
+ },
1885
+ {
1886
+ "name": "memorization_indian_celebrity",
1887
+ "score": 0.8214285714285714,
1888
+ "eval_type": "rule",
1889
+ "num_demo": 1,
1890
+ "num_query": 14
1891
+ },
1892
+ {
1893
+ "name": "research_website_parsing_blogpost",
1894
+ "score": 0.07142857142857142,
1895
+ "eval_type": "rule",
1896
+ "num_demo": 1,
1897
+ "num_query": 14
1898
+ },
1899
+ {
1900
+ "name": "research_website_parsing_publication",
1901
+ "score": 0.07142857142857142,
1902
+ "eval_type": "rule",
1903
+ "num_demo": 1,
1904
+ "num_query": 14
1905
+ },
1906
+ {
1907
+ "name": "research_website_parsing_homepage",
1908
+ "score": 0.21428571428571427,
1909
+ "eval_type": "rule",
1910
+ "num_demo": 1,
1911
+ "num_query": 14
1912
+ },
1913
+ {
1914
+ "name": "ascii_art_30",
1915
+ "score": 0.21428571428571427,
1916
+ "eval_type": "llm",
1917
+ "num_demo": 1,
1918
+ "num_query": 14
1919
+ },
1920
+ {
1921
+ "name": "humor_explanation",
1922
+ "score": 0.8533333333333335,
1923
+ "eval_type": "llm",
1924
+ "num_demo": 1,
1925
+ "num_query": 15
1926
+ },
1927
+ {
1928
+ "name": "science_figure_explanation",
1929
+ "score": 0.872413793103448,
1930
+ "eval_type": "llm",
1931
+ "num_demo": 1,
1932
+ "num_query": 29
1933
+ },
1934
+ {
1935
+ "name": "vibe_eval_phrase",
1936
+ "score": 0.7571428571428573,
1937
+ "eval_type": "llm",
1938
+ "num_demo": 1,
1939
+ "num_query": 14
1940
+ },
1941
+ {
1942
+ "name": "traffic_accident_analysis",
1943
+ "score": 0.6714285714285714,
1944
+ "eval_type": "llm",
1945
+ "num_demo": 1,
1946
+ "num_query": 14
1947
+ },
1948
+ {
1949
+ "name": "figurative_speech_explanation",
1950
+ "score": 0.8344827586206895,
1951
+ "eval_type": "llm",
1952
+ "num_demo": 1,
1953
+ "num_query": 29
1954
+ },
1955
+ {
1956
+ "name": "table2latex_complex",
1957
+ "score": 0.8111111111111111,
1958
+ "eval_type": "llm",
1959
+ "num_demo": 1,
1960
+ "num_query": 9
1961
+ },
1962
+ {
1963
+ "name": "unusual_images",
1964
+ "score": 0.889655172413793,
1965
+ "eval_type": "llm",
1966
+ "num_demo": 1,
1967
+ "num_query": 29
1968
+ },
1969
+ {
1970
+ "name": "art_explanation",
1971
+ "score": 0.4655172413793103,
1972
+ "eval_type": "llm",
1973
+ "num_demo": 1,
1974
+ "num_query": 29
1975
+ },
1976
+ {
1977
+ "name": "ocr_open_ended_qa",
1978
+ "score": 0.7965517241379308,
1979
+ "eval_type": "llm",
1980
+ "num_demo": 1,
1981
+ "num_query": 29
1982
+ },
1983
+ {
1984
+ "name": "bar_chart_interpretation",
1985
+ "score": 0.6655172413793102,
1986
+ "eval_type": "llm",
1987
+ "num_demo": 1,
1988
+ "num_query": 29
1989
+ },
1990
+ {
1991
+ "name": "scibench_w_solution_open_ended",
1992
+ "score": 0.618,
1993
+ "eval_type": "llm",
1994
+ "num_demo": 1,
1995
+ "num_query": 25
1996
+ },
1997
+ {
1998
+ "name": "GUI_Chat_Hard",
1999
+ "score": 0.4115384615384616,
2000
+ "eval_type": "llm",
2001
+ "num_demo": 1,
2002
+ "num_query": 26
2003
+ },
2004
+ {
2005
+ "name": "image_humor_understanding",
2006
+ "score": 0.882758620689655,
2007
+ "eval_type": "llm",
2008
+ "num_demo": 1,
2009
+ "num_query": 29
2010
+ },
2011
+ {
2012
+ "name": "defeasible_reasoning",
2013
+ "score": 0.8620689655172413,
2014
+ "eval_type": "llm",
2015
+ "num_demo": 1,
2016
+ "num_query": 29
2017
+ },
2018
+ {
2019
+ "name": "funny_image_title",
2020
+ "score": 0.6499999999999998,
2021
+ "eval_type": "llm",
2022
+ "num_demo": 1,
2023
+ "num_query": 14
2024
+ },
2025
+ {
2026
+ "name": "tweets_captioning",
2027
+ "score": 0.5499999999999999,
2028
+ "eval_type": "llm",
2029
+ "num_demo": 1,
2030
+ "num_query": 14
2031
+ },
2032
+ {
2033
+ "name": "graph_interpretation",
2034
+ "score": 0.8413793103448274,
2035
+ "eval_type": "llm",
2036
+ "num_demo": 1,
2037
+ "num_query": 29
2038
+ },
2039
+ {
2040
+ "name": "meme_explain",
2041
+ "score": 0.8857142857142858,
2042
+ "eval_type": "llm",
2043
+ "num_demo": 1,
2044
+ "num_query": 14
2045
+ },
2046
+ {
2047
+ "name": "guess_image_generation_prompt",
2048
+ "score": 0.8263157894736844,
2049
+ "eval_type": "llm",
2050
+ "num_demo": 1,
2051
+ "num_query": 19
2052
+ },
2053
+ {
2054
+ "name": "visualization_with_code",
2055
+ "score": 0.6714285714285715,
2056
+ "eval_type": "llm",
2057
+ "num_demo": 1,
2058
+ "num_query": 14
2059
+ },
2060
+ {
2061
+ "name": "iq_test_open_ended",
2062
+ "score": 0.6689655172413791,
2063
+ "eval_type": "llm",
2064
+ "num_demo": 1,
2065
+ "num_query": 29
2066
+ },
2067
+ {
2068
+ "name": "electrocardiogram",
2069
+ "score": 0.3857142857142857,
2070
+ "eval_type": "llm",
2071
+ "num_demo": 1,
2072
+ "num_query": 14
2073
+ },
2074
+ {
2075
+ "name": "image_captioning_with_additional_requirements",
2076
+ "score": 0.9285714285714287,
2077
+ "eval_type": "llm",
2078
+ "num_demo": 1,
2079
+ "num_query": 14
2080
+ },
2081
+ {
2082
+ "name": "docci_image_description_long",
2083
+ "score": 0.7928571428571428,
2084
+ "eval_type": "llm",
2085
+ "num_demo": 1,
2086
+ "num_query": 14
2087
+ },
2088
+ {
2089
+ "name": "GUI_Chat_Easy",
2090
+ "score": 0.5769230769230769,
2091
+ "eval_type": "llm",
2092
+ "num_demo": 1,
2093
+ "num_query": 26
2094
+ },
2095
+ {
2096
+ "name": "bridge_strategies_advanced",
2097
+ "score": 0.15714285714285717,
2098
+ "eval_type": "llm",
2099
+ "num_demo": 1,
2100
+ "num_query": 14
2101
+ },
2102
+ {
2103
+ "name": "bridge_strategies_worldclass",
2104
+ "score": 0.08571428571428572,
2105
+ "eval_type": "llm",
2106
+ "num_demo": 1,
2107
+ "num_query": 14
2108
+ },
2109
+ {
2110
+ "name": "bridge_strategies_expert",
2111
+ "score": 0.19999999999999998,
2112
+ "eval_type": "llm",
2113
+ "num_demo": 1,
2114
+ "num_query": 14
2115
+ },
2116
+ {
2117
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
2118
+ "score": 0.7428571428571429,
2119
+ "eval_type": "llm",
2120
+ "num_demo": 1,
2121
+ "num_query": 14
2122
+ },
2123
+ {
2124
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
2125
+ "score": 0.6857142857142857,
2126
+ "eval_type": "llm",
2127
+ "num_demo": 1,
2128
+ "num_query": 14
2129
+ },
2130
+ {
2131
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
2132
+ "score": 0.7357142857142858,
2133
+ "eval_type": "llm",
2134
+ "num_demo": 1,
2135
+ "num_query": 14
2136
+ },
2137
+ {
2138
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
2139
+ "score": 0.7000000000000001,
2140
+ "eval_type": "llm",
2141
+ "num_demo": 1,
2142
+ "num_query": 14
2143
+ },
2144
+ {
2145
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
2146
+ "score": 0.6857142857142857,
2147
+ "eval_type": "llm",
2148
+ "num_demo": 1,
2149
+ "num_query": 14
2150
+ },
2151
+ {
2152
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
2153
+ "score": 0.7285714285714285,
2154
+ "eval_type": "llm",
2155
+ "num_demo": 1,
2156
+ "num_query": 14
2157
+ },
2158
+ {
2159
+ "name": "table_understanding_fetaqa",
2160
+ "score": 0.5928571428571429,
2161
+ "eval_type": "llm",
2162
+ "num_demo": 1,
2163
+ "num_query": 14
2164
+ },
2165
+ {
2166
+ "name": "red_teaming_celebrity",
2167
+ "score": 0.8550000000000001,
2168
+ "eval_type": "llm",
2169
+ "num_demo": 0,
2170
+ "num_query": 20
2171
+ },
2172
+ {
2173
+ "name": "red_teaming_captcha",
2174
+ "score": 0.13157894736842107,
2175
+ "eval_type": "llm",
2176
+ "num_demo": 1,
2177
+ "num_query": 19
2178
+ },
2179
+ {
2180
+ "name": "red_teaming_jailbreak",
2181
+ "score": 0.5850000000000001,
2182
+ "eval_type": "llm",
2183
+ "num_demo": 0,
2184
+ "num_query": 20
2185
+ },
2186
+ {
2187
+ "name": "red_teaming_visualmisleading",
2188
+ "score": 0.8789473684210528,
2189
+ "eval_type": "llm",
2190
+ "num_demo": 1,
2191
+ "num_query": 19
2192
+ },
2193
+ {
2194
+ "name": "red_teaming_racial",
2195
+ "score": 0.7600000000000001,
2196
+ "eval_type": "llm",
2197
+ "num_demo": 0,
2198
+ "num_query": 20
2199
+ },
2200
+ {
2201
+ "name": "red_teaming_politics",
2202
+ "score": 0.695,
2203
+ "eval_type": "llm",
2204
+ "num_demo": 0,
2205
+ "num_query": 20
2206
+ }
2207
+ ]