Spaces:
Running
Running
update results & separate results organization
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +2 -1
- constants.py +1 -1
- static/eval_results/Default/Aquila_VL_2B/summary_results.json +251 -0
- static/eval_results/Default/Aquila_VL_2B/task_results.json +0 -0
- static/eval_results/Default/Aria/summary_results.json +251 -0
- static/eval_results/Default/Aria/task_results.json +0 -0
- static/eval_results/Default/Claude_3.5/summary_results.json +251 -0
- static/eval_results/Default/Claude_3.5/task_results.json +0 -0
- static/eval_results/Default/Claude_3.5_new/summary_results.json +251 -0
- static/eval_results/Default/Claude_3.5_new/task_results.json +0 -0
- static/eval_results/Default/GPT_4o/summary_results.json +251 -0
- static/eval_results/Default/GPT_4o/task_results.json +0 -0
- static/eval_results/Default/GPT_4o_mini/summary_results.json +251 -0
- static/eval_results/Default/GPT_4o_mini/task_results.json +0 -0
- static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json +251 -0
- static/eval_results/Default/Gemini_1.5_flash_002/task_results.json +0 -0
- static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json +251 -0
- static/eval_results/Default/Gemini_1.5_pro_002/task_results.json +0 -0
- static/eval_results/Default/Idefics3/summary_results.json +251 -0
- static/eval_results/Default/Idefics3/task_results.json +0 -0
- static/eval_results/Default/InternVL2_2B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_2B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_5_2B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_5_2B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_5_78B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_5_78B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_76B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_76B/task_results.json +0 -0
- static/eval_results/Default/InternVL2_8B/summary_results.json +251 -0
- static/eval_results/Default/InternVL2_8B/task_results.json +0 -0
- static/eval_results/Default/Llama_3_2_11B/summary_results.json +251 -0
- static/eval_results/Default/Llama_3_2_11B/task_results.json +0 -0
- static/eval_results/Default/Mammoth_VL/summary_results.json +251 -0
- static/eval_results/Default/Mammoth_VL/task_results.json +0 -0
- static/eval_results/Default/MiniCPM_v2.6/summary_results.json +251 -0
- static/eval_results/Default/MiniCPM_v2.6/task_results.json +0 -0
- static/eval_results/Default/NVLM/summary_results.json +251 -0
- static/eval_results/Default/NVLM/task_results.json +0 -0
- static/eval_results/Default/Phi-3.5-vision/summary_results.json +251 -0
- static/eval_results/Default/Phi-3.5-vision/task_results.json +0 -0
- static/eval_results/Default/Pixtral_12B/summary_results.json +251 -0
- static/eval_results/Default/Pixtral_12B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_2B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_2B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_72B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_72B/task_results.json +0 -0
- static/eval_results/Default/Qwen2_VL_7B/summary_results.json +251 -0
- static/eval_results/Default/Qwen2_VL_7B/task_results.json +0 -0
- static/eval_results/Default/all_model_keywords_stats.json +0 -0
- static/eval_results/Default/all_summary.json +0 -525
app.py
CHANGED
@@ -55,7 +55,8 @@ with gr.Blocks() as block:
|
|
55 |
)
|
56 |
|
57 |
# Define different captions for each table
|
58 |
-
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\
|
|
|
59 |
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
60 |
|
61 |
caption_component = gr.Markdown(
|
|
|
55 |
)
|
56 |
|
57 |
# Define different captions for each table
|
58 |
+
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ "
|
59 |
+
|
60 |
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
61 |
|
62 |
caption_component = gr.Markdown(
|
constants.py
CHANGED
@@ -28,7 +28,7 @@ We aim to provide cost-effective and accurate evaluation for multimodal models,
|
|
28 |
|
29 |
## 📊🔍 Results & Takeaways from Evaluating Top Models
|
30 |
|
31 |
-
- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (
|
32 |
- Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
|
33 |
- Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
|
34 |
- Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
|
|
|
28 |
|
29 |
## 📊🔍 Results & Takeaways from Evaluating Top Models
|
30 |
|
31 |
+
- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0620) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
|
32 |
- Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
|
33 |
- Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
|
34 |
- Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
|
static/eval_results/Default/Aquila_VL_2B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.159970161379836,
|
7 |
+
"micro_mean_score": 0.15844711671722148
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.24567572098570653,
|
13 |
+
"micro_mean_score": 0.2704213241616509
|
14 |
+
},
|
15 |
+
"overall_score": 0.17100157004197775
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.1796551584774396
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.1263506560912463
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.1775085349123463
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.2114933522881099
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.16251700109869488
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.26453155444796583
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.3729498746867168
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.19090788408036002
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.16500679466160564
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.03972686819521137
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.07035116566014021
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.11915109312705179
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.18915652635850314
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.21939978337316163
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.17643260913333875
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.2438396314831894
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.08989401697906672
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.12241197113963243
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.10758402844431432
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.19372082302321905
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.19201243810115767
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.23278612647548963
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.21664527852608348
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.12138133030990172
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.01221681479628382
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.17994400163273605
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.21939978337316163
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.18212149746318507
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.21563163558700174
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.0981320856519089
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.0557399538308785
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.1351126472094214
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.2025034827431662
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.29326275059361956
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.22529225586731416
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.23810497886903373
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.17867138975396438
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Aquila_VL_2B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Aria/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.289073788209904,
|
7 |
+
"micro_mean_score": 0.2859007507765791
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.5103725263180767,
|
13 |
+
"micro_mean_score": 0.5349957007738607
|
14 |
+
},
|
15 |
+
"overall_score": 0.31755778420402525
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.3153649050553317
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.34425736922415495
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.3921740378709932
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.37623282710622424
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.271674311347156
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.46313777834281344
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.5692180451127821
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.3152064038837139
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.23851147782276536
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.11246568298589892
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.28561724084490353
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.2505346698796475
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.3040414715952029
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.41865640360591405
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.3622713579911698
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.35872259826035346
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.1509096092007215
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.2846987779732631
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.2899384042262363
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.27412885527802433
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.3117275816801635
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.4523860109667709
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.310055869988487
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.18301681783824644
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.26651659725352617
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.34236220565522313
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.41865640360591405
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.19142683154129833
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.2596336265133595
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.3929243812973524
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.1403503245041943
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.25367910605102256
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.3494812758481046
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.3662927672998609
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.28616079233761366
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.3953949223279651
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.26097385403450996
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Aria/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Claude_3.5/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.5040975742801586,
|
7 |
+
"micro_mean_score": 0.5002259116666758
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.6373907158949892,
|
13 |
+
"micro_mean_score": 0.6569647463456579
|
14 |
+
},
|
15 |
+
"overall_score": 0.5212541172602853
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.5405089647404562
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.6082834220752651
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.5745077617490254
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.5450038475783499
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.4767692987630454
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5756126284078804
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6969774436090224
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.5278843049497918
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.4082144793870471
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.23803578664609892
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.5691641481808987
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.4795267886975966
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.525848282456283
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.508735695828719
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.5699094130430454
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.5096772701625744
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.4429640420975014
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.5066797418318023
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.4971460788134188
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.5278127103234661
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.4490020843308984
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.5838224169821388
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.5456152399978661
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.46300075585789874
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.5414381873407914
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.5373019912310933
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.508735695828719
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.4422556748863689
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.49311554035078103
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.6663170946790707
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.3382015835012861
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.5194010220575684
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.532329797132399
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.5808831682303479
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.513474611293123
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.5507075880782885
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.47461998432626556
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Claude_3.5/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Claude_3.5_new/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.5259191914020757,
|
7 |
+
"micro_mean_score": 0.5230785894131227
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.6563419761104125,
|
13 |
+
"micro_mean_score": 0.6724419604471196
|
14 |
+
},
|
15 |
+
"overall_score": 0.5427062825031487
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.5690045172520449
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.6220681231036606
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.6077980666415158
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.5511440615639541
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.4885536652013625
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5908204006544897
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6569473684210526
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.5486763511384175
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.4315385951907387
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.2909419331017877
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.6048192628845258
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.48924295292319175
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.556418710368288
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.4946691340754988
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.5558756390298104
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.5425198547046186
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.44210335381541843
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.5187252051932875
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.5071121107460066
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.5387340524651681
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.4824302644151348
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.6242798397166945
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.5782691045270721
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.4630277507828528
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.5914338446093256
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.5636254729390459
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.4946691340754988
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.4828123870640382
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.48756636014597515
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.6590137441693218
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.39901670035164916
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.5166853031535193
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.5561634744977417
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.6123769274172342
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.5512015158810595
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.565796566886933
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.4763267502912362
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Claude_3.5_new/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/GPT_4o/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.5265030595065238,
|
7 |
+
"micro_mean_score": 0.5236338521693411
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.6478225794744895,
|
13 |
+
"micro_mean_score": 0.665391229578676
|
14 |
+
},
|
15 |
+
"overall_score": 0.5421184432647768
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.5630758211022604
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.6216411634729735
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.616018277142757
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.5823101249498799
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.44177544539510955
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.6345458069232931
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6795263157894738
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.5514924675940659
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.39435038953269674
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.22934807257231926
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.608083455060831
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.491325251564869
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.4999089647103332
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.5315979872161023
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.5641404607063637
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.5613545677222056
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.47760591698367955
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.5388690453811203
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.48037685656449847
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.5994159671881645
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.44606605087301393
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.6274371950293718
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.5448877153826162
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.4751133786848073
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.5343350103400748
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.5672657028463585
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.5315979872161023
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.4500928191484624
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.4908653289106883
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.7056027785545881
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.33202130899313653
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.5032849161169843
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.5510350848991218
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.6095778863474799
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.5283797185155754
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.6135723164021851
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.44047720383044436
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/GPT_4o/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/GPT_4o_mini/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.40767494558789397,
|
7 |
+
"micro_mean_score": 0.40431644154143376
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.586537827213665,
|
13 |
+
"micro_mean_score": 0.6133276010318144
|
14 |
+
},
|
15 |
+
"overall_score": 0.43069690064863675
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.4492982787524939
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.49026056071002017
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.5168957112681365
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.46731791428406805
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.3406008235342885
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5572925295284307
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6902380952380953
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.4189154010048976
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.2943206715105082
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.19422793560945503
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.47202628409684394
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.3624496929166193
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.38946844562183286
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.45508480503584553
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.47569921440672464
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.465175334092545
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.29410984789062117
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.41242028190533997
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.3906415365938764
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.44244772638735347
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.3629944944697668
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.5713834131825314
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.39874839531459466
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.3359977324263039
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.4305788513381019
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.46343334374251277
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.45508480503584553
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.24651576711552803
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.36981497185070983
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.5666618234843734
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.2420320329702607
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.3458483931206892
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.43590838051817093
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.5176671720617656
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.3554299482098288
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.5399167524341886
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.32918280841495845
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/GPT_4o_mini/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.4189319021967416,
|
7 |
+
"micro_mean_score": 0.41567515414375245
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.5691365176285039,
|
13 |
+
"micro_mean_score": 0.5987532244196045
|
14 |
+
},
|
15 |
+
"overall_score": 0.4382651695295427
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.46355333176347063
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.4431807648811706
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.4975887290434539
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.49409642663278297
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.38033540105052427
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5621166766717235
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6570726817042606
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.4480877005302385
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.3338006749329557
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.16197013296986068
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.3971534837718938
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.3448204918940882
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.43525833484767545
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.4837362543956792
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.5111257660425502
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.49366013155105076
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.4001983820478609
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.386988040250785
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.3884226428206387
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.4425893080900246
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.42223626366392253
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.5390305634303021
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.472066557554629
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.3666950113378685
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.44571360028283974
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.45400479933257654
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.4837362543956792
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.35161402777057993
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.3839609821519984
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.4822341581959653
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.26434115361219657
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.3677547363031234
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.4640301382180305
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.5348199655361041
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.4890240042560499
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.5126038207415967
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.384818434165593
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Gemini_1.5_flash_002/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.4822473962867704,
|
7 |
+
"micro_mean_score": 0.4764805563057179
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.5858190649927173,
|
13 |
+
"micro_mean_score": 0.6104901117798793
|
14 |
+
},
|
15 |
+
"overall_score": 0.4955784031499121
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.5202055934299538
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.5017043129027509
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.5532599716027446
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.546753787203128
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.425969084163906
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5751012914154264
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6982330827067671
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.513647745999633
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.3845337030093212
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.23899503258223884
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.4625032188638111
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.4292353723689881
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.4869625906903554
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.5028718355967439
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.5584779204331461
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.55005349042813
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.4292127751495457
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.44896309957892694
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.44418591808616864
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.5146447350354234
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.4688623462674191
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.5580414823700747
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.5538255562099124
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.39066515495086923
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.5370278962809547
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.5034399620483027
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.5028718355967439
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.4885398161821004
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.45544217378728585
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.5421439953094952
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.3335324339429373
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.43465181771633377
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.5250631828331306
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.5821004797173627
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.5124355410095621
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.5722329455291694
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.41210885517904977
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Gemini_1.5_pro_002/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Idefics3/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.08956972487602757,
|
7 |
+
"micro_mean_score": 0.08982225274252693
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.3210866162255635,
|
13 |
+
"micro_mean_score": 0.35649183147033553
|
14 |
+
},
|
15 |
+
"overall_score": 0.11936892871309657
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.123378776179585
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.09602065544451607
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.1661543932339007
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.13018902877020821
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.11200133210641629
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.1837120314657304
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.2364085213032582
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.15239546294916975
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.08255834173646705
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.03149369112824262
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.06151607584357764
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.10124344675801887
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.14147248511867794
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.15942387460900312
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.17458268378399872
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.13442937440893113
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.02766884416043467
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.15513016850044997
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.03757596375966502
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.05386631116442094
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.0760949224506388
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.2987797010800956
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.10403841600436024
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.0661753590325019
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.09190674791720088
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.12345439179884048
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.15942387460900312
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.11382786944230487
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.10803808254834846
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.11450308988278819
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.04671278220005028
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.0978814644137225
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.13283830731528018
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.09697463995668018
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.1840497279921703
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.1605667124060194
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.09835465288235297
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Idefics3/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/InternVL2_2B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.13141974398938763,
|
7 |
+
"micro_mean_score": 0.13063500716262516
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.23864417043743646,
|
13 |
+
"micro_mean_score": 0.24901117798796224
|
14 |
+
},
|
15 |
+
"overall_score": 0.14522090778963154
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.14491178903291552
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.12126906675624163
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.16912754929321935
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.18542274192083463
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.13923308734553164
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.23992252224543772
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.3420927318295739
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.14807577209152425
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.13036555933925006
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.01727799227799228
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.057021136657850864
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.10504085961245285
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.1625198552182714
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.18999779001767986
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.1487677475708977
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.2011727338536935
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.11886936592818943
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.1131404778887607
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.05739750616837997
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.15465451663650032
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.16044698450090833
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.21429521387724249
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.2128614316540013
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.03658352229780801
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.05757839721254354
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.15225683687839608
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.18999779001767986
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.17677460549936644
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.158165588340436
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.08722661966805
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.04102853815875594
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.11264043251709285
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.17001758160301803
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.3332891958712894
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.1686125516807394
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.21169137106199268
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.10975764217070672
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/InternVL2_2B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/InternVL2_5_2B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.17806821966478364,
|
7 |
+
"micro_mean_score": 0.17708809739236367
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.2738430375585404,
|
13 |
+
"micro_mean_score": 0.2905417024935512
|
14 |
+
},
|
15 |
+
"overall_score": 0.19039567147289096
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.19614682488147464
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.18910947570579717
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.20543964378430513
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.23636598588530347
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.15691382827270517
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.28604169870255614
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.4248446115288219
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.18745928331343714
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.15097551654513372
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.030568378443583684
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.13898447520398388
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.13154711942685113
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.18343540213068474
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.20755556526976354
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.15983467048343838
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.26888883087046195
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.12906517409932386
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.14702422379343882
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.15324148486802894
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.19977956414542175
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.1665590610582109
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.2529339759528222
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.23420071687554841
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.09651832955404382
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.0784280378818194
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.21260786581183966
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.20755556526976354
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.138285387531761
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.20214332169825855
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.18128339685489062
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.053153113565753
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.12416116984428181
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.22449772657901465
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.3762336977650326
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.19222024833691936
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.25056132494721467
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.15596334442569906
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/InternVL2_5_2B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/InternVL2_5_78B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.44132952988532753,
|
7 |
+
"micro_mean_score": 0.4397079059379812
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.5538024772749066,
|
13 |
+
"micro_mean_score": 0.5776870163370592
|
14 |
+
},
|
15 |
+
"overall_score": 0.4558062458859664
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.46893853078050696
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.5220829627238773
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.4933134095077618
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.477971701185214
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.3936387335462224
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5610278744213835
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6072907268170428
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.44533550848682696
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.3548055654857457
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.22852234519925363
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.4910486370158392
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.39410061025954557
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.43424133240430957
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.5300255483670417
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.4793195260560365
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.4622918421665308
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.3729954065847296
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.4226567593431527
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.4149806887502539
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.4904285184890861
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.4348674018783908
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.5124942746906233
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.4717682857925982
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.20496909081092754
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.4184724897299287
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.4951997132559491
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.5300255483670417
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.286105084660728
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.39635000103107665
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.5401547630322637
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.26403470419652064
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.3933356676003734
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.5168098196770042
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.47731479110938463
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.4388571290145052
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.5034762755043025
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.37742798395328586
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/InternVL2_5_78B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/InternVL2_76B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.3562710424410931,
|
7 |
+
"micro_mean_score": 0.35129859801162616
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.5192997443033639,
|
13 |
+
"micro_mean_score": 0.5421324161650903
|
14 |
+
},
|
15 |
+
"overall_score": 0.3772549347599992
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.38193012983650343
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.41315219763443384
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.43665980552577693
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.4265623936500962
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.2975890791763991
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5257990949897898
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.5779473684210527
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.33287081421166276
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.2949505390920417
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.17036496432397477
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.3634339625985008
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.31396468806559114
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.3473756113126343
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.395893002855977
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.44982107744035305
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.42875248733027654
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.2868239162778749
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.3630499545707523
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.3476691827105281
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.3943337471922549
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.29244088978470345
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.45822072478616577
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.3879326330400817
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.20309901738473166
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.34771123515123364
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.4145693044465943
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.395893002855977
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.24403942809507134
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.3153417935059416
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.4306947454508794
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.2132321995754061
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.2953329718984368
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.42202934355552685
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.47409276729986083
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.30014798153766264
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.4625649385962016
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.2868813944130515
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/InternVL2_76B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/InternVL2_8B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.25956581776451815,
|
7 |
+
"micro_mean_score": 0.2546984460483302
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1165,
|
12 |
+
"macro_mean_score": 0.3978571701460552,
|
13 |
+
"micro_mean_score": 0.4108583690987125
|
14 |
+
},
|
15 |
+
"overall_score": 0.2773656948037259
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.2817247716997634
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.280559214034858
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2511,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.32020728060179815
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2469,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.325593535916075
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.24118253695139918
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.39684007367798446
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.4700852130325815
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.27052668526005397
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2439,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.23189345356483618
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.08260405712900723
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.22800928556370195
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.2013779290163996
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.2804429603269583
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 700,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.34791358240562653
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.2942163420306113
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.3388056726588417
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.10933317885944857
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.250804626773504
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.2522493284864019
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.27414636444623874
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.22381302045502052
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1456,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.3537549824897016
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.30261189962428353
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.15434618291761149
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.19872104324302098
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.30088711082969344
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 700,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.34791358240562653
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.17725087609332119
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.2532272454839157
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.29129840423784176
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.12166926715781588
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.24700310231619527
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2315,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.3214666523378005
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.3995660275981844
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.24614711281861912
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.3393895915929317
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.22078333222564453
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/InternVL2_8B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Llama_3_2_11B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.15999641916771298,
|
7 |
+
"micro_mean_score": 0.15809331016967038
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.3173342406187366,
|
13 |
+
"micro_mean_score": 0.3487962166809973
|
14 |
+
},
|
15 |
+
"overall_score": 0.1802478219287358
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.1907604552173455
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.14328677752263275
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.19646404502647707
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.22399113135844315
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.13303760019716085
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.323153603297999
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.4260501253132832
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.1770852858056774
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.15366454315378308
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.06563884729522687
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.11886347847341794
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.11489351406848371
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.1693681214060816
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.2123769209846321
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.2520175802062012
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.2485354956932213
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.06418655520777307
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.12417283740525839
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.16374180545556977
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.1576236804437753
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.15014439824913947
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.3003142292328822
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.19270157739425633
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.1463246409674981
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.0732004839476103
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.1960107191983825
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.2123769209846321
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.1351857051327849
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.18586695387250338
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.17288724679416761
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.08100042975820579
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.0575426944971537
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.19899465185565898
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.254316961351997
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.162801811963855
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.28055776664538923
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.13937853323074623
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Llama_3_2_11B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Mammoth_VL/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.264052880412689,
|
7 |
+
"micro_mean_score": 0.2626894374387823
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.37992668750165337,
|
13 |
+
"micro_mean_score": 0.40120378331900275
|
14 |
+
},
|
15 |
+
"overall_score": 0.27896733083008046
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.30194776127683565
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.2365295791606494
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.2993927028494267
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.3366347826116991
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.2408454736444444
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.37895522991264047
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.48003508771929826
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.27232427744946475
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.24522937191710698
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.11457024299726488
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.18941525254390731
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.1718334741390191
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.28108187023954245
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.3391119999611432
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.36434285930327387
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.36915384448504296
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.15940750469262005
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.2456942956200745
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.21586513216389874
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.29359048024032264
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.2646677074112521
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.34733130661096645
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.3286125236284589
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.16358654572940287
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.25463059203015115
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.2919119209789575
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.3391119999611432
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.20016011839130254
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.2679179451692527
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.23600902063965679
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.15326915093278803
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.20668466311255687
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.33348955971237954
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.3759170425350556
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.23894961766260706
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.351703435685048
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.26074348700688493
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Mammoth_VL/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/MiniCPM_v2.6/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.22955895202146906,
|
7 |
+
"micro_mean_score": 0.22560399396899078
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.41728623355613875,
|
13 |
+
"micro_mean_score": 0.43452278589853827
|
14 |
+
},
|
15 |
+
"overall_score": 0.2537218694467236
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.2604967101191775
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.2500331562865158
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.3003169369011028
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.31808748114668184
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.18281637763548025
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.40732197204308807
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.48798245614035085
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.23723675736151562
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.1968926733821904
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.08735883237069725
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.21195711598986072
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.18639148159043903
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.21578309681746147
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.3527537836840162
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.3096882575625531
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.3176880312524649
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.0755920550038197
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.23506388020592064
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.1781127776443048
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.2551275278138797
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.20833171754655547
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.36473950920880716
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.293386806641223
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.13955971277399848
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.23596215721092323
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.26319603880798287
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.3527537836840162
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.17888270664238365
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.22288558250834017
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.2666989364424082
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.11693267119342445
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.15342045420318667
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.29243044121840894
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.3777897246686755
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.25714862989687987
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.33187729423141027
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.16493399805627715
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/MiniCPM_v2.6/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/NVLM/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.21589726765847422,
|
7 |
+
"micro_mean_score": 0.21406043849932396
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.3478114310231307,
|
13 |
+
"micro_mean_score": 0.3947549441100602
|
14 |
+
},
|
15 |
+
"overall_score": 0.23287631838857856
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.21591473223174515
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.27426258729618225
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.284874072963892
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.2134087963800149
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.2525993645909815
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.4029543142569604
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.4317142857142857
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.2442484196551863
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.1424318574406695
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.046798309600525674
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.19655048708297065
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.18621338396242557
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.2922667531642391
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.0
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.3447361496776569
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.29674507895195534
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.09716389574493003
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.19684666506287793
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.2199792859352912
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.25164831125437204
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.2396831363622878
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.3215948035793096
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.1853526865291571
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.0
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.0
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.3352056263801705
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.0
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.038244047619047615
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.2100484481849172
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.15704252277801936
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.06688589450465973
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.2292747206409446
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.2689383226748064
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.18857142857142856
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.23682040748983965
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.3656649917873737
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.26866914106442213
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/NVLM/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Phi-3.5-vision/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.22995297916629392,
|
7 |
+
"micro_mean_score": 0.22708502951025372
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.3947914647737769,
|
13 |
+
"micro_mean_score": 0.42459157351676696
|
14 |
+
},
|
15 |
+
"overall_score": 0.2511698139474551
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.2550326045763433
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.24395249720074527
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.2858236369733704
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.29876274710122536
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.21972896566746963
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.37513466171380355
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.4713934837092732
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.25475240046465697
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.20386233377001492
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.06657701969095552
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.16556787388989183
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.17989790940001513
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.2671646581690049
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.24920333780186898
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.3057560384411286
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.3341992361416253
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.12884156381685322
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.20494682188374266
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.21180084406324556
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.2609992615064841
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.2149689274645855
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.365192668303297
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.2593652357274648
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.10107709750566891
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.11861055655587921
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.2824151476986241
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.24920333780186898
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.1980440594073205
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.2636292373854696
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.20747122167273002
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.08602953103518936
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.20136893467064246
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.30979039348232706
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.3495072422622861
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.25858403958844717
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.3357218088688187
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.21140555087788399
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Phi-3.5-vision/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Pixtral_12B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.31362045151669854,
|
7 |
+
"micro_mean_score": 0.3100986209078182
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.4566234428542061,
|
13 |
+
"micro_mean_score": 0.4870593293207223
|
14 |
+
},
|
15 |
+
"overall_score": 0.33202677713439754
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.34184129499032456
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.37667712211439836
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.37896441862738645
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.37077191302051077
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.2843861774995234
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.4098150360139686
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.533077694235589
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.3372902862054838
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.25372282838901716
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.09524894246403817
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.2972619996610934
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.28304049684103855
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.33523333364720703
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.3988260865341648
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.39117521970978353
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.35583482417594536
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.21897822147396953
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.3436473210057542
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.28979044279399635
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.33530850344530555
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.30160980000905374
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.4166613092238044
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.30796171250186904
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.22871315192743763
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.21669652626580332
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.36087312117067055
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.3988260865341648
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.24616927284658197
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.2900329121369093
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.42652313209316933
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.1209559708312353
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.25678368121442124
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.37605128363484847
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.4576088857728113
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.3464929909487855
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.3858431845580602
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.2549787156825223
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Pixtral_12B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Qwen2_VL_2B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.20877163406364055,
|
7 |
+
"micro_mean_score": 0.20561526268932287
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.3154302566225611,
|
13 |
+
"micro_mean_score": 0.33856405846947557
|
14 |
+
},
|
15 |
+
"overall_score": 0.22249997162072932
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.22236161923122505
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.23701014663017753
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.25669221785292334
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.26526414975225454
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.17623548305581763
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.31250702198481506
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.4140676691729323
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.20802820480076603
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.17320633068307653
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.06209506566980099
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.190837839372028
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.16287824421269087
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.19640906475019812
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.2520741776922928
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.24883076673424442
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.2877316297453947
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.13398525561847363
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.1624451002757208
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.20960092816529263
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.19986806708136184
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.2201024015934558
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.30248748033122763
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.256631742010999
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.07681405895691609
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.10526691703628158
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.25018977062352593
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.2520741776922928
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.17435940889565366
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.21286783416184518
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.2521972668785968
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.06967138760493456
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.16996250112948405
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.27603334911345223
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.31002436092347696
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.21061929716065056
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.2656728023444808
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.16356158787929762
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Qwen2_VL_2B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Qwen2_VL_72B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.4542376574527161,
|
7 |
+
"micro_mean_score": 0.4501201906164793
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1163,
|
12 |
+
"macro_mean_score": 0.5639771804231668,
|
13 |
+
"micro_mean_score": 0.5835339638865004
|
14 |
+
},
|
15 |
+
"overall_score": 0.4683625465479226
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.48669152179713876
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.5291932917937967
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2509,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.53654503409075
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2467,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.4931554892760308
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.3908023665629473
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.5668846347262286
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.6121127819548872
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.4493794346300551
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2437,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.33622171962424363
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.21642754068858566
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.5263730250833892
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.42759570727857965
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.4228561177227288
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 698,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.4780253686541936
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.5070774860945021
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.4807292191169126
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.38847545874852984
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.4359156358804688
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.43781407268698613
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.49080138099759946
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.42481004254128113
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1454,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.5132810622684265
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.5062248706593999
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.3063303099017385
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.523959576707116
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.4879791577413812
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 698,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.4780253686541936
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.34846161336322395
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.44101149919132854
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.5663587858366833
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.3067825586087303
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.4121566368482877
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2313,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.5176521211872086
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.5030444649397028
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.45616267568458396
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.5047683071464567
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.3553838743540432
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Qwen2_VL_72B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/Qwen2_VL_7B/summary_results.json
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_summary": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 440,
|
5 |
+
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.3293449599230247,
|
7 |
+
"micro_mean_score": 0.325331493515679
|
8 |
+
},
|
9 |
+
"open": {
|
10 |
+
"num_eval_tasks": 65,
|
11 |
+
"num_eval_samples": 1170,
|
12 |
+
"macro_mean_score": 0.43955105763038577,
|
13 |
+
"micro_mean_score": 0.45508547008546996
|
14 |
+
},
|
15 |
+
"overall_score": 0.34352990319228904
|
16 |
+
},
|
17 |
+
"keyword_stats": {
|
18 |
+
"skills": {
|
19 |
+
"Object Recognition and Classification": {
|
20 |
+
"count": 303,
|
21 |
+
"num_samples": 4755,
|
22 |
+
"tasks": [],
|
23 |
+
"average_score": 0.3506773570484231
|
24 |
+
},
|
25 |
+
"Text Recognition (OCR)": {
|
26 |
+
"count": 137,
|
27 |
+
"num_samples": 2239,
|
28 |
+
"tasks": [],
|
29 |
+
"average_score": 0.38363163370919123
|
30 |
+
},
|
31 |
+
"Language Understanding and Generation": {
|
32 |
+
"count": 154,
|
33 |
+
"num_samples": 2511,
|
34 |
+
"tasks": [],
|
35 |
+
"average_score": 0.3882785389756705
|
36 |
+
},
|
37 |
+
"Scene and Event Understanding": {
|
38 |
+
"count": 154,
|
39 |
+
"num_samples": 2469,
|
40 |
+
"tasks": [],
|
41 |
+
"average_score": 0.38292659892379843
|
42 |
+
},
|
43 |
+
"Mathematical and Logical Reasoning": {
|
44 |
+
"count": 109,
|
45 |
+
"num_samples": 1910,
|
46 |
+
"tasks": [],
|
47 |
+
"average_score": 0.2730765188348748
|
48 |
+
},
|
49 |
+
"Commonsense and Social Reasoning": {
|
50 |
+
"count": 51,
|
51 |
+
"num_samples": 855,
|
52 |
+
"tasks": [],
|
53 |
+
"average_score": 0.4625711182912848
|
54 |
+
},
|
55 |
+
"Ethical and Safety Reasoning": {
|
56 |
+
"count": 15,
|
57 |
+
"num_samples": 245,
|
58 |
+
"tasks": [],
|
59 |
+
"average_score": 0.5287318295739348
|
60 |
+
},
|
61 |
+
"Domain-Specific Knowledge and Skills": {
|
62 |
+
"count": 77,
|
63 |
+
"num_samples": 1386,
|
64 |
+
"tasks": [],
|
65 |
+
"average_score": 0.32297080808954215
|
66 |
+
},
|
67 |
+
"Spatial and Temporal Reasoning": {
|
68 |
+
"count": 152,
|
69 |
+
"num_samples": 2439,
|
70 |
+
"tasks": [],
|
71 |
+
"average_score": 0.2561357336105554
|
72 |
+
},
|
73 |
+
"Planning and Decision Making": {
|
74 |
+
"count": 37,
|
75 |
+
"num_samples": 577,
|
76 |
+
"tasks": [],
|
77 |
+
"average_score": 0.12651411144309255
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"input_format": {
|
81 |
+
"User Interface Screenshots": {
|
82 |
+
"count": 93,
|
83 |
+
"num_samples": 1517,
|
84 |
+
"tasks": [],
|
85 |
+
"average_score": 0.35229497847636093
|
86 |
+
},
|
87 |
+
"Text-Based Images and Documents": {
|
88 |
+
"count": 82,
|
89 |
+
"num_samples": 1294,
|
90 |
+
"tasks": [],
|
91 |
+
"average_score": 0.2881996369284258
|
92 |
+
},
|
93 |
+
"Diagrams and Data Visualizations": {
|
94 |
+
"count": 101,
|
95 |
+
"num_samples": 1718,
|
96 |
+
"tasks": [],
|
97 |
+
"average_score": 0.3162917354476226
|
98 |
+
},
|
99 |
+
"Videos": {
|
100 |
+
"count": 43,
|
101 |
+
"num_samples": 700,
|
102 |
+
"tasks": [],
|
103 |
+
"average_score": 0.3555910609857979
|
104 |
+
},
|
105 |
+
"Artistic and Creative Content": {
|
106 |
+
"count": 32,
|
107 |
+
"num_samples": 541,
|
108 |
+
"tasks": [],
|
109 |
+
"average_score": 0.3513518594470202
|
110 |
+
},
|
111 |
+
"Photographs": {
|
112 |
+
"count": 143,
|
113 |
+
"num_samples": 2248,
|
114 |
+
"tasks": [],
|
115 |
+
"average_score": 0.39509504888372243
|
116 |
+
},
|
117 |
+
"3D Models and Aerial Imagery": {
|
118 |
+
"count": 11,
|
119 |
+
"num_samples": 169,
|
120 |
+
"tasks": [],
|
121 |
+
"average_score": 0.19173322639974366
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"output_format": {
|
125 |
+
"contextual_formatted_text": {
|
126 |
+
"count": 98,
|
127 |
+
"num_samples": 1514,
|
128 |
+
"tasks": [],
|
129 |
+
"average_score": 0.3118818521697947
|
130 |
+
},
|
131 |
+
"structured_output": {
|
132 |
+
"count": 110,
|
133 |
+
"num_samples": 1714,
|
134 |
+
"tasks": [],
|
135 |
+
"average_score": 0.3323478338046426
|
136 |
+
},
|
137 |
+
"exact_text": {
|
138 |
+
"count": 83,
|
139 |
+
"num_samples": 1278,
|
140 |
+
"tasks": [],
|
141 |
+
"average_score": 0.31975345327634014
|
142 |
+
},
|
143 |
+
"numerical_data": {
|
144 |
+
"count": 49,
|
145 |
+
"num_samples": 862,
|
146 |
+
"tasks": [],
|
147 |
+
"average_score": 0.3207400992620562
|
148 |
+
},
|
149 |
+
"open_ended_output": {
|
150 |
+
"count": 80,
|
151 |
+
"num_samples": 1456,
|
152 |
+
"tasks": [],
|
153 |
+
"average_score": 0.39680785337230745
|
154 |
+
},
|
155 |
+
"multiple_choice": {
|
156 |
+
"count": 85,
|
157 |
+
"num_samples": 1363,
|
158 |
+
"tasks": [],
|
159 |
+
"average_score": 0.38069986029874947
|
160 |
+
}
|
161 |
+
},
|
162 |
+
"input_num": {
|
163 |
+
"6-8 images": {
|
164 |
+
"count": 21,
|
165 |
+
"num_samples": 314,
|
166 |
+
"tasks": [],
|
167 |
+
"average_score": 0.21448412698412703
|
168 |
+
},
|
169 |
+
"9-image or more": {
|
170 |
+
"count": 41,
|
171 |
+
"num_samples": 623,
|
172 |
+
"tasks": [],
|
173 |
+
"average_score": 0.34991843422677277
|
174 |
+
},
|
175 |
+
"1-image": {
|
176 |
+
"count": 315,
|
177 |
+
"num_samples": 5228,
|
178 |
+
"tasks": [],
|
179 |
+
"average_score": 0.36487656334089386
|
180 |
+
},
|
181 |
+
"video": {
|
182 |
+
"count": 43,
|
183 |
+
"num_samples": 700,
|
184 |
+
"tasks": [],
|
185 |
+
"average_score": 0.3555910609857979
|
186 |
+
},
|
187 |
+
"4-5 images": {
|
188 |
+
"count": 34,
|
189 |
+
"num_samples": 520,
|
190 |
+
"tasks": [],
|
191 |
+
"average_score": 0.23950364354876252
|
192 |
+
},
|
193 |
+
"2-3 images": {
|
194 |
+
"count": 51,
|
195 |
+
"num_samples": 802,
|
196 |
+
"tasks": [],
|
197 |
+
"average_score": 0.31886513111201115
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"app": {
|
201 |
+
"Information_Extraction": {
|
202 |
+
"count": 72,
|
203 |
+
"num_samples": 1124,
|
204 |
+
"tasks": [],
|
205 |
+
"average_score": 0.3972495309304478
|
206 |
+
},
|
207 |
+
"Planning": {
|
208 |
+
"count": 78,
|
209 |
+
"num_samples": 1239,
|
210 |
+
"tasks": [],
|
211 |
+
"average_score": 0.18098305857595157
|
212 |
+
},
|
213 |
+
"Coding": {
|
214 |
+
"count": 31,
|
215 |
+
"num_samples": 474,
|
216 |
+
"tasks": [],
|
217 |
+
"average_score": 0.30887234822244314
|
218 |
+
},
|
219 |
+
"Perception": {
|
220 |
+
"count": 145,
|
221 |
+
"num_samples": 2315,
|
222 |
+
"tasks": [],
|
223 |
+
"average_score": 0.39256038521661607
|
224 |
+
},
|
225 |
+
"Metrics": {
|
226 |
+
"count": 20,
|
227 |
+
"num_samples": 309,
|
228 |
+
"tasks": [],
|
229 |
+
"average_score": 0.44924313486983725
|
230 |
+
},
|
231 |
+
"Science": {
|
232 |
+
"count": 29,
|
233 |
+
"num_samples": 574,
|
234 |
+
"tasks": [],
|
235 |
+
"average_score": 0.2880278656037017
|
236 |
+
},
|
237 |
+
"Knowledge": {
|
238 |
+
"count": 97,
|
239 |
+
"num_samples": 1605,
|
240 |
+
"tasks": [],
|
241 |
+
"average_score": 0.4015531477048036
|
242 |
+
},
|
243 |
+
"Mathematics": {
|
244 |
+
"count": 33,
|
245 |
+
"num_samples": 547,
|
246 |
+
"tasks": [],
|
247 |
+
"average_score": 0.24179792538224956
|
248 |
+
}
|
249 |
+
}
|
250 |
+
}
|
251 |
+
}
|
static/eval_results/Default/Qwen2_VL_7B/task_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/all_model_keywords_stats.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/Default/all_summary.json
DELETED
@@ -1,525 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"GPT_4o": {
|
3 |
-
"core_noncot": {
|
4 |
-
"num_eval_tasks": 440,
|
5 |
-
"num_eval_samples": 6539,
|
6 |
-
"num_not_eval_samples": 0,
|
7 |
-
"macro_mean_score": 0.5203440930873326,
|
8 |
-
"micro_mean_score": 0.514302640282204
|
9 |
-
},
|
10 |
-
"core_cot": {
|
11 |
-
"num_eval_tasks": 440,
|
12 |
-
"num_eval_samples": 6539,
|
13 |
-
"num_not_eval_samples": 0,
|
14 |
-
"macro_mean_score": 0.5265030595065238,
|
15 |
-
"micro_mean_score": 0.5236338521693411
|
16 |
-
},
|
17 |
-
"open": {
|
18 |
-
"num_eval_tasks": 65,
|
19 |
-
"num_eval_samples": 1163,
|
20 |
-
"macro_mean_score": 0.6478225794744895,
|
21 |
-
"micro_mean_score": 0.665391229578676
|
22 |
-
},
|
23 |
-
"overall_score": 0.5421184432647768
|
24 |
-
},
|
25 |
-
"Gemini_1.5_pro_002": {
|
26 |
-
"core_noncot": {
|
27 |
-
"num_eval_tasks": 440,
|
28 |
-
"num_eval_samples": 6539,
|
29 |
-
"num_not_eval_samples": 0,
|
30 |
-
"macro_mean_score": 0.4699992918320008,
|
31 |
-
"micro_mean_score": 0.4651116133689296
|
32 |
-
},
|
33 |
-
"core_cot": {
|
34 |
-
"num_eval_tasks": 440,
|
35 |
-
"num_eval_samples": 6539,
|
36 |
-
"num_not_eval_samples": 0,
|
37 |
-
"macro_mean_score": 0.4822473962867704,
|
38 |
-
"micro_mean_score": 0.4764805563057179
|
39 |
-
},
|
40 |
-
"open": {
|
41 |
-
"num_eval_tasks": 65,
|
42 |
-
"num_eval_samples": 1163,
|
43 |
-
"macro_mean_score": 0.5858190649927173,
|
44 |
-
"micro_mean_score": 0.6104901117798793
|
45 |
-
},
|
46 |
-
"overall_score": 0.4955784031499121
|
47 |
-
},
|
48 |
-
"Gemini_1.5_flash_002": {
|
49 |
-
"core_noncot": {
|
50 |
-
"num_eval_tasks": 440,
|
51 |
-
"num_eval_samples": 6539,
|
52 |
-
"num_not_eval_samples": 0,
|
53 |
-
"macro_mean_score": 0.41898948981774853,
|
54 |
-
"micro_mean_score": 0.4127376993779598
|
55 |
-
},
|
56 |
-
"core_cot": {
|
57 |
-
"num_eval_tasks": 440,
|
58 |
-
"num_eval_samples": 6539,
|
59 |
-
"num_not_eval_samples": 0,
|
60 |
-
"macro_mean_score": 0.4189319021967416,
|
61 |
-
"micro_mean_score": 0.41567515414375245
|
62 |
-
},
|
63 |
-
"open": {
|
64 |
-
"num_eval_tasks": 65,
|
65 |
-
"num_eval_samples": 1163,
|
66 |
-
"macro_mean_score": 0.5691365176285039,
|
67 |
-
"micro_mean_score": 0.5987532244196045
|
68 |
-
},
|
69 |
-
"overall_score": 0.43831534488249924
|
70 |
-
},
|
71 |
-
"Claude_3.5": {
|
72 |
-
"core_noncot": {
|
73 |
-
"num_eval_tasks": 440,
|
74 |
-
"num_eval_samples": 6539,
|
75 |
-
"num_not_eval_samples": 0,
|
76 |
-
"macro_mean_score": 0.48800427486796155,
|
77 |
-
"micro_mean_score": 0.4814327812005499
|
78 |
-
},
|
79 |
-
"core_cot": {
|
80 |
-
"num_eval_tasks": 440,
|
81 |
-
"num_eval_samples": 6539,
|
82 |
-
"num_not_eval_samples": 0,
|
83 |
-
"macro_mean_score": 0.5040975742801586,
|
84 |
-
"micro_mean_score": 0.5002259116666758
|
85 |
-
},
|
86 |
-
"open": {
|
87 |
-
"num_eval_tasks": 65,
|
88 |
-
"num_eval_samples": 1163,
|
89 |
-
"macro_mean_score": 0.6373907158949892,
|
90 |
-
"micro_mean_score": 0.6569647463456579
|
91 |
-
},
|
92 |
-
"overall_score": 0.5212541172602853
|
93 |
-
},
|
94 |
-
"Claude_3.5_new": {
|
95 |
-
"core_noncot": {
|
96 |
-
"num_eval_tasks": 440,
|
97 |
-
"num_eval_samples": 6539,
|
98 |
-
"num_not_eval_samples": 0,
|
99 |
-
"macro_mean_score": 0.4919657684484185,
|
100 |
-
"micro_mean_score": 0.4874520567007144
|
101 |
-
},
|
102 |
-
"core_cot": {
|
103 |
-
"num_eval_tasks": 440,
|
104 |
-
"num_eval_samples": 6539,
|
105 |
-
"num_not_eval_samples": 0,
|
106 |
-
"macro_mean_score": 0.5259191914020757,
|
107 |
-
"micro_mean_score": 0.5230785894131227
|
108 |
-
},
|
109 |
-
"open": {
|
110 |
-
"num_eval_tasks": 65,
|
111 |
-
"num_eval_samples": 1163,
|
112 |
-
"macro_mean_score": 0.6563419761104125,
|
113 |
-
"micro_mean_score": 0.6724419604471196
|
114 |
-
},
|
115 |
-
"overall_score": 0.5427062825031487
|
116 |
-
},
|
117 |
-
"GPT_4o_mini": {
|
118 |
-
"core_noncot": {
|
119 |
-
"num_eval_tasks": 440,
|
120 |
-
"num_eval_samples": 6539,
|
121 |
-
"num_not_eval_samples": 0,
|
122 |
-
"macro_mean_score": 0.39854757130003565,
|
123 |
-
"micro_mean_score": 0.3936551517403452
|
124 |
-
},
|
125 |
-
"core_cot": {
|
126 |
-
"num_eval_tasks": 440,
|
127 |
-
"num_eval_samples": 6539,
|
128 |
-
"num_not_eval_samples": 0,
|
129 |
-
"macro_mean_score": 0.40767494558789397,
|
130 |
-
"micro_mean_score": 0.40431644154143376
|
131 |
-
},
|
132 |
-
"open": {
|
133 |
-
"num_eval_tasks": 65,
|
134 |
-
"num_eval_samples": 1163,
|
135 |
-
"macro_mean_score": 0.586537827213665,
|
136 |
-
"micro_mean_score": 0.6133276010318144
|
137 |
-
},
|
138 |
-
"overall_score": 0.43069690064863675
|
139 |
-
},
|
140 |
-
"Qwen2_VL_72B": {
|
141 |
-
"core_noncot": {
|
142 |
-
"num_eval_tasks": 440,
|
143 |
-
"num_eval_samples": 6539,
|
144 |
-
"num_not_eval_samples": 0,
|
145 |
-
"macro_mean_score": 0.46406654108789214,
|
146 |
-
"micro_mean_score": 0.4584702152011697
|
147 |
-
},
|
148 |
-
"core_cot": {
|
149 |
-
"num_eval_tasks": 440,
|
150 |
-
"num_eval_samples": 6539,
|
151 |
-
"num_not_eval_samples": 0,
|
152 |
-
"macro_mean_score": 0.4542376574527161,
|
153 |
-
"micro_mean_score": 0.4501201906164793
|
154 |
-
},
|
155 |
-
"open": {
|
156 |
-
"num_eval_tasks": 65,
|
157 |
-
"num_eval_samples": 1163,
|
158 |
-
"macro_mean_score": 0.5639771804231668,
|
159 |
-
"micro_mean_score": 0.5835339638865004
|
160 |
-
},
|
161 |
-
"overall_score": 0.4769263263488681
|
162 |
-
},
|
163 |
-
"Qwen2_VL_7B": {
|
164 |
-
"core_noncot": {
|
165 |
-
"num_eval_tasks": 440,
|
166 |
-
"num_eval_samples": 6539,
|
167 |
-
"num_not_eval_samples": 0,
|
168 |
-
"macro_mean_score": 0.3480020832611913,
|
169 |
-
"micro_mean_score": 0.3441858958345098
|
170 |
-
},
|
171 |
-
"core_cot": {
|
172 |
-
"num_eval_tasks": 440,
|
173 |
-
"num_eval_samples": 6539,
|
174 |
-
"num_not_eval_samples": 0,
|
175 |
-
"macro_mean_score": 0.3293449599230247,
|
176 |
-
"micro_mean_score": 0.325331493515679
|
177 |
-
},
|
178 |
-
"open": {
|
179 |
-
"num_eval_tasks": 65,
|
180 |
-
"num_eval_samples": 1170,
|
181 |
-
"macro_mean_score": 0.43955105763038577,
|
182 |
-
"micro_mean_score": 0.45508547008546996
|
183 |
-
},
|
184 |
-
"overall_score": 0.3597856146156421
|
185 |
-
},
|
186 |
-
"llava_onevision_72B": {
|
187 |
-
"core_noncot": {
|
188 |
-
"num_eval_tasks": 440,
|
189 |
-
"num_eval_samples": 6539,
|
190 |
-
"num_not_eval_samples": 0,
|
191 |
-
"macro_mean_score": 0.3199332158220174,
|
192 |
-
"micro_mean_score": 0.31770770553892647
|
193 |
-
},
|
194 |
-
"core_cot": {
|
195 |
-
"num_eval_tasks": 440,
|
196 |
-
"num_eval_samples": 6539,
|
197 |
-
"num_not_eval_samples": 0,
|
198 |
-
"macro_mean_score": 0.2974368415462532,
|
199 |
-
"micro_mean_score": 0.2956217833156672
|
200 |
-
},
|
201 |
-
"open": {
|
202 |
-
"num_eval_tasks": 65,
|
203 |
-
"num_eval_samples": 1163,
|
204 |
-
"macro_mean_score": 0.4599484231632498,
|
205 |
-
"micro_mean_score": 0.4850386930352536
|
206 |
-
},
|
207 |
-
"overall_score": 0.33795497518277007
|
208 |
-
},
|
209 |
-
"llava_onevision_7B": {
|
210 |
-
"core_noncot": {
|
211 |
-
"num_eval_tasks": 440,
|
212 |
-
"num_eval_samples": 6539,
|
213 |
-
"num_not_eval_samples": 0,
|
214 |
-
"macro_mean_score": 0.22409531510496777,
|
215 |
-
"micro_mean_score": 0.22238854298563537
|
216 |
-
},
|
217 |
-
"core_cot": {
|
218 |
-
"num_eval_tasks": 440,
|
219 |
-
"num_eval_samples": 6539,
|
220 |
-
"num_not_eval_samples": 0,
|
221 |
-
"macro_mean_score": 0.21362697219149712,
|
222 |
-
"micro_mean_score": 0.21073910058505504
|
223 |
-
},
|
224 |
-
"open": {
|
225 |
-
"num_eval_tasks": 65,
|
226 |
-
"num_eval_samples": 1163,
|
227 |
-
"macro_mean_score": 0.33979975321921935,
|
228 |
-
"micro_mean_score": 0.36474634565778147
|
229 |
-
},
|
230 |
-
"overall_score": 0.23898796555531696
|
231 |
-
},
|
232 |
-
"InternVL2_76B": {
|
233 |
-
"core_noncot": {
|
234 |
-
"num_eval_tasks": 440,
|
235 |
-
"num_eval_samples": 6539,
|
236 |
-
"num_not_eval_samples": 0,
|
237 |
-
"macro_mean_score": 0.3502244283768534,
|
238 |
-
"micro_mean_score": 0.3456783051732046
|
239 |
-
},
|
240 |
-
"core_cot": {
|
241 |
-
"num_eval_tasks": 440,
|
242 |
-
"num_eval_samples": 6539,
|
243 |
-
"num_not_eval_samples": 0,
|
244 |
-
"macro_mean_score": 0.3562710424410931,
|
245 |
-
"micro_mean_score": 0.35129859801162616
|
246 |
-
},
|
247 |
-
"open": {
|
248 |
-
"num_eval_tasks": 65,
|
249 |
-
"num_eval_samples": 1163,
|
250 |
-
"macro_mean_score": 0.5192997443033639,
|
251 |
-
"micro_mean_score": 0.5421324161650903
|
252 |
-
},
|
253 |
-
"overall_score": 0.3772549347599992
|
254 |
-
},
|
255 |
-
"InternVL2_8B": {
|
256 |
-
"core_noncot": {
|
257 |
-
"num_eval_tasks": 440,
|
258 |
-
"num_eval_samples": 6539,
|
259 |
-
"num_not_eval_samples": 0,
|
260 |
-
"macro_mean_score": 0.25956581776451815,
|
261 |
-
"micro_mean_score": 0.2546984460483302
|
262 |
-
},
|
263 |
-
"core_cot": {
|
264 |
-
"num_eval_tasks": 440,
|
265 |
-
"num_eval_samples": 6539,
|
266 |
-
"num_not_eval_samples": 0,
|
267 |
-
"macro_mean_score": 0.24090301358258295,
|
268 |
-
"micro_mean_score": 0.23819084111520938
|
269 |
-
},
|
270 |
-
"open": {
|
271 |
-
"num_eval_tasks": 65,
|
272 |
-
"num_eval_samples": 1165,
|
273 |
-
"macro_mean_score": 0.3978571701460552,
|
274 |
-
"micro_mean_score": 0.4108583690987125
|
275 |
-
},
|
276 |
-
"overall_score": 0.2773656948037259
|
277 |
-
},
|
278 |
-
"MiniCPM_v2.6": {
|
279 |
-
"core_noncot": {
|
280 |
-
"num_eval_tasks": 440,
|
281 |
-
"num_eval_samples": 6539,
|
282 |
-
"num_not_eval_samples": 0,
|
283 |
-
"macro_mean_score": 0.2287645706203155,
|
284 |
-
"micro_mean_score": 0.2249087742955901
|
285 |
-
},
|
286 |
-
"core_cot": {
|
287 |
-
"num_eval_tasks": 440,
|
288 |
-
"num_eval_samples": 6539,
|
289 |
-
"num_not_eval_samples": 0,
|
290 |
-
"macro_mean_score": 0.22955895202146906,
|
291 |
-
"micro_mean_score": 0.22560399396899078
|
292 |
-
},
|
293 |
-
"open": {
|
294 |
-
"num_eval_tasks": 65,
|
295 |
-
"num_eval_samples": 1163,
|
296 |
-
"macro_mean_score": 0.41728623355613875,
|
297 |
-
"micro_mean_score": 0.43452278589853827
|
298 |
-
},
|
299 |
-
"overall_score": 0.2537218694467236
|
300 |
-
},
|
301 |
-
"Phi-3.5-vision": {
|
302 |
-
"core_noncot": {
|
303 |
-
"num_eval_tasks": 440,
|
304 |
-
"num_eval_samples": 6539,
|
305 |
-
"num_not_eval_samples": 0,
|
306 |
-
"macro_mean_score": 0.23271251159409778,
|
307 |
-
"micro_mean_score": 0.2296262323791101
|
308 |
-
},
|
309 |
-
"core_cot": {
|
310 |
-
"num_eval_tasks": 440,
|
311 |
-
"num_eval_samples": 6539,
|
312 |
-
"num_not_eval_samples": 0,
|
313 |
-
"macro_mean_score": 0.22995297916629392,
|
314 |
-
"micro_mean_score": 0.22708502951025372
|
315 |
-
},
|
316 |
-
"open": {
|
317 |
-
"num_eval_tasks": 65,
|
318 |
-
"num_eval_samples": 1163,
|
319 |
-
"macro_mean_score": 0.3947914647737769,
|
320 |
-
"micro_mean_score": 0.42459157351676696
|
321 |
-
},
|
322 |
-
"overall_score": 0.25357415903306635
|
323 |
-
},
|
324 |
-
"Pixtral_12B": {
|
325 |
-
"core_noncot": {
|
326 |
-
"num_eval_tasks": 440,
|
327 |
-
"num_eval_samples": 6539,
|
328 |
-
"num_not_eval_samples": 0,
|
329 |
-
"macro_mean_score": 0.31905695620134694,
|
330 |
-
"micro_mean_score": 0.31556607913724777
|
331 |
-
},
|
332 |
-
"core_cot": {
|
333 |
-
"num_eval_tasks": 440,
|
334 |
-
"num_eval_samples": 6539,
|
335 |
-
"num_not_eval_samples": 0,
|
336 |
-
"macro_mean_score": 0.31362045151669854,
|
337 |
-
"micro_mean_score": 0.3100986209078182
|
338 |
-
},
|
339 |
-
"open": {
|
340 |
-
"num_eval_tasks": 65,
|
341 |
-
"num_eval_samples": 1163,
|
342 |
-
"macro_mean_score": 0.4566234428542061,
|
343 |
-
"micro_mean_score": 0.4870593293207223
|
344 |
-
},
|
345 |
-
"overall_score": 0.33676353369131895
|
346 |
-
},
|
347 |
-
"Llama_3_2_11B": {
|
348 |
-
"core_noncot": {
|
349 |
-
"num_eval_tasks": 440,
|
350 |
-
"num_eval_samples": 6539,
|
351 |
-
"num_not_eval_samples": 0,
|
352 |
-
"macro_mean_score": 0.10044261716549671,
|
353 |
-
"micro_mean_score": 0.09980638766828835
|
354 |
-
},
|
355 |
-
"core_cot": {
|
356 |
-
"num_eval_tasks": 440,
|
357 |
-
"num_eval_samples": 6539,
|
358 |
-
"num_not_eval_samples": 0,
|
359 |
-
"macro_mean_score": 0.15999641916771298,
|
360 |
-
"micro_mean_score": 0.15809331016967038
|
361 |
-
},
|
362 |
-
"open": {
|
363 |
-
"num_eval_tasks": 65,
|
364 |
-
"num_eval_samples": 1163,
|
365 |
-
"macro_mean_score": 0.3173342406187366,
|
366 |
-
"micro_mean_score": 0.3487962166809973
|
367 |
-
},
|
368 |
-
"overall_score": 0.1802478219287358
|
369 |
-
},
|
370 |
-
"Idefics3": {
|
371 |
-
"core_noncot": {
|
372 |
-
"num_eval_tasks": 440,
|
373 |
-
"num_eval_samples": 6539,
|
374 |
-
"num_not_eval_samples": 0,
|
375 |
-
"macro_mean_score": 0.11118980301103833,
|
376 |
-
"micro_mean_score": 0.11201785633274061
|
377 |
-
},
|
378 |
-
"core_cot": {
|
379 |
-
"num_eval_tasks": 440,
|
380 |
-
"num_eval_samples": 6539,
|
381 |
-
"num_not_eval_samples": 0,
|
382 |
-
"macro_mean_score": 0.08956972487602757,
|
383 |
-
"micro_mean_score": 0.08982225274252693
|
384 |
-
},
|
385 |
-
"open": {
|
386 |
-
"num_eval_tasks": 65,
|
387 |
-
"num_eval_samples": 1163,
|
388 |
-
"macro_mean_score": 0.3210866162255635,
|
389 |
-
"micro_mean_score": 0.35649183147033553
|
390 |
-
},
|
391 |
-
"overall_score": 0.138206224513898
|
392 |
-
},
|
393 |
-
"Aria": {
|
394 |
-
"core_noncot": {
|
395 |
-
"num_eval_tasks": 440,
|
396 |
-
"num_eval_samples": 6539,
|
397 |
-
"num_not_eval_samples": 0,
|
398 |
-
"macro_mean_score": 0.30485930718699694,
|
399 |
-
"micro_mean_score": 0.3016713629035311
|
400 |
-
},
|
401 |
-
"core_cot": {
|
402 |
-
"num_eval_tasks": 440,
|
403 |
-
"num_eval_samples": 6539,
|
404 |
-
"num_not_eval_samples": 0,
|
405 |
-
"macro_mean_score": 0.289073788209904,
|
406 |
-
"micro_mean_score": 0.2859007507765791
|
407 |
-
},
|
408 |
-
"open": {
|
409 |
-
"num_eval_tasks": 65,
|
410 |
-
"num_eval_samples": 1163,
|
411 |
-
"macro_mean_score": 0.5103725263180767,
|
412 |
-
"micro_mean_score": 0.5349957007738607
|
413 |
-
},
|
414 |
-
"overall_score": 0.3313115037088191
|
415 |
-
},
|
416 |
-
"NVLM": {
|
417 |
-
"core_noncot": {
|
418 |
-
"num_eval_tasks": 440,
|
419 |
-
"num_eval_samples": 6539,
|
420 |
-
"num_not_eval_samples": 0,
|
421 |
-
"macro_mean_score": 0.2420528895703979,
|
422 |
-
"micro_mean_score": 0.23838419989257642
|
423 |
-
},
|
424 |
-
"core_cot": {
|
425 |
-
"num_eval_tasks": 440,
|
426 |
-
"num_eval_samples": 6539,
|
427 |
-
"num_not_eval_samples": 0,
|
428 |
-
"macro_mean_score": 0.21589726765847422,
|
429 |
-
"micro_mean_score": 0.21406043849932396
|
430 |
-
},
|
431 |
-
"open": {
|
432 |
-
"num_eval_tasks": 65,
|
433 |
-
"num_eval_samples": 1163,
|
434 |
-
"macro_mean_score": 0.3478114310231307,
|
435 |
-
"micro_mean_score": 0.3947549441100602
|
436 |
-
},
|
437 |
-
"overall_score": 0.25566537510391796
|
438 |
-
},
|
439 |
-
"InternVL2_2B": {
|
440 |
-
"core_noncot": {
|
441 |
-
"num_eval_tasks": 440,
|
442 |
-
"num_eval_samples": 6539,
|
443 |
-
"num_not_eval_samples": 0,
|
444 |
-
"macro_mean_score": 0.09089701489596874,
|
445 |
-
"micro_mean_score": 0.09036328295381871
|
446 |
-
},
|
447 |
-
"core_cot": {
|
448 |
-
"num_eval_tasks": 440,
|
449 |
-
"num_eval_samples": 6539,
|
450 |
-
"num_not_eval_samples": 0,
|
451 |
-
"macro_mean_score": 0.13141974398938763,
|
452 |
-
"micro_mean_score": 0.13063500716262516
|
453 |
-
},
|
454 |
-
"open": {
|
455 |
-
"num_eval_tasks": 65,
|
456 |
-
"num_eval_samples": 1163,
|
457 |
-
"macro_mean_score": 0.23864417043743646,
|
458 |
-
"micro_mean_score": 0.24901117798796224
|
459 |
-
},
|
460 |
-
"overall_score": 0.14522090778963154
|
461 |
-
},
|
462 |
-
"Qwen2_VL_2B": {
|
463 |
-
"core_noncot": {
|
464 |
-
"num_eval_tasks": 440,
|
465 |
-
"num_eval_samples": 6539,
|
466 |
-
"num_not_eval_samples": 0,
|
467 |
-
"macro_mean_score": 0.16448220309703876,
|
468 |
-
"micro_mean_score": 0.1610710186451323
|
469 |
-
},
|
470 |
-
"core_cot": {
|
471 |
-
"num_eval_tasks": 440,
|
472 |
-
"num_eval_samples": 6539,
|
473 |
-
"num_not_eval_samples": 0,
|
474 |
-
"macro_mean_score": 0.20877163406364055,
|
475 |
-
"micro_mean_score": 0.20561526268932287
|
476 |
-
},
|
477 |
-
"open": {
|
478 |
-
"num_eval_tasks": 65,
|
479 |
-
"num_eval_samples": 1163,
|
480 |
-
"macro_mean_score": 0.3154302566225611,
|
481 |
-
"micro_mean_score": 0.33856405846947557
|
482 |
-
},
|
483 |
-
"overall_score": 0.22249997162072932
|
484 |
-
},
|
485 |
-
"Aquila_VL_2B": {
|
486 |
-
"core_noncot": {
|
487 |
-
"num_eval_tasks": 440,
|
488 |
-
"num_eval_samples": 6539,
|
489 |
-
"num_not_eval_samples": 0,
|
490 |
-
"macro_mean_score": 0.16317824309838627,
|
491 |
-
"micro_mean_score": 0.16198837245148487
|
492 |
-
},
|
493 |
-
"core_cot": {
|
494 |
-
"num_eval_tasks": 440,
|
495 |
-
"num_eval_samples": 6539,
|
496 |
-
"num_not_eval_samples": 0,
|
497 |
-
"macro_mean_score": 0.159970161379836,
|
498 |
-
"micro_mean_score": 0.15844711671722148
|
499 |
-
},
|
500 |
-
"open": {
|
501 |
-
"num_eval_tasks": 65,
|
502 |
-
"num_eval_samples": 1163,
|
503 |
-
"macro_mean_score": 0.24567572098570653,
|
504 |
-
"micro_mean_score": 0.2704213241616509
|
505 |
-
},
|
506 |
-
"overall_score": 0.17379673035120966
|
507 |
-
},
|
508 |
-
"Mammoth_VL": {
|
509 |
-
"core_noncot": {
|
510 |
-
"num_eval_tasks": 440,
|
511 |
-
"num_eval_samples": 6539,
|
512 |
-
"num_not_eval_samples": 0,
|
513 |
-
"macro_mean_score": 0.264052880412689,
|
514 |
-
"micro_mean_score": 0.2626894374387823
|
515 |
-
},
|
516 |
-
"core_cot": null,
|
517 |
-
"open": {
|
518 |
-
"num_eval_tasks": 65,
|
519 |
-
"num_eval_samples": 1163,
|
520 |
-
"macro_mean_score": 0.37992668750165337,
|
521 |
-
"micro_mean_score": 0.40120378331900275
|
522 |
-
},
|
523 |
-
"overall_score": 0.27896733083008046
|
524 |
-
}
|
525 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|