cccjc commited on
Commit
2a2ba62
·
1 Parent(s): b4acc8e

update results & separate results organization

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +2 -1
  2. constants.py +1 -1
  3. static/eval_results/Default/Aquila_VL_2B/summary_results.json +251 -0
  4. static/eval_results/Default/Aquila_VL_2B/task_results.json +0 -0
  5. static/eval_results/Default/Aria/summary_results.json +251 -0
  6. static/eval_results/Default/Aria/task_results.json +0 -0
  7. static/eval_results/Default/Claude_3.5/summary_results.json +251 -0
  8. static/eval_results/Default/Claude_3.5/task_results.json +0 -0
  9. static/eval_results/Default/Claude_3.5_new/summary_results.json +251 -0
  10. static/eval_results/Default/Claude_3.5_new/task_results.json +0 -0
  11. static/eval_results/Default/GPT_4o/summary_results.json +251 -0
  12. static/eval_results/Default/GPT_4o/task_results.json +0 -0
  13. static/eval_results/Default/GPT_4o_mini/summary_results.json +251 -0
  14. static/eval_results/Default/GPT_4o_mini/task_results.json +0 -0
  15. static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json +251 -0
  16. static/eval_results/Default/Gemini_1.5_flash_002/task_results.json +0 -0
  17. static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json +251 -0
  18. static/eval_results/Default/Gemini_1.5_pro_002/task_results.json +0 -0
  19. static/eval_results/Default/Idefics3/summary_results.json +251 -0
  20. static/eval_results/Default/Idefics3/task_results.json +0 -0
  21. static/eval_results/Default/InternVL2_2B/summary_results.json +251 -0
  22. static/eval_results/Default/InternVL2_2B/task_results.json +0 -0
  23. static/eval_results/Default/InternVL2_5_2B/summary_results.json +251 -0
  24. static/eval_results/Default/InternVL2_5_2B/task_results.json +0 -0
  25. static/eval_results/Default/InternVL2_5_78B/summary_results.json +251 -0
  26. static/eval_results/Default/InternVL2_5_78B/task_results.json +0 -0
  27. static/eval_results/Default/InternVL2_76B/summary_results.json +251 -0
  28. static/eval_results/Default/InternVL2_76B/task_results.json +0 -0
  29. static/eval_results/Default/InternVL2_8B/summary_results.json +251 -0
  30. static/eval_results/Default/InternVL2_8B/task_results.json +0 -0
  31. static/eval_results/Default/Llama_3_2_11B/summary_results.json +251 -0
  32. static/eval_results/Default/Llama_3_2_11B/task_results.json +0 -0
  33. static/eval_results/Default/Mammoth_VL/summary_results.json +251 -0
  34. static/eval_results/Default/Mammoth_VL/task_results.json +0 -0
  35. static/eval_results/Default/MiniCPM_v2.6/summary_results.json +251 -0
  36. static/eval_results/Default/MiniCPM_v2.6/task_results.json +0 -0
  37. static/eval_results/Default/NVLM/summary_results.json +251 -0
  38. static/eval_results/Default/NVLM/task_results.json +0 -0
  39. static/eval_results/Default/Phi-3.5-vision/summary_results.json +251 -0
  40. static/eval_results/Default/Phi-3.5-vision/task_results.json +0 -0
  41. static/eval_results/Default/Pixtral_12B/summary_results.json +251 -0
  42. static/eval_results/Default/Pixtral_12B/task_results.json +0 -0
  43. static/eval_results/Default/Qwen2_VL_2B/summary_results.json +251 -0
  44. static/eval_results/Default/Qwen2_VL_2B/task_results.json +0 -0
  45. static/eval_results/Default/Qwen2_VL_72B/summary_results.json +251 -0
  46. static/eval_results/Default/Qwen2_VL_72B/task_results.json +0 -0
  47. static/eval_results/Default/Qwen2_VL_7B/summary_results.json +251 -0
  48. static/eval_results/Default/Qwen2_VL_7B/task_results.json +0 -0
  49. static/eval_results/Default/all_model_keywords_stats.json +0 -0
  50. static/eval_results/Default/all_summary.json +0 -525
app.py CHANGED
@@ -55,7 +55,8 @@ with gr.Blocks() as block:
55
  )
56
 
57
  # Define different captions for each table
58
- default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
 
59
  single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
60
 
61
  caption_component = gr.Markdown(
 
55
  )
56
 
57
  # Define different captions for each table
58
+ default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data. <br> $\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ "
59
+
60
  single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
61
 
62
  caption_component = gr.Markdown(
constants.py CHANGED
@@ -28,7 +28,7 @@ We aim to provide cost-effective and accurate evaluation for multimodal models,
28
 
29
  ## 📊🔍 Results & Takeaways from Evaluating Top Models
30
 
31
- - GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0622) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
32
  - Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
33
  - Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
34
  - Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
 
28
 
29
  ## 📊🔍 Results & Takeaways from Evaluating Top Models
30
 
31
+ - GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0620) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
32
  - Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
33
  - Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
34
  - Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
static/eval_results/Default/Aquila_VL_2B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.159970161379836,
7
+ "micro_mean_score": 0.15844711671722148
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.24567572098570653,
13
+ "micro_mean_score": 0.2704213241616509
14
+ },
15
+ "overall_score": 0.17100157004197775
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.1796551584774396
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.1263506560912463
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.1775085349123463
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.2114933522881099
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.16251700109869488
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.26453155444796583
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.3729498746867168
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.19090788408036002
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.16500679466160564
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.03972686819521137
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.07035116566014021
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.11915109312705179
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.18915652635850314
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.21939978337316163
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.17643260913333875
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.2438396314831894
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.08989401697906672
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.12241197113963243
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.10758402844431432
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.19372082302321905
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.19201243810115767
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.23278612647548963
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.21664527852608348
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.12138133030990172
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.01221681479628382
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.17994400163273605
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.21939978337316163
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.18212149746318507
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.21563163558700174
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.0981320856519089
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.0557399538308785
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.1351126472094214
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.2025034827431662
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.29326275059361956
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.22529225586731416
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.23810497886903373
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.17867138975396438
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Aquila_VL_2B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Aria/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.289073788209904,
7
+ "micro_mean_score": 0.2859007507765791
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.5103725263180767,
13
+ "micro_mean_score": 0.5349957007738607
14
+ },
15
+ "overall_score": 0.31755778420402525
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.3153649050553317
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.34425736922415495
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.3921740378709932
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.37623282710622424
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.271674311347156
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.46313777834281344
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.5692180451127821
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.3152064038837139
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.23851147782276536
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.11246568298589892
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.28561724084490353
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.2505346698796475
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.3040414715952029
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.41865640360591405
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.3622713579911698
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.35872259826035346
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.1509096092007215
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.2846987779732631
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.2899384042262363
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.27412885527802433
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.3117275816801635
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.4523860109667709
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.310055869988487
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.18301681783824644
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.26651659725352617
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.34236220565522313
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.41865640360591405
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.19142683154129833
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.2596336265133595
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.3929243812973524
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.1403503245041943
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.25367910605102256
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.3494812758481046
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.3662927672998609
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.28616079233761366
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.3953949223279651
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.26097385403450996
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Aria/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Claude_3.5/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.5040975742801586,
7
+ "micro_mean_score": 0.5002259116666758
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.6373907158949892,
13
+ "micro_mean_score": 0.6569647463456579
14
+ },
15
+ "overall_score": 0.5212541172602853
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.5405089647404562
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.6082834220752651
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.5745077617490254
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.5450038475783499
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.4767692987630454
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5756126284078804
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6969774436090224
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.5278843049497918
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.4082144793870471
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.23803578664609892
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.5691641481808987
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.4795267886975966
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.525848282456283
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.508735695828719
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.5699094130430454
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.5096772701625744
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.4429640420975014
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.5066797418318023
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.4971460788134188
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.5278127103234661
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.4490020843308984
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.5838224169821388
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.5456152399978661
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.46300075585789874
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.5414381873407914
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.5373019912310933
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.508735695828719
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.4422556748863689
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.49311554035078103
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.6663170946790707
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.3382015835012861
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.5194010220575684
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.532329797132399
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.5808831682303479
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.513474611293123
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.5507075880782885
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.47461998432626556
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Claude_3.5/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Claude_3.5_new/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.5259191914020757,
7
+ "micro_mean_score": 0.5230785894131227
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.6563419761104125,
13
+ "micro_mean_score": 0.6724419604471196
14
+ },
15
+ "overall_score": 0.5427062825031487
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.5690045172520449
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.6220681231036606
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.6077980666415158
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.5511440615639541
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.4885536652013625
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5908204006544897
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6569473684210526
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.5486763511384175
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.4315385951907387
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.2909419331017877
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.6048192628845258
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.48924295292319175
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.556418710368288
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.4946691340754988
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.5558756390298104
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.5425198547046186
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.44210335381541843
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.5187252051932875
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.5071121107460066
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.5387340524651681
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.4824302644151348
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.6242798397166945
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.5782691045270721
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.4630277507828528
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.5914338446093256
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.5636254729390459
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.4946691340754988
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.4828123870640382
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.48756636014597515
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.6590137441693218
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.39901670035164916
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.5166853031535193
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.5561634744977417
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.6123769274172342
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.5512015158810595
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.565796566886933
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.4763267502912362
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Claude_3.5_new/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/GPT_4o/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.5265030595065238,
7
+ "micro_mean_score": 0.5236338521693411
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.6478225794744895,
13
+ "micro_mean_score": 0.665391229578676
14
+ },
15
+ "overall_score": 0.5421184432647768
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.5630758211022604
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.6216411634729735
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.616018277142757
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.5823101249498799
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.44177544539510955
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.6345458069232931
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6795263157894738
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.5514924675940659
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.39435038953269674
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.22934807257231926
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.608083455060831
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.491325251564869
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.4999089647103332
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.5315979872161023
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.5641404607063637
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.5613545677222056
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.47760591698367955
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.5388690453811203
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.48037685656449847
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.5994159671881645
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.44606605087301393
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.6274371950293718
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.5448877153826162
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.4751133786848073
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.5343350103400748
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.5672657028463585
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.5315979872161023
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.4500928191484624
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.4908653289106883
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.7056027785545881
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.33202130899313653
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.5032849161169843
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.5510350848991218
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.6095778863474799
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.5283797185155754
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.6135723164021851
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.44047720383044436
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/GPT_4o/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/GPT_4o_mini/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.40767494558789397,
7
+ "micro_mean_score": 0.40431644154143376
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.586537827213665,
13
+ "micro_mean_score": 0.6133276010318144
14
+ },
15
+ "overall_score": 0.43069690064863675
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.4492982787524939
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.49026056071002017
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.5168957112681365
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.46731791428406805
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.3406008235342885
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5572925295284307
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6902380952380953
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.4189154010048976
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.2943206715105082
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.19422793560945503
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.47202628409684394
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.3624496929166193
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.38946844562183286
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.45508480503584553
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.47569921440672464
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.465175334092545
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.29410984789062117
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.41242028190533997
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.3906415365938764
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.44244772638735347
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.3629944944697668
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.5713834131825314
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.39874839531459466
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.3359977324263039
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.4305788513381019
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.46343334374251277
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.45508480503584553
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.24651576711552803
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.36981497185070983
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.5666618234843734
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.2420320329702607
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.3458483931206892
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.43590838051817093
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.5176671720617656
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.3554299482098288
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.5399167524341886
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.32918280841495845
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/GPT_4o_mini/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.4189319021967416,
7
+ "micro_mean_score": 0.41567515414375245
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.5691365176285039,
13
+ "micro_mean_score": 0.5987532244196045
14
+ },
15
+ "overall_score": 0.4382651695295427
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.46355333176347063
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.4431807648811706
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.4975887290434539
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.49409642663278297
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.38033540105052427
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5621166766717235
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6570726817042606
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.4480877005302385
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.3338006749329557
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.16197013296986068
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.3971534837718938
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.3448204918940882
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.43525833484767545
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.4837362543956792
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.5111257660425502
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.49366013155105076
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.4001983820478609
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.386988040250785
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.3884226428206387
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.4425893080900246
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.42223626366392253
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.5390305634303021
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.472066557554629
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.3666950113378685
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.44571360028283974
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.45400479933257654
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.4837362543956792
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.35161402777057993
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.3839609821519984
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.4822341581959653
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.26434115361219657
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.3677547363031234
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.4640301382180305
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.5348199655361041
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.4890240042560499
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.5126038207415967
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.384818434165593
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Gemini_1.5_flash_002/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.4822473962867704,
7
+ "micro_mean_score": 0.4764805563057179
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.5858190649927173,
13
+ "micro_mean_score": 0.6104901117798793
14
+ },
15
+ "overall_score": 0.4955784031499121
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.5202055934299538
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.5017043129027509
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.5532599716027446
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.546753787203128
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.425969084163906
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5751012914154264
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6982330827067671
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.513647745999633
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.3845337030093212
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.23899503258223884
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.4625032188638111
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.4292353723689881
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.4869625906903554
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.5028718355967439
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.5584779204331461
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.55005349042813
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.4292127751495457
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.44896309957892694
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.44418591808616864
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.5146447350354234
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.4688623462674191
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.5580414823700747
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.5538255562099124
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.39066515495086923
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.5370278962809547
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.5034399620483027
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.5028718355967439
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.4885398161821004
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.45544217378728585
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.5421439953094952
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.3335324339429373
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.43465181771633377
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.5250631828331306
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.5821004797173627
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.5124355410095621
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.5722329455291694
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.41210885517904977
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Gemini_1.5_pro_002/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Idefics3/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.08956972487602757,
7
+ "micro_mean_score": 0.08982225274252693
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.3210866162255635,
13
+ "micro_mean_score": 0.35649183147033553
14
+ },
15
+ "overall_score": 0.11936892871309657
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.123378776179585
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.09602065544451607
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.1661543932339007
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.13018902877020821
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.11200133210641629
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.1837120314657304
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.2364085213032582
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.15239546294916975
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.08255834173646705
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.03149369112824262
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.06151607584357764
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.10124344675801887
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.14147248511867794
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.15942387460900312
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.17458268378399872
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.13442937440893113
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.02766884416043467
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.15513016850044997
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.03757596375966502
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.05386631116442094
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.0760949224506388
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.2987797010800956
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.10403841600436024
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.0661753590325019
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.09190674791720088
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.12345439179884048
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.15942387460900312
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.11382786944230487
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.10803808254834846
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.11450308988278819
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.04671278220005028
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.0978814644137225
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.13283830731528018
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.09697463995668018
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.1840497279921703
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.1605667124060194
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.09835465288235297
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Idefics3/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/InternVL2_2B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.13141974398938763,
7
+ "micro_mean_score": 0.13063500716262516
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.23864417043743646,
13
+ "micro_mean_score": 0.24901117798796224
14
+ },
15
+ "overall_score": 0.14522090778963154
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.14491178903291552
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.12126906675624163
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.16912754929321935
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.18542274192083463
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.13923308734553164
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.23992252224543772
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.3420927318295739
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.14807577209152425
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.13036555933925006
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.01727799227799228
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.057021136657850864
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.10504085961245285
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.1625198552182714
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.18999779001767986
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.1487677475708977
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.2011727338536935
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.11886936592818943
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.1131404778887607
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.05739750616837997
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.15465451663650032
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.16044698450090833
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.21429521387724249
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.2128614316540013
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.03658352229780801
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.05757839721254354
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.15225683687839608
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.18999779001767986
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.17677460549936644
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.158165588340436
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.08722661966805
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.04102853815875594
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.11264043251709285
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.17001758160301803
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.3332891958712894
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.1686125516807394
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.21169137106199268
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.10975764217070672
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/InternVL2_2B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/InternVL2_5_2B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.17806821966478364,
7
+ "micro_mean_score": 0.17708809739236367
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.2738430375585404,
13
+ "micro_mean_score": 0.2905417024935512
14
+ },
15
+ "overall_score": 0.19039567147289096
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.19614682488147464
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.18910947570579717
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.20543964378430513
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.23636598588530347
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.15691382827270517
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.28604169870255614
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.4248446115288219
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.18745928331343714
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.15097551654513372
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.030568378443583684
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.13898447520398388
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.13154711942685113
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.18343540213068474
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.20755556526976354
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.15983467048343838
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.26888883087046195
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.12906517409932386
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.14702422379343882
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.15324148486802894
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.19977956414542175
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.1665590610582109
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.2529339759528222
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.23420071687554841
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.09651832955404382
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.0784280378818194
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.21260786581183966
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.20755556526976354
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.138285387531761
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.20214332169825855
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.18128339685489062
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.053153113565753
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.12416116984428181
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.22449772657901465
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.3762336977650326
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.19222024833691936
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.25056132494721467
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.15596334442569906
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/InternVL2_5_2B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/InternVL2_5_78B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.44132952988532753,
7
+ "micro_mean_score": 0.4397079059379812
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.5538024772749066,
13
+ "micro_mean_score": 0.5776870163370592
14
+ },
15
+ "overall_score": 0.4558062458859664
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.46893853078050696
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.5220829627238773
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.4933134095077618
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.477971701185214
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.3936387335462224
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5610278744213835
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6072907268170428
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.44533550848682696
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.3548055654857457
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.22852234519925363
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.4910486370158392
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.39410061025954557
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.43424133240430957
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.5300255483670417
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.4793195260560365
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.4622918421665308
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.3729954065847296
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.4226567593431527
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.4149806887502539
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.4904285184890861
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.4348674018783908
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.5124942746906233
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.4717682857925982
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.20496909081092754
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.4184724897299287
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.4951997132559491
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.5300255483670417
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.286105084660728
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.39635000103107665
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.5401547630322637
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.26403470419652064
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.3933356676003734
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.5168098196770042
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.47731479110938463
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.4388571290145052
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.5034762755043025
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.37742798395328586
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/InternVL2_5_78B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/InternVL2_76B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.3562710424410931,
7
+ "micro_mean_score": 0.35129859801162616
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.5192997443033639,
13
+ "micro_mean_score": 0.5421324161650903
14
+ },
15
+ "overall_score": 0.3772549347599992
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.38193012983650343
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.41315219763443384
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.43665980552577693
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.4265623936500962
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.2975890791763991
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5257990949897898
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.5779473684210527
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.33287081421166276
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.2949505390920417
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.17036496432397477
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.3634339625985008
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.31396468806559114
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.3473756113126343
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.395893002855977
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.44982107744035305
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.42875248733027654
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.2868239162778749
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.3630499545707523
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.3476691827105281
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.3943337471922549
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.29244088978470345
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.45822072478616577
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.3879326330400817
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.20309901738473166
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.34771123515123364
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.4145693044465943
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.395893002855977
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.24403942809507134
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.3153417935059416
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.4306947454508794
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.2132321995754061
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.2953329718984368
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.42202934355552685
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.47409276729986083
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.30014798153766264
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.4625649385962016
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.2868813944130515
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/InternVL2_76B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/InternVL2_8B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.25956581776451815,
7
+ "micro_mean_score": 0.2546984460483302
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1165,
12
+ "macro_mean_score": 0.3978571701460552,
13
+ "micro_mean_score": 0.4108583690987125
14
+ },
15
+ "overall_score": 0.2773656948037259
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.2817247716997634
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.280559214034858
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2511,
34
+ "tasks": [],
35
+ "average_score": 0.32020728060179815
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2469,
40
+ "tasks": [],
41
+ "average_score": 0.325593535916075
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.24118253695139918
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.39684007367798446
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.4700852130325815
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.27052668526005397
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2439,
70
+ "tasks": [],
71
+ "average_score": 0.23189345356483618
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.08260405712900723
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.22800928556370195
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.2013779290163996
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.2804429603269583
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 700,
102
+ "tasks": [],
103
+ "average_score": 0.34791358240562653
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.2942163420306113
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.3388056726588417
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.10933317885944857
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.250804626773504
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.2522493284864019
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.27414636444623874
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.22381302045502052
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1456,
152
+ "tasks": [],
153
+ "average_score": 0.3537549824897016
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.30261189962428353
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.15434618291761149
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.19872104324302098
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.30088711082969344
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 700,
184
+ "tasks": [],
185
+ "average_score": 0.34791358240562653
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.17725087609332119
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.2532272454839157
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.29129840423784176
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.12166926715781588
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.24700310231619527
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2315,
222
+ "tasks": [],
223
+ "average_score": 0.3214666523378005
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.3995660275981844
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.24614711281861912
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.3393895915929317
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.22078333222564453
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/InternVL2_8B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Llama_3_2_11B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.15999641916771298,
7
+ "micro_mean_score": 0.15809331016967038
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.3173342406187366,
13
+ "micro_mean_score": 0.3487962166809973
14
+ },
15
+ "overall_score": 0.1802478219287358
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.1907604552173455
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.14328677752263275
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.19646404502647707
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.22399113135844315
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.13303760019716085
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.323153603297999
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.4260501253132832
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.1770852858056774
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.15366454315378308
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.06563884729522687
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.11886347847341794
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.11489351406848371
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.1693681214060816
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.2123769209846321
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.2520175802062012
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.2485354956932213
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.06418655520777307
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.12417283740525839
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.16374180545556977
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.1576236804437753
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.15014439824913947
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.3003142292328822
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.19270157739425633
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.1463246409674981
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.0732004839476103
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.1960107191983825
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.2123769209846321
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.1351857051327849
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.18586695387250338
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.17288724679416761
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.08100042975820579
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.0575426944971537
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.19899465185565898
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.254316961351997
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.162801811963855
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.28055776664538923
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.13937853323074623
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Llama_3_2_11B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Mammoth_VL/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.264052880412689,
7
+ "micro_mean_score": 0.2626894374387823
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.37992668750165337,
13
+ "micro_mean_score": 0.40120378331900275
14
+ },
15
+ "overall_score": 0.27896733083008046
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.30194776127683565
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.2365295791606494
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.2993927028494267
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.3366347826116991
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.2408454736444444
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.37895522991264047
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.48003508771929826
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.27232427744946475
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.24522937191710698
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.11457024299726488
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.18941525254390731
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.1718334741390191
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.28108187023954245
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.3391119999611432
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.36434285930327387
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.36915384448504296
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.15940750469262005
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.2456942956200745
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.21586513216389874
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.29359048024032264
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.2646677074112521
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.34733130661096645
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.3286125236284589
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.16358654572940287
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.25463059203015115
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.2919119209789575
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.3391119999611432
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.20016011839130254
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.2679179451692527
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.23600902063965679
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.15326915093278803
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.20668466311255687
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.33348955971237954
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.3759170425350556
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.23894961766260706
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.351703435685048
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.26074348700688493
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Mammoth_VL/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/MiniCPM_v2.6/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.22955895202146906,
7
+ "micro_mean_score": 0.22560399396899078
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.41728623355613875,
13
+ "micro_mean_score": 0.43452278589853827
14
+ },
15
+ "overall_score": 0.2537218694467236
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.2604967101191775
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.2500331562865158
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.3003169369011028
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.31808748114668184
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.18281637763548025
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.40732197204308807
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.48798245614035085
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.23723675736151562
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.1968926733821904
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.08735883237069725
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.21195711598986072
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.18639148159043903
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.21578309681746147
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.3527537836840162
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.3096882575625531
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.3176880312524649
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.0755920550038197
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.23506388020592064
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.1781127776443048
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.2551275278138797
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.20833171754655547
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.36473950920880716
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.293386806641223
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.13955971277399848
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.23596215721092323
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.26319603880798287
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.3527537836840162
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.17888270664238365
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.22288558250834017
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.2666989364424082
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.11693267119342445
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.15342045420318667
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.29243044121840894
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.3777897246686755
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.25714862989687987
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.33187729423141027
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.16493399805627715
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/MiniCPM_v2.6/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/NVLM/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.21589726765847422,
7
+ "micro_mean_score": 0.21406043849932396
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.3478114310231307,
13
+ "micro_mean_score": 0.3947549441100602
14
+ },
15
+ "overall_score": 0.23287631838857856
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.21591473223174515
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.27426258729618225
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.284874072963892
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.2134087963800149
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.2525993645909815
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.4029543142569604
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.4317142857142857
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.2442484196551863
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.1424318574406695
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.046798309600525674
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.19655048708297065
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.18621338396242557
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.2922667531642391
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.0
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.3447361496776569
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.29674507895195534
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.09716389574493003
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.19684666506287793
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.2199792859352912
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.25164831125437204
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.2396831363622878
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.3215948035793096
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.1853526865291571
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.0
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.0
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.3352056263801705
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.0
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.038244047619047615
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.2100484481849172
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.15704252277801936
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.06688589450465973
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.2292747206409446
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.2689383226748064
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.18857142857142856
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.23682040748983965
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.3656649917873737
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.26866914106442213
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/NVLM/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Phi-3.5-vision/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.22995297916629392,
7
+ "micro_mean_score": 0.22708502951025372
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.3947914647737769,
13
+ "micro_mean_score": 0.42459157351676696
14
+ },
15
+ "overall_score": 0.2511698139474551
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.2550326045763433
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.24395249720074527
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.2858236369733704
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.29876274710122536
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.21972896566746963
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.37513466171380355
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.4713934837092732
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.25475240046465697
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.20386233377001492
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.06657701969095552
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.16556787388989183
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.17989790940001513
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.2671646581690049
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.24920333780186898
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.3057560384411286
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.3341992361416253
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.12884156381685322
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.20494682188374266
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.21180084406324556
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.2609992615064841
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.2149689274645855
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.365192668303297
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.2593652357274648
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.10107709750566891
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.11861055655587921
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.2824151476986241
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.24920333780186898
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.1980440594073205
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.2636292373854696
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.20747122167273002
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.08602953103518936
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.20136893467064246
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.30979039348232706
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.3495072422622861
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.25858403958844717
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.3357218088688187
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.21140555087788399
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Phi-3.5-vision/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Pixtral_12B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.31362045151669854,
7
+ "micro_mean_score": 0.3100986209078182
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.4566234428542061,
13
+ "micro_mean_score": 0.4870593293207223
14
+ },
15
+ "overall_score": 0.33202677713439754
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.34184129499032456
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.37667712211439836
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.37896441862738645
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.37077191302051077
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.2843861774995234
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.4098150360139686
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.533077694235589
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.3372902862054838
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.25372282838901716
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.09524894246403817
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.2972619996610934
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.28304049684103855
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.33523333364720703
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.3988260865341648
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.39117521970978353
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.35583482417594536
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.21897822147396953
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.3436473210057542
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.28979044279399635
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.33530850344530555
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.30160980000905374
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.4166613092238044
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.30796171250186904
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.22871315192743763
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.21669652626580332
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.36087312117067055
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.3988260865341648
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.24616927284658197
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.2900329121369093
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.42652313209316933
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.1209559708312353
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.25678368121442124
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.37605128363484847
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.4576088857728113
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.3464929909487855
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.3858431845580602
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.2549787156825223
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Pixtral_12B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Qwen2_VL_2B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.20877163406364055,
7
+ "micro_mean_score": 0.20561526268932287
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.3154302566225611,
13
+ "micro_mean_score": 0.33856405846947557
14
+ },
15
+ "overall_score": 0.22249997162072932
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.22236161923122505
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.23701014663017753
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.25669221785292334
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.26526414975225454
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.17623548305581763
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.31250702198481506
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.4140676691729323
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.20802820480076603
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.17320633068307653
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.06209506566980099
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.190837839372028
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.16287824421269087
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.19640906475019812
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.2520741776922928
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.24883076673424442
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.2877316297453947
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.13398525561847363
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.1624451002757208
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.20960092816529263
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.19986806708136184
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.2201024015934558
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.30248748033122763
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.256631742010999
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.07681405895691609
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.10526691703628158
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.25018977062352593
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.2520741776922928
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.17435940889565366
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.21286783416184518
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.2521972668785968
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.06967138760493456
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.16996250112948405
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.27603334911345223
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.31002436092347696
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.21061929716065056
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.2656728023444808
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.16356158787929762
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Qwen2_VL_2B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Qwen2_VL_72B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.4542376574527161,
7
+ "micro_mean_score": 0.4501201906164793
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1163,
12
+ "macro_mean_score": 0.5639771804231668,
13
+ "micro_mean_score": 0.5835339638865004
14
+ },
15
+ "overall_score": 0.4683625465479226
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.48669152179713876
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.5291932917937967
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2509,
34
+ "tasks": [],
35
+ "average_score": 0.53654503409075
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2467,
40
+ "tasks": [],
41
+ "average_score": 0.4931554892760308
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.3908023665629473
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.5668846347262286
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.6121127819548872
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.4493794346300551
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2437,
70
+ "tasks": [],
71
+ "average_score": 0.33622171962424363
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.21642754068858566
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.5263730250833892
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.42759570727857965
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.4228561177227288
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 698,
102
+ "tasks": [],
103
+ "average_score": 0.4780253686541936
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.5070774860945021
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.4807292191169126
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.38847545874852984
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.4359156358804688
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.43781407268698613
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.49080138099759946
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.42481004254128113
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1454,
152
+ "tasks": [],
153
+ "average_score": 0.5132810622684265
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.5062248706593999
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.3063303099017385
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.523959576707116
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.4879791577413812
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 698,
184
+ "tasks": [],
185
+ "average_score": 0.4780253686541936
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.34846161336322395
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.44101149919132854
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.5663587858366833
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.3067825586087303
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.4121566368482877
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2313,
222
+ "tasks": [],
223
+ "average_score": 0.5176521211872086
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.5030444649397028
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.45616267568458396
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.5047683071464567
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.3553838743540432
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Qwen2_VL_72B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/Qwen2_VL_7B/summary_results.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.3293449599230247,
7
+ "micro_mean_score": 0.325331493515679
8
+ },
9
+ "open": {
10
+ "num_eval_tasks": 65,
11
+ "num_eval_samples": 1170,
12
+ "macro_mean_score": 0.43955105763038577,
13
+ "micro_mean_score": 0.45508547008546996
14
+ },
15
+ "overall_score": 0.34352990319228904
16
+ },
17
+ "keyword_stats": {
18
+ "skills": {
19
+ "Object Recognition and Classification": {
20
+ "count": 303,
21
+ "num_samples": 4755,
22
+ "tasks": [],
23
+ "average_score": 0.3506773570484231
24
+ },
25
+ "Text Recognition (OCR)": {
26
+ "count": 137,
27
+ "num_samples": 2239,
28
+ "tasks": [],
29
+ "average_score": 0.38363163370919123
30
+ },
31
+ "Language Understanding and Generation": {
32
+ "count": 154,
33
+ "num_samples": 2511,
34
+ "tasks": [],
35
+ "average_score": 0.3882785389756705
36
+ },
37
+ "Scene and Event Understanding": {
38
+ "count": 154,
39
+ "num_samples": 2469,
40
+ "tasks": [],
41
+ "average_score": 0.38292659892379843
42
+ },
43
+ "Mathematical and Logical Reasoning": {
44
+ "count": 109,
45
+ "num_samples": 1910,
46
+ "tasks": [],
47
+ "average_score": 0.2730765188348748
48
+ },
49
+ "Commonsense and Social Reasoning": {
50
+ "count": 51,
51
+ "num_samples": 855,
52
+ "tasks": [],
53
+ "average_score": 0.4625711182912848
54
+ },
55
+ "Ethical and Safety Reasoning": {
56
+ "count": 15,
57
+ "num_samples": 245,
58
+ "tasks": [],
59
+ "average_score": 0.5287318295739348
60
+ },
61
+ "Domain-Specific Knowledge and Skills": {
62
+ "count": 77,
63
+ "num_samples": 1386,
64
+ "tasks": [],
65
+ "average_score": 0.32297080808954215
66
+ },
67
+ "Spatial and Temporal Reasoning": {
68
+ "count": 152,
69
+ "num_samples": 2439,
70
+ "tasks": [],
71
+ "average_score": 0.2561357336105554
72
+ },
73
+ "Planning and Decision Making": {
74
+ "count": 37,
75
+ "num_samples": 577,
76
+ "tasks": [],
77
+ "average_score": 0.12651411144309255
78
+ }
79
+ },
80
+ "input_format": {
81
+ "User Interface Screenshots": {
82
+ "count": 93,
83
+ "num_samples": 1517,
84
+ "tasks": [],
85
+ "average_score": 0.35229497847636093
86
+ },
87
+ "Text-Based Images and Documents": {
88
+ "count": 82,
89
+ "num_samples": 1294,
90
+ "tasks": [],
91
+ "average_score": 0.2881996369284258
92
+ },
93
+ "Diagrams and Data Visualizations": {
94
+ "count": 101,
95
+ "num_samples": 1718,
96
+ "tasks": [],
97
+ "average_score": 0.3162917354476226
98
+ },
99
+ "Videos": {
100
+ "count": 43,
101
+ "num_samples": 700,
102
+ "tasks": [],
103
+ "average_score": 0.3555910609857979
104
+ },
105
+ "Artistic and Creative Content": {
106
+ "count": 32,
107
+ "num_samples": 541,
108
+ "tasks": [],
109
+ "average_score": 0.3513518594470202
110
+ },
111
+ "Photographs": {
112
+ "count": 143,
113
+ "num_samples": 2248,
114
+ "tasks": [],
115
+ "average_score": 0.39509504888372243
116
+ },
117
+ "3D Models and Aerial Imagery": {
118
+ "count": 11,
119
+ "num_samples": 169,
120
+ "tasks": [],
121
+ "average_score": 0.19173322639974366
122
+ }
123
+ },
124
+ "output_format": {
125
+ "contextual_formatted_text": {
126
+ "count": 98,
127
+ "num_samples": 1514,
128
+ "tasks": [],
129
+ "average_score": 0.3118818521697947
130
+ },
131
+ "structured_output": {
132
+ "count": 110,
133
+ "num_samples": 1714,
134
+ "tasks": [],
135
+ "average_score": 0.3323478338046426
136
+ },
137
+ "exact_text": {
138
+ "count": 83,
139
+ "num_samples": 1278,
140
+ "tasks": [],
141
+ "average_score": 0.31975345327634014
142
+ },
143
+ "numerical_data": {
144
+ "count": 49,
145
+ "num_samples": 862,
146
+ "tasks": [],
147
+ "average_score": 0.3207400992620562
148
+ },
149
+ "open_ended_output": {
150
+ "count": 80,
151
+ "num_samples": 1456,
152
+ "tasks": [],
153
+ "average_score": 0.39680785337230745
154
+ },
155
+ "multiple_choice": {
156
+ "count": 85,
157
+ "num_samples": 1363,
158
+ "tasks": [],
159
+ "average_score": 0.38069986029874947
160
+ }
161
+ },
162
+ "input_num": {
163
+ "6-8 images": {
164
+ "count": 21,
165
+ "num_samples": 314,
166
+ "tasks": [],
167
+ "average_score": 0.21448412698412703
168
+ },
169
+ "9-image or more": {
170
+ "count": 41,
171
+ "num_samples": 623,
172
+ "tasks": [],
173
+ "average_score": 0.34991843422677277
174
+ },
175
+ "1-image": {
176
+ "count": 315,
177
+ "num_samples": 5228,
178
+ "tasks": [],
179
+ "average_score": 0.36487656334089386
180
+ },
181
+ "video": {
182
+ "count": 43,
183
+ "num_samples": 700,
184
+ "tasks": [],
185
+ "average_score": 0.3555910609857979
186
+ },
187
+ "4-5 images": {
188
+ "count": 34,
189
+ "num_samples": 520,
190
+ "tasks": [],
191
+ "average_score": 0.23950364354876252
192
+ },
193
+ "2-3 images": {
194
+ "count": 51,
195
+ "num_samples": 802,
196
+ "tasks": [],
197
+ "average_score": 0.31886513111201115
198
+ }
199
+ },
200
+ "app": {
201
+ "Information_Extraction": {
202
+ "count": 72,
203
+ "num_samples": 1124,
204
+ "tasks": [],
205
+ "average_score": 0.3972495309304478
206
+ },
207
+ "Planning": {
208
+ "count": 78,
209
+ "num_samples": 1239,
210
+ "tasks": [],
211
+ "average_score": 0.18098305857595157
212
+ },
213
+ "Coding": {
214
+ "count": 31,
215
+ "num_samples": 474,
216
+ "tasks": [],
217
+ "average_score": 0.30887234822244314
218
+ },
219
+ "Perception": {
220
+ "count": 145,
221
+ "num_samples": 2315,
222
+ "tasks": [],
223
+ "average_score": 0.39256038521661607
224
+ },
225
+ "Metrics": {
226
+ "count": 20,
227
+ "num_samples": 309,
228
+ "tasks": [],
229
+ "average_score": 0.44924313486983725
230
+ },
231
+ "Science": {
232
+ "count": 29,
233
+ "num_samples": 574,
234
+ "tasks": [],
235
+ "average_score": 0.2880278656037017
236
+ },
237
+ "Knowledge": {
238
+ "count": 97,
239
+ "num_samples": 1605,
240
+ "tasks": [],
241
+ "average_score": 0.4015531477048036
242
+ },
243
+ "Mathematics": {
244
+ "count": 33,
245
+ "num_samples": 547,
246
+ "tasks": [],
247
+ "average_score": 0.24179792538224956
248
+ }
249
+ }
250
+ }
251
+ }
static/eval_results/Default/Qwen2_VL_7B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/all_model_keywords_stats.json DELETED
The diff for this file is too large to render. See raw diff
 
static/eval_results/Default/all_summary.json DELETED
@@ -1,525 +0,0 @@
1
- {
2
- "GPT_4o": {
3
- "core_noncot": {
4
- "num_eval_tasks": 440,
5
- "num_eval_samples": 6539,
6
- "num_not_eval_samples": 0,
7
- "macro_mean_score": 0.5203440930873326,
8
- "micro_mean_score": 0.514302640282204
9
- },
10
- "core_cot": {
11
- "num_eval_tasks": 440,
12
- "num_eval_samples": 6539,
13
- "num_not_eval_samples": 0,
14
- "macro_mean_score": 0.5265030595065238,
15
- "micro_mean_score": 0.5236338521693411
16
- },
17
- "open": {
18
- "num_eval_tasks": 65,
19
- "num_eval_samples": 1163,
20
- "macro_mean_score": 0.6478225794744895,
21
- "micro_mean_score": 0.665391229578676
22
- },
23
- "overall_score": 0.5421184432647768
24
- },
25
- "Gemini_1.5_pro_002": {
26
- "core_noncot": {
27
- "num_eval_tasks": 440,
28
- "num_eval_samples": 6539,
29
- "num_not_eval_samples": 0,
30
- "macro_mean_score": 0.4699992918320008,
31
- "micro_mean_score": 0.4651116133689296
32
- },
33
- "core_cot": {
34
- "num_eval_tasks": 440,
35
- "num_eval_samples": 6539,
36
- "num_not_eval_samples": 0,
37
- "macro_mean_score": 0.4822473962867704,
38
- "micro_mean_score": 0.4764805563057179
39
- },
40
- "open": {
41
- "num_eval_tasks": 65,
42
- "num_eval_samples": 1163,
43
- "macro_mean_score": 0.5858190649927173,
44
- "micro_mean_score": 0.6104901117798793
45
- },
46
- "overall_score": 0.4955784031499121
47
- },
48
- "Gemini_1.5_flash_002": {
49
- "core_noncot": {
50
- "num_eval_tasks": 440,
51
- "num_eval_samples": 6539,
52
- "num_not_eval_samples": 0,
53
- "macro_mean_score": 0.41898948981774853,
54
- "micro_mean_score": 0.4127376993779598
55
- },
56
- "core_cot": {
57
- "num_eval_tasks": 440,
58
- "num_eval_samples": 6539,
59
- "num_not_eval_samples": 0,
60
- "macro_mean_score": 0.4189319021967416,
61
- "micro_mean_score": 0.41567515414375245
62
- },
63
- "open": {
64
- "num_eval_tasks": 65,
65
- "num_eval_samples": 1163,
66
- "macro_mean_score": 0.5691365176285039,
67
- "micro_mean_score": 0.5987532244196045
68
- },
69
- "overall_score": 0.43831534488249924
70
- },
71
- "Claude_3.5": {
72
- "core_noncot": {
73
- "num_eval_tasks": 440,
74
- "num_eval_samples": 6539,
75
- "num_not_eval_samples": 0,
76
- "macro_mean_score": 0.48800427486796155,
77
- "micro_mean_score": 0.4814327812005499
78
- },
79
- "core_cot": {
80
- "num_eval_tasks": 440,
81
- "num_eval_samples": 6539,
82
- "num_not_eval_samples": 0,
83
- "macro_mean_score": 0.5040975742801586,
84
- "micro_mean_score": 0.5002259116666758
85
- },
86
- "open": {
87
- "num_eval_tasks": 65,
88
- "num_eval_samples": 1163,
89
- "macro_mean_score": 0.6373907158949892,
90
- "micro_mean_score": 0.6569647463456579
91
- },
92
- "overall_score": 0.5212541172602853
93
- },
94
- "Claude_3.5_new": {
95
- "core_noncot": {
96
- "num_eval_tasks": 440,
97
- "num_eval_samples": 6539,
98
- "num_not_eval_samples": 0,
99
- "macro_mean_score": 0.4919657684484185,
100
- "micro_mean_score": 0.4874520567007144
101
- },
102
- "core_cot": {
103
- "num_eval_tasks": 440,
104
- "num_eval_samples": 6539,
105
- "num_not_eval_samples": 0,
106
- "macro_mean_score": 0.5259191914020757,
107
- "micro_mean_score": 0.5230785894131227
108
- },
109
- "open": {
110
- "num_eval_tasks": 65,
111
- "num_eval_samples": 1163,
112
- "macro_mean_score": 0.6563419761104125,
113
- "micro_mean_score": 0.6724419604471196
114
- },
115
- "overall_score": 0.5427062825031487
116
- },
117
- "GPT_4o_mini": {
118
- "core_noncot": {
119
- "num_eval_tasks": 440,
120
- "num_eval_samples": 6539,
121
- "num_not_eval_samples": 0,
122
- "macro_mean_score": 0.39854757130003565,
123
- "micro_mean_score": 0.3936551517403452
124
- },
125
- "core_cot": {
126
- "num_eval_tasks": 440,
127
- "num_eval_samples": 6539,
128
- "num_not_eval_samples": 0,
129
- "macro_mean_score": 0.40767494558789397,
130
- "micro_mean_score": 0.40431644154143376
131
- },
132
- "open": {
133
- "num_eval_tasks": 65,
134
- "num_eval_samples": 1163,
135
- "macro_mean_score": 0.586537827213665,
136
- "micro_mean_score": 0.6133276010318144
137
- },
138
- "overall_score": 0.43069690064863675
139
- },
140
- "Qwen2_VL_72B": {
141
- "core_noncot": {
142
- "num_eval_tasks": 440,
143
- "num_eval_samples": 6539,
144
- "num_not_eval_samples": 0,
145
- "macro_mean_score": 0.46406654108789214,
146
- "micro_mean_score": 0.4584702152011697
147
- },
148
- "core_cot": {
149
- "num_eval_tasks": 440,
150
- "num_eval_samples": 6539,
151
- "num_not_eval_samples": 0,
152
- "macro_mean_score": 0.4542376574527161,
153
- "micro_mean_score": 0.4501201906164793
154
- },
155
- "open": {
156
- "num_eval_tasks": 65,
157
- "num_eval_samples": 1163,
158
- "macro_mean_score": 0.5639771804231668,
159
- "micro_mean_score": 0.5835339638865004
160
- },
161
- "overall_score": 0.4769263263488681
162
- },
163
- "Qwen2_VL_7B": {
164
- "core_noncot": {
165
- "num_eval_tasks": 440,
166
- "num_eval_samples": 6539,
167
- "num_not_eval_samples": 0,
168
- "macro_mean_score": 0.3480020832611913,
169
- "micro_mean_score": 0.3441858958345098
170
- },
171
- "core_cot": {
172
- "num_eval_tasks": 440,
173
- "num_eval_samples": 6539,
174
- "num_not_eval_samples": 0,
175
- "macro_mean_score": 0.3293449599230247,
176
- "micro_mean_score": 0.325331493515679
177
- },
178
- "open": {
179
- "num_eval_tasks": 65,
180
- "num_eval_samples": 1170,
181
- "macro_mean_score": 0.43955105763038577,
182
- "micro_mean_score": 0.45508547008546996
183
- },
184
- "overall_score": 0.3597856146156421
185
- },
186
- "llava_onevision_72B": {
187
- "core_noncot": {
188
- "num_eval_tasks": 440,
189
- "num_eval_samples": 6539,
190
- "num_not_eval_samples": 0,
191
- "macro_mean_score": 0.3199332158220174,
192
- "micro_mean_score": 0.31770770553892647
193
- },
194
- "core_cot": {
195
- "num_eval_tasks": 440,
196
- "num_eval_samples": 6539,
197
- "num_not_eval_samples": 0,
198
- "macro_mean_score": 0.2974368415462532,
199
- "micro_mean_score": 0.2956217833156672
200
- },
201
- "open": {
202
- "num_eval_tasks": 65,
203
- "num_eval_samples": 1163,
204
- "macro_mean_score": 0.4599484231632498,
205
- "micro_mean_score": 0.4850386930352536
206
- },
207
- "overall_score": 0.33795497518277007
208
- },
209
- "llava_onevision_7B": {
210
- "core_noncot": {
211
- "num_eval_tasks": 440,
212
- "num_eval_samples": 6539,
213
- "num_not_eval_samples": 0,
214
- "macro_mean_score": 0.22409531510496777,
215
- "micro_mean_score": 0.22238854298563537
216
- },
217
- "core_cot": {
218
- "num_eval_tasks": 440,
219
- "num_eval_samples": 6539,
220
- "num_not_eval_samples": 0,
221
- "macro_mean_score": 0.21362697219149712,
222
- "micro_mean_score": 0.21073910058505504
223
- },
224
- "open": {
225
- "num_eval_tasks": 65,
226
- "num_eval_samples": 1163,
227
- "macro_mean_score": 0.33979975321921935,
228
- "micro_mean_score": 0.36474634565778147
229
- },
230
- "overall_score": 0.23898796555531696
231
- },
232
- "InternVL2_76B": {
233
- "core_noncot": {
234
- "num_eval_tasks": 440,
235
- "num_eval_samples": 6539,
236
- "num_not_eval_samples": 0,
237
- "macro_mean_score": 0.3502244283768534,
238
- "micro_mean_score": 0.3456783051732046
239
- },
240
- "core_cot": {
241
- "num_eval_tasks": 440,
242
- "num_eval_samples": 6539,
243
- "num_not_eval_samples": 0,
244
- "macro_mean_score": 0.3562710424410931,
245
- "micro_mean_score": 0.35129859801162616
246
- },
247
- "open": {
248
- "num_eval_tasks": 65,
249
- "num_eval_samples": 1163,
250
- "macro_mean_score": 0.5192997443033639,
251
- "micro_mean_score": 0.5421324161650903
252
- },
253
- "overall_score": 0.3772549347599992
254
- },
255
- "InternVL2_8B": {
256
- "core_noncot": {
257
- "num_eval_tasks": 440,
258
- "num_eval_samples": 6539,
259
- "num_not_eval_samples": 0,
260
- "macro_mean_score": 0.25956581776451815,
261
- "micro_mean_score": 0.2546984460483302
262
- },
263
- "core_cot": {
264
- "num_eval_tasks": 440,
265
- "num_eval_samples": 6539,
266
- "num_not_eval_samples": 0,
267
- "macro_mean_score": 0.24090301358258295,
268
- "micro_mean_score": 0.23819084111520938
269
- },
270
- "open": {
271
- "num_eval_tasks": 65,
272
- "num_eval_samples": 1165,
273
- "macro_mean_score": 0.3978571701460552,
274
- "micro_mean_score": 0.4108583690987125
275
- },
276
- "overall_score": 0.2773656948037259
277
- },
278
- "MiniCPM_v2.6": {
279
- "core_noncot": {
280
- "num_eval_tasks": 440,
281
- "num_eval_samples": 6539,
282
- "num_not_eval_samples": 0,
283
- "macro_mean_score": 0.2287645706203155,
284
- "micro_mean_score": 0.2249087742955901
285
- },
286
- "core_cot": {
287
- "num_eval_tasks": 440,
288
- "num_eval_samples": 6539,
289
- "num_not_eval_samples": 0,
290
- "macro_mean_score": 0.22955895202146906,
291
- "micro_mean_score": 0.22560399396899078
292
- },
293
- "open": {
294
- "num_eval_tasks": 65,
295
- "num_eval_samples": 1163,
296
- "macro_mean_score": 0.41728623355613875,
297
- "micro_mean_score": 0.43452278589853827
298
- },
299
- "overall_score": 0.2537218694467236
300
- },
301
- "Phi-3.5-vision": {
302
- "core_noncot": {
303
- "num_eval_tasks": 440,
304
- "num_eval_samples": 6539,
305
- "num_not_eval_samples": 0,
306
- "macro_mean_score": 0.23271251159409778,
307
- "micro_mean_score": 0.2296262323791101
308
- },
309
- "core_cot": {
310
- "num_eval_tasks": 440,
311
- "num_eval_samples": 6539,
312
- "num_not_eval_samples": 0,
313
- "macro_mean_score": 0.22995297916629392,
314
- "micro_mean_score": 0.22708502951025372
315
- },
316
- "open": {
317
- "num_eval_tasks": 65,
318
- "num_eval_samples": 1163,
319
- "macro_mean_score": 0.3947914647737769,
320
- "micro_mean_score": 0.42459157351676696
321
- },
322
- "overall_score": 0.25357415903306635
323
- },
324
- "Pixtral_12B": {
325
- "core_noncot": {
326
- "num_eval_tasks": 440,
327
- "num_eval_samples": 6539,
328
- "num_not_eval_samples": 0,
329
- "macro_mean_score": 0.31905695620134694,
330
- "micro_mean_score": 0.31556607913724777
331
- },
332
- "core_cot": {
333
- "num_eval_tasks": 440,
334
- "num_eval_samples": 6539,
335
- "num_not_eval_samples": 0,
336
- "macro_mean_score": 0.31362045151669854,
337
- "micro_mean_score": 0.3100986209078182
338
- },
339
- "open": {
340
- "num_eval_tasks": 65,
341
- "num_eval_samples": 1163,
342
- "macro_mean_score": 0.4566234428542061,
343
- "micro_mean_score": 0.4870593293207223
344
- },
345
- "overall_score": 0.33676353369131895
346
- },
347
- "Llama_3_2_11B": {
348
- "core_noncot": {
349
- "num_eval_tasks": 440,
350
- "num_eval_samples": 6539,
351
- "num_not_eval_samples": 0,
352
- "macro_mean_score": 0.10044261716549671,
353
- "micro_mean_score": 0.09980638766828835
354
- },
355
- "core_cot": {
356
- "num_eval_tasks": 440,
357
- "num_eval_samples": 6539,
358
- "num_not_eval_samples": 0,
359
- "macro_mean_score": 0.15999641916771298,
360
- "micro_mean_score": 0.15809331016967038
361
- },
362
- "open": {
363
- "num_eval_tasks": 65,
364
- "num_eval_samples": 1163,
365
- "macro_mean_score": 0.3173342406187366,
366
- "micro_mean_score": 0.3487962166809973
367
- },
368
- "overall_score": 0.1802478219287358
369
- },
370
- "Idefics3": {
371
- "core_noncot": {
372
- "num_eval_tasks": 440,
373
- "num_eval_samples": 6539,
374
- "num_not_eval_samples": 0,
375
- "macro_mean_score": 0.11118980301103833,
376
- "micro_mean_score": 0.11201785633274061
377
- },
378
- "core_cot": {
379
- "num_eval_tasks": 440,
380
- "num_eval_samples": 6539,
381
- "num_not_eval_samples": 0,
382
- "macro_mean_score": 0.08956972487602757,
383
- "micro_mean_score": 0.08982225274252693
384
- },
385
- "open": {
386
- "num_eval_tasks": 65,
387
- "num_eval_samples": 1163,
388
- "macro_mean_score": 0.3210866162255635,
389
- "micro_mean_score": 0.35649183147033553
390
- },
391
- "overall_score": 0.138206224513898
392
- },
393
- "Aria": {
394
- "core_noncot": {
395
- "num_eval_tasks": 440,
396
- "num_eval_samples": 6539,
397
- "num_not_eval_samples": 0,
398
- "macro_mean_score": 0.30485930718699694,
399
- "micro_mean_score": 0.3016713629035311
400
- },
401
- "core_cot": {
402
- "num_eval_tasks": 440,
403
- "num_eval_samples": 6539,
404
- "num_not_eval_samples": 0,
405
- "macro_mean_score": 0.289073788209904,
406
- "micro_mean_score": 0.2859007507765791
407
- },
408
- "open": {
409
- "num_eval_tasks": 65,
410
- "num_eval_samples": 1163,
411
- "macro_mean_score": 0.5103725263180767,
412
- "micro_mean_score": 0.5349957007738607
413
- },
414
- "overall_score": 0.3313115037088191
415
- },
416
- "NVLM": {
417
- "core_noncot": {
418
- "num_eval_tasks": 440,
419
- "num_eval_samples": 6539,
420
- "num_not_eval_samples": 0,
421
- "macro_mean_score": 0.2420528895703979,
422
- "micro_mean_score": 0.23838419989257642
423
- },
424
- "core_cot": {
425
- "num_eval_tasks": 440,
426
- "num_eval_samples": 6539,
427
- "num_not_eval_samples": 0,
428
- "macro_mean_score": 0.21589726765847422,
429
- "micro_mean_score": 0.21406043849932396
430
- },
431
- "open": {
432
- "num_eval_tasks": 65,
433
- "num_eval_samples": 1163,
434
- "macro_mean_score": 0.3478114310231307,
435
- "micro_mean_score": 0.3947549441100602
436
- },
437
- "overall_score": 0.25566537510391796
438
- },
439
- "InternVL2_2B": {
440
- "core_noncot": {
441
- "num_eval_tasks": 440,
442
- "num_eval_samples": 6539,
443
- "num_not_eval_samples": 0,
444
- "macro_mean_score": 0.09089701489596874,
445
- "micro_mean_score": 0.09036328295381871
446
- },
447
- "core_cot": {
448
- "num_eval_tasks": 440,
449
- "num_eval_samples": 6539,
450
- "num_not_eval_samples": 0,
451
- "macro_mean_score": 0.13141974398938763,
452
- "micro_mean_score": 0.13063500716262516
453
- },
454
- "open": {
455
- "num_eval_tasks": 65,
456
- "num_eval_samples": 1163,
457
- "macro_mean_score": 0.23864417043743646,
458
- "micro_mean_score": 0.24901117798796224
459
- },
460
- "overall_score": 0.14522090778963154
461
- },
462
- "Qwen2_VL_2B": {
463
- "core_noncot": {
464
- "num_eval_tasks": 440,
465
- "num_eval_samples": 6539,
466
- "num_not_eval_samples": 0,
467
- "macro_mean_score": 0.16448220309703876,
468
- "micro_mean_score": 0.1610710186451323
469
- },
470
- "core_cot": {
471
- "num_eval_tasks": 440,
472
- "num_eval_samples": 6539,
473
- "num_not_eval_samples": 0,
474
- "macro_mean_score": 0.20877163406364055,
475
- "micro_mean_score": 0.20561526268932287
476
- },
477
- "open": {
478
- "num_eval_tasks": 65,
479
- "num_eval_samples": 1163,
480
- "macro_mean_score": 0.3154302566225611,
481
- "micro_mean_score": 0.33856405846947557
482
- },
483
- "overall_score": 0.22249997162072932
484
- },
485
- "Aquila_VL_2B": {
486
- "core_noncot": {
487
- "num_eval_tasks": 440,
488
- "num_eval_samples": 6539,
489
- "num_not_eval_samples": 0,
490
- "macro_mean_score": 0.16317824309838627,
491
- "micro_mean_score": 0.16198837245148487
492
- },
493
- "core_cot": {
494
- "num_eval_tasks": 440,
495
- "num_eval_samples": 6539,
496
- "num_not_eval_samples": 0,
497
- "macro_mean_score": 0.159970161379836,
498
- "micro_mean_score": 0.15844711671722148
499
- },
500
- "open": {
501
- "num_eval_tasks": 65,
502
- "num_eval_samples": 1163,
503
- "macro_mean_score": 0.24567572098570653,
504
- "micro_mean_score": 0.2704213241616509
505
- },
506
- "overall_score": 0.17379673035120966
507
- },
508
- "Mammoth_VL": {
509
- "core_noncot": {
510
- "num_eval_tasks": 440,
511
- "num_eval_samples": 6539,
512
- "num_not_eval_samples": 0,
513
- "macro_mean_score": 0.264052880412689,
514
- "micro_mean_score": 0.2626894374387823
515
- },
516
- "core_cot": null,
517
- "open": {
518
- "num_eval_tasks": 65,
519
- "num_eval_samples": 1163,
520
- "macro_mean_score": 0.37992668750165337,
521
- "micro_mean_score": 0.40120378331900275
522
- },
523
- "overall_score": 0.27896733083008046
524
- }
525
- }