MEGA-Bench / static /eval_results /all_model_keywords_stats.json
cccjc's picture
add new model results
f14a657
raw
history blame
144 kB
{
"NVLM": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.24033557047857043
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.32154059695494047
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.2937052996171993
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.22845955700594492
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2639741933075709
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.40870864071047447
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4555238095238095
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.25785191641267197
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.15679681195908274
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.0672259242345112
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.23922823287047076
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.21734036617042948
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.30313485498585124
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.0
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.34726189956094355
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3264757655296162
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.056894830390305184
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.22868389095927066
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.2788963949121424
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2787764976961992
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.23349712171444964
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3215948035793096
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.18487055428231897
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.0
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.0
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3680809151131777
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.0
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.03838410364145658
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2325581694709435
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.22773778915303383
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08048160660797504
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2390024647851972
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.30211261814126533
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.18857142857142856
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.24908307640275493
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3724877947012685
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.24529601154794037
}
}
},
"GPT_4o_mini": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.44928744961868194
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.48842488118273475
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5152626716886682
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.4672966076116977
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.3406008235342885
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5572281917334303
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6902380952380953
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.4189154010048976
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2943206715105082
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.19422793560945503
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.4700389569079038
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.3624496929166193
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.38946844562183286
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.45508480503584553
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.47569921440672464
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.46468618797917643
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.29410984789062117
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.41174000979649644
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.38893151244736324
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.44244772638735347
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3629944944697668
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5713834131825314
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.39874839531459466
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.3359977324263039
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.4260710116168476
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.46322170353087255
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.45508480503584553
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.24651576711552803
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.3697506340557095
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5640948591986592
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2420320329702607
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.3458483931206892
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.43544861040322835
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5176671720617656
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.3554299482098288
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5398829253460956
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.32918280841495845
}
}
},
"Llama_3_2_11B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.1907604552173455
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.14280015951776653
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.1960311445935766
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.22399113135844315
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.13303760019716085
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.323153603297999
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4260501253132832
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.1770852858056774
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.15366454315378308
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.06563884729522687
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.11886347847341794
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.11489351406848371
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.1693681214060816
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2123769209846321
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2520175802062012
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.24806929522702081
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.06418655520777307
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.12349256529641485
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.16374180545556977
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.1576236804437753
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.15014439824913947
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3003142292328822
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.19270157739425633
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1463246409674981
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.0732004839476103
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.19579907898674231
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2123769209846321
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.1351857051327849
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.18586695387250338
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.17288724679416761
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08100042975820579
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.0575426944971537
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.19853488174071646
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.254316961351997
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.162801811963855
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.28055776664538923
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.13937853323074623
}
}
},
"Claude_3.5_new": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5690042283891658
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6220681231036606
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.6077980666415158
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5511434932168607
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.4885536652013625
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.590818684469149
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6569473684210526
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5486763511384175
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.4315385951907387
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.2909419331017877
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.6048192628845258
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.48924295292319175
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.556418710368288
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4946691340754988
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5558756390298104
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.542519242638518
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.44210335381541843
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5187252051932875
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.5071113150600759
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5387340524651681
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4824302644151348
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.6242798397166945
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5782691045270721
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.4630277507828528
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5914338446093256
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5636254729390459
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4946691340754988
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4828123870640382
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.48756464396063437
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.6590137441693218
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.39901670035164916
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5166853031535193
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5561634744977417
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6123769274172342
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5512015158810595
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5657956645626817
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.4763267502912362
}
}
},
"InternVL2_8B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2817247716997634
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.2794121858805306
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2511,
"tasks": [],
"average_score": 0.31918687243853283
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2469,
"tasks": [],
"average_score": 0.325593535916075
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.24118253695139918
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.39684007367798446
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4700852130325815
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.27052668526005397
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2439,
"tasks": [],
"average_score": 0.23189345356483618
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08260405712900723
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.2277532691786533
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2013779290163996
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.2804429603269583
},
"Videos": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.34791358240562653
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2942163420306113
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.33787327172644077
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.10933317885944857
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.24944408255581693
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.25203287826995174
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.27414636444623874
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.22381302045502052
},
"open_ended_output": {
"count": 80,
"num_samples": 1456,
"tasks": [],
"average_score": 0.3537549824897016
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.30261189962428353
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.15434618291761149
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.19814032315010577
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.30046383040641306
},
"video": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.34791358240562653
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17725087609332119
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2532272454839157
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.29096771640715396
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.12166926715781588
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.24700310231619527
},
"Perception": {
"count": 145,
"num_samples": 2315,
"tasks": [],
"average_score": 0.3205471121079154
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3995660275981844
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.24614711281861912
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3393895915929317
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.22078333222564453
}
}
},
"llava_onevision_7B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2524786809911341
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.1902376706945491
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.255069390206439
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.29981286990552625
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.18973491465938852
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.36842322314565323
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.44998746867167916
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.2445135206648208
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.21802943568344288
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.06658775725427067
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.1466163383815089
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.13297395577964055
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.24236719143449742
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.30985943541023103
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3199731020402028
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3258716730180874
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.13043163858789789
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.20209776978059824
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.18285692568564196
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.25384794412815426
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.2200472229099345
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3127341248874411
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.2802999516721972
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1476473922902494
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.13787962981142515
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.25459683619676365
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.30985943541023103
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.1778991941079372
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2410111891690358
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.19274192395698486
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.09846926279075068
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.15189414475467605
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.2845922887108415
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3600079950628582
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.23654776813656775
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3271805711561501
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.22080546908673507
}
}
},
"llava_onevision_72B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.3615741356043519
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.282401662313336
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.36653344218973427
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.42146038539739283
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2951434804409883
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.478119286755779
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6005438596491229
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.31663222188988865
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.29633645022129285
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.13872280436872364
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.23294708136735856
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2126914943750874
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.34566020099204997
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4446001874842145
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.4401364830377099
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.42429297143518147
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.23897262553543516
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.28614732096244
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.25872873777911126
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.370724080249463
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3065719940769206
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4293132525502993
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3986052416087927
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.20730347694633405
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.27911174307216713
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3481968601113118
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4446001874842145
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.25013213032747944
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.34156793747875674
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.30653989171354723
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.18168666652660437
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.23240790940031927
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.38316803441883945
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4807891958712894
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.31702495228966576
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.4358874880224115
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.31588468105075895
}
}
},
"Gemini_1.5_pro_002": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5201947642961418
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.49947304390648534
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5512750115216515
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5467324805307577
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.425969084163906
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5750369536204262
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6982330827067671
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.513647745999633
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.3845337030093212
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.23899503258223884
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.4592162957187749
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4292353723689881
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4869625906903554
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5028718355967439
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5584779204331461
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5500305447809621
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.4292127751495457
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.44896309957892694
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.44137714463131966
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5146447350354234
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4688623462674191
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5580414823700747
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5538255562099124
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.39066515495086923
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5295721925617263
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5034399620483027
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5028718355967439
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4885398161821004
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.4553778359922855
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5378983862471568
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.3335324339429373
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.43465181771633377
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5250631828331306
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5821004797173627
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5124355410095621
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5721991184410764
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.41210885517904977
}
}
},
"MiniCPM_v2.6": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2604969133146555
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.24828453993935928
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.2987613496312298
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.31808788094038193
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.18281637763548025
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.4073231792632807
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.48798245614035085
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.23723675736151562
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.1968926733821904
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08735883237069725
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.21153173491931837
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.18639148159043903
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.21578309681746147
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.3527537836840162
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3096882575625531
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.31628986040092516
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.0755920550038197
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.23302306387939006
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.17775369699584467
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2551275278138797
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.20833171754655547
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.36473950920880716
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.293386806641223
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.13955971277399848
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.23499726844115643
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.2625611181730622
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.3527537836840162
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17888270664238365
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.22288678972853282
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.26614948589295767
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.11693267119342445
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.15342045420318667
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.2910511308735813
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3777897246686755
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.25714862989687987
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.33187792895542906
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.16493399805627715
}
}
},
"GPT_4o": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5630800473549525
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6216411634729735
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.616018277142757
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5823184402392676
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.44177544539510955
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.6345709158363462
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6795263157894738
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5514924675940659
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.39435038953269674
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.22934807257231926
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.608083455060831
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.491325251564869
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4999089647103332
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5315979872161023
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5641404607063637
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5613635226492386
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.47760591698367955
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5388690453811203
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.4803884979696412
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5994159671881645
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.44606605087301393
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.6274371950293718
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5448877153826162
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.4751133786848073
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5343350103400748
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5672657028463585
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5315979872161023
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4500928191484624
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.49089043782374137
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.7056027785545881
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.33202130899313653
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5032849161169843
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5510350848991218
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6095778863474799
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5283797185155754
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.6135855179956459
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.44047720383044436
}
}
},
"Phi-3.5-vision": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2551037902226636
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.24734930136620975
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.2864612416413776
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3049602749093698
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.21653804346780042
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.36823084724842464
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.46663157894736845
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.24145330077248778
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2154692063816354
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08944481289041872
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.18587661796707747
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.17497379027990792
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.26053460127801603
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.24669318645450836
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2786226802221388
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.34091066308972107
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.15444746077692828
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.21711219915973207
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.2138304528863496
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2572371188897671
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.21409351002477045
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.365192668303297
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.25960269434727634
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.12546296296296297
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.14174374624685185
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.2776898347355035
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.24669318645450836
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.20168001345379397
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2850550871176333
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.22277777000798116
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08928724806836039
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.219367263034246
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.31585879714366544
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3945898792928062
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.21925278489551242
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.33264696401038385
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.17575913004138646
}
}
},
"InternVL2_76B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.38191947207402666
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.4103649605406274
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.4341802504488193
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.42654142415639185
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2975890791763991
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5257357753421337
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5779473684210527
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.33287081421166276
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2949505390920417
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.17036496432397477
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.362195416198664
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.31396468806559114
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3473756113126343
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.395893002855977
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.44982107744035305
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.42686510293379315
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.2868239162778749
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3603288661353782
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3465926907358438
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.3943337471922549
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.29244088978470345
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.45822072478616577
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3879326330400817
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.20309901738473166
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.34490184941501867
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.41372274360003347
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.395893002855977
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.24403942809507134
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.3152784738582855
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4290949563510903
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2132321995754061
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2953329718984368
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.4201902630957567
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.47409276729986083
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.30014798153766264
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.46253164682269177
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.2868813944130515
}
}
},
"Gemini_1.5_flash_002": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.46250942866818673
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.4317914359988347
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.49775198805427967
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5098686082319499
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.34393279682972117
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5594391803821158
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6380250626566416
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.44816564352475535
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.34510790215980036
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.18973764406890803
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3836737169374586
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.3598139859097534
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4013870708864889
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4903530871753026
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5051202896842343
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5166044655846657
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.3849084036535956
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3869438864407766
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3962715194192418
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.44793686445264996
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3704146726364947
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5448638967636353
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.47829883834573317
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.33669690098261523
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.4300676062024303
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.4427944359714585
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4903530871753026
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.42346517633403413
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.41994719346489817
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4627701625196691
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2517485212411566
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.40372378342017806
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.4799408254775632
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6010361821632402
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.4569546533897065
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.511590428993871
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.33710867194177685
}
}
},
"Pixtral_12B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.34602671066871027
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.3764652079852679
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.38183869685317606
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3776679463596073
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2828575553466608
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.4190587833823822
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5687919799498747
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.32813540763467464
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2677293131171651
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.10591240329992047
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.30581019415764066
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.28832738144368647
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3223299098375932
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.409643099998057
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.37450808136321684
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.37068890840142343
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.24009431093278263
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3071379066920702
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.31782992537086313
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.3639544140938305
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.32073418701669026
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4166613092238043
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3008126415966517
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.19743008314436883
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.16370884074367903
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.37086966536142313
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.409643099998057
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.2575699315401612
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.310449170121381
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4285286292013588
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.13622980866275425
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2572414987500377
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.388749951743596
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5020540387409291
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.31301986568151985
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.38094471423409354
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.24222628640267738
}
}
},
"Aria": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.3264829094772722
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.35712138797286674
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.4004806395853317
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3783082688258977
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.27628131703993153
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.4942870225393938
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5811228070175439
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.3279996334048362
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2481896092177717
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.11945216302285933
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.2830308005758272
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.27833423130489043
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.32371820359400666
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.42875359425696014
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3612041984219992
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.37290568595471846
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.19554976321164697
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3092653492193887
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3043751656077328
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2930015244066511
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3092167834876797
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4523860109667709
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3277812604542708
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.21139455782312927
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.2711617723374526
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3576735443060994
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.42875359425696014
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.19839956701033565
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.27267126872569447
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.38321397541649777
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.14301905320436192
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2849545194421855
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.3779947327886569
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.39678729061309725
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.29682445889316517
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.4096377585306089
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.26194160419181234
}
}
},
"Claude_3.5": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5405089647404562
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6046357055234819
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5712627152062051
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5450038475783499
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.4767692987630454
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5756126284078804
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6969774436090224
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5278843049497918
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.4082144793870471
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.23803578664609892
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.5637906302497772
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4795267886975966
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.525848282456283
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.508735695828719
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5699094130430454
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5096772701625744
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.4429640420975014
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5066797418318023
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.4926030136534706
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5278127103234661
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4490020843308984
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5838224169821388
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5456152399978661
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.46300075585789874
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5292494759360522
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5373019912310933
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.508735695828719
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4422556748863689
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.49311554035078103
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.6593763006847053
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.3382015835012861
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5194010220575684
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.532329797132399
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5808831682303479
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.513474611293123
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5507075880782885
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.47461998432626556
}
}
},
"Idefics3": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.14507788965553362
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.11641535161320743
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.17255583910766542
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.14745217246476708
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.1331851390883708
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.19221534222332276
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.28640852130325817
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.17906399043310475
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.10192930055370109
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.04211916597550756
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.10126271262360581
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.11407926733108291
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.16225217317782772
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.16181866973635636
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.1839408679813373
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.14933801491626408
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.0395540896656236
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.13979628998424784
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.1062779093260333
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.07053056796593082
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.09790172378722654
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.2987797010800956
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.11588163814170001
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1008692365835223
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.09308121224497533
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.14757589734485796
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.16181866973635636
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.12217834249866026
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.12276246278377517
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.14743542163139847
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.05354869594691955
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.09065540194572455
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.1463280929280822
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.14564374862578883
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.22748773785486257
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.17647756032677067
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.13168972973651977
}
}
},
"Qwen2_VL_7B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.370836862933556
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.39973692484032347
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2511,
"tasks": [],
"average_score": 0.4012977216731433
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2469,
"tasks": [],
"average_score": 0.410990923097227
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2818925976996871
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.493608784197707
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5215889724310777
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.33309401517140946
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2439,
"tasks": [],
"average_score": 0.27564756843599875
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.1473690605854188
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3814353882556586
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2896392967775049
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3223325179806271
},
"Videos": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.4111189310485516
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.34825121621909577
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.4047366473438155
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.262166593895899
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3403519326516044
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3420538306638288
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.35162604166912687
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.32665673520415817
},
"open_ended_output": {
"count": 80,
"num_samples": 1456,
"tasks": [],
"average_score": 0.3909745200389741
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.39898011714302023
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.19415154950869234
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.37301502633138073
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3761693199448087
},
"video": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.4111189310485516
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.26429868057315387
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.33008667137716374
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.42660307298355216
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2003871750665659
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.3270187644950453
},
"Perception": {
"count": 145,
"num_samples": 2315,
"tasks": [],
"average_score": 0.39864841947520724
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4245693009859056
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.29880557491654197
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.42766370932167636
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.25562039051316643
}
}
},
"Qwen2_VL_72B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.49774395003470484
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.538829507114716
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.534480883952292
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5092565754998357
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.3776739609562984
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5676174603436022
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.60496992481203
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.4633019068994453
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.35105970797600183
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.2201150812944581
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.5356361790015363
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4289777675393297
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.42094543671351287
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.49943888306036405
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.507967430369507
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.495761900914191
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.36212605501536715
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.4444770652190341
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.44584364394901616
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5098505660529429
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4027115384266939
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5157810622684265
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5199940976484408
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.3100812547241119
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5364299983756791
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.4908605783408196
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.49943888306036405
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.36691704884033916
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.45169664275718613
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5748195752273694
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.31245958897213383
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.4372517645050852
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5343715685033166
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4968249101570037
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.4488852456563113
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5162919233645259
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.31157492395100744
}
}
}
}