Spaces:
Running
Running
{ | |
"NVLM": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.24033557047857043 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.32154059695494047 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.2937052996171993 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.22845955700594492 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2639741933075709 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.40870864071047447 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.4555238095238095 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.25785191641267197 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.15679681195908274 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.0672259242345112 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.23922823287047076 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.21734036617042948 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.30313485498585124 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.34726189956094355 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.3264757655296162 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.056894830390305184 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.22868389095927066 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.2788963949121424 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2787764976961992 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.23349712171444964 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.3215948035793096 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.18487055428231897 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.3680809151131777 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.03838410364145658 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2325581694709435 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.22773778915303383 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.08048160660797504 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2390024647851972 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.30211261814126533 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.18857142857142856 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.24908307640275493 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.3724877947012685 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.24529601154794037 | |
} | |
} | |
}, | |
"GPT_4o_mini": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.44928744961868194 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.48842488118273475 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.5152626716886682 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.4672966076116977 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.3406008235342885 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5572281917334303 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6902380952380953 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.4189154010048976 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2943206715105082 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.19422793560945503 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.4700389569079038 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.3624496929166193 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.38946844562183286 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.45508480503584553 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.47569921440672464 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.46468618797917643 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.29410984789062117 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.41174000979649644 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.38893151244736324 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.44244772638735347 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3629944944697668 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5713834131825314 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.39874839531459466 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.3359977324263039 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.4260710116168476 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.46322170353087255 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.45508480503584553 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.24651576711552803 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.3697506340557095 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.5640948591986592 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2420320329702607 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.3458483931206892 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.43544861040322835 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5176671720617656 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.3554299482098288 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5398829253460956 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.32918280841495845 | |
} | |
} | |
}, | |
"Llama_3_2_11B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.1907604552173455 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.14280015951776653 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.1960311445935766 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.22399113135844315 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.13303760019716085 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.323153603297999 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.4260501253132832 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.1770852858056774 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.15366454315378308 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.06563884729522687 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.11886347847341794 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.11489351406848371 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.1693681214060816 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.2123769209846321 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.2520175802062012 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.24806929522702081 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.06418655520777307 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.12349256529641485 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.16374180545556977 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.1576236804437753 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.15014439824913947 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.3003142292328822 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.19270157739425633 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.1463246409674981 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.0732004839476103 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.19579907898674231 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.2123769209846321 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.1351857051327849 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.18586695387250338 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.17288724679416761 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.08100042975820579 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.0575426944971537 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.19853488174071646 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.254316961351997 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.162801811963855 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.28055776664538923 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.13937853323074623 | |
} | |
} | |
}, | |
"Claude_3.5_new": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5690042283891658 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.6220681231036606 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.6077980666415158 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5511434932168607 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.4885536652013625 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.590818684469149 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6569473684210526 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.5486763511384175 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.4315385951907387 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.2909419331017877 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.6048192628845258 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.48924295292319175 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.556418710368288 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4946691340754988 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5558756390298104 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.542519242638518 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.44210335381541843 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.5187252051932875 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.5071113150600759 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5387340524651681 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4824302644151348 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.6242798397166945 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5782691045270721 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.4630277507828528 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5914338446093256 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5636254729390459 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4946691340754988 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4828123870640382 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.48756464396063437 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.6590137441693218 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.39901670035164916 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.5166853031535193 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5561634744977417 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.6123769274172342 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.5512015158810595 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5657956645626817 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.4763267502912362 | |
} | |
} | |
}, | |
"InternVL2_8B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2817247716997634 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.2794121858805306 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2511, | |
"tasks": [], | |
"average_score": 0.31918687243853283 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2469, | |
"tasks": [], | |
"average_score": 0.325593535916075 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.24118253695139918 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.39684007367798446 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.4700852130325815 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.27052668526005397 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2439, | |
"tasks": [], | |
"average_score": 0.23189345356483618 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.08260405712900723 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.2277532691786533 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.2013779290163996 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.2804429603269583 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.34791358240562653 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.2942163420306113 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.33787327172644077 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.10933317885944857 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.24944408255581693 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.25203287826995174 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.27414636444623874 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.22381302045502052 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1456, | |
"tasks": [], | |
"average_score": 0.3537549824897016 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.30261189962428353 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.15434618291761149 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.19814032315010577 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.30046383040641306 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.34791358240562653 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.17725087609332119 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2532272454839157 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.29096771640715396 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.12166926715781588 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.24700310231619527 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2315, | |
"tasks": [], | |
"average_score": 0.3205471121079154 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3995660275981844 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.24614711281861912 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.3393895915929317 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.22078333222564453 | |
} | |
} | |
}, | |
"llava_onevision_7B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2524786809911341 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.1902376706945491 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.255069390206439 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.29981286990552625 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.18973491465938852 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.36842322314565323 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.44998746867167916 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.2445135206648208 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.21802943568344288 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.06658775725427067 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.1466163383815089 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.13297395577964055 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.24236719143449742 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.30985943541023103 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.3199731020402028 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.3258716730180874 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.13043163858789789 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.20209776978059824 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.18285692568564196 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.25384794412815426 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.2200472229099345 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.3127341248874411 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.2802999516721972 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.1476473922902494 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.13787962981142515 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.25459683619676365 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.30985943541023103 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.1778991941079372 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2410111891690358 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.19274192395698486 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.09846926279075068 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.15189414475467605 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.2845922887108415 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3600079950628582 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.23654776813656775 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.3271805711561501 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.22080546908673507 | |
} | |
} | |
}, | |
"llava_onevision_72B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.3615741356043519 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.282401662313336 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.36653344218973427 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.42146038539739283 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2951434804409883 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.478119286755779 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6005438596491229 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.31663222188988865 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.29633645022129285 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.13872280436872364 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.23294708136735856 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.2126914943750874 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.34566020099204997 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4446001874842145 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.4401364830377099 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.42429297143518147 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.23897262553543516 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.28614732096244 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.25872873777911126 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.370724080249463 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3065719940769206 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.4293132525502993 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3986052416087927 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.20730347694633405 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.27911174307216713 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.3481968601113118 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4446001874842145 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.25013213032747944 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.34156793747875674 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.30653989171354723 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.18168666652660437 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.23240790940031927 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.38316803441883945 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.4807891958712894 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.31702495228966576 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.4358874880224115 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.31588468105075895 | |
} | |
} | |
}, | |
"Gemini_1.5_pro_002": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5201947642961418 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.49947304390648534 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.5512750115216515 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5467324805307577 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.425969084163906 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5750369536204262 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6982330827067671 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.513647745999633 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.3845337030093212 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.23899503258223884 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.4592162957187749 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.4292353723689881 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.4869625906903554 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5028718355967439 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5584779204331461 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5500305447809621 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.4292127751495457 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.44896309957892694 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.44137714463131966 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5146447350354234 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4688623462674191 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5580414823700747 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5538255562099124 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.39066515495086923 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5295721925617263 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5034399620483027 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5028718355967439 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4885398161821004 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.4553778359922855 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.5378983862471568 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.3335324339429373 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.43465181771633377 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5250631828331306 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5821004797173627 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.5124355410095621 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5721991184410764 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.41210885517904977 | |
} | |
} | |
}, | |
"MiniCPM_v2.6": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2604969133146555 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.24828453993935928 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.2987613496312298 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.31808788094038193 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.18281637763548025 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.4073231792632807 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.48798245614035085 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.23723675736151562 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.1968926733821904 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.08735883237069725 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.21153173491931837 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.18639148159043903 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.21578309681746147 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.3527537836840162 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.3096882575625531 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.31628986040092516 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.0755920550038197 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.23302306387939006 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.17775369699584467 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2551275278138797 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.20833171754655547 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.36473950920880716 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.293386806641223 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.13955971277399848 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.23499726844115643 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.2625611181730622 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.3527537836840162 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.17888270664238365 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.22288678972853282 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.26614948589295767 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.11693267119342445 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.15342045420318667 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.2910511308735813 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3777897246686755 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.25714862989687987 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.33187792895542906 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.16493399805627715 | |
} | |
} | |
}, | |
"GPT_4o": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5630800473549525 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.6216411634729735 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.616018277142757 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5823184402392676 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.44177544539510955 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.6345709158363462 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6795263157894738 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.5514924675940659 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.39435038953269674 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.22934807257231926 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.608083455060831 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.491325251564869 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.4999089647103332 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5315979872161023 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5641404607063637 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5613635226492386 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.47760591698367955 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.5388690453811203 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.4803884979696412 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5994159671881645 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.44606605087301393 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.6274371950293718 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5448877153826162 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.4751133786848073 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5343350103400748 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5672657028463585 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5315979872161023 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4500928191484624 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.49089043782374137 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.7056027785545881 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.33202130899313653 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.5032849161169843 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5510350848991218 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.6095778863474799 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.5283797185155754 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.6135855179956459 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.44047720383044436 | |
} | |
} | |
}, | |
"Phi-3.5-vision": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2551037902226636 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.24734930136620975 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.2864612416413776 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.3049602749093698 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.21653804346780042 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.36823084724842464 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.46663157894736845 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.24145330077248778 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2154692063816354 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.08944481289041872 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.18587661796707747 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.17497379027990792 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.26053460127801603 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.24669318645450836 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.2786226802221388 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.34091066308972107 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.15444746077692828 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.21711219915973207 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.2138304528863496 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2572371188897671 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.21409351002477045 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.365192668303297 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.25960269434727634 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.12546296296296297 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.14174374624685185 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.2776898347355035 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.24669318645450836 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.20168001345379397 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2850550871176333 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.22277777000798116 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.08928724806836039 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.219367263034246 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.31585879714366544 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3945898792928062 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.21925278489551242 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.33264696401038385 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.17575913004138646 | |
} | |
} | |
}, | |
"InternVL2_76B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.38191947207402666 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.4103649605406274 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.4341802504488193 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.42654142415639185 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2975890791763991 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5257357753421337 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5779473684210527 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.33287081421166276 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2949505390920417 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.17036496432397477 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.362195416198664 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.31396468806559114 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.3473756113126343 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.395893002855977 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.44982107744035305 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.42686510293379315 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.2868239162778749 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3603288661353782 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3465926907358438 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.3943337471922549 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.29244088978470345 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.45822072478616577 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3879326330400817 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.20309901738473166 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.34490184941501867 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.41372274360003347 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.395893002855977 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.24403942809507134 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.3152784738582855 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.4290949563510903 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2132321995754061 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2953329718984368 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.4201902630957567 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.47409276729986083 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.30014798153766264 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.46253164682269177 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.2868813944130515 | |
} | |
} | |
}, | |
"Gemini_1.5_flash_002": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.46250942866818673 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.4317914359988347 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.49775198805427967 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5098686082319499 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.34393279682972117 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5594391803821158 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6380250626566416 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.44816564352475535 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.34510790215980036 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.18973764406890803 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.3836737169374586 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.3598139859097534 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.4013870708864889 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4903530871753026 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5051202896842343 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5166044655846657 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.3849084036535956 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3869438864407766 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3962715194192418 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.44793686445264996 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3704146726364947 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5448638967636353 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.47829883834573317 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.33669690098261523 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.4300676062024303 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.4427944359714585 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4903530871753026 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.42346517633403413 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.41994719346489817 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.4627701625196691 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2517485212411566 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.40372378342017806 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.4799408254775632 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.6010361821632402 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.4569546533897065 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.511590428993871 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.33710867194177685 | |
} | |
} | |
}, | |
"Pixtral_12B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.34602671066871027 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.3764652079852679 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.38183869685317606 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.3776679463596073 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2828575553466608 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.4190587833823822 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5687919799498747 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.32813540763467464 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2677293131171651 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.10591240329992047 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.30581019415764066 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.28832738144368647 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.3223299098375932 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.409643099998057 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.37450808136321684 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.37068890840142343 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.24009431093278263 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3071379066920702 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.31782992537086313 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.3639544140938305 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.32073418701669026 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.4166613092238043 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3008126415966517 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.19743008314436883 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.16370884074367903 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.37086966536142313 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.409643099998057 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.2575699315401612 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.310449170121381 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.4285286292013588 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.13622980866275425 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2572414987500377 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.388749951743596 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5020540387409291 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.31301986568151985 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.38094471423409354 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.24222628640267738 | |
} | |
} | |
}, | |
"Aria": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.3264829094772722 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.35712138797286674 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.4004806395853317 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.3783082688258977 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.27628131703993153 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.4942870225393938 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5811228070175439 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.3279996334048362 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2481896092177717 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.11945216302285933 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.2830308005758272 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.27833423130489043 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.32371820359400666 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.42875359425696014 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.3612041984219992 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.37290568595471846 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.19554976321164697 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3092653492193887 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3043751656077328 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2930015244066511 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3092167834876797 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.4523860109667709 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3277812604542708 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.21139455782312927 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.2711617723374526 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.3576735443060994 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.42875359425696014 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.19839956701033565 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.27267126872569447 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.38321397541649777 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.14301905320436192 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2849545194421855 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.3779947327886569 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.39678729061309725 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.29682445889316517 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.4096377585306089 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.26194160419181234 | |
} | |
} | |
}, | |
"Claude_3.5": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5405089647404562 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.6046357055234819 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.5712627152062051 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5450038475783499 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.4767692987630454 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5756126284078804 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6969774436090224 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.5278843049497918 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.4082144793870471 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.23803578664609892 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.5637906302497772 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.4795267886975966 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.525848282456283 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.508735695828719 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5699094130430454 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5096772701625744 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.4429640420975014 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.5066797418318023 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.4926030136534706 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5278127103234661 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4490020843308984 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5838224169821388 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5456152399978661 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.46300075585789874 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5292494759360522 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5373019912310933 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.508735695828719 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4422556748863689 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.49311554035078103 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.6593763006847053 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.3382015835012861 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.5194010220575684 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.532329797132399 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5808831682303479 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.513474611293123 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5507075880782885 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.47461998432626556 | |
} | |
} | |
}, | |
"Idefics3": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.14507788965553362 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.11641535161320743 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.17255583910766542 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.14745217246476708 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.1331851390883708 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.19221534222332276 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.28640852130325817 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.17906399043310475 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.10192930055370109 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.04211916597550756 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.10126271262360581 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.11407926733108291 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.16225217317782772 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.16181866973635636 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.1839408679813373 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.14933801491626408 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.0395540896656236 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.13979628998424784 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.1062779093260333 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.07053056796593082 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.09790172378722654 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.2987797010800956 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.11588163814170001 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.1008692365835223 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.09308121224497533 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.14757589734485796 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.16181866973635636 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.12217834249866026 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.12276246278377517 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.14743542163139847 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.05354869594691955 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.09065540194572455 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.1463280929280822 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.14564374862578883 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.22748773785486257 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.17647756032677067 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.13168972973651977 | |
} | |
} | |
}, | |
"Qwen2_VL_7B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.370836862933556 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.39973692484032347 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2511, | |
"tasks": [], | |
"average_score": 0.4012977216731433 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2469, | |
"tasks": [], | |
"average_score": 0.410990923097227 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2818925976996871 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.493608784197707 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5215889724310777 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.33309401517140946 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2439, | |
"tasks": [], | |
"average_score": 0.27564756843599875 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.1473690605854188 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.3814353882556586 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.2896392967775049 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.3223325179806271 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.4111189310485516 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.34825121621909577 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.4047366473438155 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.262166593895899 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3403519326516044 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3420538306638288 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.35162604166912687 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.32665673520415817 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1456, | |
"tasks": [], | |
"average_score": 0.3909745200389741 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.39898011714302023 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.19415154950869234 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.37301502633138073 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.3761693199448087 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.4111189310485516 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.26429868057315387 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.33008667137716374 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.42660307298355216 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2003871750665659 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.3270187644950453 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2315, | |
"tasks": [], | |
"average_score": 0.39864841947520724 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.4245693009859056 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.29880557491654197 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.42766370932167636 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.25562039051316643 | |
} | |
} | |
}, | |
"Qwen2_VL_72B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.49774395003470484 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.538829507114716 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.534480883952292 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5092565754998357 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.3776739609562984 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5676174603436022 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.60496992481203 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.4633019068994453 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.35105970797600183 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.2201150812944581 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.5356361790015363 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.4289777675393297 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.42094543671351287 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.49943888306036405 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.507967430369507 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.495761900914191 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.36212605501536715 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.4444770652190341 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.44584364394901616 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5098505660529429 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4027115384266939 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5157810622684265 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5199940976484408 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.3100812547241119 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5364299983756791 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.4908605783408196 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.49943888306036405 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.36691704884033916 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.45169664275718613 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.5748195752273694 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.31245958897213383 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.4372517645050852 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5343715685033166 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.4968249101570037 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.4488852456563113 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5162919233645259 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.31157492395100744 | |
} | |
} | |
} | |
} |