{ "GPT_4o": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.5630758211022604 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.6216411634729735 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.616018277142757 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.5823101249498799 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.44177544539510955 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.6345458069232931 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.6795263157894738 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.5514924675940659 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.39435038953269674 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.22934807257231926 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.608083455060831 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.491325251564869 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.4999089647103332 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.5315979872161023 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.5641404607063637 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.5613545677222056 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.47760591698367955 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.5388690453811203 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.48037685656449847 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.5994159671881645 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.44606605087301393 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.6274371950293718 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.5448877153826162 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.4751133786848073 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.5343350103400748 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.5672657028463585 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.5315979872161023 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.4500928191484624 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.4908653289106883 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.7056027785545881 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.33202130899313653 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.5032849161169843 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.5510350848991218 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.6095778863474799 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.5283797185155754 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.6135723164021851 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.44047720383044436 } } }, "Gemini_1.5_pro_002": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.5202055934299538 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.5017043129027509 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.5532599716027446 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.546753787203128 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.425969084163906 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.5751012914154264 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.6982330827067671 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.513647745999633 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.3845337030093212 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.23899503258223884 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.4625032188638111 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.4292353723689881 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.4869625906903554 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.5028718355967439 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.5584779204331461 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.55005349042813 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.4292127751495457 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.44896309957892694 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.44418591808616864 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.5146447350354234 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.4688623462674191 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.5580414823700747 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.5538255562099124 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.39066515495086923 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.5370278962809547 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.5034399620483027 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.5028718355967439 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.4885398161821004 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.45544217378728585 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.5421439953094952 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.3335324339429373 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.43465181771633377 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.5250631828331306 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.5821004797173627 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.5124355410095621 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.5722329455291694 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.41210885517904977 } } }, "Gemini_1.5_flash_002": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.46250942866818673 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.4337278553354258 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.49947464681475356 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.5098686082319499 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.34393279682972117 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.5594391803821158 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.6380250626566416 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.44816564352475535 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.34510790215980036 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.18973764406890803 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.3865262916591035 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.3598139859097534 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.4013870708864889 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.4903530871753026 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.5051202896842343 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.5166044655846657 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.3849084036535956 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.3869438864407766 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.39868324168390534 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.44793686445264996 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.3704146726364947 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.5448638967636353 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.47829883834573317 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.33669690098261523 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.43653808057103954 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.4427944359714585 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.4903530871753026 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.42346517633403413 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.41994719346489817 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.46645473820179373 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.2517485212411566 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.40372378342017806 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.4799408254775632 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.6010361821632402 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.4569546533897065 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.511590428993871 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.33710867194177685 } } }, "Claude_3.5": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.5405089647404562 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.6082834220752651 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.5745077617490254 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.5450038475783499 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.4767692987630454 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.5756126284078804 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.6969774436090224 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.5278843049497918 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.4082144793870471 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.23803578664609892 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.5691641481808987 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.4795267886975966 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.525848282456283 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.508735695828719 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.5699094130430454 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.5096772701625744 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.4429640420975014 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.5066797418318023 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.4971460788134188 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.5278127103234661 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.4490020843308984 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.5838224169821388 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.5456152399978661 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.46300075585789874 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.5414381873407914 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.5373019912310933 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.508735695828719 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.4422556748863689 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.49311554035078103 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.6663170946790707 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.3382015835012861 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.5194010220575684 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.532329797132399 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.5808831682303479 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.513474611293123 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.5507075880782885 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.47461998432626556 } } }, "Claude_3.5_new": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.5690045172520449 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.6220681231036606 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.6077980666415158 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.5511440615639541 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.4885536652013625 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.5908204006544897 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.6569473684210526 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.5486763511384175 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.4315385951907387 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.2909419331017877 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.6048192628845258 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.48924295292319175 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.556418710368288 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.4946691340754988 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.5558756390298104 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.5425198547046186 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.44210335381541843 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.5187252051932875 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.5071121107460066 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.5387340524651681 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.4824302644151348 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.6242798397166945 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.5782691045270721 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.4630277507828528 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.5914338446093256 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.5636254729390459 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.4946691340754988 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.4828123870640382 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.48756636014597515 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.6590137441693218 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.39901670035164916 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.5166853031535193 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.5561634744977417 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.6123769274172342 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.5512015158810595 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.565796566886933 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.4763267502912362 } } }, "GPT_4o_mini": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.4492982787524939 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.49026056071002017 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.5168957112681365 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.46731791428406805 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.3406008235342885 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.5572925295284307 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.6902380952380953 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.4189154010048976 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.2943206715105082 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.19422793560945503 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.47202628409684394 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.3624496929166193 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.38946844562183286 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.45508480503584553 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.47569921440672464 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.465175334092545 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.29410984789062117 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.41242028190533997 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.3906415365938764 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.44244772638735347 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.3629944944697668 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.5713834131825314 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.39874839531459466 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.3359977324263039 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.4305788513381019 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.46343334374251277 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.45508480503584553 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.24651576711552803 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.36981497185070983 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.5666618234843734 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.2420320329702607 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.3458483931206892 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.43590838051817093 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.5176671720617656 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.3554299482098288 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.5399167524341886 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.32918280841495845 } } }, "Qwen2_VL_72B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.49787264809826687 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.5439010430283516 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.5392244859385411 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.509277882172206 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.3776739609562984 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.5676817981386025 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.60496992481203 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.4633019068994453 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.35105970797600183 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.2201150812944581 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.5402397677488632 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.4289777675393297 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.42094543671351287 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.49943888306036405 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.507967430369507 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.49789939867591104 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.36212605501536715 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.44719815365440824 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.4500902736468407 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.5098505660529429 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.4027115384266939 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.5157810622684265 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.5199940976484408 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.3100812547241119 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.5468722850464449 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.4918205178721877 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.49943888306036405 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.36691704884033916 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.45176098055218655 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.5807658773593334 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.31245958897213383 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.4372517645050852 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.5362106489630868 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.4968249101570037 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.4488852456563113 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.5166939389651373 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.31157492395100744 } } }, "Qwen2_VL_7B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.3708368629321668 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.40213773918065815 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2511, "tasks": [], "average_score": 0.4034335110538307 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2469, "tasks": [], "average_score": 0.4109909230944937 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.2818925976996871 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.49360878418945336 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.5215889724310777 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.33309401517140946 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2439, "tasks": [], "average_score": 0.27564756843599875 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.1473690605854188 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.3821046882337143 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.2896392967775049 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.3223325179806271 }, "Videos": { "count": 43, "num_samples": 700, "tasks": [], "average_score": 0.4111189310485516 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.34825121621909577 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.40660144920567376 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.262166593895899 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.3430730210869785 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.3426196933687219 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.35162604166912687 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.32665673520415817 }, "open_ended_output": { "count": 80, "num_samples": 1456, "tasks": [], "average_score": 0.3909745200389741 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.39898011714302023 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.19415154950869234 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.37453319457428763 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.37701588079136955 }, "video": { "count": 43, "num_samples": 700, "tasks": [], "average_score": 0.4111189310485516 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.26429868057315387 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.33008667136891007 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.42746758545520747 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.2003871750665659 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.3270187644950453 }, "Perception": { "count": 145, "num_samples": 2315, "tasks": [], "average_score": 0.40048749993497734 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.4245693009859056 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.29880557491654197 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.4276637093173368 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.25562039051316643 } } }, "llava_onevision_72B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.3615741356043519 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.2834675874668524 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.3674817002808495 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.42146038539739283 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.2951434804409883 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.478119286755779 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.6005438596491229 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.31663222188988865 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.29633645022129285 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.13872280436872364 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.23380046931752074 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.2126914943750874 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.34566020099204997 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.4446001874842145 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.4401364830377099 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.4247591719013819 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.23897262553543516 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.2868275930712835 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.259450238500612 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.370724080249463 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.3065719940769206 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.4293132525502993 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.3986052416087927 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.20730347694633405 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.28104747671521785 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.34840850032295206 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.4446001874842145 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.25013213032747944 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.34156793747875674 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.3076421844825067 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.18168666652660437 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.23240790940031927 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.38362780453378204 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.4807891958712894 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.31702495228966576 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.4358874880224115 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.31588468105075895 } } }, "llava_onevision_7B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.2524786809911341 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.19077168655703208 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.2555444562659206 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.29981286990552625 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.18973491465938852 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.36842322314565323 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.44998746867167916 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.2445135206648208 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.21802943568344288 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.06658775725427067 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.1466861610319767 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.13297395577964055 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.24236719143449742 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.30985943541023103 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.3199731020402028 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.3263378734842879 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.13043163858789789 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.20277804188944173 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.18291595756285564 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.25384794412815426 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.2200472229099345 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.3127341248874411 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.2802999516721972 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.1476473922902494 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.13803800801858385 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.2548084764084038 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.30985943541023103 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.1778991941079372 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.2410111891690358 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.19283211154717242 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.09846926279075068 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.15189414475467605 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.28505205882578405 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.3600079950628582 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.23654776813656775 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.3271805711561501 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.22080546908673507 } } }, "InternVL2_76B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.38193012983650343 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.41315219763443384 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.43665980552577693 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.4265623936500962 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.2975890791763991 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.5257990949897898 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.5779473684210527 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.33287081421166276 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.2949505390920417 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.17036496432397477 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.3634339625985008 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.31396468806559114 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.3473756113126343 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.395893002855977 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.44982107744035305 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.42875248733027654 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.2868239162778749 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.3630499545707523 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.3476691827105281 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.3943337471922549 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.29244088978470345 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.45822072478616577 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.3879326330400817 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.20309901738473166 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.34771123515123364 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.4145693044465943 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.395893002855977 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.24403942809507134 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.3153417935059416 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.4306947454508794 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.2132321995754061 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.2953329718984368 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.42202934355552685 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.47409276729986083 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.30014798153766264 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.4625649385962016 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.2868813944130515 } } }, "InternVL2_8B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.2817247716997634 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.280559214034858 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2511, "tasks": [], "average_score": 0.32020728060179815 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2469, "tasks": [], "average_score": 0.325593535916075 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.24118253695139918 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.39684007367798446 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.4700852130325815 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.27052668526005397 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2439, "tasks": [], "average_score": 0.23189345356483618 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.08260405712900723 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.22800928556370195 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.2013779290163996 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.2804429603269583 }, "Videos": { "count": 43, "num_samples": 700, "tasks": [], "average_score": 0.34791358240562653 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.2942163420306113 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.3388056726588417 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.10933317885944857 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.250804626773504 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.2522493284864019 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.27414636444623874 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.22381302045502052 }, "open_ended_output": { "count": 80, "num_samples": 1456, "tasks": [], "average_score": 0.3537549824897016 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.30261189962428353 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.15434618291761149 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.19872104324302098 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.30088711082969344 }, "video": { "count": 43, "num_samples": 700, "tasks": [], "average_score": 0.34791358240562653 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.17725087609332119 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.2532272454839157 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.29129840423784176 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.12166926715781588 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.24700310231619527 }, "Perception": { "count": 145, "num_samples": 2315, "tasks": [], "average_score": 0.3214666523378005 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.3995660275981844 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.24614711281861912 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.3393895915929317 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.22078333222564453 } } }, "MiniCPM_v2.6": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.2604967101191775 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.2500331562865158 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.3003169369011028 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.31808748114668184 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.18281637763548025 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.40732197204308807 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.48798245614035085 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.23723675736151562 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.1968926733821904 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.08735883237069725 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.21195711598986072 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.18639148159043903 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.21578309681746147 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.3527537836840162 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.3096882575625531 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.3176880312524649 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.0755920550038197 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.23506388020592064 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.1781127776443048 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.2551275278138797 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.20833171754655547 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.36473950920880716 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.293386806641223 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.13955971277399848 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.23596215721092323 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.26319603880798287 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.3527537836840162 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.17888270664238365 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.22288558250834017 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.2666989364424082 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.11693267119342445 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.15342045420318667 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.29243044121840894 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.3777897246686755 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.25714862989687987 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.33187729423141027 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.16493399805627715 } } }, "Phi-3.5-vision": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.2551037902226636 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.2483252111012436 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.28732942108098564 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.3049602749093698 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.21653804346780042 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.36823084724842464 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.46663157894736845 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.24145330077248778 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.2154692063816354 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.08944481289041872 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.1865974025588298 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.17497379027990792 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.26053460127801603 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.24669318645450836 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.2786226802221388 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.3413768635559215 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.15444746077692828 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.2177924712685756 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.21443984349574025 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.2572371188897671 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.21409351002477045 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.365192668303297 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.25960269434727634 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.12546296296296297 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.14337869666229008 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.27790147494714373 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.24669318645450836 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.20168001345379397 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.2850550871176333 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.2237087834389946 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.08928724806836039 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.219367263034246 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.316318567258608 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.3945898792928062 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.21925278489551242 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.33264696401038385 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.17575913004138646 } } }, "Pixtral_12B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.3460288961410444 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.3777640755922415 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.38299418297106824 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.3776722463473817 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.2828575553466608 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.419071767659191 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.5687919799498747 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.32813540763467464 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.2677293131171651 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.10591240329992047 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.3070067338940785 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.28832738144368647 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.3223299098375932 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.409643099998057 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.37450808136321684 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.37115973962368864 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.24009431093278263 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.3078181788009137 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.3188475653127356 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.3639544140938305 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.32073418701669026 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.4166613092238043 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.3008126415966517 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.19743008314436883 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.16642294307267227 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.37108130557306335 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.409643099998057 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.2575699315401612 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.3104621543981899 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.4300741596942578 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.13622980866275425 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.2572414987500377 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.3892097218585385 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.5020540387409291 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.31301986568151985 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.3809515410188075 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.24222628640267738 } } }, "Llama_3_2_11B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.1907604552173455 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.14328677752263275 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.19646404502647707 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.22399113135844315 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.13303760019716085 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.323153603297999 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.4260501253132832 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.1770852858056774 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.15366454315378308 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.06563884729522687 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.11886347847341794 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.11489351406848371 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.1693681214060816 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.2123769209846321 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.2520175802062012 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.2485354956932213 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.06418655520777307 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.12417283740525839 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.16374180545556977 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.1576236804437753 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.15014439824913947 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.3003142292328822 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.19270157739425633 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.1463246409674981 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.0732004839476103 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.1960107191983825 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.2123769209846321 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.1351857051327849 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.18586695387250338 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.17288724679416761 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.08100042975820579 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.0575426944971537 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.19899465185565898 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.254316961351997 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.162801811963855 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.28055776664538923 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.13937853323074623 } } }, "Idefics3": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.14507788965553362 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.11641535161320743 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.17255583910766542 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.14745217246476708 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.1331851390883708 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.19221534222332276 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.28640852130325817 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.17906399043310475 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.10192930055370109 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.04211916597550756 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.10126271262360581 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.11407926733108291 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.16225217317782772 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.16181866973635636 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.1839408679813373 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.14933801491626408 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.0395540896656236 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.13979628998424784 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.1062779093260333 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.07053056796593082 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.09790172378722654 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.2987797010800956 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.11588163814170001 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.1008692365835223 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.09308121224497533 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.14757589734485796 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.16181866973635636 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.12217834249866026 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.12276246278377517 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.14743542163139847 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.05354869594691955 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.09065540194572455 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.1463280929280822 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.14564374862578883 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.22748773785486257 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.17647756032677067 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.13168972973651977 } } }, "Aria": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.3264829094772722 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.35712138797286674 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.4004806395853317 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.3783082688258977 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.27628131703993153 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.4942870225393938 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.5811228070175439 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.3279996334048362 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.2481896092177717 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.11945216302285933 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.2830308005758272 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.27833423130489043 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.32371820359400666 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.42875359425696014 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.3612041984219992 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.37290568595471846 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.19554976321164697 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.3092653492193887 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.3043751656077328 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.2930015244066511 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.3092167834876797 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.4523860109667709 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.3277812604542708 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.21139455782312927 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.2711617723374526 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.3576735443060994 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.42875359425696014 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.19839956701033565 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.27267126872569447 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.38321397541649777 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.14301905320436192 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.2849545194421855 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.3779947327886569 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.39678729061309725 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.29682445889316517 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.4096377585306089 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.26194160419181234 } } }, "NVLM": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.24033557047857043 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.32154059695494047 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.2937052996171993 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.22845955700594492 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.2639741933075709 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.40870864071047447 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.4555238095238095 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.25785191641267197 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.15679681195908274 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.0672259242345112 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.23922823287047076 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.21734036617042948 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.30313485498585124 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.0 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.34726189956094355 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.3264757655296162 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.056894830390305184 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.22868389095927066 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.2788963949121424 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.2787764976961992 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.23349712171444964 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.3215948035793096 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.18487055428231897 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.0 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.0 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.3680809151131777 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.0 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.03838410364145658 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.2325581694709435 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.22773778915303383 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.08048160660797504 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.2390024647851972 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.30211261814126533 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.18857142857142856 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.24908307640275493 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.3724877947012685 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.24529601154794037 } } }, "InternVL2_2B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.14491178903291552 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.12126906675624163 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.16912754929321935 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.18542274192083463 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.13923308734553164 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.23992252224543772 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.3420927318295739 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.14807577209152425 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.13036555933925006 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.01727799227799228 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.057021136657850864 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.10504085961245285 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.1625198552182714 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.18999779001767986 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.1487677475708977 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.2011727338536935 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.11886936592818943 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.1131404778887607 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.05739750616837997 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.15465451663650032 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.16044698450090833 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.21429521387724249 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.2128614316540013 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.03658352229780801 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.05757839721254354 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.15225683687839608 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.18999779001767986 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.17677460549936644 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.158165588340436 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.08722661966805 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.04102853815875594 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.11264043251709285 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.17001758160301803 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.3332891958712894 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.1686125516807394 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.21169137106199268 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.10975764217070672 } } }, "Qwen2_VL_2B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.22236161923122505 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.23701014663017753 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.25669221785292334 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.26526414975225454 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.17623548305581763 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.31250702198481506 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.4140676691729323 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.20802820480076603 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.17320633068307653 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.06209506566980099 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.190837839372028 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.16287824421269087 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.19640906475019812 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.2520741776922928 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.24883076673424442 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.2877316297453947 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.13398525561847363 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.1624451002757208 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.20960092816529263 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.19986806708136184 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.2201024015934558 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.30248748033122763 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.256631742010999 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.07681405895691609 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.10526691703628158 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.25018977062352593 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.2520741776922928 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.17435940889565366 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.21286783416184518 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.2521972668785968 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.06967138760493456 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.16996250112948405 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.27603334911345223 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.31002436092347696 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.21061929716065056 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.2656728023444808 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.16356158787929762 } } }, "Aquila_VL_2B": { "skills": { "Object Recognition and Classification": { "count": 303, "num_samples": 4755, "tasks": [], "average_score": 0.18420666660337692 }, "Text Recognition (OCR)": { "count": 137, "num_samples": 2239, "tasks": [], "average_score": 0.12395530240359122 }, "Language Understanding and Generation": { "count": 154, "num_samples": 2509, "tasks": [], "average_score": 0.17924536722051596 }, "Scene and Event Understanding": { "count": 154, "num_samples": 2467, "tasks": [], "average_score": 0.220108610660707 }, "Mathematical and Logical Reasoning": { "count": 109, "num_samples": 1910, "tasks": [], "average_score": 0.1680749869910155 }, "Commonsense and Social Reasoning": { "count": 51, "num_samples": 855, "tasks": [], "average_score": 0.26630477322766793 }, "Ethical and Safety Reasoning": { "count": 15, "num_samples": 245, "tasks": [], "average_score": 0.35152130325814535 }, "Domain-Specific Knowledge and Skills": { "count": 77, "num_samples": 1386, "tasks": [], "average_score": 0.1857154485444521 }, "Spatial and Temporal Reasoning": { "count": 152, "num_samples": 2437, "tasks": [], "average_score": 0.1616397700608881 }, "Planning and Decision Making": { "count": 37, "num_samples": 577, "tasks": [], "average_score": 0.044513236949565 } }, "input_format": { "User Interface Screenshots": { "count": 93, "num_samples": 1517, "tasks": [], "average_score": 0.07480350331940272 }, "Text-Based Images and Documents": { "count": 82, "num_samples": 1294, "tasks": [], "average_score": 0.11444110320621242 }, "Diagrams and Data Visualizations": { "count": 101, "num_samples": 1718, "tasks": [], "average_score": 0.19412275574929044 }, "Videos": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.21367350061199514 }, "Artistic and Creative Content": { "count": 32, "num_samples": 541, "tasks": [], "average_score": 0.19717811128156643 }, "Photographs": { "count": 143, "num_samples": 2248, "tasks": [], "average_score": 0.24620947964695974 }, "3D Models and Aerial Imagery": { "count": 11, "num_samples": 169, "tasks": [], "average_score": 0.10131259529340846 } }, "output_format": { "contextual_formatted_text": { "count": 98, "num_samples": 1514, "tasks": [], "average_score": 0.11925340914357861 }, "structured_output": { "count": 110, "num_samples": 1714, "tasks": [], "average_score": 0.123417109500157 }, "exact_text": { "count": 83, "num_samples": 1278, "tasks": [], "average_score": 0.18474924824567768 }, "numerical_data": { "count": 49, "num_samples": 862, "tasks": [], "average_score": 0.19908864029107046 }, "open_ended_output": { "count": 80, "num_samples": 1454, "tasks": [], "average_score": 0.23278612647548963 }, "multiple_choice": { "count": 85, "num_samples": 1363, "tasks": [], "average_score": 0.22108484223035305 } }, "input_num": { "6-8 images": { "count": 21, "num_samples": 314, "tasks": [], "average_score": 0.11057256235827662 }, "9-image or more": { "count": 41, "num_samples": 623, "tasks": [], "average_score": 0.011631871744697361 }, "1-image": { "count": 315, "num_samples": 5228, "tasks": [], "average_score": 0.18240049845355885 }, "video": { "count": 43, "num_samples": 698, "tasks": [], "average_score": 0.21367350061199514 }, "4-5 images": { "count": 34, "num_samples": 520, "tasks": [], "average_score": 0.1898373110613516 }, "2-3 images": { "count": 51, "num_samples": 802, "tasks": [], "average_score": 0.23274180707905315 } }, "app": { "Information_Extraction": { "count": 72, "num_samples": 1124, "tasks": [], "average_score": 0.09484068019620011 }, "Planning": { "count": 78, "num_samples": 1239, "tasks": [], "average_score": 0.05864269260897992 }, "Coding": { "count": 31, "num_samples": 474, "tasks": [], "average_score": 0.13323092677931386 }, "Perception": { "count": 145, "num_samples": 2313, "tasks": [], "average_score": 0.20714098741611 }, "Metrics": { "count": 20, "num_samples": 309, "tasks": [], "average_score": 0.2932627505936196 }, "Science": { "count": 29, "num_samples": 574, "tasks": [], "average_score": 0.21075421274487907 }, "Knowledge": { "count": 97, "num_samples": 1605, "tasks": [], "average_score": 0.24110595572817994 }, "Mathematics": { "count": 33, "num_samples": 547, "tasks": [], "average_score": 0.20711160718581811 } } } }