Spaces:
Running
Running
{ | |
"GPT_4o": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5630758211022604 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.6216411634729735 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.616018277142757 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5823101249498799 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.44177544539510955 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.6345458069232931 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6795263157894738 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.5514924675940659 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.39435038953269674 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.22934807257231926 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.608083455060831 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.491325251564869 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.4999089647103332 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5315979872161023 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5641404607063637 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5613545677222056 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.47760591698367955 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.5388690453811203 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.48037685656449847 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5994159671881645 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.44606605087301393 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.6274371950293718 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5448877153826162 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.4751133786848073 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5343350103400748 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5672657028463585 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5315979872161023 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4500928191484624 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.4908653289106883 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.7056027785545881 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.33202130899313653 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.5032849161169843 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5510350848991218 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.6095778863474799 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.5283797185155754 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.6135723164021851 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.44047720383044436 | |
} | |
} | |
}, | |
"Gemini_1.5_pro_002": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5202055934299538 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.5017043129027509 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.5532599716027446 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.546753787203128 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.425969084163906 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5751012914154264 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6982330827067671 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.513647745999633 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.3845337030093212 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.23899503258223884 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.4625032188638111 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.4292353723689881 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.4869625906903554 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5028718355967439 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5584779204331461 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.55005349042813 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.4292127751495457 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.44896309957892694 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.44418591808616864 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5146447350354234 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4688623462674191 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5580414823700747 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5538255562099124 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.39066515495086923 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5370278962809547 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5034399620483027 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.5028718355967439 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4885398161821004 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.45544217378728585 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.5421439953094952 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.3335324339429373 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.43465181771633377 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5250631828331306 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5821004797173627 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.5124355410095621 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5722329455291694 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.41210885517904977 | |
} | |
} | |
}, | |
"Gemini_1.5_flash_002": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.46250942866818673 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.4337278553354258 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.49947464681475356 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5098686082319499 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.34393279682972117 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5594391803821158 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6380250626566416 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.44816564352475535 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.34510790215980036 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.18973764406890803 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.3865262916591035 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.3598139859097534 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.4013870708864889 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4903530871753026 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5051202896842343 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5166044655846657 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.3849084036535956 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3869438864407766 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.39868324168390534 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.44793686445264996 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3704146726364947 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5448638967636353 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.47829883834573317 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.33669690098261523 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.43653808057103954 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.4427944359714585 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4903530871753026 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.42346517633403413 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.41994719346489817 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.46645473820179373 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2517485212411566 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.40372378342017806 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.4799408254775632 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.6010361821632402 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.4569546533897065 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.511590428993871 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.33710867194177685 | |
} | |
} | |
}, | |
"Claude_3.5": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5405089647404562 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.6082834220752651 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.5745077617490254 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5450038475783499 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.4767692987630454 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5756126284078804 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6969774436090224 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.5278843049497918 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.4082144793870471 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.23803578664609892 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.5691641481808987 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.4795267886975966 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.525848282456283 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.508735695828719 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5699094130430454 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5096772701625744 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.4429640420975014 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.5066797418318023 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.4971460788134188 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5278127103234661 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4490020843308984 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5838224169821388 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5456152399978661 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.46300075585789874 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5414381873407914 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5373019912310933 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.508735695828719 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4422556748863689 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.49311554035078103 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.6663170946790707 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.3382015835012861 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.5194010220575684 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.532329797132399 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5808831682303479 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.513474611293123 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5507075880782885 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.47461998432626556 | |
} | |
} | |
}, | |
"Claude_3.5_new": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.5690045172520449 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.6220681231036606 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.6077980666415158 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.5511440615639541 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.4885536652013625 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5908204006544897 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6569473684210526 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.5486763511384175 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.4315385951907387 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.2909419331017877 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.6048192628845258 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.48924295292319175 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.556418710368288 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4946691340754988 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.5558756390298104 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.5425198547046186 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.44210335381541843 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.5187252051932875 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.5071121107460066 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5387340524651681 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4824302644151348 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.6242798397166945 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5782691045270721 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.4630277507828528 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5914338446093256 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.5636254729390459 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4946691340754988 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.4828123870640382 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.48756636014597515 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.6590137441693218 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.39901670035164916 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.5166853031535193 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5561634744977417 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.6123769274172342 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.5512015158810595 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.565796566886933 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.4763267502912362 | |
} | |
} | |
}, | |
"GPT_4o_mini": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.4492982787524939 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.49026056071002017 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.5168957112681365 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.46731791428406805 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.3406008235342885 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5572925295284307 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6902380952380953 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.4189154010048976 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2943206715105082 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.19422793560945503 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.47202628409684394 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.3624496929166193 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.38946844562183286 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.45508480503584553 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.47569921440672464 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.465175334092545 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.29410984789062117 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.41242028190533997 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3906415365938764 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.44244772638735347 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3629944944697668 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5713834131825314 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.39874839531459466 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.3359977324263039 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.4305788513381019 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.46343334374251277 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.45508480503584553 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.24651576711552803 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.36981497185070983 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.5666618234843734 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2420320329702607 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.3458483931206892 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.43590838051817093 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5176671720617656 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.3554299482098288 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5399167524341886 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.32918280841495845 | |
} | |
} | |
}, | |
"Qwen2_VL_72B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.49787264809826687 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.5439010430283516 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.5392244859385411 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.509277882172206 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.3776739609562984 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5676817981386025 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.60496992481203 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.4633019068994453 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.35105970797600183 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.2201150812944581 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.5402397677488632 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.4289777675393297 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.42094543671351287 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.49943888306036405 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.507967430369507 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.49789939867591104 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.36212605501536715 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.44719815365440824 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.4500902736468407 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.5098505660529429 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.4027115384266939 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.5157810622684265 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.5199940976484408 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.3100812547241119 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.5468722850464449 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.4918205178721877 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.49943888306036405 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.36691704884033916 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.45176098055218655 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.5807658773593334 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.31245958897213383 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.4372517645050852 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.5362106489630868 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.4968249101570037 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.4488852456563113 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.5166939389651373 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.31157492395100744 | |
} | |
} | |
}, | |
"Qwen2_VL_7B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.3708368629321668 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.40213773918065815 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2511, | |
"tasks": [], | |
"average_score": 0.4034335110538307 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2469, | |
"tasks": [], | |
"average_score": 0.4109909230944937 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2818925976996871 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.49360878418945336 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5215889724310777 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.33309401517140946 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2439, | |
"tasks": [], | |
"average_score": 0.27564756843599875 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.1473690605854188 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.3821046882337143 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.2896392967775049 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.3223325179806271 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.4111189310485516 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.34825121621909577 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.40660144920567376 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.262166593895899 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3430730210869785 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3426196933687219 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.35162604166912687 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.32665673520415817 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1456, | |
"tasks": [], | |
"average_score": 0.3909745200389741 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.39898011714302023 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.19415154950869234 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.37453319457428763 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.37701588079136955 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.4111189310485516 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.26429868057315387 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.33008667136891007 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.42746758545520747 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2003871750665659 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.3270187644950453 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2315, | |
"tasks": [], | |
"average_score": 0.40048749993497734 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.4245693009859056 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.29880557491654197 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.4276637093173368 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.25562039051316643 | |
} | |
} | |
}, | |
"llava_onevision_72B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.3615741356043519 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.2834675874668524 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.3674817002808495 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.42146038539739283 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2951434804409883 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.478119286755779 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.6005438596491229 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.31663222188988865 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.29633645022129285 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.13872280436872364 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.23380046931752074 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.2126914943750874 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.34566020099204997 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4446001874842145 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.4401364830377099 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.4247591719013819 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.23897262553543516 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.2868275930712835 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.259450238500612 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.370724080249463 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3065719940769206 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.4293132525502993 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3986052416087927 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.20730347694633405 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.28104747671521785 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.34840850032295206 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.4446001874842145 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.25013213032747944 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.34156793747875674 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.3076421844825067 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.18168666652660437 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.23240790940031927 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.38362780453378204 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.4807891958712894 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.31702495228966576 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.4358874880224115 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.31588468105075895 | |
} | |
} | |
}, | |
"llava_onevision_7B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2524786809911341 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.19077168655703208 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.2555444562659206 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.29981286990552625 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.18973491465938852 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.36842322314565323 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.44998746867167916 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.2445135206648208 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.21802943568344288 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.06658775725427067 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.1466861610319767 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.13297395577964055 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.24236719143449742 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.30985943541023103 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.3199731020402028 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.3263378734842879 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.13043163858789789 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.20277804188944173 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.18291595756285564 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.25384794412815426 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.2200472229099345 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.3127341248874411 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.2802999516721972 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.1476473922902494 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.13803800801858385 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.2548084764084038 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.30985943541023103 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.1778991941079372 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2410111891690358 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.19283211154717242 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.09846926279075068 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.15189414475467605 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.28505205882578405 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3600079950628582 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.23654776813656775 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.3271805711561501 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.22080546908673507 | |
} | |
} | |
}, | |
"InternVL2_76B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.38193012983650343 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.41315219763443384 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.43665980552577693 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.4265623936500962 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2975890791763991 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.5257990949897898 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5779473684210527 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.33287081421166276 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2949505390920417 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.17036496432397477 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.3634339625985008 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.31396468806559114 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.3473756113126343 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.395893002855977 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.44982107744035305 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.42875248733027654 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.2868239162778749 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3630499545707523 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3476691827105281 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.3943337471922549 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.29244088978470345 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.45822072478616577 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3879326330400817 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.20309901738473166 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.34771123515123364 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.4145693044465943 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.395893002855977 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.24403942809507134 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.3153417935059416 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.4306947454508794 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.2132321995754061 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2953329718984368 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.42202934355552685 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.47409276729986083 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.30014798153766264 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.4625649385962016 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.2868813944130515 | |
} | |
} | |
}, | |
"InternVL2_8B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2817247716997634 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.280559214034858 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2511, | |
"tasks": [], | |
"average_score": 0.32020728060179815 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2469, | |
"tasks": [], | |
"average_score": 0.325593535916075 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.24118253695139918 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.39684007367798446 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.4700852130325815 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.27052668526005397 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2439, | |
"tasks": [], | |
"average_score": 0.23189345356483618 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.08260405712900723 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.22800928556370195 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.2013779290163996 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.2804429603269583 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.34791358240562653 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.2942163420306113 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.3388056726588417 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.10933317885944857 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.250804626773504 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.2522493284864019 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.27414636444623874 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.22381302045502052 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1456, | |
"tasks": [], | |
"average_score": 0.3537549824897016 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.30261189962428353 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.15434618291761149 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.19872104324302098 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.30088711082969344 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 700, | |
"tasks": [], | |
"average_score": 0.34791358240562653 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.17725087609332119 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2532272454839157 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.29129840423784176 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.12166926715781588 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.24700310231619527 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2315, | |
"tasks": [], | |
"average_score": 0.3214666523378005 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3995660275981844 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.24614711281861912 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.3393895915929317 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.22078333222564453 | |
} | |
} | |
}, | |
"MiniCPM_v2.6": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2604967101191775 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.2500331562865158 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.3003169369011028 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.31808748114668184 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.18281637763548025 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.40732197204308807 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.48798245614035085 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.23723675736151562 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.1968926733821904 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.08735883237069725 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.21195711598986072 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.18639148159043903 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.21578309681746147 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.3527537836840162 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.3096882575625531 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.3176880312524649 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.0755920550038197 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.23506388020592064 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.1781127776443048 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2551275278138797 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.20833171754655547 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.36473950920880716 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.293386806641223 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.13955971277399848 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.23596215721092323 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.26319603880798287 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.3527537836840162 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.17888270664238365 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.22288558250834017 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.2666989364424082 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.11693267119342445 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.15342045420318667 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.29243044121840894 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3777897246686755 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.25714862989687987 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.33187729423141027 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.16493399805627715 | |
} | |
} | |
}, | |
"Phi-3.5-vision": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.2551037902226636 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.2483252111012436 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.28732942108098564 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.3049602749093698 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.21653804346780042 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.36823084724842464 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.46663157894736845 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.24145330077248778 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2154692063816354 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.08944481289041872 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.1865974025588298 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.17497379027990792 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.26053460127801603 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.24669318645450836 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.2786226802221388 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.3413768635559215 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.15444746077692828 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.2177924712685756 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.21443984349574025 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2572371188897671 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.21409351002477045 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.365192668303297 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.25960269434727634 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.12546296296296297 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.14337869666229008 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.27790147494714373 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.24669318645450836 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.20168001345379397 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2850550871176333 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.2237087834389946 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.08928724806836039 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.219367263034246 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.316318567258608 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3945898792928062 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.21925278489551242 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.33264696401038385 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.17575913004138646 | |
} | |
} | |
}, | |
"Pixtral_12B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.3460288961410444 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.3777640755922415 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.38299418297106824 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.3776722463473817 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2828575553466608 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.419071767659191 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5687919799498747 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.32813540763467464 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2677293131171651 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.10591240329992047 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.3070067338940785 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.28832738144368647 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.3223299098375932 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.409643099998057 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.37450808136321684 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.37115973962368864 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.24009431093278263 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3078181788009137 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3188475653127356 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.3639544140938305 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.32073418701669026 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.4166613092238043 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3008126415966517 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.19743008314436883 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.16642294307267227 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.37108130557306335 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.409643099998057 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.2575699315401612 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.3104621543981899 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.4300741596942578 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.13622980866275425 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2572414987500377 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.3892097218585385 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.5020540387409291 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.31301986568151985 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.3809515410188075 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.24222628640267738 | |
} | |
} | |
}, | |
"Llama_3_2_11B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.1907604552173455 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.14328677752263275 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.19646404502647707 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.22399113135844315 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.13303760019716085 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.323153603297999 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.4260501253132832 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.1770852858056774 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.15366454315378308 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.06563884729522687 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.11886347847341794 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.11489351406848371 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.1693681214060816 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.2123769209846321 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.2520175802062012 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.2485354956932213 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.06418655520777307 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.12417283740525839 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.16374180545556977 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.1576236804437753 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.15014439824913947 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.3003142292328822 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.19270157739425633 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.1463246409674981 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.0732004839476103 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.1960107191983825 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.2123769209846321 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.1351857051327849 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.18586695387250338 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.17288724679416761 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.08100042975820579 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.0575426944971537 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.19899465185565898 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.254316961351997 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.162801811963855 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.28055776664538923 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.13937853323074623 | |
} | |
} | |
}, | |
"Idefics3": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.14507788965553362 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.11641535161320743 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.17255583910766542 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.14745217246476708 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.1331851390883708 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.19221534222332276 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.28640852130325817 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.17906399043310475 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.10192930055370109 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.04211916597550756 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.10126271262360581 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.11407926733108291 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.16225217317782772 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.16181866973635636 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.1839408679813373 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.14933801491626408 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.0395540896656236 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.13979628998424784 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.1062779093260333 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.07053056796593082 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.09790172378722654 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.2987797010800956 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.11588163814170001 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.1008692365835223 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.09308121224497533 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.14757589734485796 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.16181866973635636 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.12217834249866026 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.12276246278377517 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.14743542163139847 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.05354869594691955 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.09065540194572455 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.1463280929280822 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.14564374862578883 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.22748773785486257 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.17647756032677067 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.13168972973651977 | |
} | |
} | |
}, | |
"Aria": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.3264829094772722 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.35712138797286674 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.4004806395853317 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.3783082688258977 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.27628131703993153 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.4942870225393938 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.5811228070175439 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.3279996334048362 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.2481896092177717 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.11945216302285933 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.2830308005758272 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.27833423130489043 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.32371820359400666 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.42875359425696014 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.3612041984219992 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.37290568595471846 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.19554976321164697 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.3092653492193887 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.3043751656077328 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2930015244066511 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.3092167834876797 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.4523860109667709 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.3277812604542708 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.21139455782312927 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.2711617723374526 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.3576735443060994 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.42875359425696014 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.19839956701033565 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.27267126872569447 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.38321397541649777 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.14301905320436192 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2849545194421855 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.3779947327886569 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.39678729061309725 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.29682445889316517 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.4096377585306089 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.26194160419181234 | |
} | |
} | |
}, | |
"NVLM": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.24033557047857043 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.32154059695494047 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.2937052996171993 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.22845955700594492 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.2639741933075709 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.40870864071047447 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.4555238095238095 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.25785191641267197 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.15679681195908274 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.0672259242345112 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.23922823287047076 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.21734036617042948 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.30313485498585124 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.34726189956094355 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.3264757655296162 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.056894830390305184 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.22868389095927066 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.2788963949121424 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.2787764976961992 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.23349712171444964 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.3215948035793096 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.18487055428231897 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.3680809151131777 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.0 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.03838410364145658 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.2325581694709435 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.22773778915303383 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.08048160660797504 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.2390024647851972 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.30211261814126533 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.18857142857142856 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.24908307640275493 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.3724877947012685 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.24529601154794037 | |
} | |
} | |
}, | |
"InternVL2_2B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.14491178903291552 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.12126906675624163 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.16912754929321935 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.18542274192083463 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.13923308734553164 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.23992252224543772 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.3420927318295739 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.14807577209152425 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.13036555933925006 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.01727799227799228 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.057021136657850864 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.10504085961245285 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.1625198552182714 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.18999779001767986 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.1487677475708977 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.2011727338536935 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.11886936592818943 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.1131404778887607 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.05739750616837997 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.15465451663650032 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.16044698450090833 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.21429521387724249 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.2128614316540013 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.03658352229780801 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.05757839721254354 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.15225683687839608 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.18999779001767986 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.17677460549936644 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.158165588340436 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.08722661966805 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.04102853815875594 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.11264043251709285 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.17001758160301803 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.3332891958712894 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.1686125516807394 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.21169137106199268 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.10975764217070672 | |
} | |
} | |
}, | |
"Qwen2_VL_2B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.22236161923122505 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.23701014663017753 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.25669221785292334 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.26526414975225454 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.17623548305581763 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.31250702198481506 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.4140676691729323 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.20802820480076603 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.17320633068307653 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.06209506566980099 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.190837839372028 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.16287824421269087 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.19640906475019812 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.2520741776922928 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.24883076673424442 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.2877316297453947 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.13398525561847363 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.1624451002757208 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.20960092816529263 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.19986806708136184 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.2201024015934558 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.30248748033122763 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.256631742010999 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.07681405895691609 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.10526691703628158 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.25018977062352593 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.2520741776922928 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.17435940889565366 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.21286783416184518 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.2521972668785968 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.06967138760493456 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.16996250112948405 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.27603334911345223 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.31002436092347696 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.21061929716065056 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.2656728023444808 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.16356158787929762 | |
} | |
} | |
}, | |
"Aquila_VL_2B": { | |
"skills": { | |
"Object Recognition and Classification": { | |
"count": 303, | |
"num_samples": 4755, | |
"tasks": [], | |
"average_score": 0.18420666660337692 | |
}, | |
"Text Recognition (OCR)": { | |
"count": 137, | |
"num_samples": 2239, | |
"tasks": [], | |
"average_score": 0.12395530240359122 | |
}, | |
"Language Understanding and Generation": { | |
"count": 154, | |
"num_samples": 2509, | |
"tasks": [], | |
"average_score": 0.17924536722051596 | |
}, | |
"Scene and Event Understanding": { | |
"count": 154, | |
"num_samples": 2467, | |
"tasks": [], | |
"average_score": 0.220108610660707 | |
}, | |
"Mathematical and Logical Reasoning": { | |
"count": 109, | |
"num_samples": 1910, | |
"tasks": [], | |
"average_score": 0.1680749869910155 | |
}, | |
"Commonsense and Social Reasoning": { | |
"count": 51, | |
"num_samples": 855, | |
"tasks": [], | |
"average_score": 0.26630477322766793 | |
}, | |
"Ethical and Safety Reasoning": { | |
"count": 15, | |
"num_samples": 245, | |
"tasks": [], | |
"average_score": 0.35152130325814535 | |
}, | |
"Domain-Specific Knowledge and Skills": { | |
"count": 77, | |
"num_samples": 1386, | |
"tasks": [], | |
"average_score": 0.1857154485444521 | |
}, | |
"Spatial and Temporal Reasoning": { | |
"count": 152, | |
"num_samples": 2437, | |
"tasks": [], | |
"average_score": 0.1616397700608881 | |
}, | |
"Planning and Decision Making": { | |
"count": 37, | |
"num_samples": 577, | |
"tasks": [], | |
"average_score": 0.044513236949565 | |
} | |
}, | |
"input_format": { | |
"User Interface Screenshots": { | |
"count": 93, | |
"num_samples": 1517, | |
"tasks": [], | |
"average_score": 0.07480350331940272 | |
}, | |
"Text-Based Images and Documents": { | |
"count": 82, | |
"num_samples": 1294, | |
"tasks": [], | |
"average_score": 0.11444110320621242 | |
}, | |
"Diagrams and Data Visualizations": { | |
"count": 101, | |
"num_samples": 1718, | |
"tasks": [], | |
"average_score": 0.19412275574929044 | |
}, | |
"Videos": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.21367350061199514 | |
}, | |
"Artistic and Creative Content": { | |
"count": 32, | |
"num_samples": 541, | |
"tasks": [], | |
"average_score": 0.19717811128156643 | |
}, | |
"Photographs": { | |
"count": 143, | |
"num_samples": 2248, | |
"tasks": [], | |
"average_score": 0.24620947964695974 | |
}, | |
"3D Models and Aerial Imagery": { | |
"count": 11, | |
"num_samples": 169, | |
"tasks": [], | |
"average_score": 0.10131259529340846 | |
} | |
}, | |
"output_format": { | |
"contextual_formatted_text": { | |
"count": 98, | |
"num_samples": 1514, | |
"tasks": [], | |
"average_score": 0.11925340914357861 | |
}, | |
"structured_output": { | |
"count": 110, | |
"num_samples": 1714, | |
"tasks": [], | |
"average_score": 0.123417109500157 | |
}, | |
"exact_text": { | |
"count": 83, | |
"num_samples": 1278, | |
"tasks": [], | |
"average_score": 0.18474924824567768 | |
}, | |
"numerical_data": { | |
"count": 49, | |
"num_samples": 862, | |
"tasks": [], | |
"average_score": 0.19908864029107046 | |
}, | |
"open_ended_output": { | |
"count": 80, | |
"num_samples": 1454, | |
"tasks": [], | |
"average_score": 0.23278612647548963 | |
}, | |
"multiple_choice": { | |
"count": 85, | |
"num_samples": 1363, | |
"tasks": [], | |
"average_score": 0.22108484223035305 | |
} | |
}, | |
"input_num": { | |
"6-8 images": { | |
"count": 21, | |
"num_samples": 314, | |
"tasks": [], | |
"average_score": 0.11057256235827662 | |
}, | |
"9-image or more": { | |
"count": 41, | |
"num_samples": 623, | |
"tasks": [], | |
"average_score": 0.011631871744697361 | |
}, | |
"1-image": { | |
"count": 315, | |
"num_samples": 5228, | |
"tasks": [], | |
"average_score": 0.18240049845355885 | |
}, | |
"video": { | |
"count": 43, | |
"num_samples": 698, | |
"tasks": [], | |
"average_score": 0.21367350061199514 | |
}, | |
"4-5 images": { | |
"count": 34, | |
"num_samples": 520, | |
"tasks": [], | |
"average_score": 0.1898373110613516 | |
}, | |
"2-3 images": { | |
"count": 51, | |
"num_samples": 802, | |
"tasks": [], | |
"average_score": 0.23274180707905315 | |
} | |
}, | |
"app": { | |
"Information_Extraction": { | |
"count": 72, | |
"num_samples": 1124, | |
"tasks": [], | |
"average_score": 0.09484068019620011 | |
}, | |
"Planning": { | |
"count": 78, | |
"num_samples": 1239, | |
"tasks": [], | |
"average_score": 0.05864269260897992 | |
}, | |
"Coding": { | |
"count": 31, | |
"num_samples": 474, | |
"tasks": [], | |
"average_score": 0.13323092677931386 | |
}, | |
"Perception": { | |
"count": 145, | |
"num_samples": 2313, | |
"tasks": [], | |
"average_score": 0.20714098741611 | |
}, | |
"Metrics": { | |
"count": 20, | |
"num_samples": 309, | |
"tasks": [], | |
"average_score": 0.2932627505936196 | |
}, | |
"Science": { | |
"count": 29, | |
"num_samples": 574, | |
"tasks": [], | |
"average_score": 0.21075421274487907 | |
}, | |
"Knowledge": { | |
"count": 97, | |
"num_samples": 1605, | |
"tasks": [], | |
"average_score": 0.24110595572817994 | |
}, | |
"Mathematics": { | |
"count": 33, | |
"num_samples": 547, | |
"tasks": [], | |
"average_score": 0.20711160718581811 | |
} | |
} | |
} | |
} |