diff --git "a/static/eval_results/Default/all_model_keywords_stats.json" "b/static/eval_results/Default/all_model_keywords_stats.json" deleted file mode 100644--- "a/static/eval_results/Default/all_model_keywords_stats.json" +++ /dev/null @@ -1,5384 +0,0 @@ -{ - "GPT_4o": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5630758211022604 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.6216411634729735 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.616018277142757 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5823101249498799 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.44177544539510955 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.6345458069232931 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6795263157894738 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.5514924675940659 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.39435038953269674 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.22934807257231926 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.608083455060831 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.491325251564869 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.4999089647103332 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5315979872161023 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5641404607063637 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5613545677222056 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.47760591698367955 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.5388690453811203 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.48037685656449847 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5994159671881645 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.44606605087301393 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.6274371950293718 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5448877153826162 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.4751133786848073 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5343350103400748 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5672657028463585 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5315979872161023 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4500928191484624 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.4908653289106883 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.7056027785545881 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.33202130899313653 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.5032849161169843 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5510350848991218 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.6095778863474799 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.5283797185155754 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.6135723164021851 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.44047720383044436 - } - } - }, - "Gemini_1.5_pro_002": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5202055934299538 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.5017043129027509 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5532599716027446 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.546753787203128 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.425969084163906 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5751012914154264 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6982330827067671 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.513647745999633 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.3845337030093212 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.23899503258223884 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.4625032188638111 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4292353723689881 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.4869625906903554 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5028718355967439 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5584779204331461 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.55005349042813 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.4292127751495457 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.44896309957892694 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.44418591808616864 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5146447350354234 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4688623462674191 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5580414823700747 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5538255562099124 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.39066515495086923 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5370278962809547 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5034399620483027 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5028718355967439 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4885398161821004 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.45544217378728585 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.5421439953094952 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.3335324339429373 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.43465181771633377 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5250631828331306 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5821004797173627 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.5124355410095621 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5722329455291694 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.41210885517904977 - } - } - }, - "Gemini_1.5_flash_002": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.46250942866818673 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.4337278553354258 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.49947464681475356 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5098686082319499 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.34393279682972117 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5594391803821158 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6380250626566416 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.44816564352475535 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.34510790215980036 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.18973764406890803 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3865262916591035 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.3598139859097534 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.4013870708864889 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4903530871753026 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5051202896842343 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5166044655846657 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.3849084036535956 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3869438864407766 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.39868324168390534 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.44793686445264996 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3704146726364947 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5448638967636353 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.47829883834573317 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.33669690098261523 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.43653808057103954 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4427944359714585 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4903530871753026 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.42346517633403413 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.41994719346489817 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.46645473820179373 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2517485212411566 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.40372378342017806 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.4799408254775632 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.6010361821632402 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.4569546533897065 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.511590428993871 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.33710867194177685 - } - } - }, - "Claude_3.5": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5405089647404562 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.6082834220752651 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5745077617490254 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5450038475783499 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.4767692987630454 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5756126284078804 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6969774436090224 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.5278843049497918 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.4082144793870471 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.23803578664609892 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.5691641481808987 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4795267886975966 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.525848282456283 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.508735695828719 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5699094130430454 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5096772701625744 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.4429640420975014 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.5066797418318023 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.4971460788134188 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5278127103234661 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4490020843308984 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5838224169821388 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5456152399978661 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.46300075585789874 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5414381873407914 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5373019912310933 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.508735695828719 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4422556748863689 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.49311554035078103 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.6663170946790707 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.3382015835012861 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.5194010220575684 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.532329797132399 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5808831682303479 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.513474611293123 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5507075880782885 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.47461998432626556 - } - } - }, - "Claude_3.5_new": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5690045172520449 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.6220681231036606 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.6077980666415158 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5511440615639541 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.4885536652013625 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5908204006544897 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6569473684210526 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.5486763511384175 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.4315385951907387 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.2909419331017877 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.6048192628845258 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.48924295292319175 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.556418710368288 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4946691340754988 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5558756390298104 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5425198547046186 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.44210335381541843 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.5187252051932875 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.5071121107460066 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5387340524651681 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4824302644151348 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.6242798397166945 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5782691045270721 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.4630277507828528 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5914338446093256 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5636254729390459 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4946691340754988 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4828123870640382 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.48756636014597515 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.6590137441693218 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.39901670035164916 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.5166853031535193 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5561634744977417 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.6123769274172342 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.5512015158810595 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.565796566886933 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.4763267502912362 - } - } - }, - "GPT_4o_mini": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.4492982787524939 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.49026056071002017 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5168957112681365 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.46731791428406805 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.3406008235342885 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5572925295284307 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6902380952380953 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.4189154010048976 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2943206715105082 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.19422793560945503 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.47202628409684394 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.3624496929166193 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.38946844562183286 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.45508480503584553 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.47569921440672464 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.465175334092545 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.29410984789062117 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.41242028190533997 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3906415365938764 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.44244772638735347 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3629944944697668 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5713834131825314 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.39874839531459466 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.3359977324263039 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.4305788513381019 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.46343334374251277 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.45508480503584553 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.24651576711552803 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.36981497185070983 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.5666618234843734 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2420320329702607 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.3458483931206892 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.43590838051817093 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5176671720617656 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.3554299482098288 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5399167524341886 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.32918280841495845 - } - } - }, - "Qwen2_VL_72B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.49787264809826687 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.5439010430283516 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5392244859385411 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.509277882172206 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.3776739609562984 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5676817981386025 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.60496992481203 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.4633019068994453 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.35105970797600183 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.2201150812944581 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.5402397677488632 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4289777675393297 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.42094543671351287 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.49943888306036405 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.507967430369507 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.49789939867591104 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.36212605501536715 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.44719815365440824 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.4500902736468407 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5098505660529429 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4027115384266939 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5157810622684265 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5199940976484408 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.3100812547241119 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5468722850464449 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4918205178721877 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.49943888306036405 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.36691704884033916 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.45176098055218655 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.5807658773593334 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.31245958897213383 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.4372517645050852 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5362106489630868 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.4968249101570037 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.4488852456563113 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5166939389651373 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.31157492395100744 - } - } - }, - "Qwen2_VL_7B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3708368629321668 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.40213773918065815 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2511, - "tasks": [], - "average_score": 0.4034335110538307 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2469, - "tasks": [], - "average_score": 0.4109909230944937 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2818925976996871 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.49360878418945336 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5215889724310777 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.33309401517140946 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2439, - "tasks": [], - "average_score": 0.27564756843599875 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.1473690605854188 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3821046882337143 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.2896392967775049 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.3223325179806271 - }, - "Videos": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.4111189310485516 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.34825121621909577 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.40660144920567376 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.262166593895899 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3430730210869785 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3426196933687219 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.35162604166912687 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.32665673520415817 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1456, - "tasks": [], - "average_score": 0.3909745200389741 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.39898011714302023 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.19415154950869234 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.37453319457428763 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.37701588079136955 - }, - "video": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.4111189310485516 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.26429868057315387 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.33008667136891007 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.42746758545520747 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2003871750665659 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.3270187644950453 - }, - "Perception": { - "count": 145, - "num_samples": 2315, - "tasks": [], - "average_score": 0.40048749993497734 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.4245693009859056 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.29880557491654197 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4276637093173368 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.25562039051316643 - } - } - }, - "llava_onevision_72B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3615741356043519 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2834675874668524 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.3674817002808495 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.42146038539739283 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2951434804409883 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.478119286755779 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6005438596491229 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.31663222188988865 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.29633645022129285 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.13872280436872364 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.23380046931752074 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.2126914943750874 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.34566020099204997 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4446001874842145 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.4401364830377099 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.4247591719013819 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.23897262553543516 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.2868275930712835 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.259450238500612 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.370724080249463 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3065719940769206 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.4293132525502993 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3986052416087927 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.20730347694633405 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.28104747671521785 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.34840850032295206 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4446001874842145 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.25013213032747944 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.34156793747875674 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.3076421844825067 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.18168666652660437 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.23240790940031927 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.38362780453378204 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.4807891958712894 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.31702495228966576 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4358874880224115 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.31588468105075895 - } - } - }, - "llava_onevision_7B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2524786809911341 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.19077168655703208 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.2555444562659206 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.29981286990552625 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.18973491465938852 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.36842322314565323 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.44998746867167916 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.2445135206648208 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.21802943568344288 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.06658775725427067 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.1466861610319767 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.13297395577964055 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.24236719143449742 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.30985943541023103 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.3199731020402028 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3263378734842879 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.13043163858789789 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.20277804188944173 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.18291595756285564 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.25384794412815426 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.2200472229099345 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.3127341248874411 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.2802999516721972 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.1476473922902494 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.13803800801858385 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.2548084764084038 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.30985943541023103 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.1778991941079372 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2410111891690358 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.19283211154717242 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.09846926279075068 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.15189414475467605 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.28505205882578405 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3600079950628582 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.23654776813656775 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3271805711561501 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.22080546908673507 - } - } - }, - "InternVL2_76B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.38193012983650343 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.41315219763443384 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.43665980552577693 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.4265623936500962 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2975890791763991 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5257990949897898 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5779473684210527 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.33287081421166276 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2949505390920417 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.17036496432397477 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3634339625985008 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.31396468806559114 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.3473756113126343 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.395893002855977 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.44982107744035305 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.42875248733027654 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.2868239162778749 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3630499545707523 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3476691827105281 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.3943337471922549 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.29244088978470345 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.45822072478616577 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3879326330400817 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.20309901738473166 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.34771123515123364 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4145693044465943 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.395893002855977 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.24403942809507134 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.3153417935059416 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.4306947454508794 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2132321995754061 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2953329718984368 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.42202934355552685 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.47409276729986083 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.30014798153766264 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4625649385962016 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.2868813944130515 - } - } - }, - "InternVL2_8B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2817247716997634 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.280559214034858 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2511, - "tasks": [], - "average_score": 0.32020728060179815 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2469, - "tasks": [], - "average_score": 0.325593535916075 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.24118253695139918 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.39684007367798446 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4700852130325815 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.27052668526005397 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2439, - "tasks": [], - "average_score": 0.23189345356483618 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.08260405712900723 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.22800928556370195 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.2013779290163996 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.2804429603269583 - }, - "Videos": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.34791358240562653 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.2942163420306113 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3388056726588417 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.10933317885944857 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.250804626773504 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.2522493284864019 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.27414636444623874 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.22381302045502052 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1456, - "tasks": [], - "average_score": 0.3537549824897016 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.30261189962428353 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.15434618291761149 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.19872104324302098 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.30088711082969344 - }, - "video": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.34791358240562653 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17725087609332119 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2532272454839157 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.29129840423784176 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.12166926715781588 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.24700310231619527 - }, - "Perception": { - "count": 145, - "num_samples": 2315, - "tasks": [], - "average_score": 0.3214666523378005 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3995660275981844 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.24614711281861912 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3393895915929317 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.22078333222564453 - } - } - }, - "MiniCPM_v2.6": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2604967101191775 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2500331562865158 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.3003169369011028 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.31808748114668184 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.18281637763548025 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.40732197204308807 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.48798245614035085 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.23723675736151562 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.1968926733821904 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.08735883237069725 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.21195711598986072 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.18639148159043903 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.21578309681746147 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3527537836840162 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.3096882575625531 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3176880312524649 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.0755920550038197 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.23506388020592064 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.1781127776443048 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2551275278138797 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.20833171754655547 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.36473950920880716 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.293386806641223 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.13955971277399848 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.23596215721092323 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.26319603880798287 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3527537836840162 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17888270664238365 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.22288558250834017 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.2666989364424082 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.11693267119342445 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.15342045420318667 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.29243044121840894 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3777897246686755 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.25714862989687987 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.33187729423141027 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.16493399805627715 - } - } - }, - "Phi-3.5-vision": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2551037902226636 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2483252111012436 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.28732942108098564 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3049602749093698 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.21653804346780042 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.36823084724842464 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.46663157894736845 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.24145330077248778 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2154692063816354 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.08944481289041872 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.1865974025588298 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.17497379027990792 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.26053460127801603 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.24669318645450836 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.2786226802221388 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3413768635559215 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.15444746077692828 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.2177924712685756 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.21443984349574025 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2572371188897671 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.21409351002477045 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.365192668303297 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.25960269434727634 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.12546296296296297 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.14337869666229008 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.27790147494714373 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.24669318645450836 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.20168001345379397 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2850550871176333 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.2237087834389946 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.08928724806836039 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.219367263034246 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.316318567258608 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3945898792928062 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.21925278489551242 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.33264696401038385 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.17575913004138646 - } - } - }, - "Pixtral_12B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3460288961410444 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.3777640755922415 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.38299418297106824 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3776722463473817 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2828575553466608 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.419071767659191 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5687919799498747 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.32813540763467464 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2677293131171651 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.10591240329992047 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3070067338940785 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.28832738144368647 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.3223299098375932 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.409643099998057 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.37450808136321684 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.37115973962368864 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.24009431093278263 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3078181788009137 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3188475653127356 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.3639544140938305 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.32073418701669026 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.4166613092238043 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3008126415966517 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.19743008314436883 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.16642294307267227 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.37108130557306335 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.409643099998057 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.2575699315401612 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.3104621543981899 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.4300741596942578 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.13622980866275425 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2572414987500377 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.3892097218585385 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5020540387409291 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.31301986568151985 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3809515410188075 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.24222628640267738 - } - } - }, - "Llama_3_2_11B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.1907604552173455 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.14328677752263275 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.19646404502647707 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.22399113135844315 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.13303760019716085 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.323153603297999 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4260501253132832 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.1770852858056774 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.15366454315378308 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.06563884729522687 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.11886347847341794 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.11489351406848371 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.1693681214060816 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2123769209846321 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.2520175802062012 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.2485354956932213 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.06418655520777307 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.12417283740525839 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.16374180545556977 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.1576236804437753 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.15014439824913947 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.3003142292328822 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.19270157739425633 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.1463246409674981 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.0732004839476103 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.1960107191983825 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2123769209846321 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.1351857051327849 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.18586695387250338 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.17288724679416761 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.08100042975820579 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.0575426944971537 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.19899465185565898 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.254316961351997 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.162801811963855 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.28055776664538923 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.13937853323074623 - } - } - }, - "Idefics3": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.14507788965553362 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.11641535161320743 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.17255583910766542 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.14745217246476708 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.1331851390883708 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.19221534222332276 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.28640852130325817 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.17906399043310475 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.10192930055370109 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.04211916597550756 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.10126271262360581 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.11407926733108291 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.16225217317782772 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.16181866973635636 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.1839408679813373 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.14933801491626408 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.0395540896656236 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.13979628998424784 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.1062779093260333 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.07053056796593082 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.09790172378722654 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.2987797010800956 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.11588163814170001 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.1008692365835223 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.09308121224497533 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.14757589734485796 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.16181866973635636 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.12217834249866026 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.12276246278377517 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.14743542163139847 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.05354869594691955 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.09065540194572455 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.1463280929280822 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.14564374862578883 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.22748773785486257 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.17647756032677067 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.13168972973651977 - } - } - }, - "Aria": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3264829094772722 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.35712138797286674 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.4004806395853317 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3783082688258977 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.27628131703993153 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.4942870225393938 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5811228070175439 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.3279996334048362 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2481896092177717 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.11945216302285933 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.2830308005758272 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.27833423130489043 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.32371820359400666 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.42875359425696014 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.3612041984219992 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.37290568595471846 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.19554976321164697 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3092653492193887 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3043751656077328 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2930015244066511 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3092167834876797 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.4523860109667709 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3277812604542708 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.21139455782312927 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.2711617723374526 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.3576735443060994 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.42875359425696014 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.19839956701033565 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.27267126872569447 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.38321397541649777 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.14301905320436192 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2849545194421855 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.3779947327886569 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.39678729061309725 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.29682445889316517 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4096377585306089 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.26194160419181234 - } - } - }, - "NVLM": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.24033557047857043 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.32154059695494047 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.2937052996171993 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.22845955700594492 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2639741933075709 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.40870864071047447 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4555238095238095 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.25785191641267197 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.15679681195908274 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.0672259242345112 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.23922823287047076 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.21734036617042948 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.30313485498585124 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.0 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.34726189956094355 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3264757655296162 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.056894830390305184 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.22868389095927066 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.2788963949121424 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2787764976961992 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.23349712171444964 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.3215948035793096 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.18487055428231897 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.0 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.0 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.3680809151131777 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.0 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.03838410364145658 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2325581694709435 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.22773778915303383 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.08048160660797504 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2390024647851972 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.30211261814126533 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.18857142857142856 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.24908307640275493 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3724877947012685 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.24529601154794037 - } - } - }, - "InternVL2_2B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.14491178903291552 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.12126906675624163 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.16912754929321935 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.18542274192083463 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.13923308734553164 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.23992252224543772 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.3420927318295739 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.14807577209152425 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.13036555933925006 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.01727799227799228 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.057021136657850864 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.10504085961245285 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.1625198552182714 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.18999779001767986 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.1487677475708977 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.2011727338536935 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.11886936592818943 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.1131404778887607 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.05739750616837997 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.15465451663650032 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.16044698450090833 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.21429521387724249 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.2128614316540013 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.03658352229780801 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.05757839721254354 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.15225683687839608 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.18999779001767986 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17677460549936644 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.158165588340436 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.08722661966805 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.04102853815875594 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.11264043251709285 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.17001758160301803 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3332891958712894 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.1686125516807394 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.21169137106199268 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.10975764217070672 - } - } - }, - "Qwen2_VL_2B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.22236161923122505 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.23701014663017753 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.25669221785292334 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.26526414975225454 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.17623548305581763 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.31250702198481506 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4140676691729323 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.20802820480076603 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.17320633068307653 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.06209506566980099 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.190837839372028 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.16287824421269087 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.19640906475019812 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2520741776922928 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.24883076673424442 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.2877316297453947 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.13398525561847363 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.1624451002757208 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.20960092816529263 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.19986806708136184 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.2201024015934558 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.30248748033122763 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.256631742010999 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.07681405895691609 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.10526691703628158 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.25018977062352593 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2520741776922928 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17435940889565366 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.21286783416184518 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.2521972668785968 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.06967138760493456 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.16996250112948405 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.27603334911345223 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.31002436092347696 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.21061929716065056 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.2656728023444808 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.16356158787929762 - } - } - }, - "Aquila_VL_2B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.18420666660337692 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.12395530240359122 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.17924536722051596 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.220108610660707 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.1680749869910155 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.26630477322766793 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.35152130325814535 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.1857154485444521 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.1616397700608881 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.044513236949565 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.07480350331940272 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.11444110320621242 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.19412275574929044 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.21367350061199514 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.19717811128156643 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.24620947964695974 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.10131259529340846 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.11925340914357861 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.123417109500157 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.18474924824567768 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.19908864029107046 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.23278612647548963 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.22108484223035305 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.11057256235827662 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.011631871744697361 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.18240049845355885 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.21367350061199514 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.1898373110613516 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.23274180707905315 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.09484068019620011 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.05864269260897992 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.13323092677931386 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.20714098741611 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.2932627505936196 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.21075421274487907 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.24110595572817994 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.20711160718581811 - } - } - }, - "Mammoth_VL": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.30194776127683565 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2365295791606494 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.2993927028494267 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3366347826116991 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2408454736444444 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.37895522991264047 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.48003508771929826 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.27232427744946475 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.24522937191710698 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.11457024299726488 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.18941525254390731 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.1718334741390191 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.28108187023954245 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3391119999611432 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.36434285930327387 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.36915384448504296 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.15940750469262005 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.2456942956200745 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.21586513216389874 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.29359048024032264 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.2646677074112521 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.34733130661096645 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3286125236284589 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.16358654572940287 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.25463059203015115 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.2919119209789575 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3391119999611432 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.20016011839130254 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2679179451692527 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.23600902063965679 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.15326915093278803 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.20668466311255687 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.33348955971237954 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3759170425350556 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.23894961766260706 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.351703435685048 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.26074348700688493 - } - } - } -} \ No newline at end of file