MEGA-Bench / static /eval_results /Default /all_model_keywords_stats.json
cccjc's picture
Update single-image results, add model link url
4301eca
raw
history blame
167 kB
{
"GPT_4o": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5630758211022604
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6216411634729735
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.616018277142757
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5823101249498799
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.44177544539510955
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.6345458069232931
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6795263157894738
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5514924675940659
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.39435038953269674
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.22934807257231926
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.608083455060831
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.491325251564869
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4999089647103332
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5315979872161023
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5641404607063637
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5613545677222056
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.47760591698367955
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5388690453811203
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.48037685656449847
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5994159671881645
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.44606605087301393
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.6274371950293718
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5448877153826162
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.4751133786848073
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5343350103400748
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5672657028463585
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5315979872161023
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4500928191484624
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.4908653289106883
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.7056027785545881
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.33202130899313653
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5032849161169843
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5510350848991218
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6095778863474799
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5283797185155754
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.6135723164021851
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.44047720383044436
}
}
},
"Gemini_1.5_pro_002": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5202055934299538
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.5017043129027509
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5532599716027446
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.546753787203128
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.425969084163906
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5751012914154264
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6982330827067671
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.513647745999633
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.3845337030093212
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.23899503258223884
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.4625032188638111
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4292353723689881
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4869625906903554
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5028718355967439
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5584779204331461
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.55005349042813
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.4292127751495457
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.44896309957892694
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.44418591808616864
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5146447350354234
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4688623462674191
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5580414823700747
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5538255562099124
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.39066515495086923
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5370278962809547
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5034399620483027
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.5028718355967439
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4885398161821004
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.45544217378728585
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5421439953094952
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.3335324339429373
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.43465181771633377
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5250631828331306
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5821004797173627
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5124355410095621
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5722329455291694
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.41210885517904977
}
}
},
"Gemini_1.5_flash_002": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.46250942866818673
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.4337278553354258
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.49947464681475356
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5098686082319499
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.34393279682972117
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5594391803821158
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6380250626566416
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.44816564352475535
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.34510790215980036
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.18973764406890803
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3865262916591035
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.3598139859097534
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.4013870708864889
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4903530871753026
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5051202896842343
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5166044655846657
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.3849084036535956
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3869438864407766
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.39868324168390534
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.44793686445264996
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3704146726364947
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5448638967636353
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.47829883834573317
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.33669690098261523
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.43653808057103954
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.4427944359714585
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4903530871753026
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.42346517633403413
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.41994719346489817
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.46645473820179373
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2517485212411566
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.40372378342017806
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.4799408254775632
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6010361821632402
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.4569546533897065
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.511590428993871
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.33710867194177685
}
}
},
"Claude_3.5": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5405089647404562
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6082834220752651
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5745077617490254
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5450038475783499
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.4767692987630454
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5756126284078804
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6969774436090224
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5278843049497918
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.4082144793870471
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.23803578664609892
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.5691641481808987
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4795267886975966
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.525848282456283
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.508735695828719
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5699094130430454
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5096772701625744
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.4429640420975014
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5066797418318023
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.4971460788134188
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5278127103234661
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4490020843308984
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5838224169821388
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5456152399978661
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.46300075585789874
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5414381873407914
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5373019912310933
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.508735695828719
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4422556748863689
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.49311554035078103
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.6663170946790707
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.3382015835012861
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5194010220575684
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.532329797132399
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5808831682303479
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.513474611293123
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5507075880782885
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.47461998432626556
}
}
},
"Claude_3.5_new": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.5690045172520449
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.6220681231036606
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.6077980666415158
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.5511440615639541
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.4885536652013625
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5908204006544897
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6569473684210526
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.5486763511384175
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.4315385951907387
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.2909419331017877
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.6048192628845258
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.48924295292319175
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.556418710368288
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4946691340754988
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.5558756390298104
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.5425198547046186
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.44210335381541843
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.5187252051932875
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.5071121107460066
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5387340524651681
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4824302644151348
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.6242798397166945
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5782691045270721
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.4630277507828528
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5914338446093256
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.5636254729390459
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4946691340754988
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.4828123870640382
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.48756636014597515
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.6590137441693218
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.39901670035164916
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.5166853031535193
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5561634744977417
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.6123769274172342
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.5512015158810595
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.565796566886933
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.4763267502912362
}
}
},
"GPT_4o_mini": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.4492982787524939
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.49026056071002017
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5168957112681365
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.46731791428406805
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.3406008235342885
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5572925295284307
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6902380952380953
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.4189154010048976
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2943206715105082
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.19422793560945503
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.47202628409684394
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.3624496929166193
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.38946844562183286
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.45508480503584553
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.47569921440672464
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.465175334092545
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.29410984789062117
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.41242028190533997
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3906415365938764
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.44244772638735347
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3629944944697668
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5713834131825314
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.39874839531459466
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.3359977324263039
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.4305788513381019
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.46343334374251277
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.45508480503584553
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.24651576711552803
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.36981497185070983
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5666618234843734
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2420320329702607
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.3458483931206892
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.43590838051817093
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5176671720617656
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.3554299482098288
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5399167524341886
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.32918280841495845
}
}
},
"Qwen2_VL_72B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.49787264809826687
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.5439010430283516
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.5392244859385411
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.509277882172206
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.3776739609562984
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5676817981386025
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.60496992481203
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.4633019068994453
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.35105970797600183
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.2201150812944581
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.5402397677488632
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.4289777675393297
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.42094543671351287
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.49943888306036405
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.507967430369507
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.49789939867591104
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.36212605501536715
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.44719815365440824
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.4500902736468407
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.5098505660529429
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.4027115384266939
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.5157810622684265
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.5199940976484408
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.3100812547241119
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.5468722850464449
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.4918205178721877
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.49943888306036405
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.36691704884033916
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.45176098055218655
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.5807658773593334
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.31245958897213383
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.4372517645050852
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.5362106489630868
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4968249101570037
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.4488852456563113
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.5166939389651373
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.31157492395100744
}
}
},
"Qwen2_VL_7B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.3708368629321668
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.40213773918065815
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2511,
"tasks": [],
"average_score": 0.4034335110538307
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2469,
"tasks": [],
"average_score": 0.4109909230944937
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2818925976996871
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.49360878418945336
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5215889724310777
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.33309401517140946
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2439,
"tasks": [],
"average_score": 0.27564756843599875
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.1473690605854188
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3821046882337143
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2896392967775049
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3223325179806271
},
"Videos": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.4111189310485516
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.34825121621909577
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.40660144920567376
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.262166593895899
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3430730210869785
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3426196933687219
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.35162604166912687
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.32665673520415817
},
"open_ended_output": {
"count": 80,
"num_samples": 1456,
"tasks": [],
"average_score": 0.3909745200389741
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.39898011714302023
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.19415154950869234
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.37453319457428763
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.37701588079136955
},
"video": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.4111189310485516
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.26429868057315387
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.33008667136891007
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.42746758545520747
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2003871750665659
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.3270187644950453
},
"Perception": {
"count": 145,
"num_samples": 2315,
"tasks": [],
"average_score": 0.40048749993497734
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4245693009859056
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.29880557491654197
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.4276637093173368
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.25562039051316643
}
}
},
"llava_onevision_72B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.3615741356043519
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.2834675874668524
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.3674817002808495
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.42146038539739283
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2951434804409883
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.478119286755779
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.6005438596491229
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.31663222188988865
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.29633645022129285
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.13872280436872364
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.23380046931752074
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2126914943750874
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.34566020099204997
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4446001874842145
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.4401364830377099
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.4247591719013819
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.23897262553543516
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.2868275930712835
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.259450238500612
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.370724080249463
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3065719940769206
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4293132525502993
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3986052416087927
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.20730347694633405
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.28104747671521785
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.34840850032295206
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.4446001874842145
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.25013213032747944
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.34156793747875674
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.3076421844825067
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.18168666652660437
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.23240790940031927
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.38362780453378204
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.4807891958712894
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.31702495228966576
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.4358874880224115
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.31588468105075895
}
}
},
"llava_onevision_7B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2524786809911341
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.19077168655703208
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.2555444562659206
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.29981286990552625
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.18973491465938852
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.36842322314565323
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.44998746867167916
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.2445135206648208
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.21802943568344288
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.06658775725427067
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.1466861610319767
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.13297395577964055
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.24236719143449742
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.30985943541023103
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3199731020402028
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3263378734842879
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.13043163858789789
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.20277804188944173
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.18291595756285564
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.25384794412815426
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.2200472229099345
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3127341248874411
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.2802999516721972
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1476473922902494
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.13803800801858385
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.2548084764084038
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.30985943541023103
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.1778991941079372
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2410111891690358
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.19283211154717242
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.09846926279075068
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.15189414475467605
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.28505205882578405
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3600079950628582
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.23654776813656775
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3271805711561501
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.22080546908673507
}
}
},
"InternVL2_76B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.38193012983650343
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.41315219763443384
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.43665980552577693
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.4265623936500962
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2975890791763991
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.5257990949897898
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5779473684210527
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.33287081421166276
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2949505390920417
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.17036496432397477
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3634339625985008
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.31396468806559114
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3473756113126343
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.395893002855977
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.44982107744035305
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.42875248733027654
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.2868239162778749
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3630499545707523
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3476691827105281
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.3943337471922549
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.29244088978470345
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.45822072478616577
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3879326330400817
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.20309901738473166
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.34771123515123364
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.4145693044465943
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.395893002855977
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.24403942809507134
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.3153417935059416
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4306947454508794
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.2132321995754061
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2953329718984368
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.42202934355552685
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.47409276729986083
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.30014798153766264
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.4625649385962016
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.2868813944130515
}
}
},
"InternVL2_8B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2817247716997634
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.280559214034858
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2511,
"tasks": [],
"average_score": 0.32020728060179815
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2469,
"tasks": [],
"average_score": 0.325593535916075
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.24118253695139918
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.39684007367798446
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4700852130325815
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.27052668526005397
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2439,
"tasks": [],
"average_score": 0.23189345356483618
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08260405712900723
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.22800928556370195
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.2013779290163996
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.2804429603269583
},
"Videos": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.34791358240562653
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2942163420306113
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3388056726588417
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.10933317885944857
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.250804626773504
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.2522493284864019
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.27414636444623874
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.22381302045502052
},
"open_ended_output": {
"count": 80,
"num_samples": 1456,
"tasks": [],
"average_score": 0.3537549824897016
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.30261189962428353
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.15434618291761149
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.19872104324302098
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.30088711082969344
},
"video": {
"count": 43,
"num_samples": 700,
"tasks": [],
"average_score": 0.34791358240562653
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17725087609332119
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2532272454839157
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.29129840423784176
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.12166926715781588
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.24700310231619527
},
"Perception": {
"count": 145,
"num_samples": 2315,
"tasks": [],
"average_score": 0.3214666523378005
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3995660275981844
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.24614711281861912
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3393895915929317
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.22078333222564453
}
}
},
"MiniCPM_v2.6": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2604967101191775
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.2500331562865158
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.3003169369011028
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.31808748114668184
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.18281637763548025
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.40732197204308807
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.48798245614035085
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.23723675736151562
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.1968926733821904
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08735883237069725
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.21195711598986072
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.18639148159043903
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.21578309681746147
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.3527537836840162
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3096882575625531
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3176880312524649
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.0755920550038197
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.23506388020592064
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.1781127776443048
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2551275278138797
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.20833171754655547
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.36473950920880716
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.293386806641223
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.13955971277399848
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.23596215721092323
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.26319603880798287
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.3527537836840162
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17888270664238365
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.22288558250834017
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.2666989364424082
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.11693267119342445
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.15342045420318667
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.29243044121840894
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3777897246686755
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.25714862989687987
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.33187729423141027
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.16493399805627715
}
}
},
"Phi-3.5-vision": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.2551037902226636
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.2483252111012436
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.28732942108098564
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3049602749093698
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.21653804346780042
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.36823084724842464
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.46663157894736845
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.24145330077248778
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2154692063816354
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.08944481289041872
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.1865974025588298
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.17497379027990792
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.26053460127801603
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.24669318645450836
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2786226802221388
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3413768635559215
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.15444746077692828
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.2177924712685756
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.21443984349574025
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2572371188897671
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.21409351002477045
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.365192668303297
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.25960269434727634
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.12546296296296297
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.14337869666229008
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.27790147494714373
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.24669318645450836
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.20168001345379397
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2850550871176333
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.2237087834389946
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08928724806836039
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.219367263034246
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.316318567258608
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3945898792928062
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.21925278489551242
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.33264696401038385
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.17575913004138646
}
}
},
"Pixtral_12B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.3460288961410444
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.3777640755922415
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.38299418297106824
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3776722463473817
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2828575553466608
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.419071767659191
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5687919799498747
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.32813540763467464
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2677293131171651
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.10591240329992047
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.3070067338940785
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.28832738144368647
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.3223299098375932
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.409643099998057
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.37450808136321684
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.37115973962368864
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.24009431093278263
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3078181788009137
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3188475653127356
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.3639544140938305
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.32073418701669026
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4166613092238043
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3008126415966517
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.19743008314436883
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.16642294307267227
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.37108130557306335
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.409643099998057
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.2575699315401612
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.3104621543981899
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.4300741596942578
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.13622980866275425
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2572414987500377
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.3892097218585385
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.5020540387409291
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.31301986568151985
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3809515410188075
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.24222628640267738
}
}
},
"Llama_3_2_11B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.1907604552173455
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.14328677752263275
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.19646404502647707
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.22399113135844315
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.13303760019716085
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.323153603297999
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4260501253132832
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.1770852858056774
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.15366454315378308
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.06563884729522687
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.11886347847341794
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.11489351406848371
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.1693681214060816
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2123769209846321
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.2520175802062012
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.2485354956932213
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.06418655520777307
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.12417283740525839
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.16374180545556977
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.1576236804437753
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.15014439824913947
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3003142292328822
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.19270157739425633
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1463246409674981
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.0732004839476103
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.1960107191983825
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2123769209846321
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.1351857051327849
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.18586695387250338
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.17288724679416761
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08100042975820579
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.0575426944971537
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.19899465185565898
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.254316961351997
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.162801811963855
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.28055776664538923
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.13937853323074623
}
}
},
"Idefics3": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.14507788965553362
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.11641535161320743
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.17255583910766542
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.14745217246476708
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.1331851390883708
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.19221534222332276
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.28640852130325817
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.17906399043310475
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.10192930055370109
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.04211916597550756
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.10126271262360581
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.11407926733108291
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.16225217317782772
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.16181866973635636
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.1839408679813373
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.14933801491626408
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.0395540896656236
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.13979628998424784
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.1062779093260333
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.07053056796593082
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.09790172378722654
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.2987797010800956
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.11588163814170001
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.1008692365835223
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.09308121224497533
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.14757589734485796
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.16181866973635636
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.12217834249866026
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.12276246278377517
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.14743542163139847
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.05354869594691955
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.09065540194572455
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.1463280929280822
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.14564374862578883
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.22748773785486257
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.17647756032677067
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.13168972973651977
}
}
},
"Aria": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.3264829094772722
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.35712138797286674
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.4004806395853317
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.3783082688258977
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.27628131703993153
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.4942870225393938
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.5811228070175439
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.3279996334048362
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.2481896092177717
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.11945216302285933
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.2830308005758272
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.27833423130489043
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.32371820359400666
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.42875359425696014
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.3612041984219992
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.37290568595471846
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.19554976321164697
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.3092653492193887
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.3043751656077328
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2930015244066511
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.3092167834876797
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.4523860109667709
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.3277812604542708
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.21139455782312927
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.2711617723374526
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3576735443060994
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.42875359425696014
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.19839956701033565
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.27267126872569447
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.38321397541649777
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.14301905320436192
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2849545194421855
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.3779947327886569
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.39678729061309725
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.29682445889316517
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.4096377585306089
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.26194160419181234
}
}
},
"NVLM": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.24033557047857043
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.32154059695494047
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.2937052996171993
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.22845955700594492
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.2639741933075709
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.40870864071047447
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4555238095238095
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.25785191641267197
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.15679681195908274
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.0672259242345112
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.23922823287047076
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.21734036617042948
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.30313485498585124
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.0
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.34726189956094355
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.3264757655296162
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.056894830390305184
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.22868389095927066
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.2788963949121424
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.2787764976961992
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.23349712171444964
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.3215948035793096
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.18487055428231897
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.0
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.0
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.3680809151131777
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.0
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.03838410364145658
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.2325581694709435
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.22773778915303383
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.08048160660797504
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.2390024647851972
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.30211261814126533
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.18857142857142856
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.24908307640275493
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.3724877947012685
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.24529601154794037
}
}
},
"InternVL2_2B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.14491178903291552
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.12126906675624163
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.16912754929321935
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.18542274192083463
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.13923308734553164
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.23992252224543772
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.3420927318295739
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.14807577209152425
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.13036555933925006
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.01727799227799228
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.057021136657850864
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.10504085961245285
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.1625198552182714
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.18999779001767986
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.1487677475708977
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.2011727338536935
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.11886936592818943
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.1131404778887607
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.05739750616837997
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.15465451663650032
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.16044698450090833
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.21429521387724249
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.2128614316540013
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.03658352229780801
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.05757839721254354
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.15225683687839608
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.18999779001767986
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17677460549936644
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.158165588340436
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.08722661966805
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.04102853815875594
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.11264043251709285
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.17001758160301803
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.3332891958712894
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.1686125516807394
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.21169137106199268
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.10975764217070672
}
}
},
"Qwen2_VL_2B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.22236161923122505
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.23701014663017753
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.25669221785292334
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.26526414975225454
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.17623548305581763
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.31250702198481506
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.4140676691729323
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.20802820480076603
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.17320633068307653
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.06209506566980099
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.190837839372028
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.16287824421269087
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.19640906475019812
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2520741776922928
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.24883076673424442
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.2877316297453947
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.13398525561847363
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.1624451002757208
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.20960092816529263
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.19986806708136184
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.2201024015934558
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.30248748033122763
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.256631742010999
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.07681405895691609
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.10526691703628158
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.25018977062352593
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.2520741776922928
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.17435940889565366
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.21286783416184518
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.2521972668785968
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.06967138760493456
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.16996250112948405
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.27603334911345223
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.31002436092347696
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.21061929716065056
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.2656728023444808
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.16356158787929762
}
}
},
"Aquila_VL_2B": {
"skills": {
"Object Recognition and Classification": {
"count": 303,
"num_samples": 4755,
"tasks": [],
"average_score": 0.18420666660337692
},
"Text Recognition (OCR)": {
"count": 137,
"num_samples": 2239,
"tasks": [],
"average_score": 0.12395530240359122
},
"Language Understanding and Generation": {
"count": 154,
"num_samples": 2509,
"tasks": [],
"average_score": 0.17924536722051596
},
"Scene and Event Understanding": {
"count": 154,
"num_samples": 2467,
"tasks": [],
"average_score": 0.220108610660707
},
"Mathematical and Logical Reasoning": {
"count": 109,
"num_samples": 1910,
"tasks": [],
"average_score": 0.1680749869910155
},
"Commonsense and Social Reasoning": {
"count": 51,
"num_samples": 855,
"tasks": [],
"average_score": 0.26630477322766793
},
"Ethical and Safety Reasoning": {
"count": 15,
"num_samples": 245,
"tasks": [],
"average_score": 0.35152130325814535
},
"Domain-Specific Knowledge and Skills": {
"count": 77,
"num_samples": 1386,
"tasks": [],
"average_score": 0.1857154485444521
},
"Spatial and Temporal Reasoning": {
"count": 152,
"num_samples": 2437,
"tasks": [],
"average_score": 0.1616397700608881
},
"Planning and Decision Making": {
"count": 37,
"num_samples": 577,
"tasks": [],
"average_score": 0.044513236949565
}
},
"input_format": {
"User Interface Screenshots": {
"count": 93,
"num_samples": 1517,
"tasks": [],
"average_score": 0.07480350331940272
},
"Text-Based Images and Documents": {
"count": 82,
"num_samples": 1294,
"tasks": [],
"average_score": 0.11444110320621242
},
"Diagrams and Data Visualizations": {
"count": 101,
"num_samples": 1718,
"tasks": [],
"average_score": 0.19412275574929044
},
"Videos": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.21367350061199514
},
"Artistic and Creative Content": {
"count": 32,
"num_samples": 541,
"tasks": [],
"average_score": 0.19717811128156643
},
"Photographs": {
"count": 143,
"num_samples": 2248,
"tasks": [],
"average_score": 0.24620947964695974
},
"3D Models and Aerial Imagery": {
"count": 11,
"num_samples": 169,
"tasks": [],
"average_score": 0.10131259529340846
}
},
"output_format": {
"contextual_formatted_text": {
"count": 98,
"num_samples": 1514,
"tasks": [],
"average_score": 0.11925340914357861
},
"structured_output": {
"count": 110,
"num_samples": 1714,
"tasks": [],
"average_score": 0.123417109500157
},
"exact_text": {
"count": 83,
"num_samples": 1278,
"tasks": [],
"average_score": 0.18474924824567768
},
"numerical_data": {
"count": 49,
"num_samples": 862,
"tasks": [],
"average_score": 0.19908864029107046
},
"open_ended_output": {
"count": 80,
"num_samples": 1454,
"tasks": [],
"average_score": 0.23278612647548963
},
"multiple_choice": {
"count": 85,
"num_samples": 1363,
"tasks": [],
"average_score": 0.22108484223035305
}
},
"input_num": {
"6-8 images": {
"count": 21,
"num_samples": 314,
"tasks": [],
"average_score": 0.11057256235827662
},
"9-image or more": {
"count": 41,
"num_samples": 623,
"tasks": [],
"average_score": 0.011631871744697361
},
"1-image": {
"count": 315,
"num_samples": 5228,
"tasks": [],
"average_score": 0.18240049845355885
},
"video": {
"count": 43,
"num_samples": 698,
"tasks": [],
"average_score": 0.21367350061199514
},
"4-5 images": {
"count": 34,
"num_samples": 520,
"tasks": [],
"average_score": 0.1898373110613516
},
"2-3 images": {
"count": 51,
"num_samples": 802,
"tasks": [],
"average_score": 0.23274180707905315
}
},
"app": {
"Information_Extraction": {
"count": 72,
"num_samples": 1124,
"tasks": [],
"average_score": 0.09484068019620011
},
"Planning": {
"count": 78,
"num_samples": 1239,
"tasks": [],
"average_score": 0.05864269260897992
},
"Coding": {
"count": 31,
"num_samples": 474,
"tasks": [],
"average_score": 0.13323092677931386
},
"Perception": {
"count": 145,
"num_samples": 2313,
"tasks": [],
"average_score": 0.20714098741611
},
"Metrics": {
"count": 20,
"num_samples": 309,
"tasks": [],
"average_score": 0.2932627505936196
},
"Science": {
"count": 29,
"num_samples": 574,
"tasks": [],
"average_score": 0.21075421274487907
},
"Knowledge": {
"count": 97,
"num_samples": 1605,
"tasks": [],
"average_score": 0.24110595572817994
},
"Mathematics": {
"count": 33,
"num_samples": 547,
"tasks": [],
"average_score": 0.20711160718581811
}
}
}
}