diff --git "a/static/eval_results/Core_SI/all_model_keywords_stats.json" "b/static/eval_results/Core_SI/all_model_keywords_stats.json" new file mode 100644--- /dev/null +++ "b/static/eval_results/Core_SI/all_model_keywords_stats.json" @@ -0,0 +1,4952 @@ +{ + "Aquila_VL_2B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.23416866746698675 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.06443618301582155 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.2513377943770101 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19099680045595863 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.39206349206349206 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.2940001351324067 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.07183931981868254 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.3048902830669748 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.10279080594456047 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.24908402989678843 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.22336185704360972 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.3380105010630228 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.09444637122168677 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.10866738911340403 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.20770364903712496 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.17545286840086055 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.1884228017877996 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.43875114784205704 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23519563962981577 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.15555555555555556 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.13666905933371165 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.4535581601061477 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.20244786476321197 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.6214285714285713 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.23268010607606476 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.197538439362422 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.22671604990393424 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.04052406201684317 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.3313749317545069 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.17863891296838966 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.15590817580226335 + } + } + }, + "Aria": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.3712276577297585 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.32453947235749425 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.364563525052646 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.22745674367681176 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4142857142857143 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.41491324204854596 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.09191879646828911 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.3605432412199329 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.04960831797041802 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.3216854905016703 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.30519297350169805 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.4343994309016504 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.24229546394440749 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.25980402737696456 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.31788827761478883 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.31023926514469913 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2579833154471113 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4787572696663607 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3082165471908181 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.22222222222222224 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.3193930637611741 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.5125098792377363 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.32296445873568824 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.7642857142857142 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.3519759933836403 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.2933246830594777 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.3208911078847184 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.05444727731376161 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.43720680235135884 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.2153170777389514 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.34797879518907177 + } + } + }, + "Claude_3.5": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.601982459650527 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.6730954422754497 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.4902441959564851 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4760293511799448 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4174603174603175 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5816376503416976 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.30455064781597585 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.5354477101845522 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.34512576094802216 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.5491299615821561 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.5432583400280819 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.5007954666401646 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.46298701236975 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.559621931609799 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.5202763858774847 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.5108627782344681 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.5112348724553849 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6164633346451529 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.4712835541311676 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.32222222222222224 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.551879400614103 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.5507139652199553 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.5186882904104265 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.6142857142857143 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.5357523919613602 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.4992470979095547 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.5455539344116468 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.1679041371346854 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.5862330985108403 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.4449245232910409 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.6133373652565922 + } + } + }, + "Claude_3.5_new": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.549453114579165 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.6675228339123135 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.5203487670246788 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4831956110227109 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.6285714285714286 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.624434009429481 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.3446127005975698 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.5779398170751554 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3619606028475468 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.5262763464518933 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.5745052612958822 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.5580994890913593 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.45534366261534626 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.5822097189631597 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.5462752278980763 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.5253512331959443 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.5300876558354416 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6380252743889108 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.5106873710119535 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.5111111111111112 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.560591278825322 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.5867039401572987 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.5312758177535931 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.5571428571428572 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.5901461883499776 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.5119288166310395 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.5780665618037544 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.20949111732140704 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.6000158007956335 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.48461955984560184 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.6175034056671596 + } + } + }, + "GPT_4o": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.5674353074563158 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.7449866017749027 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.573927179058758 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4387500704123191 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.626984126984127 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.6239356613266711 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.29104981532971697 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.537217258514251 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3427989299648589 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.5579401936213082 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.5163717756815422 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.5947367553407275 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.4860836138929036 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.6119906537905739 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.5529953662872721 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.5437570369132202 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.6016521462358102 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6204512659058113 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.4632537848154335 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.588888888888889 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.5336024331857716 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.6320445134575569 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.5438239765705489 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.7071428571428571 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.6277899249430077 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.4640760968604708 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.5809362181268165 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.1805270108227497 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.665877251472041 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.41908034223621293 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.6376827603266417 + } + } + }, + "GPT_4o_mini": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.443485727624383 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.5454125089849866 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.5184849378347831 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.33759329198549914 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4873015873015873 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5187731762548257 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.19712054228287026 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.4038829487137757 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1194248916897328 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.4391158991976564 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3844074585777545 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.5358636141991402 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.3630310056937025 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.4840322431234019 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.4428597096479722 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.38792580072361127 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.42048902061937554 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5559184922821285 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3777213713726476 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.5500000000000002 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.47435690264120933 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.5783744567621099 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.43053604973662485 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.8428571428571429 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.5269878554658799 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.3626500426533 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.47288058334043664 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.12353036435088108 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.5418846117881347 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.31581509437872207 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.49008961222480013 + } + } + }, + "Gemini_1.5_flash_002": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.4472997532346272 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.4276350622249924 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.47845517820513545 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.33330909636235384 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5095238095238095 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.530000094475405 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.18249617437154458 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.4440128188436459 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3555116262572404 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.4593977374231426 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3949868283785861 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.556499641004488 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.30950715243638166 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.3739545635068347 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.4218846086557438 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.37400486007598105 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3994332512947306 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5646552101097555 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.377682596312313 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.4388888888888889 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.4369798808291258 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.6031775450454782 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.41717531733177465 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.7857142857142857 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.4844309936962728 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.34646045301796197 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.4582611096104647 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.11218235861431454 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.540048519545808 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.34942814354037294 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.41692722021235956 + } + } + }, + "Gemini_1.5_pro_002": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.46775793650793657 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.5032228681487538 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.5596947945845004 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4200866579901879 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.48888888888888893 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5792532588171437 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.2794371119471525 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.5115315344638652 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3592697030984118 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.5698942838229203 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.49826758771614266 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.5721361134218315 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.39137138784541126 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.44574943991897686 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.49143110382294064 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.4422929828689221 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4731762319443037 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6245091608727974 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.48334866543174226 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.4777777777777778 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.49392496558558086 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.6175798311315372 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.4992744432046732 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.7857142857142858 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.5503578518411129 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.441698322015751 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.5194827632845835 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.1658085273639766 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.5911773766863718 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.3911858251350186 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.48929358587759453 + } + } + }, + "Idefics3": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.11274092970521543 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.01436895111355141 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.05696236096777891 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.07952603609985566 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.19999999999999998 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.1495053964440067 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.0384781807459826 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.18779486595276068 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.020659062938075456 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.1115661125131167 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.13230086580589415 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.11996266708437761 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.048897502780851865 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.0228057597307929 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.08941182847569332 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.14297736485125095 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.053829016986911726 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.13726004635095543 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.10744987600153451 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.19444444444444445 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.020330265112257467 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.10407148385670058 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.09930339554952557 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.014285714285714285 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.10780916069162783 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.1246530847322073 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.10574821858751979 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.006666666666666666 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.06960589055393365 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.08524217126909336 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.054111731779166165 + } + } + }, + "InternVL2_2B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.07919417767106841 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.012603881639063218 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.17097692715812768 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.12084183290294437 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3428571428571428 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.17431616833933153 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.041741486068111455 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.1533391378880101 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.07184873949579831 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.11422643876952225 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.1464322629599411 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.19633237004094392 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.0766182560089734 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.031121661849216026 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.12069001041308772 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.12223256337656033 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.11544832152620972 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.3044601862783681 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.12291071957107838 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.10000000000000002 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.03926707679962924 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.301796494913755 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.07179874628459325 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.41428571428571426 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.1484091935544191 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.13757858462183295 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.11615946858915803 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.012857142857142855 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.18936747660419526 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.12188347297192939 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.0987293040895875 + } + } + }, + "InternVL2_76B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.4169042617046819 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.45203503822029745 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.4248419603383912 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.3075073077960568 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5301587301587302 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5182024112478909 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.1533748520430929 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.3987637383502045 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.30169355252977215 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.43282638648854793 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3745248011696646 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.4871142261940827 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.31455105169242936 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.38932828375916556 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.39986165680187585 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.3914730115474357 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4041893680050902 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5389260571078752 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.34950523809271744 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.19999999999999998 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.4005263920643573 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.5696598273176547 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.3522928014906122 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.7 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.4210043056481755 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.33651969189064057 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.4158058468372138 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.09033733364601888 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.5299708150707814 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.32548714710035676 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.45651605400420087 + } + } + }, + "InternVL2_8B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.34619264372415637 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.2367141923041484 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.31973480587816666 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.23531351908862871 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3253968253968254 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.36204347589787156 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.0876690426244146 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.30852051665961444 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.0503849634147267 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.28741358772611214 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.2683845408567845 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.3914842713520135 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.200010386803836 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.2113402664720869 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.2765061240182559 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.25574229876669874 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2452385560845516 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4334863789409244 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.26248166960198344 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.13888888888888887 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.27775784786335034 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.46671099904492686 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.2796065136069559 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.6571428571428571 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.2913674981438704 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.2532144596862197 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.2844365958708302 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.04997623112502865 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.3949129609309493 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.21676083385513684 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.28183434902647947 + } + } + }, + "Llama_3_2_11B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.11167800453514738 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.2259373314221925 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.2971413316788703 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.12719796356144183 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.22857142857142856 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.25035819950825267 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.08934893521326495 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.19909633390836395 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.14822411270107955 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.24691637232808508 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.17286578923047669 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.3093952296240606 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.11249921833148997 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.1977763930597233 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.207891449607965 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.15137399357321155 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.16571305203663964 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.3762691853600945 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.16301171403498463 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.18333333333333335 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.24213134587284754 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.3843196346044139 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.20989950016321096 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.4357142857142858 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.2193924484034085 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.13593325121178748 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.24187586172942716 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.052164272356217546 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.3291153381826294 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.15368343143034857 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.18558479922921525 + } + } + }, + "MiniCPM_v2.6": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.1890714619181006 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.19295028180165177 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.2874237632024869 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.14556723677922526 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3507936507936508 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.33667469770557334 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.06140103429890864 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.2637176816500125 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.012567281814686655 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.26600737302144806 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.21237708196573646 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.3434348615389641 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.14945199789110994 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.1862927985498053 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.23230765810722806 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.2360053480825224 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.21540007432647457 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.39586776859504136 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.2036075191422558 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.12777777777777777 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.19960274145757392 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.46470984446410135 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.20879672103577454 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.6642857142857144 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.24241871024454942 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.17749280318127247 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.2515552313492125 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.040364725239421285 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.3535352281999884 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.16473079763168127 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.24499669754871362 + } + } + }, + "Molmo_72B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.25349306389222354 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.4948574349589178 + }, + "Knowledge": { + "count": 54, + "num_samples": 828, + "tasks": [], + "average_score": 0.3803488932935097 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.2954490282663994 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.6714285714285714 + }, + "Perception": { + "count": 73, + "num_samples": 1106, + "tasks": [], + "average_score": 0.4501314837168729 + }, + "Planning": { + "count": 40, + "num_samples": 623, + "tasks": [], + "average_score": 0.1243838738723015 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.37452543696904594 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.18757766329699532 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.35536275913874316 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1258, + "tasks": [], + "average_score": 0.3449361031266481 + }, + "Photographs": { + "count": 72, + "num_samples": 1090, + "tasks": [], + "average_score": 0.44745679758989315 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.23366368937907964 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 928, + "tasks": [], + "average_score": 0.40668864867102084 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4331, + "tasks": [], + "average_score": 0.3648000060938494 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.3040006051089098 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.36443166533642163 + }, + "multiple_choice": { + "count": 33, + "num_samples": 552, + "tasks": [], + "average_score": 0.5421225239407056 + }, + "numerical_data": { + "count": 39, + "num_samples": 679, + "tasks": [], + "average_score": 0.3342330361070466 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.2666666666666668 + }, + "structured_output": { + "count": 70, + "num_samples": 1080, + "tasks": [], + "average_score": 0.3692089646353472 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 308, + "tasks": [], + "average_score": 0.4743720714830556 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.30030823960058817 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 60, + "tasks": [], + "average_score": 0.5142857142857142 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1112, + "tasks": [], + "average_score": 0.38758498245184864 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1469, + "tasks": [], + "average_score": 0.30054068714083426 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.4179136192086295 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 295, + "tasks": [], + "average_score": 0.06535561286776892 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.5549412690785723 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1167, + "tasks": [], + "average_score": 0.28924619567527443 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1488, + "tasks": [], + "average_score": 0.401901741446779 + } + } + }, + "Molmo_7B_D": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.12124433106575963 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.2532202578113846 + }, + "Knowledge": { + "count": 54, + "num_samples": 828, + "tasks": [], + "average_score": 0.23968959822395494 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.1740048655548875 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3619047619047619 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.28659408373421147 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.05370374786207103 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.16930936648981762 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09043432702433757 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.21600371830878892 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.2072476551410762 + }, + "Photographs": { + "count": 72, + "num_samples": 1090, + "tasks": [], + "average_score": 0.28994325546730004 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.11851171195131012 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.19126906534081203 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4362, + "tasks": [], + "average_score": 0.20980884469925187 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.16547511463023942 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.17907829453546903 + }, + "multiple_choice": { + "count": 33, + "num_samples": 552, + "tasks": [], + "average_score": 0.3169618260527351 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.22086240998395923 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.19444444444444445 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.2200598160693021 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 308, + "tasks": [], + "average_score": 0.3029780373247698 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.1823343958488806 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 60, + "tasks": [], + "average_score": 0.2857142857142857 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1112, + "tasks": [], + "average_score": 0.21471894345249354 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.17170235491904917 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.23690791881784 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.01699865713149676 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.32222325026322907 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.1903222995259856 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.2087480533696987 + } + } + }, + "NVLM": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.3750125050020009 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.3402401105071423 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.3518381988922923 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.2786818799518423 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3047619047619048 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.43083653890768137 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.11644208162732136 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.35800290770215587 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09447890526012262 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.35722209936606236 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.306670759110608 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.41082237253037907 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.2904940355621133 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.29235169318945975 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.3298987289092603 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.3220928699225652 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.29287492253780084 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5016004197822379 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.28793758479482745 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.14444444444444443 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.3411858782746986 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.5058549907377139 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.29339312128564665 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.6571428571428573 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.3591191655954516 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.2986577481366766 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.3255697086744965 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.057572626740723686 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.4167091954582625 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.2641033524048475 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.3860451654290392 + } + } + }, + "POINTS_7B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.31870664932639714 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.2068087770328232 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.25991650217591905 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19633555542091463 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.34761904761904755 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.37504978632490477 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.08085212312197398 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.2768663813212685 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1499797713556708 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.2874577127032444 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.25256889745414135 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.34652988826051795 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.18660445129701578 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.1948102787931022 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.25511317681632323 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.23260678658479111 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.239982641771955 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4200183654729108 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23646374895042882 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.1944444444444445 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.23041771883702364 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.43646018418809185 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.21129352241857768 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.4 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.2753345859057553 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.2237494997186118 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.277860337866304 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.061210172184118075 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.35929175265237484 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.22070936301030905 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.26132652140685175 + } + } + }, + "Phi-3.5-vision": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.3101740696278511 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.16746168583214033 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.2974341707118193 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19473774010136682 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4142857142857143 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.3679586648204004 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.06791094945069898 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.3162754288318198 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.04423070234557053 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.2551644754909188 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.26227969504938176 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.38504695345209006 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.16022684584297756 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.17345163363650537 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.2561274958722834 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.21643352778754282 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.23002523914604356 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.46076785167694245 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.20335546763980886 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.20555555555555557 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.25413742870032724 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.47288368379388357 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.25307365356071676 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.6428571428571429 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.27590556918981346 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.22499064137767294 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.2690117230770391 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.03827978594637614 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.3881671022150741 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.2056777907758279 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.24589821102196616 + } + } + }, + "Pixtral_12B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.3631786047752434 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.40454696979466326 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.35812945808602836 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.27839183583970506 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3444444444444444 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4417682407813345 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.11628057748090181 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.3671156245592336 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.11048396896880823 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.31856430910857636 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3523947190701712 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.4096154838343029 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.2520839676436745 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.34149524135650317 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.3436942439614409 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.3154592581738141 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.29510067482559116 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5076172985263894 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3135393276021012 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.25 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.36385587859693563 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.4550597080937638 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.33534904302374446 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.5857142857142857 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.3623285456380049 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.3010808833549744 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.36531612449850126 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.07498375558099864 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.42516945737046213 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.2803352064645702 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.3821628723878419 + } + } + }, + "Qwen2_VL_2B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.18917150193410698 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.20283025008122135 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.25792999769758446 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.16147424237969243 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.36984126984126986 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.32389533491247047 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.07757607199536717 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.25114159858520757 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09293971931071163 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.18729615169081051 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.22619068053001246 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.3494210357937378 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.11498664481056947 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.18853887878466394 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.2278790697324484 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.16334308041768983 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.17061075451792151 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.42328479601206864 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23904036592289388 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.21666666666666667 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.23525613577355753 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.40420075925106874 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.21187230462053547 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.5857142857142857 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.23917557274631937 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.1920835472074626 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.2514970984841806 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.06190044857572624 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.3560250849009433 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.19097312901223737 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.20852012355101548 + } + } + }, + "Qwen2_VL_72B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.4641064759237028 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.629346044663391 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.468515925715788 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.3355316008767396 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.39365079365079364 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.6000216138624136 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.22828401815412466 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.43109779318425934 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.18309869697155778 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.47402685016214946 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.4280210143260369 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.5145439268296668 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.42667661800594775 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.5304162957314315 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.4730536307784527 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.46340863503678986 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4608929319719144 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5851458306003763 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.44066773476234555 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.2888888888888889 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.4882698282232416 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.6024216992288907 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.4199287684648145 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.7571428571428572 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.5221626669916383 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.39465370086540064 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.4924413707801089 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.13490955025936763 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.5714161208154724 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.36532364382699006 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.5630364005339461 + } + } + }, + "Qwen2_VL_7B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.36349122982526344 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.43505672396192463 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.3903343291507636 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.25874500882297563 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3507936507936508 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.46282652711560157 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.11909481878241916 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.3053160227408348 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.167887099917599 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.3393142086555984 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3165990212923503 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.45313997258463495 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.27275584408561765 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.3579776350901385 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.3538656561495699 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.3328740702343424 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3068323820854221 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5064978792251521 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.31569247186288174 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.18333333333333335 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.389303565405027 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.5335132027639431 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.3137857132588446 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.6928571428571427 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.3803052907797739 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.2842846280141439 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.377268485705059 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.06257172449572052 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.5000864392006293 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.2747388531252317 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.3937029859209358 + } + } + }, + "llava_onevision_72B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.3016498265973056 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.2061370321936942 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.395935061646491 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.30843360355217414 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4857142857142857 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.3898572114956503 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.13782860942097938 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.3509486649524244 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.15000864315905132 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.4400683957793645 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.324519320813171 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.44729619280084937 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.1765737289659373 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.20867978327013104 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.31261824262126414 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.2625645850195834 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3154587757748795 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5216100397918579 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.29549573982348826 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.20555555555555557 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.28400705589012487 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.562053775743707 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.3303197144600285 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.7071428571428571 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.33599293561944077 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.28702982191513 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.34442681429731226 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.07183763151045863 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.45012905165476186 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.2799559435189789 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.24698893836676314 + } + } + }, + "llava_onevision_7B": { + "app": { + "Coding": { + "count": 14, + "num_samples": 219, + "tasks": [], + "average_score": 0.18737911831399226 + }, + "Information_Extraction": { + "count": 39, + "num_samples": 585, + "tasks": [], + "average_score": 0.1138353380467867 + }, + "Knowledge": { + "count": 54, + "num_samples": 843, + "tasks": [], + "average_score": 0.2961238346865711 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19520567001898761 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5126984126984127 + }, + "Perception": { + "count": 73, + "num_samples": 1121, + "tasks": [], + "average_score": 0.3390813909897151 + }, + "Planning": { + "count": 40, + "num_samples": 639, + "tasks": [], + "average_score": 0.09623320827602701 + }, + "Science": { + "count": 20, + "num_samples": 428, + "tasks": [], + "average_score": 0.28027045427421365 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1783310257200802 + }, + "Artistic and Creative Content": { + "count": 17, + "num_samples": 273, + "tasks": [], + "average_score": 0.3293728430484681 + }, + "Diagrams and Data Visualizations": { + "count": 77, + "num_samples": 1273, + "tasks": [], + "average_score": 0.2386719125595502 + }, + "Photographs": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.39420179001028427 + }, + "Text-Based Images and Documents": { + "count": 48, + "num_samples": 752, + "tasks": [], + "average_score": 0.09192407874399001 + }, + "User Interface Screenshots": { + "count": 57, + "num_samples": 944, + "tasks": [], + "average_score": 0.13205078771812961 + } + }, + "input_num": { + "1-image": { + "count": 273, + "num_samples": 4377, + "tasks": [], + "average_score": 0.23683339637631795 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 62, + "num_samples": 960, + "tasks": [], + "average_score": 0.19282449186006576 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2490174433570946 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.43553281735099914 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.22047389017098817 + }, + "open_ended_output": { + "count": 12, + "num_samples": 180, + "tasks": [], + "average_score": 0.1888888888888889 + }, + "structured_output": { + "count": 70, + "num_samples": 1096, + "tasks": [], + "average_score": 0.18955247268768835 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 20, + "num_samples": 323, + "tasks": [], + "average_score": 0.46845569806870113 + }, + "Domain-Specific Knowledge and Skills": { + "count": 39, + "num_samples": 736, + "tasks": [], + "average_score": 0.23753715388335023 + }, + "Ethical and Safety Reasoning": { + "count": 5, + "num_samples": 75, + "tasks": [], + "average_score": 0.5357142857142856 + }, + "Language Understanding and Generation": { + "count": 74, + "num_samples": 1127, + "tasks": [], + "average_score": 0.2531191596438906 + }, + "Mathematical and Logical Reasoning": { + "count": 85, + "num_samples": 1484, + "tasks": [], + "average_score": 0.19409364599495751 + }, + "Object Recognition and Classification": { + "count": 157, + "num_samples": 2422, + "tasks": [], + "average_score": 0.27815212199492234 + }, + "Planning and Decision Making": { + "count": 20, + "num_samples": 311, + "tasks": [], + "average_score": 0.06387490099846409 + }, + "Scene and Event Understanding": { + "count": 47, + "num_samples": 704, + "tasks": [], + "average_score": 0.40022615806851775 + }, + "Spatial and Temporal Reasoning": { + "count": 74, + "num_samples": 1183, + "tasks": [], + "average_score": 0.20739353991494894 + }, + "Text Recognition (OCR)": { + "count": 92, + "num_samples": 1503, + "tasks": [], + "average_score": 0.17048468484282922 + } + } + } +} \ No newline at end of file