{ "GPT_4o": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.5203440930873326, "micro_mean_score": 0.514302640282204 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.5265030595065238, "micro_mean_score": 0.5236338521693411 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.6478225794744895, "micro_mean_score": 0.665391229578676 }, "overall_score": 0.5421184432647768 }, "Gemini_1.5_pro_002": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.4699992918320008, "micro_mean_score": 0.4651116133689296 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.4822473962867704, "micro_mean_score": 0.4764805563057179 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.5858190649927173, "micro_mean_score": 0.6104901117798793 }, "overall_score": 0.4955784031499121 }, "Gemini_1.5_flash_002": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.41898948981774853, "micro_mean_score": 0.4127376993779598 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.4189319021967416, "micro_mean_score": 0.41567515414375245 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.5691365176285039, "micro_mean_score": 0.5987532244196045 }, "overall_score": 0.43831534488249924 }, "Claude_3.5": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.48800427486796155, "micro_mean_score": 0.4814327812005499 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.5040975742801586, "micro_mean_score": 0.5002259116666758 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.6373907158949892, "micro_mean_score": 0.6569647463456579 }, "overall_score": 0.5212541172602853 }, "Claude_3.5_new": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.4919657684484185, "micro_mean_score": 0.4874520567007144 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.5259191914020757, "micro_mean_score": 0.5230785894131227 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.6563419761104125, "micro_mean_score": 0.6724419604471196 }, "overall_score": 0.5427062825031487 }, "GPT_4o_mini": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.39854757130003565, "micro_mean_score": 0.3936551517403452 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.40767494558789397, "micro_mean_score": 0.40431644154143376 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.586537827213665, "micro_mean_score": 0.6133276010318144 }, "overall_score": 0.43069690064863675 }, "Qwen2_VL_72B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.46406654108789214, "micro_mean_score": 0.4584702152011697 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.4542376574527161, "micro_mean_score": 0.4501201906164793 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.5639771804231668, "micro_mean_score": 0.5835339638865004 }, "overall_score": 0.4769263263488681 }, "Qwen2_VL_7B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.3480020832611913, "micro_mean_score": 0.3441858958345098 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.3293449599230247, "micro_mean_score": 0.325331493515679 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1170, "macro_mean_score": 0.43955105763038577, "micro_mean_score": 0.45508547008546996 }, "overall_score": 0.3597856146156421 }, "llava_onevision_72B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.3199332158220174, "micro_mean_score": 0.31770770553892647 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.2974368415462532, "micro_mean_score": 0.2956217833156672 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.4599484231632498, "micro_mean_score": 0.4850386930352536 }, "overall_score": 0.33795497518277007 }, "llava_onevision_7B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.22409531510496777, "micro_mean_score": 0.22238854298563537 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.21362697219149712, "micro_mean_score": 0.21073910058505504 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.33979975321921935, "micro_mean_score": 0.36474634565778147 }, "overall_score": 0.23898796555531696 }, "InternVL2_76B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.3502244283768534, "micro_mean_score": 0.3456783051732046 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.3562710424410931, "micro_mean_score": 0.35129859801162616 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.5192997443033639, "micro_mean_score": 0.5421324161650903 }, "overall_score": 0.3772549347599992 }, "InternVL2_8B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.25956581776451815, "micro_mean_score": 0.2546984460483302 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.24090301358258295, "micro_mean_score": 0.23819084111520938 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1165, "macro_mean_score": 0.3978571701460552, "micro_mean_score": 0.4108583690987125 }, "overall_score": 0.2773656948037259 }, "MiniCPM_v2.6": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.2287645706203155, "micro_mean_score": 0.2249087742955901 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.22955895202146906, "micro_mean_score": 0.22560399396899078 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.41728623355613875, "micro_mean_score": 0.43452278589853827 }, "overall_score": 0.2537218694467236 }, "Phi-3.5-vision": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.23271251159409778, "micro_mean_score": 0.2296262323791101 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.22995297916629392, "micro_mean_score": 0.22708502951025372 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.3947914647737769, "micro_mean_score": 0.42459157351676696 }, "overall_score": 0.25357415903306635 }, "Pixtral_12B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.31905695620134694, "micro_mean_score": 0.31556607913724777 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.31362045151669854, "micro_mean_score": 0.3100986209078182 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.4566234428542061, "micro_mean_score": 0.4870593293207223 }, "overall_score": 0.33676353369131895 }, "Llama_3_2_11B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.10044261716549671, "micro_mean_score": 0.09980638766828835 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.15999641916771298, "micro_mean_score": 0.15809331016967038 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.3173342406187366, "micro_mean_score": 0.3487962166809973 }, "overall_score": 0.1802478219287358 }, "Idefics3": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.11118980301103833, "micro_mean_score": 0.11201785633274061 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.08956972487602757, "micro_mean_score": 0.08982225274252693 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.3210866162255635, "micro_mean_score": 0.35649183147033553 }, "overall_score": 0.138206224513898 }, "Aria": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.30485930718699694, "micro_mean_score": 0.3016713629035311 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.289073788209904, "micro_mean_score": 0.2859007507765791 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.5103725263180767, "micro_mean_score": 0.5349957007738607 }, "overall_score": 0.3313115037088191 }, "NVLM": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.2420528895703979, "micro_mean_score": 0.23838419989257642 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.21589726765847422, "micro_mean_score": 0.21406043849932396 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.3478114310231307, "micro_mean_score": 0.3947549441100602 }, "overall_score": 0.25566537510391796 }, "InternVL2_2B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.09089701489596874, "micro_mean_score": 0.09036328295381871 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.13141974398938763, "micro_mean_score": 0.13063500716262516 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.23864417043743646, "micro_mean_score": 0.24901117798796224 }, "overall_score": 0.14522090778963154 }, "Qwen2_VL_2B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.16448220309703876, "micro_mean_score": 0.1610710186451323 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.20877163406364055, "micro_mean_score": 0.20561526268932287 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.3154302566225611, "micro_mean_score": 0.33856405846947557 }, "overall_score": 0.22249997162072932 }, "Aquila_VL_2B": { "core_noncot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.16317824309838627, "micro_mean_score": 0.16198837245148487 }, "core_cot": { "num_eval_tasks": 440, "num_eval_samples": 6539, "num_not_eval_samples": 0, "macro_mean_score": 0.159970161379836, "micro_mean_score": 0.15844711671722148 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, "macro_mean_score": 0.24567572098570653, "micro_mean_score": 0.2704213241616509 }, "overall_score": 0.17379673035120966 } }