Spaces:
Running
Running
{ | |
"GPT_4o": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.5203470034386184, | |
"micro_mean_score": 0.514305381949725 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.5265059698578094, | |
"micro_mean_score": 0.5236365938368621 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2448, | |
"macro_mean_score": 0.6478225794744895, | |
"micro_mean_score": 0.665391229578676 | |
}, | |
"overall_score": 0.542120979016392 | |
}, | |
"Gemini_1.5_pro_002": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.46887846869580546, | |
"micro_mean_score": 0.46403536258864253 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.48154520292305814, | |
"micro_mean_score": 0.47581906202211677 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2448, | |
"macro_mean_score": 0.5858190649927173, | |
"micro_mean_score": 0.6104901117798793 | |
}, | |
"overall_score": 0.49496659111024205 | |
}, | |
"Gemini_1.5_flash_002": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.4183865592515826, | |
"micro_mean_score": 0.41216971462683855 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.4183865592515826, | |
"micro_mean_score": 0.41216971462683855 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2168, | |
"macro_mean_score": 0.5691365176285039, | |
"micro_mean_score": 0.5987532244196045 | |
}, | |
"overall_score": 0.4377900192406913 | |
}, | |
"Claude_3.5": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.4863241841253708, | |
"micro_mean_score": 0.4798092874490549 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.5029618079901714, | |
"micro_mean_score": 0.4991559743144323 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2288, | |
"macro_mean_score": 0.6373907158949892, | |
"micro_mean_score": 0.6569647463456579 | |
}, | |
"overall_score": 0.5202645387105935 | |
}, | |
"Claude_3.5_new": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.4919657684484185, | |
"micro_mean_score": 0.4874520567007144 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.525918992480593, | |
"micro_mean_score": 0.5230784020211157 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.6563419761104125, | |
"micro_mean_score": 0.6724419604471196 | |
}, | |
"overall_score": 0.5427061091854214 | |
}, | |
"GPT_4o_mini": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.3974259652331149, | |
"micro_mean_score": 0.392578163407945 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.4070959243997505, | |
"micro_mean_score": 0.40376078514357017 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.586537827213665, | |
"micro_mean_score": 0.6133276010318144 | |
}, | |
"overall_score": 0.43019240694015537 | |
}, | |
"Qwen2_VL_72B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.4623988230573754, | |
"micro_mean_score": 0.4568583770401895 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.45284699372478177, | |
"micro_mean_score": 0.4487693487093462 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2448, | |
"macro_mean_score": 0.5639771804231668, | |
"micro_mean_score": 0.5835339638865004 | |
}, | |
"overall_score": 0.4754732650945565 | |
}, | |
"Qwen2_VL_7B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.34725455697890745, | |
"micro_mean_score": 0.34344091516995323 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.3284357723853296, | |
"micro_mean_score": 0.32443422147119677 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1170, | |
"num_total_samples": 2452, | |
"macro_mean_score": 0.43955105763038577, | |
"micro_mean_score": 0.45508547008546996 | |
}, | |
"overall_score": 0.35913430458751355 | |
}, | |
"llava_onevision_72B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.31960132549012704, | |
"micro_mean_score": 0.3173848563095166 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.29725827011768174, | |
"micro_mean_score": 0.2954433666362564 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.4599484231632498, | |
"micro_mean_score": 0.4850386930352536 | |
}, | |
"overall_score": 0.33766580340844976 | |
}, | |
"llava_onevision_7B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.2239290419841492, | |
"micro_mean_score": 0.22222171180488767 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.21347545703998197, | |
"micro_mean_score": 0.210586172002703 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2448, | |
"macro_mean_score": 0.33979975321921935, | |
"micro_mean_score": 0.36474634565778147 | |
}, | |
"overall_score": 0.23884309392529685 | |
}, | |
"InternVL2_76B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.34977582844066846, | |
"micro_mean_score": 0.3452353155814884 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.35539585884136143, | |
"micro_mean_score": 0.35043335903915124 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.5192997443033639, | |
"micro_mean_score": 0.5421324161650903 | |
}, | |
"overall_score": 0.37649239855429245 | |
}, | |
"InternVL2_8B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.25920867490737526, | |
"micro_mean_score": 0.2543416126895087 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.24055897165959364, | |
"micro_mean_score": 0.23784634936127952 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1165, | |
"num_total_samples": 2452, | |
"macro_mean_score": 0.3978571701460552, | |
"micro_mean_score": 0.4108583690987125 | |
}, | |
"overall_score": 0.2770545208291856 | |
}, | |
"MiniCPM_v2.6": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.22838207666977445, | |
"micro_mean_score": 0.22452805919103805 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.22901463640480854, | |
"micro_mean_score": 0.2250606411323753 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2448, | |
"macro_mean_score": 0.41728623355613875, | |
"micro_mean_score": 0.43452278589853827 | |
}, | |
"overall_score": 0.25324761425596987 | |
}, | |
"Phi-3.5-vision": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.23240864879023493, | |
"micro_mean_score": 0.22932978620408923 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.2295097914016776, | |
"micro_mean_score": 0.2266573336398296 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2428, | |
"macro_mean_score": 0.3947914647737769, | |
"micro_mean_score": 0.42459157351676696 | |
}, | |
"overall_score": 0.2533094072831661 | |
}, | |
"Pixtral_12B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.3186510310643637, | |
"micro_mean_score": 0.3151734861550665 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.3132232487306254, | |
"micro_mean_score": 0.30971424472967524 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.4566234428542061, | |
"micro_mean_score": 0.4870593293207223 | |
}, | |
"overall_score": 0.3364098563442444 | |
}, | |
"Llama_3_2_11B": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.10044261716549671, | |
"micro_mean_score": 0.09980638766828835 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.15984490401619783, | |
"micro_mean_score": 0.15794038158731832 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.3173342406187366, | |
"micro_mean_score": 0.3487962166809973 | |
}, | |
"overall_score": 0.1801158087274157 | |
}, | |
"Idefics3": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.11118980301103833, | |
"micro_mean_score": 0.11201785633274061 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.08956972487602757, | |
"micro_mean_score": 0.08982225274252693 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 2448, | |
"macro_mean_score": 0.3210866162255635, | |
"micro_mean_score": 0.35649183147033553 | |
}, | |
"overall_score": 0.138206224513898 | |
}, | |
"Aria": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.30485930718699694, | |
"micro_mean_score": 0.3016713629035311 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.289073788209904, | |
"micro_mean_score": 0.2859007507765791 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.5103725263180767, | |
"micro_mean_score": 0.5349957007738607 | |
}, | |
"overall_score": 0.3313115037088191 | |
}, | |
"NVLM": { | |
"core_noncot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.2420528895703979, | |
"micro_mean_score": 0.23838419989257642 | |
}, | |
"core_cot": { | |
"num_eval_tasks": 440, | |
"num_eval_samples": 6539, | |
"num_not_eval_samples": 0, | |
"num_total_samples": 6961, | |
"macro_mean_score": 0.21589726765847422, | |
"micro_mean_score": 0.21406043849932396 | |
}, | |
"open": { | |
"num_eval_tasks": 65, | |
"num_eval_samples": 1163, | |
"num_total_samples": 1224, | |
"macro_mean_score": 0.3478114310231307, | |
"micro_mean_score": 0.3947549441100602 | |
}, | |
"overall_score": 0.25566537510391796 | |
} | |
} |