cccjc's picture
Update single-image results, add model link url
4301eca
raw
history blame
17.1 kB
{
"GPT_4o": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.5203440930873326,
"micro_mean_score": 0.514302640282204
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.5265030595065238,
"micro_mean_score": 0.5236338521693411
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.6478225794744895,
"micro_mean_score": 0.665391229578676
},
"overall_score": 0.5421184432647768
},
"Gemini_1.5_pro_002": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.4699992918320008,
"micro_mean_score": 0.4651116133689296
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.4822473962867704,
"micro_mean_score": 0.4764805563057179
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.5858190649927173,
"micro_mean_score": 0.6104901117798793
},
"overall_score": 0.4955784031499121
},
"Gemini_1.5_flash_002": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.41898948981774853,
"micro_mean_score": 0.4127376993779598
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.4189319021967416,
"micro_mean_score": 0.41567515414375245
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.5691365176285039,
"micro_mean_score": 0.5987532244196045
},
"overall_score": 0.43831534488249924
},
"Claude_3.5": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.48800427486796155,
"micro_mean_score": 0.4814327812005499
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.5040975742801586,
"micro_mean_score": 0.5002259116666758
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.6373907158949892,
"micro_mean_score": 0.6569647463456579
},
"overall_score": 0.5212541172602853
},
"Claude_3.5_new": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.4919657684484185,
"micro_mean_score": 0.4874520567007144
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.5259191914020757,
"micro_mean_score": 0.5230785894131227
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.6563419761104125,
"micro_mean_score": 0.6724419604471196
},
"overall_score": 0.5427062825031487
},
"GPT_4o_mini": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.39854757130003565,
"micro_mean_score": 0.3936551517403452
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.40767494558789397,
"micro_mean_score": 0.40431644154143376
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.586537827213665,
"micro_mean_score": 0.6133276010318144
},
"overall_score": 0.43069690064863675
},
"Qwen2_VL_72B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.46406654108789214,
"micro_mean_score": 0.4584702152011697
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.4542376574527161,
"micro_mean_score": 0.4501201906164793
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.5639771804231668,
"micro_mean_score": 0.5835339638865004
},
"overall_score": 0.4769263263488681
},
"Qwen2_VL_7B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.3480020832611913,
"micro_mean_score": 0.3441858958345098
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.3293449599230247,
"micro_mean_score": 0.325331493515679
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1170,
"macro_mean_score": 0.43955105763038577,
"micro_mean_score": 0.45508547008546996
},
"overall_score": 0.3597856146156421
},
"llava_onevision_72B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.3199332158220174,
"micro_mean_score": 0.31770770553892647
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.2974368415462532,
"micro_mean_score": 0.2956217833156672
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.4599484231632498,
"micro_mean_score": 0.4850386930352536
},
"overall_score": 0.33795497518277007
},
"llava_onevision_7B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.22409531510496777,
"micro_mean_score": 0.22238854298563537
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.21362697219149712,
"micro_mean_score": 0.21073910058505504
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.33979975321921935,
"micro_mean_score": 0.36474634565778147
},
"overall_score": 0.23898796555531696
},
"InternVL2_76B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.3502244283768534,
"micro_mean_score": 0.3456783051732046
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.3562710424410931,
"micro_mean_score": 0.35129859801162616
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.5192997443033639,
"micro_mean_score": 0.5421324161650903
},
"overall_score": 0.3772549347599992
},
"InternVL2_8B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.25956581776451815,
"micro_mean_score": 0.2546984460483302
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.24090301358258295,
"micro_mean_score": 0.23819084111520938
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1165,
"macro_mean_score": 0.3978571701460552,
"micro_mean_score": 0.4108583690987125
},
"overall_score": 0.2773656948037259
},
"MiniCPM_v2.6": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.2287645706203155,
"micro_mean_score": 0.2249087742955901
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.22955895202146906,
"micro_mean_score": 0.22560399396899078
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.41728623355613875,
"micro_mean_score": 0.43452278589853827
},
"overall_score": 0.2537218694467236
},
"Phi-3.5-vision": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.23271251159409778,
"micro_mean_score": 0.2296262323791101
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.22995297916629392,
"micro_mean_score": 0.22708502951025372
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.3947914647737769,
"micro_mean_score": 0.42459157351676696
},
"overall_score": 0.25357415903306635
},
"Pixtral_12B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.31905695620134694,
"micro_mean_score": 0.31556607913724777
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.31362045151669854,
"micro_mean_score": 0.3100986209078182
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.4566234428542061,
"micro_mean_score": 0.4870593293207223
},
"overall_score": 0.33676353369131895
},
"Llama_3_2_11B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.10044261716549671,
"micro_mean_score": 0.09980638766828835
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.15999641916771298,
"micro_mean_score": 0.15809331016967038
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.3173342406187366,
"micro_mean_score": 0.3487962166809973
},
"overall_score": 0.1802478219287358
},
"Idefics3": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.11118980301103833,
"micro_mean_score": 0.11201785633274061
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.08956972487602757,
"micro_mean_score": 0.08982225274252693
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.3210866162255635,
"micro_mean_score": 0.35649183147033553
},
"overall_score": 0.138206224513898
},
"Aria": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.30485930718699694,
"micro_mean_score": 0.3016713629035311
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.289073788209904,
"micro_mean_score": 0.2859007507765791
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.5103725263180767,
"micro_mean_score": 0.5349957007738607
},
"overall_score": 0.3313115037088191
},
"NVLM": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.2420528895703979,
"micro_mean_score": 0.23838419989257642
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.21589726765847422,
"micro_mean_score": 0.21406043849932396
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.3478114310231307,
"micro_mean_score": 0.3947549441100602
},
"overall_score": 0.25566537510391796
},
"InternVL2_2B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.09089701489596874,
"micro_mean_score": 0.09036328295381871
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.13141974398938763,
"micro_mean_score": 0.13063500716262516
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.23864417043743646,
"micro_mean_score": 0.24901117798796224
},
"overall_score": 0.14522090778963154
},
"Qwen2_VL_2B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.16448220309703876,
"micro_mean_score": 0.1610710186451323
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.20877163406364055,
"micro_mean_score": 0.20561526268932287
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.3154302566225611,
"micro_mean_score": 0.33856405846947557
},
"overall_score": 0.22249997162072932
},
"Aquila_VL_2B": {
"core_noncot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.16317824309838627,
"micro_mean_score": 0.16198837245148487
},
"core_cot": {
"num_eval_tasks": 440,
"num_eval_samples": 6539,
"num_not_eval_samples": 0,
"macro_mean_score": 0.159970161379836,
"micro_mean_score": 0.15844711671722148
},
"open": {
"num_eval_tasks": 65,
"num_eval_samples": 1163,
"macro_mean_score": 0.24567572098570653,
"micro_mean_score": 0.2704213241616509
},
"overall_score": 0.17379673035120966
}
}