Spaces:
Running
Running
File size: 6,134 Bytes
8553d06 f14a657 8553d06 f14a657 8553d06 f14a657 8553d06 f14a657 8553d06 aedc60d f14a657 8553d06 0d5512e 8553d06 0d5512e 8553d06 0d5512e 8553d06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository
import numpy as np
# Load the JSON data
with open("./static/eval_results/all_model_keywords_stats.json", "r") as f:
MODEL_DATA = json.load(f)
with open("./static/eval_results/all_summary.json", "r") as f:
SUMMARY_DATA = json.load(f)
# Define model name mapping
MODEL_NAME_MAP = {
"Claude_3.5_new": "Claude-3.5-Sonnet (1022)",
"GPT_4o": "GPT-4o (0513)",
"Claude_3.5": "Claude-3.5-Sonnet (0622)",
"Gemini_1.5_pro_002": "Gemini-1.5-Pro-002",
"InternVL2_76B": "InternVL2-Llama3-76B",
"Qwen2_VL_72B": "Qwen2-VL-72B",
"llava_onevision_72B": "Llava-OneVision-72B",
"NVLM": "NVLM-72B",
"GPT_4o_mini": "GPT-4o mini",
"Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
"Pixtral_12B": "Pixtral 12B",
"Aria": "Aria-MoE-25B",
"Qwen2_VL_7B": "Qwen2-VL-7B",
"InternVL2_8B": "InternVL2-8B",
"llava_onevision_7B": "Llava-OneVision-7B",
"Llama_3_2_11B": "Llama-3.2-11B",
"Phi-3.5-vision": "Phi-3.5-Vision",
"MiniCPM_v2.6": "MiniCPM-V2.6",
"Idefics3": "Idefics3-8B-Llama3",
}
# Custom name mapping for dimensions and keywords
DIMENSION_NAME_MAP = {
"skills": "Skills",
"input_format": "Input Format",
"output_format": "Output Format",
"input_num": "Visual Input Number",
"app": "Application"
}
KEYWORD_NAME_MAP = {
# Skills
"Object Recognition and Classification": "Object Recognition",
"Text Recognition (OCR)": "OCR",
"Language Understanding and Generation": "Language",
"Scene and Event Understanding": "Scene/Event",
"Mathematical and Logical Reasoning": "Math/Logic",
"Commonsense and Social Reasoning": "Commonsense",
"Ethical and Safety Reasoning": "Ethics/Safety",
"Domain-Specific Knowledge and Skills": "Domain-Specific",
"Spatial and Temporal Reasoning": "Spatial/Temporal",
"Planning and Decision Making": "Planning/Decision",
# Input Format
'User Interface Screenshots': "UI related",
'Text-Based Images and Documents': "Documents",
'Diagrams and Data Visualizations': "Infographics",
'Videos': "Videos",
'Artistic and Creative Content': "Arts/Creative",
'Photographs': "Photographs",
'3D Models and Aerial Imagery': "3D related",
# Application
'Information_Extraction': "Info Extraction",
'Planning' : "Planning",
'Coding': "Coding",
'Perception': "Perception",
'Metrics': "Metrics",
'Science': "Science",
'Knowledge': "Knowledge",
'Mathematics': "Math",
# Output format
'contextual_formatted_text': "Contexual",
'structured_output': "Structured",
'exact_text': "Exact",
'numerical_data': "Numerical",
'open_ended_output': "Open-ended",
'multiple_choice': "MC",
"6-8 images": "6-8 imgs",
"1-image": "1 img",
"2-3 images": "2-3 imgs",
"4-5 images": "4-5 imgs",
"9-image or more": "9+ imgs",
"video": "Video",
}
# Extract super groups (dimensions) and their keywords
SUPER_GROUPS = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in MODEL_DATA[next(iter(MODEL_DATA))][dim].keys()]
for dim in MODEL_DATA[next(iter(MODEL_DATA))]}
def get_original_dimension(mapped_dimension):
return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
def get_original_keyword(mapped_keyword):
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
# Define model groups
MODEL_GROUPS = {
"All": list(MODEL_DATA.keys()),
"Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM'],
"Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
"Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
"Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
"Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM'],
"Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
}
def get_display_model_name(model_name):
return MODEL_NAME_MAP.get(model_name, model_name)
def get_df(selected_super_group, selected_model_group):
original_dimension = get_original_dimension(selected_super_group)
data = []
for model in MODEL_GROUPS[selected_model_group]:
model_data = MODEL_DATA[model]
summary = SUMMARY_DATA[model]
core_noncot_score = summary["core_noncot"]["macro_mean_score"]
core_cot_score = summary["core_cot"]["macro_mean_score"]
row = {
"Models": get_display_model_name(model), # Use the mapped name
"Overall": round(summary["overall_score"] * 100, 2),
"Core(w/o CoT)": round(core_noncot_score * 100, 2),
"Core(w/ CoT)": round(core_cot_score * 100, 2),
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
}
for keyword in SUPER_GROUPS[selected_super_group]:
original_keyword = get_original_keyword(keyword)
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
else:
row[keyword] = None
data.append(row)
df = pd.DataFrame(data)
df = df.sort_values(by="Overall", ascending=False)
return df
def get_leaderboard_data(selected_super_group, selected_model_group):
df = get_df(selected_super_group, selected_model_group)
headers = ["Models", "Overall", "Core(w/o CoT)", "Core(w/ CoT)", "Open-ended"] + SUPER_GROUPS[selected_super_group]
data = df[headers].values.tolist()
return headers, data
|