MEGA-Bench / utils.py
cccjc's picture
Add task count into table column name
8c04f42
raw
history blame
11.8 kB
import pandas as pd
import json
from typing import Dict, Any, Tuple
# Keep all the constant mappings outside the class
MODEL_NAME_MAP = {
"Claude_3.5_new": "Claude-3.5-Sonnet (1022)",
"GPT_4o": "GPT-4o (0513)",
"Claude_3.5": "Claude-3.5-Sonnet (0622)",
"Gemini_1.5_pro_002": "Gemini-1.5-Pro-002",
"InternVL2_76B": "InternVL2-Llama3-76B",
"Qwen2_VL_72B": "Qwen2-VL-72B",
"llava_onevision_72B": "Llava-OneVision-72B",
"NVLM": "NVLM-72B",
"GPT_4o_mini": "GPT-4o mini",
"Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
"Pixtral_12B": "Pixtral 12B",
"Aria": "Aria-MoE-25B",
"Qwen2_VL_7B": "Qwen2-VL-7B",
"InternVL2_8B": "InternVL2-8B",
"llava_onevision_7B": "Llava-OneVision-7B",
"Llama_3_2_11B": "Llama-3.2-11B",
"Phi-3.5-vision": "Phi-3.5-Vision",
"MiniCPM_v2.6": "MiniCPM-V2.6",
"Idefics3": "Idefics3-8B-Llama3",
"Aquila_VL_2B": "Aquila-VL-2B-llava-qwen",
"POINTS_7B": "POINTS-Qwen2.5-7B",
"Qwen2_VL_2B": "Qwen2-VL-2B",
"InternVL2_2B": "InternVL2-2B",
"Molmo_7B_D": "Molmo-7B-D-0924",
"Molmo_72B": "Molmo-72B-0924",
}
DIMENSION_NAME_MAP = {
"skills": "Skills",
"input_format": "Input Format",
"output_format": "Output Format",
"input_num": "Visual Input Number",
"app": "Application"
}
KEYWORD_NAME_MAP = {
# Skills
"Object Recognition and Classification": "Object Recognition",
"Text Recognition (OCR)": "OCR",
"Language Understanding and Generation": "Language",
"Scene and Event Understanding": "Scene/Event",
"Mathematical and Logical Reasoning": "Math/Logic",
"Commonsense and Social Reasoning": "Commonsense",
"Ethical and Safety Reasoning": "Ethics/Safety",
"Domain-Specific Knowledge and Skills": "Domain-Specific",
"Spatial and Temporal Reasoning": "Spatial/Temporal",
"Planning and Decision Making": "Planning/Decision",
# Input Format
'User Interface Screenshots': "UI related",
'Text-Based Images and Documents': "Documents",
'Diagrams and Data Visualizations': "Infographics",
'Videos': "Videos",
'Artistic and Creative Content': "Arts/Creative",
'Photographs': "Photographs",
'3D Models and Aerial Imagery': "3D related",
# Application
'Information_Extraction': "Info Extraction",
'Planning' : "Planning",
'Coding': "Coding",
'Perception': "Perception",
'Metrics': "Metrics",
'Science': "Science",
'Knowledge': "Knowledge",
'Mathematics': "Math",
# Output format
'contextual_formatted_text': "Contexual",
'structured_output': "Structured",
'exact_text': "Exact",
'numerical_data': "Numerical",
'open_ended_output': "Open-ended",
'multiple_choice': "MC",
"6-8 images": "6-8 imgs",
"1-image": "1 img",
"2-3 images": "2-3 imgs",
"4-5 images": "4-5 imgs",
"9-image or more": "9+ imgs",
"video": "Video",
}
class BaseDataLoader:
# Define the base MODEL_GROUPS structure
BASE_MODEL_GROUPS = {
"All": list(MODEL_NAME_MAP.keys()),
"Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B'],
"Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B"],
"Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
"Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
"Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B"],
"Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B",]
}
def __init__(self):
self.MODEL_DATA = self._load_model_data()
self.SUMMARY_DATA = self._load_summary_data()
self.SUPER_GROUPS = self._initialize_super_groups()
self.MODEL_GROUPS = self._initialize_model_groups()
def _initialize_super_groups(self):
# Get a sample model to access the structure
sample_model = next(iter(self.MODEL_DATA))
# Create groups with task counts
groups = {}
self.keyword_display_map = {} # Add this map to store display-to-original mapping
for dim in self.MODEL_DATA[sample_model]:
dim_name = DIMENSION_NAME_MAP[dim]
# Create a list of tuples (display_name, count, keyword) for sorting
keyword_info = []
for keyword in self.MODEL_DATA[sample_model][dim]:
# Get the task count for this keyword
task_count = self.MODEL_DATA[sample_model][dim][keyword]["count"]
original_name = KEYWORD_NAME_MAP.get(keyword, keyword)
display_name = f"{original_name}({task_count})"
keyword_info.append((display_name, task_count, keyword))
# Sort by count (descending) and then by display name (for ties)
keyword_info.sort(key=lambda x: (-x[1], x[0]))
# Store sorted display names and update mapping
groups[dim_name] = [info[0] for info in keyword_info]
for display_name, _, keyword in keyword_info:
self.keyword_display_map[display_name] = keyword
# Sort based on predefined order
order = ["Application", "Skills", "Output Format", "Input Format", "Visual Input Number"]
return {k: groups[k] for k in order if k in groups}
def _initialize_model_groups(self) -> Dict[str, list]:
# Get the list of available models from the loaded data
available_models = set(self.MODEL_DATA.keys())
# Create filtered groups based on available models
filtered_groups = {}
for group_name, models in self.BASE_MODEL_GROUPS.items():
if group_name == "All":
filtered_groups[group_name] = sorted(list(available_models))
else:
filtered_models = [model for model in models if model in available_models]
if filtered_models: # Only include group if it has models
filtered_groups[group_name] = filtered_models
return filtered_groups
def _load_model_data(self) -> Dict[str, Any]:
raise NotImplementedError("Subclasses must implement _load_model_data")
def _load_summary_data(self) -> Dict[str, Any]:
raise NotImplementedError("Subclasses must implement _load_summary_data")
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
raise NotImplementedError("Subclasses must implement get_df")
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
raise NotImplementedError("Subclasses must implement get_leaderboard_data")
class DefaultDataLoader(BaseDataLoader):
def __init__(self):
super().__init__()
def _load_model_data(self) -> Dict[str, Any]:
with open("./static/eval_results/Default/all_model_keywords_stats.json", "r") as f:
return json.load(f)
def _load_summary_data(self) -> Dict[str, Any]:
with open("./static/eval_results/Default/all_summary.json", "r") as f:
return json.load(f)
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
original_dimension = get_original_dimension(selected_super_group)
data = []
for model in self.MODEL_GROUPS[selected_model_group]:
model_data = self.MODEL_DATA[model]
summary = self.SUMMARY_DATA[model]
core_noncot_score = summary["core_noncot"]["macro_mean_score"]
core_cot_score = summary["core_cot"]["macro_mean_score"]
row = {
"Models": get_display_model_name(model),
"Overall": round(summary["overall_score"] * 100, 2),
"Core(w/o CoT)": round(core_noncot_score * 100, 2),
"Core(w/ CoT)": round(core_cot_score * 100, 2),
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
}
for display_name in self.SUPER_GROUPS[selected_super_group]:
original_keyword = self.keyword_display_map[display_name]
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
else:
row[display_name] = None
data.append(row)
df = pd.DataFrame(data)
df = df.sort_values(by="Overall", ascending=False)
return df
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
df = self.get_df(selected_super_group, selected_model_group)
headers = ["Models", "Overall", "Core(w/o CoT)", "Core(w/ CoT)", "Open-ended"] + self.SUPER_GROUPS[selected_super_group]
data = df[headers].values.tolist()
return headers, data
class CoreSingleDataLoader(BaseDataLoader):
def __init__(self):
super().__init__()
def _load_model_data(self) -> Dict[str, Any]:
with open("./static/eval_results/Core_SI/all_model_keywords_stats.json", "r") as f:
return json.load(f)
def _load_summary_data(self) -> Dict[str, Any]:
with open("./static/eval_results/Core_SI/all_summary.json", "r") as f:
return json.load(f)
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
original_dimension = get_original_dimension(selected_super_group)
data = []
for model in self.MODEL_GROUPS[selected_model_group]:
model_data = self.MODEL_DATA[model]
summary = self.SUMMARY_DATA[model]
core_si_score = summary["macro_mean_score"]
row = {
"Models": get_display_model_name(model),
"Core SI": round(core_si_score * 100, 2),
}
for display_name in self.SUPER_GROUPS[selected_super_group]:
original_keyword = self.keyword_display_map[display_name]
if original_dimension in model_data and original_keyword in model_data[original_dimension]:
row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
else:
row[display_name] = None
data.append(row)
df = pd.DataFrame(data)
df = df.sort_values(by="Core SI", ascending=False)
return df
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
df = self.get_df(selected_super_group, selected_model_group)
headers = ["Models", "Core SI"] + self.SUPER_GROUPS[selected_super_group]
data = df[headers].values.tolist()
return headers, data
# Keep your helper functions
def get_original_dimension(mapped_dimension):
return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
def get_original_keyword(mapped_keyword):
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
def get_display_model_name(model_name):
return MODEL_NAME_MAP.get(model_name, model_name)