|
[ |
|
{ |
|
"name": "image_translation_en2cn", |
|
"score": 0.5564421945052599, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 9, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Information_Extraction", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "CLEVRER_physics", |
|
"score": 0.45, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 20, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "logical_reasoning_find_odd_one_out", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_reasoning_overlapped_circle", |
|
"score": 0.75, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "logical_reasoning_fit_pattern", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "GUI_Act_Mobile_tap", |
|
"score": 0.14285714285714285, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "relative_depth_of_different_points", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "autonomous_driving_scene_analysis", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_reasoning_count_line_intersections", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ishihara_test", |
|
"score": 0.5571428571428572, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "llavaguard", |
|
"score": 0.6071428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Ethical and Safety Reasoning", |
|
"Scene and Event Understanding", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "knowledge_graph_understanding", |
|
"score": 0.6, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "recover_masked_word_in_figure", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "paper_vqa", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "game_platform_support_identification", |
|
"score": 0.8928571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "product_ocr_qa", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "healthcare_info_judgement", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills", |
|
"Ethical and Safety Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Science", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "circuit_diagram_understanding", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_screenshot_barman", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "counting", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "signage_navigation", |
|
"score": 0.7333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "music_sheet_note_count", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 17, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Knowledge", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "Ad_count_detection", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "flowchart_code_generation", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 9, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Coding", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "song_title_identification_from_lyrics", |
|
"score": 0.42857142857142855, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "media_QA_web_stackoverflow", |
|
"score": 0.619047619047619, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MMMU_pro_exam_screenshot", |
|
"score": 0.5050505050505051, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 99, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Science", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_reasoning_grid", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "location_vqa", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_reasoning_circled_letter", |
|
"score": 0.6071428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "monthly_weather_days_count", |
|
"score": 0.3571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "transit_map_intersection_points", |
|
"score": 0.5625, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "webpage_code_understanding", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 9, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Coding", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_visual_storage", |
|
"score": 0.13333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_reasoning_nested_squares", |
|
"score": 0.4642857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "GUI_Act_Web_Multi", |
|
"score": 0.4379245788668292, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "mensa_iq_test", |
|
"score": 0.40294117647058825, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 17, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chess_find_legal_moves", |
|
"score": 0.1858388265990491, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "play_go_capture_stone", |
|
"score": 0.2, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "topological_sort", |
|
"score": 0.07142857142857142, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "medical_multi_organ_segmentation_rater", |
|
"score": 0.21428571428571427, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Science", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "mindmap_elements_parsing", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_screenshot_grippers", |
|
"score": 0.3333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "interpret_force_perspective_illusion", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chinese_idiom_recognition", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "GUI_Act_Mobile_swipe", |
|
"score": 0.5925323909834338, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "road_map_find_highway_between_two_place", |
|
"score": 0.6470588235294118, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 17, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "annoying_word_search", |
|
"score": 0.0009041591320072332, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Planning", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "position_relationship", |
|
"score": 0.7333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "polygon_interior_angles", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "font_recognition", |
|
"score": 0.21428571428571427, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "top_video_creator_identification", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "Bongard_Problem", |
|
"score": 0.21052631578947367, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ascii_art_understanding", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "GUI_Act_Web_Single", |
|
"score": 0.01601312748867357, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_screenshot_termes", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "actor_recognition_in_Movie", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "extract_webpage_headline", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "hashtag_recommendation", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "calendar_schedule_suggestion", |
|
"score": 0.42857142857142855, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "relative_reflectance_of_different_regions", |
|
"score": 0.14285714285714285, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "web_action_prediction", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Perception", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "highest_discount_game_price_identification", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_screenshot_floortile", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "code_execution", |
|
"score": 0.75, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 16, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_screenshot_blocksworld", |
|
"score": 0.06666666666666667, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "music_sheet_format_QA", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Knowledge", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "soccer_offside", |
|
"score": 0.2222222222222222, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 9, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ti_fused_vqa_physics", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "number_comparison", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multilingual_movie_info_parsing", |
|
"score": 0.6632653061224488, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "electricity_load_estimate_plot", |
|
"score": 0.4767857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_screenshot_storage", |
|
"score": 0.06666666666666667, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "pmc_vqa_medical_image_qa", |
|
"score": 0.8947368421052632, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "movie_info_parsing", |
|
"score": 0.6875, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "scibench_calculus_wo_solution", |
|
"score": 0.3673469387755102, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 49, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "code_programming_test_easy", |
|
"score": 0.4583333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 24, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "mahjong", |
|
"score": 0.13333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chess_sygyzy_endgames", |
|
"score": 0.1496598639455782, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "rebus", |
|
"score": 0.6956521739130435, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 23, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Planning", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "insect_order_classification", |
|
"score": 0.4, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_shortest_path_kamada_kawai", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "LaTeX_complex_formula_convertion", |
|
"score": 0.5294117647058824, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 17, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "icon_arithmetic_puzzle", |
|
"score": 0.5357142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "human_relationship_reasoning", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 16, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "orchestra_score_recognition", |
|
"score": 0.32142857142857145, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "web_action_grounding", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_shortest_path_planar", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "iconqa", |
|
"score": 0.2631578947368421, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "remaining_playback_time_calculation", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "stock_info_parsing", |
|
"score": 0.9747899159663866, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "stock_price_future_prediction", |
|
"score": 0.7872142857142859, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "weather_map_climate_type_temperature_parsing", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_length", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "traffic_future_prediction_from_line_plot", |
|
"score": 0.6918947368421055, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ti_fused_vqa_biology", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "distinguish_ai_generated_image", |
|
"score": 0.6842105263157895, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "animal_pose_estimation", |
|
"score": 0.2785198065092178, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "average_humidity_estimate_plot", |
|
"score": 0.828, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "planning_screenshot_tyreworld", |
|
"score": 0.9333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "music_info_parsing", |
|
"score": 0.75, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "nlvr2_two_image_compare_qa", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_transformation", |
|
"score": 0.21428571428571427, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "entertainment_web_game_style", |
|
"score": 0.8214285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multilingual_game_info_parsing", |
|
"score": 0.8303571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "egocentric_analysis_single_image", |
|
"score": 0.5555555555555556, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 9, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "landmark_recognition_and_qa", |
|
"score": 0.711111111111111, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_connectivity", |
|
"score": 0.95, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "youtube_video_info_parsing", |
|
"score": 0.8095238095238095, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "physical_property_reasoning", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "scibench_fundamental_wo_solution", |
|
"score": 0.3469387755102041, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 49, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "license_plate_recognition", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "science_basic_physics", |
|
"score": 0.7333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "face_keypoint_detection", |
|
"score": 0.5982549376215841, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "famous_building_recognition", |
|
"score": 0.875, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 16, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "TV_show_info_parsing", |
|
"score": 0.8253968253968255, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "image_style_recognition", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "figureqa", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_theory", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chess_winner_identification", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "electricity_plot_future_prediction", |
|
"score": 0.7131684210526317, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "algebra", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "mnist_pattern", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Planning", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "math_convexity_value_estimation", |
|
"score": 0.5867591836191252, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "map_diagram_qa", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "deciphering_oracle_bone", |
|
"score": 0.07142857142857142, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_isomorphism", |
|
"score": 0.4, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "funsd_document_qa", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "newspaper_page_parse_and_count", |
|
"score": 0.6444444444444445, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "weather_info_parsing", |
|
"score": 0.9285714285714288, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "vibe_eval_short_phrase", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "signboard_identification", |
|
"score": 0.8666666666666667, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "game_info_parsing", |
|
"score": 0.8766233766233764, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "long_string_letter_recognition", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_analytic", |
|
"score": 0.07142857142857142, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "long_string_number_recognition", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "waybill_number_sequence_extraction", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "egocentric_spatial_reasoning", |
|
"score": 0.4444444444444444, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 9, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_area", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "single_person_pose_estimation", |
|
"score": 0.2903422951989705, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "newspaper_ocr_in_query_box", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "quizlet_question_solving", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chart_vqa", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "electricity_future_prediction_from_table", |
|
"score": 0.7417368421052631, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "humor_understand_caption_match", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "hotel_booking_confirmation_parsing", |
|
"score": 0.7142857142857142, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_descriptive", |
|
"score": 0.07142857142857142, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "coco_person_detection", |
|
"score": 0.6477943776571286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ti_fused_vqa_math", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_maxflow", |
|
"score": 0.26666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "places365_scene_type_classification", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "research_website_parsing_blogpost", |
|
"score": 0.07142857142857142, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Information_Extraction", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "coco_object_detection_by_query_property", |
|
"score": 0.5807339650392197, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "3d_indoor_scene_text_bbox_prediction", |
|
"score": 0.18559785992971775, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "3D Models and Aerial Imagery", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "top_rated_hotel_identification", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "research_website_parsing_publication", |
|
"score": 0.14285714285714285, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Information_Extraction", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "widerface_face_count_and_event_classification", |
|
"score": 0.6071428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "research_website_parsing_homepage", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Information_Extraction", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "math_parity", |
|
"score": 0.8, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "reward_models_I2T_reward", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Metrics", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "snli_ve_visual_entailment", |
|
"score": 0.8666666666666667, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "brand_logo_recognition_and_elaboration", |
|
"score": 0.74, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 25, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "constrained_generation_contain_repeat_length", |
|
"score": 0.26666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "super_clevr", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "constrained_generation_contain_length", |
|
"score": 0.8, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "science_molecule_chemistry", |
|
"score": 0.8, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "constrained_generation_contain_position_length", |
|
"score": 0.7333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ti_fused_vqa_chemistry", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "constrained_generation_contain_contain_length", |
|
"score": 0.9333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chess_puzzle_single_step", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "panel_images_single_question", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Perception", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "panel_images_multi_question", |
|
"score": 0.8333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chess_puzzles_checkmate", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "exchange_rate_estimate_plot", |
|
"score": 0.9764785714285713, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chess_puzzles_equality", |
|
"score": 0.06666666666666667, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_chordless_cycle", |
|
"score": 0.35714285714285715, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "chess_puzzles_crushing", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "table_understanding_fact_verification", |
|
"score": 0.9047619047619048, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "table_understanding_complex_question_answering", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "mvsa_sentiment_classification", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "math_breakpoint", |
|
"score": 0.9333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "number_puzzle_sudoku", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ili_ratio_future_prediction", |
|
"score": 0.12478571428571421, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "dvqa", |
|
"score": 0.3684210526315789, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "number_puzzle_kakuro_5x5", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "pictionary_chinese_food_img2en", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "geometry_solid", |
|
"score": 0.07142857142857142, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Mathematics", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "pictionary_skribbl_io", |
|
"score": 0.55, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 20, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "pictionary_genai_output_chinese", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "pictionary_doodle_guess", |
|
"score": 0.8, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "pictionary_cartoon_drawing_guess", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_face_swap", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_veracity", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_out_of_context", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "question_solution_solving", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_background_change", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_clip_stable_diffusion_generate", |
|
"score": 0.42857142857142855, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_text_style", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_text_entity_replace", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MFC_Bench_check_face_attribute_edit", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_math_TheoremQA", |
|
"score": 0.42857142857142855, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Mathematics", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MMSoc_HatefulMemes", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Ethical and Safety Reasoning", |
|
"Commonsense and Social Reasoning", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MMSoc_Misinformation_PolitiFact", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Language Understanding and Generation", |
|
"Ethical and Safety Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MMSoc_Misinformation_GossipCop", |
|
"score": 0.42857142857142855, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Language Understanding and Generation", |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Ethical and Safety Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MMSoc_Memotion", |
|
"score": 0.6000000000000002, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 17, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_leetcode", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_instagram", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_iphone_settings", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_ppt", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_amazon", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_excel", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_youtube", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_twitter", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_alipay", |
|
"score": 0.7647058823529411, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 17, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_zoom", |
|
"score": 0.6, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_word", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_tiktok", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "maze_2d_8x8", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "crossword_mini_5x5", |
|
"score": 0.6785714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Planning", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "autorater_artifact_reason", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Metrics", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "autorater_artifact", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Metrics", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "counterfactual_arithmetic", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "poetry_acrostic_alliteration", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "poetry_shakespearean_sonnet", |
|
"score": 0.26666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "poetry_haiku", |
|
"score": 0.9333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "poetry_petrarchian_sonnet_optional_meter", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "cvbench_adapted_cvbench_relation", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "poetry_acrostic", |
|
"score": 0.8666666666666667, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "poetry_limerick", |
|
"score": 0.8, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "cvbench_adapted_cvbench_distance", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "poetry_custom_rhyming_scheme", |
|
"score": 0.13333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 0, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "cvbench_adapted_cvbench_depth", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "cvbench_adapted_cvbench_count", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "shape_composition_shapes", |
|
"score": 0.4562925170068027, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "shape_composition_colours", |
|
"score": 0.36553287981859406, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Perception", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_article_authors", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Information_Extraction", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_table_to_html", |
|
"score": 0.6428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_article_journal", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_resume_skill_plain", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_math_equation", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "symbolic_graphics_programs_scalable_vector_graphics", |
|
"score": 0.1111111111111111, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 18, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_table_to_latex", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_resume_experience_plain", |
|
"score": 0.7142857142857143, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_resume_employer_plain", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_math_text_latex", |
|
"score": 0.42857142857142855, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_table_to_markdown", |
|
"score": 0.9285714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_resume_school_plain", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Information_Extraction", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_table_to_csv", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "memorization_indian_celebrity", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "memorization_chinese_celebrity", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "memorization_famous_treaty", |
|
"score": 0.6785714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "symbolic_graphics_programs_computer_aided_design", |
|
"score": 0.2857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "memorization_papers", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multiple_states_identify_africa", |
|
"score": 0.7428571428571429, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multiple_states_identify_europe", |
|
"score": 0.7571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multiple_states_identify_asia", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocr_math_MATH", |
|
"score": 0.6666666666666666, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Mathematics", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multiple_states_identify_americas", |
|
"score": 0.5428571428571428, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "medical_cell_recognition", |
|
"score": 0.42857142857142855, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Science", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "medical_image_artifacts_indentification", |
|
"score": 0.21428571428571427, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Science", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "medical_counting_lymphocytes", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Science", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "medical_blood_vessels_recognition", |
|
"score": 0.6785714285714286, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Science", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "cultural_vqa", |
|
"score": 0.5333333333333333, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "character_recognition_in_TV_shows", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "kvqa_knowledge_aware_qa", |
|
"score": 0.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "tqa_textbook_qa", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "3d_indoor_scene_text_bbox_selection", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "3D Models and Aerial Imagery", |
|
"app": "Perception", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "app_layout_understanding_notes", |
|
"score": 0.7857142857142857, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "table_understanding", |
|
"score": 0.5, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "numerical_data", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "MMMU_physics_chemistry_MCQ", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "exact_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "arxiv_vqa", |
|
"score": 1.0, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "multiple_choice", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "realworld_qa_en2cn", |
|
"score": 0.8571428571428571, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Information_Extraction", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "handwritten_math_expression_extraction", |
|
"score": 0.5714285714285714, |
|
"eval_type": "rule", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Mathematical and Logical Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "red_teaming_jailbreak", |
|
"score": 0.8949999999999999, |
|
"eval_type": "llm", |
|
"num_demo": 0, |
|
"num_query": 20, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Ethical and Safety Reasoning" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "red_teaming_celebrity", |
|
"score": 0.9, |
|
"eval_type": "llm", |
|
"num_demo": 0, |
|
"num_query": 20, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "red_teaming_politics", |
|
"score": 0.7250000000000002, |
|
"eval_type": "llm", |
|
"num_demo": 0, |
|
"num_query": 20, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Commonsense and Social Reasoning", |
|
"Ethical and Safety Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "red_teaming_racial", |
|
"score": 0.765, |
|
"eval_type": "llm", |
|
"num_demo": 0, |
|
"num_query": 20, |
|
"skills": [ |
|
"Ethical and Safety Reasoning", |
|
"Scene and Event Understanding", |
|
"Object Recognition and Classification" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "red_teaming_visualmisleading", |
|
"score": 0.7578947368421054, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Ethical and Safety Reasoning", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multi_lingual_Ruozhiba_expalnation_Spanish", |
|
"score": 0.7142857142857144, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multi_lingual_Ruozhiba_expalnation_English", |
|
"score": 0.7214285714285715, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multi_lingual_Ruozhiba_expalnation_Russian", |
|
"score": 0.6, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multi_lingual_Ruozhiba_expalnation_Arabic", |
|
"score": 0.7071428571428572, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multi_lingual_Ruozhiba_expalnation_Japanese", |
|
"score": 0.65, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "multi_lingual_Ruozhiba_expalnation_French", |
|
"score": 0.6785714285714285, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ascii_art_30", |
|
"score": 0.0, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "contextual_formatted_text", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "table2latex_complex", |
|
"score": 0.6777777777777777, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 9, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "red_teaming_captcha", |
|
"score": 0.10000000000000003, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Text Recognition (OCR)" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "meme_explain", |
|
"score": 0.9142857142857145, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "scibench_w_solution_open_ended", |
|
"score": 0.36200000000000004, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 25, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Science", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "funny_image_title", |
|
"score": 0.6928571428571428, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "guess_image_generation_prompt", |
|
"score": 0.8315789473684211, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 19, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "traffic_accident_analysis", |
|
"score": 0.37857142857142856, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "vibe-eval", |
|
"score": 0.6642857142857144, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Ethical and Safety Reasoning", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "electrocardiogram", |
|
"score": 0.3, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Science", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "GUI_Chat_Easy", |
|
"score": 0.7423076923076924, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 26, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "docci_image_description_long", |
|
"score": 0.8428571428571429, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "humor_explanation", |
|
"score": 0.8666666666666668, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 15, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "GUI_Chat_Hard", |
|
"score": 0.49354838709677434, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 31, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Information_Extraction", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "tweets_captioning", |
|
"score": 0.6, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "image_captioning_with_additional_requirements", |
|
"score": 0.9214285714285716, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "visualization_with_code", |
|
"score": 0.6357142857142858, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Coding", |
|
"output_format": "structured_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "table_understanding_fetaqa", |
|
"score": 0.6785714285714286, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "bridge_strategies_worldclass", |
|
"score": 0.2857142857142857, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Planning and Decision Making", |
|
"Mathematical and Logical Reasoning", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "bridge_strategies_advanced", |
|
"score": 0.2785714285714286, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "bridge_strategies_expert", |
|
"score": 0.3571428571428572, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 14, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Spatial and Temporal Reasoning", |
|
"Planning and Decision Making" |
|
], |
|
"input_format": "User Interface Screenshots", |
|
"app": "Planning", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "image_humor_understanding", |
|
"score": 0.8620689655172411, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Commonsense and Social Reasoning", |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "graph_interpretation", |
|
"score": 0.8310344827586206, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Mathematical and Logical Reasoning", |
|
"Language Understanding and Generation", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "science_figure_explanation", |
|
"score": 0.8793103448275862, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Scene and Event Understanding", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "ocrqa", |
|
"score": 0.8689655172413793, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Text Recognition (OCR)", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Text-Based Images and Documents", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "iq_test", |
|
"score": 0.7310344827586206, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Spatial and Temporal Reasoning" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Planning", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "unusual_images", |
|
"score": 0.9068965517241377, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Spatial and Temporal Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Photographs", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "art_explanation", |
|
"score": 0.6172413793103447, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Language Understanding and Generation", |
|
"Scene and Event Understanding", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "bar_chart_interpretation", |
|
"score": 0.627586206896552, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Object Recognition and Classification", |
|
"Mathematical and Logical Reasoning", |
|
"Language Understanding and Generation" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Perception", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "figurative_speech_explanation", |
|
"score": 0.8310344827586207, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation", |
|
"Commonsense and Social Reasoning" |
|
], |
|
"input_format": "Artistic and Creative Content", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
}, |
|
{ |
|
"name": "defeasible_reasoning", |
|
"score": 0.8275862068965518, |
|
"eval_type": "llm", |
|
"num_demo": 1, |
|
"num_query": 29, |
|
"skills": [ |
|
"Scene and Event Understanding", |
|
"Language Understanding and Generation", |
|
"Domain-Specific Knowledge and Skills" |
|
], |
|
"input_format": "Diagrams and Data Visualizations", |
|
"app": "Knowledge", |
|
"output_format": "open_ended_output", |
|
"num_input": "1-image" |
|
} |
|
] |