Spaces:
Running
Running
[ | |
{ | |
"name": "graph_maxflow", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "arc_agi", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "long_string_letter_recognition", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "action_prediction", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "3d_indoor_scene_text_bbox_selection", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "photo_sharing_image_retrieval", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "emotion_recognition", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "code_error_line_identification", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Domain-Specific Knowledge and Skills", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "multiple_choice", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "hotel_booking_confirmation_parsing", | |
"score": 0.3857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "brand_logo_recognition_and_elaboration", | |
"score": 0.48, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 25, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "3d_indoor_scene_text_bbox_prediction", | |
"score": 0.04632755935026561, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "table_understanding", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "newspaper_page_parse_and_count", | |
"score": 0.3111111111111111, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "coco_person_detection", | |
"score": 0.08040063592083609, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "graph_hamiltonian_path", | |
"score": 0.38244047619047616, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "weather_info_retrieval", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "cheapest_flight_identification", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "game_info_parsing", | |
"score": 0.6688311688311688, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "mvsa_sentiment_classification", | |
"score": 0.6428571428571429, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "logo2k_same_type_logo_retrieval", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "coco_object_detection_by_query_property", | |
"score": 0.08458208458208459, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "perception_test_object_shuffle_video", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "landmark_recognition_and_qa", | |
"score": 0.17777777777777778, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "tqa_textbook_qa", | |
"score": 0.6428571428571429, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "electricity_load_estimate_plot", | |
"score": 0.05114285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "pmc_vqa_medical_image_qa", | |
"score": 0.42105263157894735, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "graph_shortest_path_planar", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_output_result", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "kvqa_knowledge_aware_qa", | |
"score": 0.3684210526315789, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_content_reasoning", | |
"score": 0.1111111111111111, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "stock_price_future_prediction", | |
"score": 0.21028571428571435, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "iconqa", | |
"score": 0.10526315789473684, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ti_fused_vqa_biology", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_programming_test_easy", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 24, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "visualdial_visual_dialog_image_guessing", | |
"score": 0.2, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "license_plate_recognition", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "math_convexity_value_estimation", | |
"score": 0.18817731556471845, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_analytic", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "famous_building_recognition", | |
"score": 0.4375, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_transformation", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "human_relationship_reasoning", | |
"score": 0.375, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_to_camera_trajectory_retrieval", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "algebra", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "nextqa_mc", | |
"score": 0.9473684210526315, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "mnist_pattern", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Planning", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "face_identity_matching", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "single_person_pose_estimation", | |
"score": 0.1174011354666419, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "movie_info_retrieval", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "funsd_document_qa", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "perception_test_video_character_order", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "movie_info_parsing", | |
"score": 0.375, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "clevrer_video_moving_object_property_recognition", | |
"score": 0.125, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "science_basic_physics", | |
"score": 0.4666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "av_vehicle_multiview_counting", | |
"score": 0.13333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "graph_theory", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ili_ratio_future_prediction", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "nlvr2_two_image_compare_qa", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multi_load_type_prediction_from_plot", | |
"score": 0.4523809523809523, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "chart_vqa", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "stock_info_retrieval", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "newspaper_ocr_in_query_box", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "quizlet_question_solving", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "math_breakpoint", | |
"score": 0.5333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "coco_ood_global_image_retrieval_by_query_property", | |
"score": 0.06547619047619047, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "animal_pose_estimation", | |
"score": 0.013860848714248784, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "long_string_number_recognition", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "perception_test_video_action_count", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "video" | |
}, | |
{ | |
"name": "IAM_line_ocr_and_locate", | |
"score": 0.10210634994992598, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "landmark_check_two_images", | |
"score": 0.04444444444444444, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "av_view_identification", | |
"score": 0.13333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "traffic_future_prediction_from_line_plot", | |
"score": 0.513578947368421, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "music_info_retrieval", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "clevrer_moving_direction_video", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "face_keypoint_detection", | |
"score": 0.4592092436351974, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_descriptive", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "muma_theory_of_mind_social_goal", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "places365_similar_scene_retrieval", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Information_Extraction", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "graph_hamiltonian_cycle", | |
"score": 0.14047619047619048, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "science_molecule_chemistry", | |
"score": 0.5333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multilingual_game_info_parsing", | |
"score": 0.29464285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "graph_chordless_cycle", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "muma_theory_of_mind_belief_of_goal", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "clevrer_video_moving_object_count", | |
"score": 0.3333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 21, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "video" | |
}, | |
{ | |
"name": "star_object_interaction_video", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "exchange_rate_estimate_plot", | |
"score": 0.7091142857142856, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vizwiz_quality_accessment_for_blind", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "av_human_multiview_counting", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "humor_understand_caption_match", | |
"score": 0.4, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vlnqa_egocentric_navigation_video", | |
"score": 0.25, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "sta_action_localization_video", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "question_solution_solving", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "places365_scene_type_classification", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "chess_winner_identification", | |
"score": 0.5333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_solid", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "egocentric_spatial_reasoning", | |
"score": 0.1111111111111111, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "electricity_future_prediction_from_table", | |
"score": 0.5842105263157894, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "chess_puzzle_single_step", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "top_rated_hotel_identification", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vibe_eval_short_phrase", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "graph_shortest_path_kamada_kawai", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ti_fused_vqa_chemistry", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "insect_order_classification", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ti_fused_vqa_physics", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "dvqa", | |
"score": 0.42105263157894735, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "electricity_plot_future_prediction", | |
"score": 0.35777368421052635, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "super_clevr", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "clevrer_object_existence_video", | |
"score": 0.6875, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "figureqa", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "average_humidity_estimate_plot", | |
"score": 0.05999999999999999, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "clevr_arithmetic", | |
"score": 0.10526315789473684, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "math_parity", | |
"score": 0.13333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_programming_extremely_hard", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "widerface_face_count_and_event_classification", | |
"score": 0.42857142857142855, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_programming_test_hard", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "graph_isomorphism", | |
"score": 0.5333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_programming_test_advanced", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 18, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "snli_ve_visual_entailment", | |
"score": 0.8666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "stock_info_parsing", | |
"score": 0.7226890756302522, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "egocentric_analysis_single_image", | |
"score": 0.5555555555555556, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "3d_fragments_understanding", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "physical_property_reasoning", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "cam_traj_to_video_selection", | |
"score": 0.42857142857142855, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "image_style_recognition", | |
"score": 0.4, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_length", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multilingual_movie_info_parsing", | |
"score": 0.2959183673469387, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "deciphering_oracle_bone", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "action_sequence", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "handwritten_math_expression_extraction", | |
"score": 0.42857142857142855, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "youtube_video_info_parsing", | |
"score": 0.46428571428571425, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_area", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "map_diagram_qa", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "weather_info_parsing", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "graph_connectivity", | |
"score": 0.11666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ti_fused_vqa_math", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "game_info_retrieval", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "av_multicamera_tracking_predict_bbox", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "signboard_identification", | |
"score": 0.6, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "TV_show_info_parsing", | |
"score": 0.40476190476190477, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "waybill_number_sequence_extraction", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "music_info_parsing", | |
"score": 0.20535714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_segments_reordering", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "medical_cell_recognition", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Science", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "weather_map_climate_type_temperature_parsing", | |
"score": 0.25, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "recipe_image_ordering", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "music_sheet_sentiment", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "waldo", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 18, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "google_streetview_direction_understanding", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "medical_abdomen_MRI_organ_recognition", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "geometry_reasoning_overlapped_circle", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_parasite_detection", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Science", | |
"output_format": "structured_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "medical_blood_vessels_recognition", | |
"score": 0.03571428571428571, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Science", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "photoshop_operation", | |
"score": 0.10714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "vln_identify_robot", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "logical_reasoning_find_odd_one_out", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MMMU_physics_chemistry_MCQ", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "visual_correspondance_in_two_images", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "MMMU_pro_exam_screenshot", | |
"score": 0.12121212121212122, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 99, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Science", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vln_tegulu_next_step", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "planning_screenshot_termes", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "play_go_capture_stone", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "distinguish_ai_generated_image", | |
"score": 0.5263157894736842, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_match_problem", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "multiview_reasoning_camera_moving", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "paper_review_rating", | |
"score": 0.764197764824463, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Metrics", | |
"output_format": "numerical_data", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "highest_discount_game_price_identification", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "planning_screenshot_blocksworld", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "dish_ingredient_match", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "TV_show_retrieval_by_character", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "interpret_force_perspective_illusion", | |
"score": 0.13333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "knowledge_graph_understanding", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "scibench_calculus_wo_solution", | |
"score": 0.02040816326530612, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 49, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "Ad_count_detection", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "google_streetview_circle_reasoning", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "music_sheet_format_QA", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "logical_reasoning_2D_views_of_3D_shapes", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "video_eval_visual_pref", | |
"score": 0.5625, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Metrics", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "code_solution_compare", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Coding", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "soccer_offside", | |
"score": 0.3333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "signage_navigation", | |
"score": 0.6, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "position_relationship", | |
"score": 0.5333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "song_title_identification_from_lyrics", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vln_identify_location", | |
"score": 0.030303030303030307, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "worldle", | |
"score": 0.049999999999999996, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "booking_web_recommendation", | |
"score": 0.2792517006802721, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "web_action_grounding", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_reasoning_nested_squares", | |
"score": 0.25, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "location_vqa", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ascii_art_understanding", | |
"score": 0.42857142857142855, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "CLEVRER_physics", | |
"score": 0.3, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 20, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "planning_visual_grippers", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "web_action_prediction", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "number_comparison", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "logical_reasoning_fit_pattern", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_translation_hard", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "paper_review_acceptance", | |
"score": 0.5333333333333333, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "webpage_code_understanding", | |
"score": 0.5555555555555556, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Coding", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "rocks_samples_compare", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "product_ocr_qa", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "planning_visual_barman", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "video_action_recognition", | |
"score": 0.10714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "semantic_matching_of_two_images", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "planning_visual_floortile", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "polygon_interior_angles", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "knowledge_sign_recognition", | |
"score": 0.1111111111111111, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "extract_webpage_headline", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "chinese_idiom_recognition", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "planning_screenshot_tyreworld", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_keywords_based_retrieval_non_radiology", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Science", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "move_pos_to_pos_hanoi_4_pole", | |
"score": 0.0503968253968254, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "2d_image_jigsaw_puzzle_easy", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "relative_depth_of_different_points", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "calendar_schedule_suggestion", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_image_artifacts_indentification", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Science", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_intent_recognition", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "GUI_Act_Web_Multi", | |
"score": 0.10714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "music_sheet_name", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "code_add_tag", | |
"score": 0.26666666666666666, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "contextual_formatted_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "annoying_word_search", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "google_streetview_line_reasoning", | |
"score": 0.2, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "scibench_fundamental_wo_solution", | |
"score": 0.02040816326530612, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 49, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "road_map_find_highway_between_two_place", | |
"score": 0.29411764705882354, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_retrieval", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "chess_sygyzy_endgames", | |
"score": 0.05714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Planning and Decision Making", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_camera_motion_description", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "media_homepage_profile", | |
"score": 0.0071428571428571435, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "TRANCE_physics_reasoning_basic", | |
"score": 0.23529411764705882, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "TRANCE_physics_reasoning_view", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "orchestra_score_recognition", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "cultural_vqa", | |
"score": 0.26666666666666666, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_retrieval_given_surgeon_activity", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Videos", | |
"app": "Science", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "monthly_weather_days_count", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "remaining_playback_time_calculation", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_reasoning_count_line_intersections", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "Movie_retrieval_by_actor", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "google_streetview_line_sorting", | |
"score": 0.2, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "GUI_Act_Web_Single", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "Bongard_Problem", | |
"score": 0.8157894736842105, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_polyp_segmentation_single_object_rater", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Science", | |
"output_format": "structured_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "video_eval_factual_pref", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Metrics", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "game_platform_support_identification", | |
"score": 0.03571428571428571, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Object Recognition and Classification" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "mahjong", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_reasoning_grid", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ishihara_test", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "paper_vqa", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "transit_map_intersection_points", | |
"score": 0.017857142857142856, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_translation_advanced", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "Forensic_Detection_of_different_images", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "top_video_creator_identification", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "realworld_qa_en2cn", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geometry_reasoning_circled_letter", | |
"score": 0.17857142857142858, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "logical_reasoning_2d_folding", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "rebus", | |
"score": 0.043478260869565216, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 23, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_translation_Python", | |
"score": 0.0625, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "recover_masked_word_in_figure", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_content_based_retrieval_radiology", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Science", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "sign_language", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "planning_screenshot_grippers", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "music_sheet_author", | |
"score": 0.1875, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "relative_reflectance_of_different_regions", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "planning_screenshot_floortile", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "font_recognition", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "TRANCE_physics_reasoning_event", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "media_recommend_solutions_stackoverflow", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Coding", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "topological_sort", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Mathematics", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "entertainment_web_game_style", | |
"score": 0.7857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "planning_visual_termes", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "comic_page_ordering", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "planning_visual_storage", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "counting_multi_image", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "code_translation_easy", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "mensa_iq_test", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "image_translation_en2cn", | |
"score": 0.1615633519949754, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "GUI_Act_Mobile_tap", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "actor_recognition_in_Movie", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "planning_screenshot_barman", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "code_execution", | |
"score": 0.0625, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_abdomen_endscopy_organ_recognition", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Science", | |
"output_format": "contextual_formatted_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "hashtag_recommendation", | |
"score": 0.7857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multilingual_news_qa", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Photographs", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "LaTeX_complex_formula_convertion", | |
"score": 0.11764705882352941, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vln_hindi_next_step", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "planning_visual_blocksworld", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "video_grounding_spatial", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "video" | |
}, | |
{ | |
"name": "music_sheet_note_count", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "counting", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "GUI_Act_Mobile_swipe", | |
"score": 0.24841274279293252, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "icon_arithmetic_puzzle", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "google_streetview_circle_sorting", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "ancient_map_understanding", | |
"score": 0.42857142857142855, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "llavaguard", | |
"score": 0.39285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Ethical and Safety Reasoning", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "circuit_diagram_understanding", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "media_QA_web_stackoverflow", | |
"score": 0.3095238095238095, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "painting_QA", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "exact_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "character_recognition_in_TV_shows", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_eval_dynamic_pref", | |
"score": 0.375, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Metrics", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "booking_web_rating", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "functionality_matching_in_different_objects", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "rocks_samples_identify", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "chess_find_legal_moves", | |
"score": 0.033620994446927885, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "mindmap_elements_parsing", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "flowchart_code_generation", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Coding", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "pokemon_3D_recognition", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "video_grounding_temporal", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "medical_counting_lymphocytes", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Science", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vln_english_next_step", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "code_visualization_output_understanding", | |
"score": 0.1, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 10, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Coding", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "planning_screenshot_storage", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Planning and Decision Making", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "autonomous_driving_scene_analysis", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "arxiv_vqa", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "healthcare_info_judgement", | |
"score": 0.8571428571428571, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Science", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "medical_multi_organ_segmentation_rater", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Science", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multiple_states_identify_asia", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multiple_states_identify_europe", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multiple_states_identify_africa", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multiple_states_identify_americas", | |
"score": 0.028571428571428574, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "symbolic_graphics_programs_computer_aided_design", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "symbolic_graphics_programs_scalable_vector_graphics", | |
"score": 0.16666666666666666, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 18, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "cvbench_adapted_cvbench_depth", | |
"score": 0.6428571428571429, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "cvbench_adapted_cvbench_relation", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "cvbench_adapted_cvbench_count", | |
"score": 0.7142857142857143, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "cvbench_adapted_cvbench_distance", | |
"score": 0.6428571428571429, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "geographic_remote_sensing_land_cover", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_semantic_segmentation", | |
"score": 0.0625, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_surface_normal_estimation", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_novel_view_synthesis", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_openable_part_segmentation", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_depth_estimation", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_3d_assembled_quality_understanding", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_panoptic_segmentation", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "visual_prediction_rater_plane_segmentation", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "ocr_math_TheoremQA", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Mathematics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_math_MATH", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "research_website_parsing_blogpost", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "research_website_parsing_homepage", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "research_website_parsing_publication", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "reward_models_T2I_reward", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "reward_models_I2T_reward", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "poetry_custom_rhyming_scheme", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "poetry_haiku", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "poetry_limerick", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "poetry_acrostic", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "poetry_acrostic_alliteration", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "poetry_shakespearean_sonnet", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "poetry_petrarchian_sonnet_optional_meter", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "maze_2d_8x8", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_resume_school_plain", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_math_text_latex", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_table_to_csv", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_math_equation", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_article_journal", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_resume_skill_plain", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_table_to_latex", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_table_to_html", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_table_to_markdown", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_resume_employer_plain", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_resume_experience_plain", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocr_article_authors", | |
"score": 0.39285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "panel_images_multi_question", | |
"score": 0.40476190476190477, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "panel_images_single_question", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ball_cup_swap_3", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_layout_understanding_excel", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_zoom", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_ppt", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_twitter", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_youtube", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_alipay", | |
"score": 0.058823529411764705, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_instagram", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_notes", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_word", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_leetcode", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_amazon", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_iphone_settings", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_layout_understanding_tiktok", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "crossword_mini_5x5", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "chess_puzzles_equality", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "chess_puzzles_crushing", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Planning and Decision Making", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "chess_puzzles_checkmate", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "constrained_generation_contain_contain_images", | |
"score": 0.26666666666666666, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "constrained_generation_contain_contain_length", | |
"score": 0.6, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "constrained_generation_multi_contain_repeat_position_only_length", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "constrained_generation_multi_contain_position_only", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "constrained_generation_multi_contain_repeat", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "constrained_generation_contain_position_images", | |
"score": 0.13333333333333333, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "constrained_generation_xor_images", | |
"score": 0.5333333333333333, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "constrained_generation_contain_position_length", | |
"score": 0.3333333333333333, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "constrained_generation_contain_repeat_length", | |
"score": 0.26666666666666666, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "constrained_generation_contain_length", | |
"score": 0.26666666666666666, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "autorater_unmask", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "autorater_mask", | |
"score": 0.6428571428571429, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "autorater_control", | |
"score": 0.6428571428571429, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "autorater_aesthetics", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "autorater_subject", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "autorater_semantics", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "exact_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "autorater_motion_guided_editing", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "multiple_choice", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "autorater_artifact_reason", | |
"score": 0.6, | |
"eval_type": "rule", | |
"num_demo": 0, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "autorater_artifact", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Metrics", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "autorater_3d_model_texturing", | |
"score": 0.6428571428571429, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "3D Models and Aerial Imagery", | |
"app": "Metrics", | |
"output_format": "contextual_formatted_text", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "shape_composition_colours", | |
"score": 0.1304421768707483, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "shape_composition_shapes", | |
"score": 0.10374149659863945, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_motion_matching_3D_real", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "video_motion_matching_real_3D", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "multiple_choice", | |
"num_input": "video" | |
}, | |
{ | |
"name": "memorization_chinese_celebrity", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "memorization_famous_treaty", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "memorization_indian_celebrity", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "memorization_papers", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_face_swap", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_clip_stable_diffusion_generate", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_background_change", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_face_attribute_edit", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_veracity", | |
"score": 0.7857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_text_entity_replace", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_text_style", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MFC_Bench_check_out_of_context", | |
"score": 0.5, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "pictionary_genai_output_chinese", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "pictionary_cartoon_drawing_guess", | |
"score": 0.2857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "pictionary_skribbl_io", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 20, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "pictionary_doodle_guess", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "pictionary_chinese_food_img2en", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "counterfactual_arithmetic", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Mathematics", | |
"output_format": "numerical_data", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "table_understanding_fact_verification", | |
"score": 0.41666666666666663, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "table_understanding_complex_question_answering", | |
"score": 0.21428571428571427, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Mathematical and Logical Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "number_puzzle_kakuro_5x5", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "exact_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "number_puzzle_sudoku", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MMSoc_HatefulMemes", | |
"score": 0.7857142857142857, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Ethical and Safety Reasoning", | |
"Commonsense and Social Reasoning", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MMSoc_Misinformation_PolitiFact", | |
"score": 0.5714285714285714, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MMSoc_Misinformation_GossipCop", | |
"score": 0.35714285714285715, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation", | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "multiple_choice", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "MMSoc_Memotion", | |
"score": 0.22352941176470592, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "app_interactive_operations_instagram", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_leetcode", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_alipay", | |
"score": 0.11764705882352941, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 17, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_iphone_settings", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_zoom", | |
"score": 0.06666666666666667, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_ppt", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_notes", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_excel", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_youtube", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_amazon", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_twitter", | |
"score": 0.14285714285714285, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_tiktok", | |
"score": 0.07142857142857142, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "app_interactive_operations_word", | |
"score": 0.0, | |
"eval_type": "rule", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "multiple_choice", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "ascii_art_30", | |
"score": 0.21428571428571427, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "contextual_formatted_text", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "generated_video_artifacts", | |
"score": 0.11250000000000002, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Metrics", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "electrocardiogram", | |
"score": 0.2571428571428572, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Science", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_detail_description", | |
"score": 0.2210526315789474, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "doc_vqa", | |
"score": 0.6437499999999999, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 16, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "video2notes", | |
"score": 0.05, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "guess_image_generation_prompt", | |
"score": 0.6, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "wikihow_complex_task_completion", | |
"score": 0.08888888888888888, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Planning", | |
"output_format": "open_ended_output", | |
"num_input": "9-image or more" | |
}, | |
{ | |
"name": "video_qa", | |
"score": 0.5285714285714286, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "sceneqa_scene_transition_video", | |
"score": 0.3071428571428571, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "table2latex_complex", | |
"score": 0.3222222222222222, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 9, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "defeasible_reasoning", | |
"score": 0.3620689655172413, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "meme_explain", | |
"score": 0.14285714285714288, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "funny_image_title", | |
"score": 0.39285714285714285, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "vibe-eval", | |
"score": 0.35, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Ethical and Safety Reasoning", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "image_captioning_with_additional_requirements", | |
"score": 0.4214285714285714, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_content_follow_up", | |
"score": 0.03571428571428571, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Planning and Decision Making" | |
], | |
"input_format": "Videos", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "tweets_captioning", | |
"score": 0.2285714285714286, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "visualization_with_code", | |
"score": 0.028571428571428574, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Coding", | |
"output_format": "structured_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "funqa_unexpected_action_humor_video", | |
"score": 0.21333333333333337, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "unusual_images", | |
"score": 0.26896551724137924, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "GUI_Chat_Easy", | |
"score": 0.6346153846153848, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 26, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "bar_chart_interpretation", | |
"score": 0.17241379310344832, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "iq_test", | |
"score": 0.07931034482758621, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Mathematical and Logical Reasoning", | |
"Spatial and Temporal Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "figurative_speech_explanation", | |
"score": 0.3, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_short_title", | |
"score": 0.4285714285714285, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Scene and Event Understanding" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "funqa_unexpected_action_creative_video", | |
"score": 0.21333333333333335, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "docci_image_description_long", | |
"score": 0.6285714285714287, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "traffic_accident_analysis", | |
"score": 0.10714285714285716, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "scibench_w_solution_open_ended", | |
"score": 0.07600000000000001, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 25, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Science", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "ocrqa", | |
"score": 0.41379310344827597, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "paper_review_writing", | |
"score": 0.22, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Metrics", | |
"output_format": "open_ended_output", | |
"num_input": "4-5 images" | |
}, | |
{ | |
"name": "nextqa_oe", | |
"score": 0.24736842105263163, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "image_humor_understanding", | |
"score": 0.38965517241379305, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "graph_interpretation", | |
"score": 0.20344827586206896, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Mathematical and Logical Reasoning", | |
"Language Understanding and Generation", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "activitynetqa", | |
"score": 0.4526315789473684, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "humor_explanation", | |
"score": 0.28, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "funqa_unexpected_action_magic_video", | |
"score": 0.3133333333333334, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 15, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Spatial and Temporal Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "GUI_Chat_Hard", | |
"score": 0.48064516129032264, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 31, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "video_summary", | |
"score": 0.1642857142857143, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Videos", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "video" | |
}, | |
{ | |
"name": "science_figure_explanation", | |
"score": 0.12068965517241381, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Scene and Event Understanding", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "art_explanation", | |
"score": 0.18620689655172415, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 29, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multi_lingual_manual_explanation_scooter_Spanish", | |
"score": 0.042857142857142864, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "multi_lingual_manual_explanation_scooter_Chinese", | |
"score": 0.08571428571428572, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning", | |
"Ethical and Safety Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "multi_lingual_manual_explanation_scooter_Russian", | |
"score": 0.0, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "multi_lingual_manual_explanation_scooter_Arabic", | |
"score": 0.03571428571428571, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Language Understanding and Generation", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "multi_lingual_manual_explanation_scooter_French", | |
"score": 0.014285714285714287, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Information_Extraction", | |
"output_format": "open_ended_output", | |
"num_input": "6-8 images" | |
}, | |
{ | |
"name": "multi_lingual_Ruozhiba_expalnation_Russian", | |
"score": 0.021428571428571432, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multi_lingual_Ruozhiba_expalnation_English", | |
"score": 0.06428571428571428, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multi_lingual_Ruozhiba_expalnation_Arabic", | |
"score": 0.028571428571428574, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multi_lingual_Ruozhiba_expalnation_French", | |
"score": 0.09999999999999999, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multi_lingual_Ruozhiba_expalnation_Spanish", | |
"score": 0.014285714285714287, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "multi_lingual_Ruozhiba_expalnation_Japanese", | |
"score": 0.021428571428571432, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Language Understanding and Generation", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "table_understanding_fetaqa", | |
"score": 0.44285714285714295, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Perception", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "red_teaming_visual_order_B", | |
"score": 0.6631578947368422, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "red_teaming_captcha", | |
"score": 0.11052631578947371, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Text Recognition (OCR)" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "red_teaming_jailbreak", | |
"score": 0.51, | |
"eval_type": "llm", | |
"num_demo": 0, | |
"num_query": 20, | |
"skills": [ | |
"Text Recognition (OCR)", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Text-Based Images and Documents", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "red_teaming_racial", | |
"score": 0.73, | |
"eval_type": "llm", | |
"num_demo": 0, | |
"num_query": 20, | |
"skills": [ | |
"Ethical and Safety Reasoning", | |
"Scene and Event Understanding", | |
"Object Recognition and Classification" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "red_teaming_visual_order_A", | |
"score": 0.7157894736842106, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "2-3 images" | |
}, | |
{ | |
"name": "red_teaming_celebrity", | |
"score": 0.7, | |
"eval_type": "llm", | |
"num_demo": 0, | |
"num_query": 20, | |
"skills": [ | |
"Commonsense and Social Reasoning", | |
"Language Understanding and Generation" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "red_teaming_politics", | |
"score": 0.615, | |
"eval_type": "llm", | |
"num_demo": 0, | |
"num_query": 20, | |
"skills": [ | |
"Scene and Event Understanding", | |
"Commonsense and Social Reasoning", | |
"Ethical and Safety Reasoning" | |
], | |
"input_format": "Photographs", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "red_teaming_visualmisleading", | |
"score": 0.7105263157894738, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 19, | |
"skills": [ | |
"Ethical and Safety Reasoning", | |
"Commonsense and Social Reasoning" | |
], | |
"input_format": "Artistic and Creative Content", | |
"app": "Knowledge", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "bridge_strategies_advanced", | |
"score": 0.11428571428571431, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "bridge_strategies_worldclass", | |
"score": 0.021428571428571432, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Planning and Decision Making", | |
"Mathematical and Logical Reasoning", | |
"Domain-Specific Knowledge and Skills" | |
], | |
"input_format": "Diagrams and Data Visualizations", | |
"app": "Planning", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
}, | |
{ | |
"name": "bridge_strategies_expert", | |
"score": 0.13571428571428573, | |
"eval_type": "llm", | |
"num_demo": 1, | |
"num_query": 14, | |
"skills": [ | |
"Object Recognition and Classification", | |
"Spatial and Temporal Reasoning", | |
"Planning and Decision Making" | |
], | |
"input_format": "User Interface Screenshots", | |
"app": "Planning", | |
"output_format": "open_ended_output", | |
"num_input": "1-image" | |
} | |
] |