diff --git "a/static/eval_results/Default/Qwen2_VL_7B/task_results.json" "b/static/eval_results/Default/Qwen2_VL_7B/task_results.json" new file mode 100644--- /dev/null +++ "b/static/eval_results/Default/Qwen2_VL_7B/task_results.json" @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.3953838788800739, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.026547987444069228, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.5370954442383014, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.64, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.37716272271985773, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.12041002821230626, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.7728401943343881, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8766233766233765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.19897365891504984, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6093333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.7543233207204482, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.6444444444444443, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.12222222222222223, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.018518518518518517, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7726315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.746031746031746, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.6253043759566965, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.5065263157894736, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.6086428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.78125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.7410714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9722222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.47619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9400428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.45385714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.6399368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.3928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.7232142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.5982142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.08888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.06598639455782314, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.35939538929969556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.2838251878691227, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.032670025538873985, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.8869047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.2756576749919229, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.36434240362811793, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.17261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.13337585034013605, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6904761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.1845238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.25252525252525254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.39166666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.11383541415414918, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0503968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "signage_navigation", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.2261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.3095238095238096, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.03296776632380673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0066666666666666775, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.7142674593015049, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.000811738675187593, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.3714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.08484848484848485, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.08695652173913043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.07857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6352941176470588, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.2787414965986395, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.2790532879818594, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.6785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.4310344827586208, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.37999999999999995, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.5620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7947368421052632, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.5793103448275861, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.506896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.4368421052631579, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.2666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video2notes", + "score": 0.43571428571428567, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "activitynetqa", + "score": 0.445, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.6857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.7142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.3724137931034482, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.4206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.33333333333333326, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "defeasible_reasoning", + "score": 0.5517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.19285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.7250000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "funny_image_title", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6500000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.3068965517241378, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.12999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.3333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_summary", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.29285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.28, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.15333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4548387096774194, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.3714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.31249999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "table2latex_complex", + "score": 0.5333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.6357142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.23750000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.2899999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_content_follow_up", + "score": 0.4428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7449999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.865, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.7449999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8850000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6399999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.255, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.41428571428571426, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.4071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.1357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.22857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.2214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.24285714285714288, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.2071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.13571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file