Spaces:
Running
Running
Update doc info and table style
Browse files- constants.py +4 -4
- static/css/style.css +11 -0
- utils.py +7 -7
constants.py
CHANGED
@@ -28,10 +28,10 @@ We aim to provide cost-effective and accurate evaluation for multimodal models,
|
|
28 |
|
29 |
## ππ Results & Takeaways from Evaluating Top Models
|
30 |
|
31 |
-
-
|
32 |
-
-
|
33 |
-
- Chain-of-Thought (CoT) improves proprietary models but has limited impact on open-source models
|
34 |
-
-
|
35 |
- Many open-source models face challenges in adhering to output format instructions
|
36 |
|
37 |
## π― Interactive Visualization
|
|
|
28 |
|
29 |
## ππ Results & Takeaways from Evaluating Top Models
|
30 |
|
31 |
+
- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0622) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
|
32 |
+
- Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
|
33 |
+
- Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
|
34 |
+
- Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
|
35 |
- Many open-source models face challenges in adhering to output format instructions
|
36 |
|
37 |
## π― Interactive Visualization
|
static/css/style.css
CHANGED
@@ -45,3 +45,14 @@
|
|
45 |
margin-top: 10px;
|
46 |
color: var(--text-color);
|
47 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
margin-top: 10px;
|
46 |
color: var(--text-color);
|
47 |
}
|
48 |
+
|
49 |
+
.custom-dataframe td:first-child {
|
50 |
+
min-width: 220px !important; /* Adjust minimum width for model names */
|
51 |
+
white-space: nowrap !important; /* Prevent text wrapping */
|
52 |
+
}
|
53 |
+
|
54 |
+
.custom-dataframe a {
|
55 |
+
text-decoration: none;
|
56 |
+
color: #2196F3;
|
57 |
+
white-space: nowrap !important;
|
58 |
+
}
|
utils.py
CHANGED
@@ -241,10 +241,10 @@ class DefaultDataLoader(BaseDataLoader):
|
|
241 |
# Define headers with task counts
|
242 |
column_headers = {
|
243 |
"Models": "Models",
|
244 |
-
"Overall": f"Overall
|
245 |
-
"Core w/o CoT": f"Core
|
246 |
-
"Core w/ CoT": f"Core
|
247 |
-
"Open-ended": f"Open-ended
|
248 |
}
|
249 |
|
250 |
# Rename the columns in DataFrame to match headers
|
@@ -317,9 +317,9 @@ class SingleImageDataLoader(BaseDataLoader):
|
|
317 |
# Define headers with task counts
|
318 |
column_headers = {
|
319 |
"Models": "Models",
|
320 |
-
"Overall": f"Overall
|
321 |
-
"Core": f"Core
|
322 |
-
"Open-ended": f"Open-ended
|
323 |
}
|
324 |
|
325 |
# Rename the columns in DataFrame to match headers
|
|
|
241 |
# Define headers with task counts
|
242 |
column_headers = {
|
243 |
"Models": "Models",
|
244 |
+
"Overall": f"Overall({total_tasks})",
|
245 |
+
"Core w/o CoT": f"Core w/o CoT({total_core_tasks})",
|
246 |
+
"Core w/ CoT": f"Core w/ CoT({total_core_tasks})",
|
247 |
+
"Open-ended": f"Open-ended({total_open_tasks})"
|
248 |
}
|
249 |
|
250 |
# Rename the columns in DataFrame to match headers
|
|
|
317 |
# Define headers with task counts
|
318 |
column_headers = {
|
319 |
"Models": "Models",
|
320 |
+
"Overall": f"Overall({total_tasks})",
|
321 |
+
"Core": f"Core({total_core_tasks})",
|
322 |
+
"Open-ended": f"Open-ended({total_open_tasks})"
|
323 |
}
|
324 |
|
325 |
# Rename the columns in DataFrame to match headers
|