cccjc commited on
Commit
44b6d4e
Β·
1 Parent(s): 4301eca

Update doc info and table style

Browse files
Files changed (3) hide show
  1. constants.py +4 -4
  2. static/css/style.css +11 -0
  3. utils.py +7 -7
constants.py CHANGED
@@ -28,10 +28,10 @@ We aim to provide cost-effective and accurate evaluation for multimodal models,
28
 
29
  ## πŸ“ŠπŸ” Results & Takeaways from Evaluating Top Models
30
 
31
- - GPT4o leads the benchmark, outperforming others by 3.5% over Claude3.5
32
- - Qwen2VL stands out among open-source models, nearing flagship-level performance
33
- - Chain-of-Thought (CoT) improves proprietary models but has limited impact on open-source models
34
- - Efficiency models like Gemini 1.5 Flash perform well but struggle with UI and document tasks
35
  - Many open-source models face challenges in adhering to output format instructions
36
 
37
  ## 🎯 Interactive Visualization
 
28
 
29
  ## πŸ“ŠπŸ” Results & Takeaways from Evaluating Top Models
30
 
31
+ - GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0622) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension).
32
+ - Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models
33
+ - Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models
34
+ - Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks
35
  - Many open-source models face challenges in adhering to output format instructions
36
 
37
  ## 🎯 Interactive Visualization
static/css/style.css CHANGED
@@ -45,3 +45,14 @@
45
  margin-top: 10px;
46
  color: var(--text-color);
47
  }
 
 
 
 
 
 
 
 
 
 
 
 
45
  margin-top: 10px;
46
  color: var(--text-color);
47
  }
48
+
49
+ .custom-dataframe td:first-child {
50
+ min-width: 220px !important; /* Adjust minimum width for model names */
51
+ white-space: nowrap !important; /* Prevent text wrapping */
52
+ }
53
+
54
+ .custom-dataframe a {
55
+ text-decoration: none;
56
+ color: #2196F3;
57
+ white-space: nowrap !important;
58
+ }
utils.py CHANGED
@@ -241,10 +241,10 @@ class DefaultDataLoader(BaseDataLoader):
241
  # Define headers with task counts
242
  column_headers = {
243
  "Models": "Models",
244
- "Overall": f"Overall ({total_tasks})",
245
- "Core w/o CoT": f"Core(w/o CoT) ({total_core_tasks})",
246
- "Core w/ CoT": f"Core(w/ CoT) ({total_core_tasks})",
247
- "Open-ended": f"Open-ended ({total_open_tasks})"
248
  }
249
 
250
  # Rename the columns in DataFrame to match headers
@@ -317,9 +317,9 @@ class SingleImageDataLoader(BaseDataLoader):
317
  # Define headers with task counts
318
  column_headers = {
319
  "Models": "Models",
320
- "Overall": f"Overall ({total_tasks})",
321
- "Core": f"Core ({total_core_tasks})",
322
- "Open-ended": f"Open-ended ({total_open_tasks})"
323
  }
324
 
325
  # Rename the columns in DataFrame to match headers
 
241
  # Define headers with task counts
242
  column_headers = {
243
  "Models": "Models",
244
+ "Overall": f"Overall({total_tasks})",
245
+ "Core w/o CoT": f"Core w/o CoT({total_core_tasks})",
246
+ "Core w/ CoT": f"Core w/ CoT({total_core_tasks})",
247
+ "Open-ended": f"Open-ended({total_open_tasks})"
248
  }
249
 
250
  # Rename the columns in DataFrame to match headers
 
317
  # Define headers with task counts
318
  column_headers = {
319
  "Models": "Models",
320
+ "Overall": f"Overall({total_tasks})",
321
+ "Core": f"Core({total_core_tasks})",
322
+ "Open-ended": f"Open-ended({total_open_tasks})"
323
  }
324
 
325
  # Rename the columns in DataFrame to match headers