cccjc commited on
Commit
ba3cd85
·
1 Parent(s): 44b6d4e

add some new model results

Browse files
static/eval_results/Default/all_model_keywords_stats.json CHANGED
@@ -5146,5 +5146,239 @@
5146
  "average_score": 0.20711160718581811
5147
  }
5148
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5149
  }
5150
  }
 
5146
  "average_score": 0.20711160718581811
5147
  }
5148
  }
5149
+ },
5150
+ "Mammoth_VL": {
5151
+ "skills": {
5152
+ "Object Recognition and Classification": {
5153
+ "count": 303,
5154
+ "num_samples": 4755,
5155
+ "tasks": [],
5156
+ "average_score": 0.30194776127683565
5157
+ },
5158
+ "Text Recognition (OCR)": {
5159
+ "count": 137,
5160
+ "num_samples": 2239,
5161
+ "tasks": [],
5162
+ "average_score": 0.2365295791606494
5163
+ },
5164
+ "Language Understanding and Generation": {
5165
+ "count": 154,
5166
+ "num_samples": 2509,
5167
+ "tasks": [],
5168
+ "average_score": 0.2993927028494267
5169
+ },
5170
+ "Scene and Event Understanding": {
5171
+ "count": 154,
5172
+ "num_samples": 2467,
5173
+ "tasks": [],
5174
+ "average_score": 0.3366347826116991
5175
+ },
5176
+ "Mathematical and Logical Reasoning": {
5177
+ "count": 109,
5178
+ "num_samples": 1910,
5179
+ "tasks": [],
5180
+ "average_score": 0.2408454736444444
5181
+ },
5182
+ "Commonsense and Social Reasoning": {
5183
+ "count": 51,
5184
+ "num_samples": 855,
5185
+ "tasks": [],
5186
+ "average_score": 0.37895522991264047
5187
+ },
5188
+ "Ethical and Safety Reasoning": {
5189
+ "count": 15,
5190
+ "num_samples": 245,
5191
+ "tasks": [],
5192
+ "average_score": 0.48003508771929826
5193
+ },
5194
+ "Domain-Specific Knowledge and Skills": {
5195
+ "count": 77,
5196
+ "num_samples": 1386,
5197
+ "tasks": [],
5198
+ "average_score": 0.27232427744946475
5199
+ },
5200
+ "Spatial and Temporal Reasoning": {
5201
+ "count": 152,
5202
+ "num_samples": 2437,
5203
+ "tasks": [],
5204
+ "average_score": 0.24522937191710698
5205
+ },
5206
+ "Planning and Decision Making": {
5207
+ "count": 37,
5208
+ "num_samples": 577,
5209
+ "tasks": [],
5210
+ "average_score": 0.11457024299726488
5211
+ }
5212
+ },
5213
+ "input_format": {
5214
+ "User Interface Screenshots": {
5215
+ "count": 93,
5216
+ "num_samples": 1517,
5217
+ "tasks": [],
5218
+ "average_score": 0.18941525254390731
5219
+ },
5220
+ "Text-Based Images and Documents": {
5221
+ "count": 82,
5222
+ "num_samples": 1294,
5223
+ "tasks": [],
5224
+ "average_score": 0.1718334741390191
5225
+ },
5226
+ "Diagrams and Data Visualizations": {
5227
+ "count": 101,
5228
+ "num_samples": 1718,
5229
+ "tasks": [],
5230
+ "average_score": 0.28108187023954245
5231
+ },
5232
+ "Videos": {
5233
+ "count": 43,
5234
+ "num_samples": 698,
5235
+ "tasks": [],
5236
+ "average_score": 0.3391119999611432
5237
+ },
5238
+ "Artistic and Creative Content": {
5239
+ "count": 32,
5240
+ "num_samples": 541,
5241
+ "tasks": [],
5242
+ "average_score": 0.36434285930327387
5243
+ },
5244
+ "Photographs": {
5245
+ "count": 143,
5246
+ "num_samples": 2248,
5247
+ "tasks": [],
5248
+ "average_score": 0.36915384448504296
5249
+ },
5250
+ "3D Models and Aerial Imagery": {
5251
+ "count": 11,
5252
+ "num_samples": 169,
5253
+ "tasks": [],
5254
+ "average_score": 0.15940750469262005
5255
+ }
5256
+ },
5257
+ "output_format": {
5258
+ "contextual_formatted_text": {
5259
+ "count": 98,
5260
+ "num_samples": 1514,
5261
+ "tasks": [],
5262
+ "average_score": 0.2456942956200745
5263
+ },
5264
+ "structured_output": {
5265
+ "count": 110,
5266
+ "num_samples": 1714,
5267
+ "tasks": [],
5268
+ "average_score": 0.21586513216389874
5269
+ },
5270
+ "exact_text": {
5271
+ "count": 83,
5272
+ "num_samples": 1278,
5273
+ "tasks": [],
5274
+ "average_score": 0.29359048024032264
5275
+ },
5276
+ "numerical_data": {
5277
+ "count": 49,
5278
+ "num_samples": 862,
5279
+ "tasks": [],
5280
+ "average_score": 0.2646677074112521
5281
+ },
5282
+ "open_ended_output": {
5283
+ "count": 80,
5284
+ "num_samples": 1454,
5285
+ "tasks": [],
5286
+ "average_score": 0.34733130661096645
5287
+ },
5288
+ "multiple_choice": {
5289
+ "count": 85,
5290
+ "num_samples": 1363,
5291
+ "tasks": [],
5292
+ "average_score": 0.3286125236284589
5293
+ }
5294
+ },
5295
+ "input_num": {
5296
+ "6-8 images": {
5297
+ "count": 21,
5298
+ "num_samples": 314,
5299
+ "tasks": [],
5300
+ "average_score": 0.16358654572940287
5301
+ },
5302
+ "9-image or more": {
5303
+ "count": 41,
5304
+ "num_samples": 623,
5305
+ "tasks": [],
5306
+ "average_score": 0.25463059203015115
5307
+ },
5308
+ "1-image": {
5309
+ "count": 315,
5310
+ "num_samples": 5228,
5311
+ "tasks": [],
5312
+ "average_score": 0.2919119209789575
5313
+ },
5314
+ "video": {
5315
+ "count": 43,
5316
+ "num_samples": 698,
5317
+ "tasks": [],
5318
+ "average_score": 0.3391119999611432
5319
+ },
5320
+ "4-5 images": {
5321
+ "count": 34,
5322
+ "num_samples": 520,
5323
+ "tasks": [],
5324
+ "average_score": 0.20016011839130254
5325
+ },
5326
+ "2-3 images": {
5327
+ "count": 51,
5328
+ "num_samples": 802,
5329
+ "tasks": [],
5330
+ "average_score": 0.2679179451692527
5331
+ }
5332
+ },
5333
+ "app": {
5334
+ "Information_Extraction": {
5335
+ "count": 72,
5336
+ "num_samples": 1124,
5337
+ "tasks": [],
5338
+ "average_score": 0.23600902063965679
5339
+ },
5340
+ "Planning": {
5341
+ "count": 78,
5342
+ "num_samples": 1239,
5343
+ "tasks": [],
5344
+ "average_score": 0.15326915093278803
5345
+ },
5346
+ "Coding": {
5347
+ "count": 31,
5348
+ "num_samples": 474,
5349
+ "tasks": [],
5350
+ "average_score": 0.20668466311255687
5351
+ },
5352
+ "Perception": {
5353
+ "count": 145,
5354
+ "num_samples": 2313,
5355
+ "tasks": [],
5356
+ "average_score": 0.33348955971237954
5357
+ },
5358
+ "Metrics": {
5359
+ "count": 20,
5360
+ "num_samples": 309,
5361
+ "tasks": [],
5362
+ "average_score": 0.3759170425350556
5363
+ },
5364
+ "Science": {
5365
+ "count": 29,
5366
+ "num_samples": 574,
5367
+ "tasks": [],
5368
+ "average_score": 0.23894961766260706
5369
+ },
5370
+ "Knowledge": {
5371
+ "count": 97,
5372
+ "num_samples": 1605,
5373
+ "tasks": [],
5374
+ "average_score": 0.351703435685048
5375
+ },
5376
+ "Mathematics": {
5377
+ "count": 33,
5378
+ "num_samples": 547,
5379
+ "tasks": [],
5380
+ "average_score": 0.26074348700688493
5381
+ }
5382
+ }
5383
  }
5384
  }
static/eval_results/Default/all_summary.json CHANGED
@@ -504,5 +504,22 @@
504
  "micro_mean_score": 0.2704213241616509
505
  },
506
  "overall_score": 0.17379673035120966
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  }
508
  }
 
504
  "micro_mean_score": 0.2704213241616509
505
  },
506
  "overall_score": 0.17379673035120966
507
+ },
508
+ "Mammoth_VL": {
509
+ "core_noncot": {
510
+ "num_eval_tasks": 440,
511
+ "num_eval_samples": 6539,
512
+ "num_not_eval_samples": 0,
513
+ "macro_mean_score": 0.264052880412689,
514
+ "micro_mean_score": 0.2626894374387823
515
+ },
516
+ "core_cot": null,
517
+ "open": {
518
+ "num_eval_tasks": 65,
519
+ "num_eval_samples": 1163,
520
+ "macro_mean_score": 0.37992668750165337,
521
+ "micro_mean_score": 0.40120378331900275
522
+ },
523
+ "overall_score": 0.27896733083008046
524
  }
525
  }
static/eval_results/SI/all_model_keywords_stats.json CHANGED
@@ -1025,7 +1025,7 @@
1025
  "count": 82,
1026
  "num_samples": 1321,
1027
  "tasks": [],
1028
- "average_score": 0.5428885848330401
1029
  },
1030
  "Planning": {
1031
  "count": 44,
@@ -1063,7 +1063,7 @@
1063
  "count": 83,
1064
  "num_samples": 1315,
1065
  "tasks": [],
1066
- "average_score": 0.5561844967140265
1067
  },
1068
  "Text-Based Images and Documents": {
1069
  "count": 53,
@@ -1083,7 +1083,7 @@
1083
  "count": 315,
1084
  "num_samples": 5228,
1085
  "tasks": [],
1086
- "average_score": 0.46322170353087255
1087
  }
1088
  },
1089
  "output_format": {
@@ -1091,7 +1091,7 @@
1091
  "count": 63,
1092
  "num_samples": 975,
1093
  "tasks": [],
1094
- "average_score": 0.381768248331173
1095
  },
1096
  "exact_text": {
1097
  "count": 57,
@@ -1147,7 +1147,7 @@
1147
  "count": 102,
1148
  "num_samples": 1713,
1149
  "tasks": [],
1150
- "average_score": 0.5451973412590135
1151
  },
1152
  "Mathematical and Logical Reasoning": {
1153
  "count": 91,
@@ -1183,7 +1183,7 @@
1183
  "count": 101,
1184
  "num_samples": 1687,
1185
  "tasks": [],
1186
- "average_score": 0.5024587375994013
1187
  }
1188
  }
1189
  },
@@ -1193,49 +1193,49 @@
1193
  "count": 16,
1194
  "num_samples": 244,
1195
  "tasks": [],
1196
- "average_score": 0.47501823646125113
1197
  },
1198
  "Information_Extraction": {
1199
  "count": 41,
1200
  "num_samples": 644,
1201
  "tasks": [],
1202
- "average_score": 0.4328505884518674
1203
  },
1204
  "Knowledge": {
1205
  "count": 77,
1206
  "num_samples": 1294,
1207
  "tasks": [],
1208
- "average_score": 0.5102257466534984
1209
  },
1210
  "Mathematics": {
1211
  "count": 30,
1212
  "num_samples": 497,
1213
  "tasks": [],
1214
- "average_score": 0.33330909636235384
1215
  },
1216
  "Metrics": {
1217
  "count": 3,
1218
  "num_samples": 45,
1219
  "tasks": [],
1220
- "average_score": 0.5095238095238095
1221
  },
1222
  "Perception": {
1223
  "count": 82,
1224
  "num_samples": 1321,
1225
  "tasks": [],
1226
- "average_score": 0.5507427313044685
1227
  },
1228
  "Planning": {
1229
  "count": 44,
1230
  "num_samples": 714,
1231
  "tasks": [],
1232
- "average_score": 0.19508720733284174
1233
  },
1234
  "Science": {
1235
  "count": 22,
1236
  "num_samples": 469,
1237
  "tasks": [],
1238
- "average_score": 0.4351415236240936
1239
  }
1240
  },
1241
  "input_format": {
@@ -1243,37 +1243,37 @@
1243
  "count": 2,
1244
  "num_samples": 30,
1245
  "tasks": [],
1246
- "average_score": 0.3555116262572404
1247
  },
1248
  "Artistic and Creative Content": {
1249
  "count": 22,
1250
  "num_samples": 389,
1251
  "tasks": [],
1252
- "average_score": 0.5404112582997231
1253
  },
1254
  "Diagrams and Data Visualizations": {
1255
  "count": 88,
1256
  "num_samples": 1524,
1257
  "tasks": [],
1258
- "average_score": 0.42366990116355135
1259
  },
1260
  "Photographs": {
1261
  "count": 83,
1262
  "num_samples": 1315,
1263
  "tasks": [],
1264
- "average_score": 0.5726107634234434
1265
  },
1266
  "Text-Based Images and Documents": {
1267
  "count": 53,
1268
  "num_samples": 847,
1269
  "tasks": [],
1270
- "average_score": 0.33154206029123856
1271
  },
1272
  "User Interface Screenshots": {
1273
  "count": 67,
1274
  "num_samples": 1123,
1275
  "tasks": [],
1276
- "average_score": 0.3656537691630919
1277
  }
1278
  },
1279
  "input_num": {
@@ -1281,7 +1281,7 @@
1281
  "count": 315,
1282
  "num_samples": 5228,
1283
  "tasks": [],
1284
- "average_score": 0.4427944359714585
1285
  }
1286
  },
1287
  "output_format": {
@@ -1289,37 +1289,37 @@
1289
  "count": 63,
1290
  "num_samples": 975,
1291
  "tasks": [],
1292
- "average_score": 0.3680682749954099
1293
  },
1294
  "exact_text": {
1295
  "count": 57,
1296
  "num_samples": 880,
1297
  "tasks": [],
1298
- "average_score": 0.3994332512947306
1299
  },
1300
  "multiple_choice": {
1301
  "count": 33,
1302
  "num_samples": 567,
1303
  "tasks": [],
1304
- "average_score": 0.5646552101097555
1305
  },
1306
  "numerical_data": {
1307
  "count": 39,
1308
  "num_samples": 694,
1309
  "tasks": [],
1310
- "average_score": 0.377682596312313
1311
  },
1312
  "open_ended_output": {
1313
  "count": 51,
1314
  "num_samples": 991,
1315
  "tasks": [],
1316
- "average_score": 0.5536141293443697
1317
  },
1318
  "structured_output": {
1319
  "count": 72,
1320
  "num_samples": 1121,
1321
  "tasks": [],
1322
- "average_score": 0.4434262068907506
1323
  }
1324
  },
1325
  "skills": {
@@ -1327,61 +1327,61 @@
1327
  "count": 38,
1328
  "num_samples": 654,
1329
  "tasks": [],
1330
- "average_score": 0.5949207694245245
1331
  },
1332
  "Domain-Specific Knowledge and Skills": {
1333
  "count": 46,
1334
  "num_samples": 897,
1335
  "tasks": [],
1336
- "average_score": 0.4385603970138852
1337
  },
1338
  "Ethical and Safety Reasoning": {
1339
  "count": 10,
1340
  "num_samples": 170,
1341
  "tasks": [],
1342
- "average_score": 0.7734661654135339
1343
  },
1344
  "Language Understanding and Generation": {
1345
  "count": 102,
1346
  "num_samples": 1713,
1347
  "tasks": [],
1348
- "average_score": 0.5163987806731475
1349
  },
1350
  "Mathematical and Logical Reasoning": {
1351
  "count": 91,
1352
  "num_samples": 1630,
1353
  "tasks": [],
1354
- "average_score": 0.35922563291424964
1355
  },
1356
  "Object Recognition and Classification": {
1357
  "count": 172,
1358
  "num_samples": 2714,
1359
  "tasks": [],
1360
- "average_score": 0.46460120838976576
1361
  },
1362
  "Planning and Decision Making": {
1363
  "count": 23,
1364
  "num_samples": 356,
1365
  "tasks": [],
1366
- "average_score": 0.12518962860872068
1367
  },
1368
  "Scene and Event Understanding": {
1369
  "count": 60,
1370
  "num_samples": 1004,
1371
  "tasks": [],
1372
- "average_score": 0.5961096083948861
1373
  },
1374
  "Spatial and Temporal Reasoning": {
1375
  "count": 78,
1376
  "num_samples": 1273,
1377
  "tasks": [],
1378
- "average_score": 0.3571876703463106
1379
  },
1380
  "Text Recognition (OCR)": {
1381
  "count": 101,
1382
  "num_samples": 1687,
1383
  "tasks": [],
1384
- "average_score": 0.4329456546880451
1385
  }
1386
  }
1387
  },
@@ -3365,6 +3365,204 @@
3365
  }
3366
  }
3367
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3368
  "POINTS_7B": {
3369
  "app": {
3370
  "Coding": {
@@ -4553,6 +4751,204 @@
4553
  }
4554
  }
4555
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4556
  "llava_onevision_72B": {
4557
  "app": {
4558
  "Coding": {
 
1025
  "count": 82,
1026
  "num_samples": 1321,
1027
  "tasks": [],
1028
+ "average_score": 0.5437015929631214
1029
  },
1030
  "Planning": {
1031
  "count": 44,
 
1063
  "count": 83,
1064
  "num_samples": 1315,
1065
  "tasks": [],
1066
+ "average_score": 0.5569877095654321
1067
  },
1068
  "Text-Based Images and Documents": {
1069
  "count": 53,
 
1083
  "count": 315,
1084
  "num_samples": 5228,
1085
  "tasks": [],
1086
+ "average_score": 0.46343334374251277
1087
  }
1088
  },
1089
  "output_format": {
 
1091
  "count": 63,
1092
  "num_samples": 975,
1093
  "tasks": [],
1094
+ "average_score": 0.38282644938937405
1095
  },
1096
  "exact_text": {
1097
  "count": 57,
 
1147
  "count": 102,
1148
  "num_samples": 1713,
1149
  "tasks": [],
1150
+ "average_score": 0.5458509360302554
1151
  },
1152
  "Mathematical and Logical Reasoning": {
1153
  "count": 91,
 
1183
  "count": 101,
1184
  "num_samples": 1687,
1185
  "tasks": [],
1186
+ "average_score": 0.503118803606002
1187
  }
1188
  }
1189
  },
 
1193
  "count": 16,
1194
  "num_samples": 244,
1195
  "tasks": [],
1196
+ "average_score": 0.47487599206349207
1197
  },
1198
  "Information_Extraction": {
1199
  "count": 41,
1200
  "num_samples": 644,
1201
  "tasks": [],
1202
+ "average_score": 0.45245079667466714
1203
  },
1204
  "Knowledge": {
1205
  "count": 77,
1206
  "num_samples": 1294,
1207
  "tasks": [],
1208
+ "average_score": 0.5086518140501541
1209
  },
1210
  "Mathematics": {
1211
  "count": 30,
1212
  "num_samples": 497,
1213
  "tasks": [],
1214
+ "average_score": 0.3853815223607656
1215
  },
1216
  "Metrics": {
1217
  "count": 3,
1218
  "num_samples": 45,
1219
  "tasks": [],
1220
+ "average_score": 0.4380952380952381
1221
  },
1222
  "Perception": {
1223
  "count": 82,
1224
  "num_samples": 1321,
1225
  "tasks": [],
1226
+ "average_score": 0.5468998820129136
1227
  },
1228
  "Planning": {
1229
  "count": 44,
1230
  "num_samples": 714,
1231
  "tasks": [],
1232
+ "average_score": 0.21148887498941377
1233
  },
1234
  "Science": {
1235
  "count": 22,
1236
  "num_samples": 469,
1237
  "tasks": [],
1238
+ "average_score": 0.48499051643275837
1239
  }
1240
  },
1241
  "input_format": {
 
1243
  "count": 2,
1244
  "num_samples": 30,
1245
  "tasks": [],
1246
+ "average_score": 0.3348446026637953
1247
  },
1248
  "Artistic and Creative Content": {
1249
  "count": 22,
1250
  "num_samples": 389,
1251
  "tasks": [],
1252
+ "average_score": 0.5535202379362348
1253
  },
1254
  "Diagrams and Data Visualizations": {
1255
  "count": 88,
1256
  "num_samples": 1524,
1257
  "tasks": [],
1258
+ "average_score": 0.46724590271207767
1259
  },
1260
  "Photographs": {
1261
  "count": 83,
1262
  "num_samples": 1315,
1263
  "tasks": [],
1264
+ "average_score": 0.5613400178213946
1265
  },
1266
  "Text-Based Images and Documents": {
1267
  "count": 53,
1268
  "num_samples": 847,
1269
  "tasks": [],
1270
+ "average_score": 0.33052002642818507
1271
  },
1272
  "User Interface Screenshots": {
1273
  "count": 67,
1274
  "num_samples": 1123,
1275
  "tasks": [],
1276
+ "average_score": 0.3722082840493195
1277
  }
1278
  },
1279
  "input_num": {
 
1281
  "count": 315,
1282
  "num_samples": 5228,
1283
  "tasks": [],
1284
+ "average_score": 0.45400479933257654
1285
  }
1286
  },
1287
  "output_format": {
 
1289
  "count": 63,
1290
  "num_samples": 975,
1291
  "tasks": [],
1292
+ "average_score": 0.3691249729531883
1293
  },
1294
  "exact_text": {
1295
  "count": 57,
1296
  "num_samples": 880,
1297
  "tasks": [],
1298
+ "average_score": 0.42013434507914493
1299
  },
1300
  "multiple_choice": {
1301
  "count": 33,
1302
  "num_samples": 567,
1303
  "tasks": [],
1304
+ "average_score": 0.5905636451090996
1305
  },
1306
  "numerical_data": {
1307
  "count": 39,
1308
  "num_samples": 694,
1309
  "tasks": [],
1310
+ "average_score": 0.43247267273235235
1311
  },
1312
  "open_ended_output": {
1313
  "count": 51,
1314
  "num_samples": 991,
1315
  "tasks": [],
1316
+ "average_score": 0.5470781816319514
1317
  },
1318
  "structured_output": {
1319
  "count": 72,
1320
  "num_samples": 1121,
1321
  "tasks": [],
1322
+ "average_score": 0.43823554216399857
1323
  }
1324
  },
1325
  "skills": {
 
1327
  "count": 38,
1328
  "num_samples": 654,
1329
  "tasks": [],
1330
+ "average_score": 0.5955368143490581
1331
  },
1332
  "Domain-Specific Knowledge and Skills": {
1333
  "count": 46,
1334
  "num_samples": 897,
1335
  "tasks": [],
1336
+ "average_score": 0.4655431430975485
1337
  },
1338
  "Ethical and Safety Reasoning": {
1339
  "count": 10,
1340
  "num_samples": 170,
1341
  "tasks": [],
1342
+ "average_score": 0.7948947368421052
1343
  },
1344
  "Language Understanding and Generation": {
1345
  "count": 102,
1346
  "num_samples": 1713,
1347
  "tasks": [],
1348
+ "average_score": 0.5122400421391089
1349
  },
1350
  "Mathematical and Logical Reasoning": {
1351
  "count": 91,
1352
  "num_samples": 1630,
1353
  "tasks": [],
1354
+ "average_score": 0.4086167264646781
1355
  },
1356
  "Object Recognition and Classification": {
1357
  "count": 172,
1358
  "num_samples": 2714,
1359
  "tasks": [],
1360
+ "average_score": 0.47630441828533016
1361
  },
1362
  "Planning and Decision Making": {
1363
  "count": 23,
1364
  "num_samples": 356,
1365
  "tasks": [],
1366
+ "average_score": 0.09741974015331743
1367
  },
1368
  "Scene and Event Understanding": {
1369
  "count": 60,
1370
  "num_samples": 1004,
1371
  "tasks": [],
1372
+ "average_score": 0.5920539115535787
1373
  },
1374
  "Spatial and Temporal Reasoning": {
1375
  "count": 78,
1376
  "num_samples": 1273,
1377
  "tasks": [],
1378
+ "average_score": 0.3559690476405975
1379
  },
1380
  "Text Recognition (OCR)": {
1381
  "count": 101,
1382
  "num_samples": 1687,
1383
  "tasks": [],
1384
+ "average_score": 0.4474763430506795
1385
  }
1386
  }
1387
  },
 
3365
  }
3366
  }
3367
  },
3368
+ "POINTS_15_7B": {
3369
+ "app": {
3370
+ "Coding": {
3371
+ "count": 16,
3372
+ "num_samples": 244,
3373
+ "tasks": [],
3374
+ "average_score": 0.31641062675070025
3375
+ },
3376
+ "Information_Extraction": {
3377
+ "count": 41,
3378
+ "num_samples": 644,
3379
+ "tasks": [],
3380
+ "average_score": 0.3095789895735217
3381
+ },
3382
+ "Knowledge": {
3383
+ "count": 77,
3384
+ "num_samples": 1294,
3385
+ "tasks": [],
3386
+ "average_score": 0.35705988992418164
3387
+ },
3388
+ "Mathematics": {
3389
+ "count": 30,
3390
+ "num_samples": 497,
3391
+ "tasks": [],
3392
+ "average_score": 0.24128406446063128
3393
+ },
3394
+ "Metrics": {
3395
+ "count": 3,
3396
+ "num_samples": 45,
3397
+ "tasks": [],
3398
+ "average_score": 0.48095238095238096
3399
+ },
3400
+ "Perception": {
3401
+ "count": 82,
3402
+ "num_samples": 1321,
3403
+ "tasks": [],
3404
+ "average_score": 0.4420532221275683
3405
+ },
3406
+ "Planning": {
3407
+ "count": 44,
3408
+ "num_samples": 714,
3409
+ "tasks": [],
3410
+ "average_score": 0.1277481304284383
3411
+ },
3412
+ "Science": {
3413
+ "count": 22,
3414
+ "num_samples": 469,
3415
+ "tasks": [],
3416
+ "average_score": 0.32551503611448934
3417
+ }
3418
+ },
3419
+ "input_format": {
3420
+ "3D Models and Aerial Imagery": {
3421
+ "count": 2,
3422
+ "num_samples": 30,
3423
+ "tasks": [],
3424
+ "average_score": 0.15572486552610215
3425
+ },
3426
+ "Artistic and Creative Content": {
3427
+ "count": 22,
3428
+ "num_samples": 389,
3429
+ "tasks": [],
3430
+ "average_score": 0.37330010041194067
3431
+ },
3432
+ "Diagrams and Data Visualizations": {
3433
+ "count": 88,
3434
+ "num_samples": 1524,
3435
+ "tasks": [],
3436
+ "average_score": 0.30991539183635347
3437
+ },
3438
+ "Photographs": {
3439
+ "count": 83,
3440
+ "num_samples": 1315,
3441
+ "tasks": [],
3442
+ "average_score": 0.4276343385855984
3443
+ },
3444
+ "Text-Based Images and Documents": {
3445
+ "count": 53,
3446
+ "num_samples": 847,
3447
+ "tasks": [],
3448
+ "average_score": 0.24722440389191766
3449
+ },
3450
+ "User Interface Screenshots": {
3451
+ "count": 67,
3452
+ "num_samples": 1123,
3453
+ "tasks": [],
3454
+ "average_score": 0.27713077639707523
3455
+ }
3456
+ },
3457
+ "input_num": {
3458
+ "1-image": {
3459
+ "count": 315,
3460
+ "num_samples": 5228,
3461
+ "tasks": [],
3462
+ "average_score": 0.32686003793394974
3463
+ }
3464
+ },
3465
+ "output_format": {
3466
+ "contextual_formatted_text": {
3467
+ "count": 63,
3468
+ "num_samples": 975,
3469
+ "tasks": [],
3470
+ "average_score": 0.3101162129247054
3471
+ },
3472
+ "exact_text": {
3473
+ "count": 57,
3474
+ "num_samples": 880,
3475
+ "tasks": [],
3476
+ "average_score": 0.2614010338203017
3477
+ },
3478
+ "multiple_choice": {
3479
+ "count": 33,
3480
+ "num_samples": 567,
3481
+ "tasks": [],
3482
+ "average_score": 0.4855568673750491
3483
+ },
3484
+ "numerical_data": {
3485
+ "count": 39,
3486
+ "num_samples": 694,
3487
+ "tasks": [],
3488
+ "average_score": 0.28761899055063767
3489
+ },
3490
+ "open_ended_output": {
3491
+ "count": 51,
3492
+ "num_samples": 991,
3493
+ "tasks": [],
3494
+ "average_score": 0.37619796536407
3495
+ },
3496
+ "structured_output": {
3497
+ "count": 72,
3498
+ "num_samples": 1121,
3499
+ "tasks": [],
3500
+ "average_score": 0.3069044183161335
3501
+ }
3502
+ },
3503
+ "skills": {
3504
+ "Commonsense and Social Reasoning": {
3505
+ "count": 38,
3506
+ "num_samples": 654,
3507
+ "tasks": [],
3508
+ "average_score": 0.45980379926019677
3509
+ },
3510
+ "Domain-Specific Knowledge and Skills": {
3511
+ "count": 46,
3512
+ "num_samples": 897,
3513
+ "tasks": [],
3514
+ "average_score": 0.30711751050032277
3515
+ },
3516
+ "Ethical and Safety Reasoning": {
3517
+ "count": 10,
3518
+ "num_samples": 170,
3519
+ "tasks": [],
3520
+ "average_score": 0.6173496240601504
3521
+ },
3522
+ "Language Understanding and Generation": {
3523
+ "count": 102,
3524
+ "num_samples": 1713,
3525
+ "tasks": [],
3526
+ "average_score": 0.35317851821169477
3527
+ },
3528
+ "Mathematical and Logical Reasoning": {
3529
+ "count": 91,
3530
+ "num_samples": 1630,
3531
+ "tasks": [],
3532
+ "average_score": 0.28961632718794406
3533
+ },
3534
+ "Object Recognition and Classification": {
3535
+ "count": 172,
3536
+ "num_samples": 2714,
3537
+ "tasks": [],
3538
+ "average_score": 0.3333459246264911
3539
+ },
3540
+ "Planning and Decision Making": {
3541
+ "count": 23,
3542
+ "num_samples": 356,
3543
+ "tasks": [],
3544
+ "average_score": 0.08369131166291023
3545
+ },
3546
+ "Scene and Event Understanding": {
3547
+ "count": 60,
3548
+ "num_samples": 1004,
3549
+ "tasks": [],
3550
+ "average_score": 0.43105364189963935
3551
+ },
3552
+ "Spatial and Temporal Reasoning": {
3553
+ "count": 78,
3554
+ "num_samples": 1273,
3555
+ "tasks": [],
3556
+ "average_score": 0.26796963300870397
3557
+ },
3558
+ "Text Recognition (OCR)": {
3559
+ "count": 101,
3560
+ "num_samples": 1687,
3561
+ "tasks": [],
3562
+ "average_score": 0.3443899066327916
3563
+ }
3564
+ }
3565
+ },
3566
  "POINTS_7B": {
3567
  "app": {
3568
  "Coding": {
 
4751
  }
4752
  }
4753
  },
4754
+ "SmolVLM": {
4755
+ "app": {
4756
+ "Coding": {
4757
+ "count": 16,
4758
+ "num_samples": 244,
4759
+ "tasks": [],
4760
+ "average_score": 0.05390625
4761
+ },
4762
+ "Information_Extraction": {
4763
+ "count": 41,
4764
+ "num_samples": 644,
4765
+ "tasks": [],
4766
+ "average_score": 0.03906165844850793
4767
+ },
4768
+ "Knowledge": {
4769
+ "count": 77,
4770
+ "num_samples": 1294,
4771
+ "tasks": [],
4772
+ "average_score": 0.09639506190200878
4773
+ },
4774
+ "Mathematics": {
4775
+ "count": 30,
4776
+ "num_samples": 497,
4777
+ "tasks": [],
4778
+ "average_score": 0.06728619034079576
4779
+ },
4780
+ "Metrics": {
4781
+ "count": 3,
4782
+ "num_samples": 45,
4783
+ "tasks": [],
4784
+ "average_score": 0.2222222222222222
4785
+ },
4786
+ "Perception": {
4787
+ "count": 82,
4788
+ "num_samples": 1321,
4789
+ "tasks": [],
4790
+ "average_score": 0.1606753925138995
4791
+ },
4792
+ "Planning": {
4793
+ "count": 44,
4794
+ "num_samples": 714,
4795
+ "tasks": [],
4796
+ "average_score": 0.03272316763696074
4797
+ },
4798
+ "Science": {
4799
+ "count": 22,
4800
+ "num_samples": 469,
4801
+ "tasks": [],
4802
+ "average_score": 0.13950042461525716
4803
+ }
4804
+ },
4805
+ "input_format": {
4806
+ "3D Models and Aerial Imagery": {
4807
+ "count": 2,
4808
+ "num_samples": 30,
4809
+ "tasks": [],
4810
+ "average_score": 0.10013149786398344
4811
+ },
4812
+ "Artistic and Creative Content": {
4813
+ "count": 22,
4814
+ "num_samples": 389,
4815
+ "tasks": [],
4816
+ "average_score": 0.143657576543239
4817
+ },
4818
+ "Diagrams and Data Visualizations": {
4819
+ "count": 88,
4820
+ "num_samples": 1524,
4821
+ "tasks": [],
4822
+ "average_score": 0.0979843882877799
4823
+ },
4824
+ "Photographs": {
4825
+ "count": 83,
4826
+ "num_samples": 1315,
4827
+ "tasks": [],
4828
+ "average_score": 0.1383108182448921
4829
+ },
4830
+ "Text-Based Images and Documents": {
4831
+ "count": 53,
4832
+ "num_samples": 847,
4833
+ "tasks": [],
4834
+ "average_score": 0.09044016512537822
4835
+ },
4836
+ "User Interface Screenshots": {
4837
+ "count": 67,
4838
+ "num_samples": 1123,
4839
+ "tasks": [],
4840
+ "average_score": 0.029842216842698305
4841
+ }
4842
+ },
4843
+ "input_num": {
4844
+ "1-image": {
4845
+ "count": 315,
4846
+ "num_samples": 5228,
4847
+ "tasks": [],
4848
+ "average_score": 0.09605051124900241
4849
+ }
4850
+ },
4851
+ "output_format": {
4852
+ "contextual_formatted_text": {
4853
+ "count": 63,
4854
+ "num_samples": 975,
4855
+ "tasks": [],
4856
+ "average_score": 0.12682789970863723
4857
+ },
4858
+ "exact_text": {
4859
+ "count": 57,
4860
+ "num_samples": 880,
4861
+ "tasks": [],
4862
+ "average_score": 0.05128016118728194
4863
+ },
4864
+ "multiple_choice": {
4865
+ "count": 33,
4866
+ "num_samples": 567,
4867
+ "tasks": [],
4868
+ "average_score": 0.10496742314924135
4869
+ },
4870
+ "numerical_data": {
4871
+ "count": 39,
4872
+ "num_samples": 694,
4873
+ "tasks": [],
4874
+ "average_score": 0.09999979828107199
4875
+ },
4876
+ "open_ended_output": {
4877
+ "count": 51,
4878
+ "num_samples": 991,
4879
+ "tasks": [],
4880
+ "average_score": 0.21315705831839693
4881
+ },
4882
+ "structured_output": {
4883
+ "count": 72,
4884
+ "num_samples": 1121,
4885
+ "tasks": [],
4886
+ "average_score": 0.015386904208215372
4887
+ }
4888
+ },
4889
+ "skills": {
4890
+ "Commonsense and Social Reasoning": {
4891
+ "count": 38,
4892
+ "num_samples": 654,
4893
+ "tasks": [],
4894
+ "average_score": 0.1293055688222371
4895
+ },
4896
+ "Domain-Specific Knowledge and Skills": {
4897
+ "count": 46,
4898
+ "num_samples": 897,
4899
+ "tasks": [],
4900
+ "average_score": 0.077851045512787
4901
+ },
4902
+ "Ethical and Safety Reasoning": {
4903
+ "count": 10,
4904
+ "num_samples": 170,
4905
+ "tasks": [],
4906
+ "average_score": 0.2222067669172932
4907
+ },
4908
+ "Language Understanding and Generation": {
4909
+ "count": 102,
4910
+ "num_samples": 1713,
4911
+ "tasks": [],
4912
+ "average_score": 0.12889143083611815
4913
+ },
4914
+ "Mathematical and Logical Reasoning": {
4915
+ "count": 91,
4916
+ "num_samples": 1630,
4917
+ "tasks": [],
4918
+ "average_score": 0.0865768026006882
4919
+ },
4920
+ "Object Recognition and Classification": {
4921
+ "count": 172,
4922
+ "num_samples": 2714,
4923
+ "tasks": [],
4924
+ "average_score": 0.10501451629704919
4925
+ },
4926
+ "Planning and Decision Making": {
4927
+ "count": 23,
4928
+ "num_samples": 356,
4929
+ "tasks": [],
4930
+ "average_score": 0.008178053830227744
4931
+ },
4932
+ "Scene and Event Understanding": {
4933
+ "count": 60,
4934
+ "num_samples": 1004,
4935
+ "tasks": [],
4936
+ "average_score": 0.12403047579230878
4937
+ },
4938
+ "Spatial and Temporal Reasoning": {
4939
+ "count": 78,
4940
+ "num_samples": 1273,
4941
+ "tasks": [],
4942
+ "average_score": 0.061765081348496016
4943
+ },
4944
+ "Text Recognition (OCR)": {
4945
+ "count": 101,
4946
+ "num_samples": 1687,
4947
+ "tasks": [],
4948
+ "average_score": 0.08610257462374318
4949
+ }
4950
+ }
4951
+ },
4952
  "llava_onevision_72B": {
4953
  "app": {
4954
  "Coding": {
static/eval_results/SI/all_summary.json CHANGED
@@ -93,8 +93,8 @@
93
  "num_eval_tasks": 273,
94
  "num_eval_samples": 4116,
95
  "num_not_eval_samples": 0,
96
- "macro_mean_score": 0.44285970964797233,
97
- "micro_mean_score": 0.43756073858114675
98
  },
99
  "open": {
100
  "num_eval_tasks": 42,
@@ -103,15 +103,15 @@
103
  "macro_mean_score": 0.595574663769726,
104
  "micro_mean_score": 0.6334563345633456
105
  },
106
- "overall_score": 0.46322170353087283
107
  },
108
  "Gemini_1.5_flash_002": {
109
  "core": {
110
  "num_eval_tasks": 273,
111
  "num_eval_samples": 4116,
112
  "num_not_eval_samples": 0,
113
- "macro_mean_score": 0.42188460865574384,
114
- "micro_mean_score": 0.413508260447036
115
  },
116
  "open": {
117
  "num_eval_tasks": 42,
@@ -120,7 +120,7 @@
120
  "macro_mean_score": 0.5787083135236054,
121
  "micro_mean_score": 0.6186961869618696
122
  },
123
- "overall_score": 0.44279443597145873
124
  },
125
  "Gemini_1.5_pro_002": {
126
  "core": {
@@ -261,8 +261,8 @@
261
  "macro_mean_score": 0.36480000609384927,
262
  "micro_mean_score": 0.36205779758110807,
263
  "missing_tasks": [
264
- "MMSoc_Misinformation_PolitiFact",
265
  "table_understanding",
 
266
  "planning_screenshot_termes"
267
  ]
268
  },
@@ -316,6 +316,25 @@
316
  },
317
  "overall_score": 0.34550356262982296
318
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  "POINTS_7B": {
320
  "core": {
321
  "num_eval_tasks": 273,
@@ -430,6 +449,25 @@
430
  },
431
  "overall_score": 0.3669159632302898
432
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  "llava_onevision_72B": {
434
  "core": {
435
  "num_eval_tasks": 273,
 
93
  "num_eval_tasks": 273,
94
  "num_eval_samples": 4116,
95
  "num_not_eval_samples": 0,
96
+ "macro_mean_score": 0.4431039098921726,
97
+ "micro_mean_score": 0.43780369290573373
98
  },
99
  "open": {
100
  "num_eval_tasks": 42,
 
103
  "macro_mean_score": 0.595574663769726,
104
  "micro_mean_score": 0.6334563345633456
105
  },
106
+ "overall_score": 0.46343334374251305
107
  },
108
  "Gemini_1.5_flash_002": {
109
  "core": {
110
  "num_eval_tasks": 273,
111
  "num_eval_samples": 4116,
112
  "num_not_eval_samples": 0,
113
+ "macro_mean_score": 0.43481964330318734,
114
+ "micro_mean_score": 0.4297862001943635
115
  },
116
  "open": {
117
  "num_eval_tasks": 42,
 
120
  "macro_mean_score": 0.5787083135236054,
121
  "micro_mean_score": 0.6186961869618696
122
  },
123
+ "overall_score": 0.4540047993325765
124
  },
125
  "Gemini_1.5_pro_002": {
126
  "core": {
 
261
  "macro_mean_score": 0.36480000609384927,
262
  "micro_mean_score": 0.36205779758110807,
263
  "missing_tasks": [
 
264
  "table_understanding",
265
+ "MMSoc_Misinformation_PolitiFact",
266
  "planning_screenshot_termes"
267
  ]
268
  },
 
316
  },
317
  "overall_score": 0.34550356262982296
318
  },
319
+ "POINTS_15_7B": {
320
+ "core": {
321
+ "num_eval_tasks": 273,
322
+ "num_eval_samples": 4116,
323
+ "num_not_eval_samples": 0,
324
+ "macro_mean_score": 0.31355970638319003,
325
+ "micro_mean_score": 0.30728203432446294,
326
+ "missing_tasks": []
327
+ },
328
+ "open": {
329
+ "num_eval_tasks": 42,
330
+ "num_eval_samples": 813,
331
+ "num_not_eval_samples": 0,
332
+ "macro_mean_score": 0.41331219301389166,
333
+ "micro_mean_score": 0.42749077490774917,
334
+ "missing_tasks": []
335
+ },
336
+ "overall_score": 0.32686003793395024
337
+ },
338
  "POINTS_7B": {
339
  "core": {
340
  "num_eval_tasks": 273,
 
449
  },
450
  "overall_score": 0.3669159632302898
451
  },
452
+ "SmolVLM": {
453
+ "core": {
454
+ "num_eval_tasks": 273,
455
+ "num_eval_samples": 4116,
456
+ "num_not_eval_samples": 0,
457
+ "macro_mean_score": 0.07348385181460795,
458
+ "micro_mean_score": 0.0732694668402814,
459
+ "missing_tasks": []
460
+ },
461
+ "open": {
462
+ "num_eval_tasks": 42,
463
+ "num_eval_samples": 813,
464
+ "num_not_eval_samples": 0,
465
+ "macro_mean_score": 0.2427337975725658,
466
+ "micro_mean_score": 0.2504920049200492,
467
+ "missing_tasks": []
468
+ },
469
+ "overall_score": 0.09605051124900234
470
+ },
471
  "llava_onevision_72B": {
472
  "core": {
473
  "num_eval_tasks": 273,
utils.py CHANGED
@@ -29,6 +29,9 @@ MODEL_NAME_MAP = {
29
  "InternVL2_2B": "InternVL2-2B",
30
  "Molmo_7B_D": "Molmo-7B-D-0924",
31
  "Molmo_72B": "Molmo-72B-0924",
 
 
 
32
  }
33
 
34
  DIMENSION_NAME_MAP = {
@@ -108,7 +111,11 @@ MODEL_URLS = {
108
  "Aquila_VL_2B": "https://huggingface.co/BAAI/Aquila-VL-2B-llava-qwen",
109
  "POINTS_7B": "https://huggingface.co/WePOINTS/POINTS-Qwen-2-5-7B-Chat",
110
  "Qwen2_VL_2B": "https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct",
111
- "InternVL2_2B": "https://huggingface.co/OpenGVLab/InternVL2-2B"
 
 
 
 
112
  }
113
 
114
  class BaseDataLoader:
@@ -208,14 +215,20 @@ class DefaultDataLoader(BaseDataLoader):
208
  for model in self.MODEL_GROUPS[selected_model_group]:
209
  model_data = self.MODEL_DATA[model]
210
  summary = self.SUMMARY_DATA[model]
211
- core_noncot_score = summary["core_noncot"]["macro_mean_score"]
212
- core_cot_score = summary["core_cot"]["macro_mean_score"]
 
 
 
 
 
 
213
  row = {
214
  "Models": get_display_model_name(model, as_link=True),
215
  "Overall": round(summary["overall_score"] * 100, 2),
216
- "Core w/o CoT": round(core_noncot_score * 100, 2),
217
- "Core w/ CoT": round(core_cot_score * 100, 2),
218
- "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
219
  }
220
  for display_name in self.SUPER_GROUPS[selected_super_group]:
221
  original_keyword = self.keyword_display_map[display_name]
 
29
  "InternVL2_2B": "InternVL2-2B",
30
  "Molmo_7B_D": "Molmo-7B-D-0924",
31
  "Molmo_72B": "Molmo-72B-0924",
32
+ "Mammoth_VL": "Mammoth-VL-8B",
33
+ "SmolVLM": "SmolVLM-1.7B",
34
+ "POINTS_15_7B": "POINTS-1.5-8B",
35
  }
36
 
37
  DIMENSION_NAME_MAP = {
 
111
  "Aquila_VL_2B": "https://huggingface.co/BAAI/Aquila-VL-2B-llava-qwen",
112
  "POINTS_7B": "https://huggingface.co/WePOINTS/POINTS-Qwen-2-5-7B-Chat",
113
  "Qwen2_VL_2B": "https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct",
114
+ "InternVL2_2B": "https://huggingface.co/OpenGVLab/InternVL2-2B",
115
+ "POINTS_7B": "https://huggingface.co/WePOINTS/POINTS-Qwen-2-5-7B-Chat",
116
+ "POINTS_15_7B": "https://huggingface.co/WePOINTS/POINTS-1-5-Qwen-2-5-7B-Chat",
117
+ "SmolVLM": "https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct",
118
+ "Mammoth_VL": "https://huggingface.co/MAmmoTH-VL/MAmmoTH-VL-8B",
119
  }
120
 
121
  class BaseDataLoader:
 
215
  for model in self.MODEL_GROUPS[selected_model_group]:
216
  model_data = self.MODEL_DATA[model]
217
  summary = self.SUMMARY_DATA[model]
218
+ if summary["core_noncot"]:
219
+ core_noncot_score = summary["core_noncot"]["macro_mean_score"]
220
+ else:
221
+ core_noncot_score = '-'
222
+ if summary["core_cot"]:
223
+ core_cot_score = summary["core_cot"]["macro_mean_score"]
224
+ else:
225
+ core_cot_score = '-'
226
  row = {
227
  "Models": get_display_model_name(model, as_link=True),
228
  "Overall": round(summary["overall_score"] * 100, 2),
229
+ "Core w/o CoT": round(core_noncot_score * 100, 2) if core_noncot_score != '-' else '-',
230
+ "Core w/ CoT": round(core_cot_score * 100, 2) if core_cot_score != '-' else '-',
231
+ "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) if summary["open"] else '-'
232
  }
233
  for display_name in self.SUPER_GROUPS[selected_super_group]:
234
  original_keyword = self.keyword_display_map[display_name]