luisrguerra commited on
Commit
54a8724
·
verified ·
1 Parent(s): 51b171d

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +66 -24
index.html CHANGED
@@ -29,9 +29,15 @@
29
 
30
  <body>
31
  <div><canvas id="radarChart" height="750"></canvas></div>
32
- <div><canvas id="mmluChart" height="200"></canvas></div>
33
- <div><canvas id="gsm8kChart" height="200"></canvas></div>
34
- <div><canvas id="arenaeloChart" height="200"></canvas></div>
 
 
 
 
 
 
35
  <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
36
  <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
37
  <p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
@@ -165,7 +171,7 @@
165
  name: 'gpt-4-0125-preview (turbo)',
166
  mmlu: null,
167
  mtbench: null,
168
- arenaelo:1253,
169
  gsm8k: null,
170
  winogrande: null,
171
  truthfulqa: null,
@@ -180,8 +186,8 @@
180
  name: 'gpt-4-1106-preview (turbo)',
181
  mmlu: null,
182
  mtbench: 9.32,
183
- arenaelo:1254,
184
- gsm8k: null,
185
  winogrande: 81.8,
186
  truthfulqa: 75.7,
187
  hellaswag:92.7,
@@ -210,7 +216,7 @@
210
  name: 'gpt-4-0314',
211
  mmlu: 86.4,
212
  mtbench: 8.96,
213
- arenaelo:1190,
214
  gsm8k: 92,
215
  winogrande: 87.5,
216
  truthfulqa: 59,
@@ -225,7 +231,7 @@
225
  name: 'gpt-3.5-turbo-0613',
226
  mmlu: null,
227
  mtbench: 8.39,
228
- arenaelo:1116,
229
  gsm8k: null,
230
  winogrande: 55.3,
231
  truthfulqa: 61.4,
@@ -240,7 +246,7 @@
240
  name: 'gpt-3.5-turbo-0301',
241
  mmlu: 70,
242
  mtbench: 7.94,
243
- arenaelo:1104,
244
  gsm8k: 57.1,
245
  winogrande: 81.6,
246
  truthfulqa: 47,
@@ -255,7 +261,7 @@
255
  name: 'gpt-3.5-turbo-1106',
256
  mmlu: null,
257
  mtbench: 8.32,
258
- arenaelo:1072,
259
  gsm8k: null,
260
  winogrande: 54,
261
  truthfulqa: 60.7,
@@ -274,9 +280,9 @@
274
  gsm8k: 95.0,
275
  winogrande: null,
276
  truthfulqa: null,
277
- hellaswag:null,
278
  arc:96.4,
279
- nothallucination: null,
280
  parameters: null,
281
  organization: 'Anthropic',
282
  license: 'Proprietary',
@@ -285,13 +291,13 @@
285
  name: 'Claude 3 Sonnet',
286
  mmlu: 79.0,
287
  mtbench: null,
288
- arenaelo:1190,
289
  gsm8k: 92.3,
290
  winogrande: null,
291
  truthfulqa: null,
292
  hellaswag:null,
293
  arc:89.0,
294
- nothallucination: null,
295
  parameters: null,
296
  organization: 'Anthropic',
297
  license: 'Proprietary',
@@ -300,13 +306,13 @@
300
  name: 'Claude 3 Haiku',
301
  mmlu: 75.2,
302
  mtbench: null,
303
- arenaelo:null,
304
  gsm8k: 88.9,
305
  winogrande: null,
306
  truthfulqa: null,
307
  hellaswag:null,
308
  arc:85.9,
309
- nothallucination: null,
310
  parameters: null,
311
  organization: 'Anthropic',
312
  license: 'Proprietary',
@@ -315,7 +321,7 @@
315
  name: 'Claude 2.1',
316
  mmlu: null,
317
  mtbench: 8.18,
318
- arenaelo:1119,
319
  gsm8k: 88,
320
  winogrande: null,
321
  truthfulqa: null,
@@ -330,7 +336,7 @@
330
  name: 'Claude 2.0',
331
  mmlu: 78.5,
332
  mtbench: 8.06,
333
- arenaelo:1131,
334
  gsm8k: 71.2,
335
  winogrande: null,
336
  truthfulqa: 69,
@@ -345,7 +351,7 @@
345
  name: 'Claude 1.0',
346
  mmlu: 77,
347
  mtbench: 7.9,
348
- arenaelo:1149,
349
  gsm8k: null,
350
  winogrande: null,
351
  truthfulqa: null,
@@ -360,7 +366,7 @@
360
  name: 'Claude Instant 1',
361
  mmlu: 73.4,
362
  mtbench: 7.85,
363
- arenaelo:1109,
364
  gsm8k: 86.7,
365
  winogrande: null,
366
  truthfulqa: null,
@@ -401,11 +407,26 @@
401
  organization: 'Google',
402
  license: 'Proprietary',
403
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  {
405
  name: 'Gemini Pro',
406
  mmlu: 71.8,
407
  mtbench: null,
408
- arenaelo:1114,
409
  gsm8k: 77.9,
410
  winogrande: null,
411
  truthfulqa: null,
@@ -416,11 +437,26 @@
416
  organization: 'Google',
417
  license: 'Proprietary',
418
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  {
420
  name: 'Mistral Medium',
421
  mmlu: 75.3,
422
  mtbench: 8.61,
423
- arenaelo:1150,
424
  gsm8k: null,
425
  winogrande: null,
426
  truthfulqa: null,
@@ -435,7 +471,7 @@
435
  name: 'Mixtral 8x7B Instruct',
436
  mmlu: 70.6,
437
  mtbench: 8.3,
438
- arenaelo:1123,
439
  gsm8k: 58.4,
440
  winogrande: 81.2,
441
  truthfulqa: 46.7,
@@ -480,7 +516,7 @@
480
  name: 'Yi 34B',
481
  mmlu: 73.5,
482
  mtbench: null,
483
- arenaelo:1111,
484
  gsm8k: 50.64,
485
  winogrande: 83.03,
486
  truthfulqa: 56.23,
@@ -668,6 +704,12 @@
668
  updateChart('mmluChart','mmlu');
669
  updateChart('gsm8kChart','gsm8k');
670
  updateChart('arenaeloChart','arenaelo');
 
 
 
 
 
 
671
 
672
  </script>
673
  </body>
 
29
 
30
  <body>
31
  <div><canvas id="radarChart" height="750"></canvas></div>
32
+ <div><canvas id="mmluChart" height="150"></canvas></div>
33
+ <div><canvas id="gsm8kChart" height="150"></canvas></div>
34
+ <div><canvas id="arenaeloChart" height="150"></canvas></div>
35
+ <div><canvas id="nothallucinationChart" height="150"></canvas></div>
36
+ <div><canvas id="truthfulqaChart" height="150"></canvas></div>
37
+ <div><canvas id="hellaSwagChart" height="150"></canvas></div>
38
+ <div><canvas id="winograndeChart" height="150"></canvas></div>
39
+ <div><canvas id="arcChart" height="150"></canvas></div>
40
+ <div><canvas id="mtbenchChart" height="150"></canvas></div>
41
  <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
42
  <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
43
  <p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
 
171
  name: 'gpt-4-0125-preview (turbo)',
172
  mmlu: null,
173
  mtbench: null,
174
+ arenaelo:1249,
175
  gsm8k: null,
176
  winogrande: null,
177
  truthfulqa: null,
 
186
  name: 'gpt-4-1106-preview (turbo)',
187
  mmlu: null,
188
  mtbench: 9.32,
189
+ arenaelo:1252,
190
+ gsm8k: 95.3,
191
  winogrande: 81.8,
192
  truthfulqa: 75.7,
193
  hellaswag:92.7,
 
216
  name: 'gpt-4-0314',
217
  mmlu: 86.4,
218
  mtbench: 8.96,
219
+ arenaelo:1185,
220
  gsm8k: 92,
221
  winogrande: 87.5,
222
  truthfulqa: 59,
 
231
  name: 'gpt-3.5-turbo-0613',
232
  mmlu: null,
233
  mtbench: 8.39,
234
+ arenaelo:1115,
235
  gsm8k: null,
236
  winogrande: 55.3,
237
  truthfulqa: 61.4,
 
246
  name: 'gpt-3.5-turbo-0301',
247
  mmlu: 70,
248
  mtbench: 7.94,
249
+ arenaelo:1103,
250
  gsm8k: 57.1,
251
  winogrande: 81.6,
252
  truthfulqa: 47,
 
261
  name: 'gpt-3.5-turbo-1106',
262
  mmlu: null,
263
  mtbench: 8.32,
264
+ arenaelo:1069,
265
  gsm8k: null,
266
  winogrande: 54,
267
  truthfulqa: 60.7,
 
280
  gsm8k: 95.0,
281
  winogrande: null,
282
  truthfulqa: null,
283
+ hellaswag:95.4,
284
  arc:96.4,
285
+ nothallucination: 92.6,
286
  parameters: null,
287
  organization: 'Anthropic',
288
  license: 'Proprietary',
 
291
  name: 'Claude 3 Sonnet',
292
  mmlu: 79.0,
293
  mtbench: null,
294
+ arenaelo:1200,
295
  gsm8k: 92.3,
296
  winogrande: null,
297
  truthfulqa: null,
298
  hellaswag:null,
299
  arc:89.0,
300
+ nothallucination: 94,
301
  parameters: null,
302
  organization: 'Anthropic',
303
  license: 'Proprietary',
 
306
  name: 'Claude 3 Haiku',
307
  mmlu: 75.2,
308
  mtbench: null,
309
+ arenaelo:1177,
310
  gsm8k: 88.9,
311
  winogrande: null,
312
  truthfulqa: null,
313
  hellaswag:null,
314
  arc:85.9,
315
+ nothallucination: 92.4,
316
  parameters: null,
317
  organization: 'Anthropic',
318
  license: 'Proprietary',
 
321
  name: 'Claude 2.1',
322
  mmlu: null,
323
  mtbench: 8.18,
324
+ arenaelo:1116,
325
  gsm8k: 88,
326
  winogrande: null,
327
  truthfulqa: null,
 
336
  name: 'Claude 2.0',
337
  mmlu: 78.5,
338
  mtbench: 8.06,
339
+ arenaelo:1127,
340
  gsm8k: 71.2,
341
  winogrande: null,
342
  truthfulqa: 69,
 
351
  name: 'Claude 1.0',
352
  mmlu: 77,
353
  mtbench: 7.9,
354
+ arenaelo:1146,
355
  gsm8k: null,
356
  winogrande: null,
357
  truthfulqa: null,
 
366
  name: 'Claude Instant 1',
367
  mmlu: 73.4,
368
  mtbench: 7.85,
369
+ arenaelo:1105,
370
  gsm8k: 86.7,
371
  winogrande: null,
372
  truthfulqa: null,
 
407
  organization: 'Google',
408
  license: 'Proprietary',
409
  },
410
+ {
411
+ name: 'Gemini Pro Online',
412
+ mmlu: null,
413
+ mtbench: null,
414
+ arenaelo:1204,
415
+ gsm8k: null,
416
+ winogrande: null,
417
+ truthfulqa: null,
418
+ hellaswag:null,
419
+ arc:null,
420
+ nothallucination: null,
421
+ parameters: null,
422
+ organization: 'Google',
423
+ license: 'Proprietary',
424
+ },
425
  {
426
  name: 'Gemini Pro',
427
  mmlu: 71.8,
428
  mtbench: null,
429
+ arenaelo:1111,
430
  gsm8k: 77.9,
431
  winogrande: null,
432
  truthfulqa: null,
 
437
  organization: 'Google',
438
  license: 'Proprietary',
439
  },
440
+ {
441
+ name: 'Mistral Large',
442
+ mmlu: 81.2,
443
+ mtbench: null,
444
+ arenaelo:1157,
445
+ gsm8k: null,
446
+ winogrande: 86.7,
447
+ truthfulqa: 50.5,
448
+ hellaswag:89.2,
449
+ arc:94.2,
450
+ nothallucination: null,
451
+ parameters: null,
452
+ organization: 'Mistral',
453
+ license: 'Proprietary',
454
+ },
455
  {
456
  name: 'Mistral Medium',
457
  mmlu: 75.3,
458
  mtbench: 8.61,
459
+ arenaelo:1146,
460
  gsm8k: null,
461
  winogrande: null,
462
  truthfulqa: null,
 
471
  name: 'Mixtral 8x7B Instruct',
472
  mmlu: 70.6,
473
  mtbench: 8.3,
474
+ arenaelo:1114,
475
  gsm8k: 58.4,
476
  winogrande: 81.2,
477
  truthfulqa: 46.7,
 
516
  name: 'Yi 34B',
517
  mmlu: 73.5,
518
  mtbench: null,
519
+ arenaelo:1100,
520
  gsm8k: 50.64,
521
  winogrande: 83.03,
522
  truthfulqa: 56.23,
 
704
  updateChart('mmluChart','mmlu');
705
  updateChart('gsm8kChart','gsm8k');
706
  updateChart('arenaeloChart','arenaelo');
707
+ updateChart('nothallucinationChart','nothallucination');
708
+ updateChart('truthfulqaChart','truthfulqa');
709
+ updateChart('hellaSwagChart','hellaswag');
710
+ updateChart('winograndeChart','winogrande');
711
+ updateChart('arcChart','arc');
712
+ updateChart('mtbenchChart','mtbench');
713
 
714
  </script>
715
  </body>