gardari commited on
Commit
83443db
·
verified ·
1 Parent(s): 3835c7d

Update score.py

Browse files
Files changed (1) hide show
  1. score.py +24 -12
score.py CHANGED
@@ -6,52 +6,64 @@ import re
6
  # Model scores
7
  BENCHMARK_SCORES = {
8
  "icelandic-winogrande": {
9
- "Claude 3.5 Sonnet": 90.4,
 
10
  "GPT-4o": 85.4,
11
- "GPT-4-turbo": 85.8,
12
  "Hermes 3 Llama 3.1 405B fp8": 70.6,
13
  "Claude 2.1": 55.1,
14
  "GPT-3.5-turbo": 52.0,
 
15
  },
16
  "grammatical-error-detection": {
17
- "Claude 3.5 Sonnet": 70.0,
 
18
  "GPT-4o": 68.0,
19
- "GPT-4-turbo": 60.5,
20
  "Hermes 3 Llama 3.1 405B fp8": 53.5,
21
  "Claude 2.1": 52.5,
22
  "GPT-3.5-turbo": 52.0,
 
23
  },
24
  "icelandic-inflection-all": {
25
- "Claude 3.5 Sonnet": 89.2,
 
26
  "GPT-4o": 87.8,
27
- "GPT-4-turbo": 76.6,
28
  "Hermes 3 Llama 3.1 405B fp8": 61.8,
29
  "Claude 2.1": 55.2,
30
  "GPT-3.5-turbo": 39.1,
 
31
  },
32
  "icelandic-belebele": {
33
- "Claude 3.5 Sonnet": 92.0,
 
34
  "GPT-4o": 90.4,
35
- "GPT-4-turbo": 89.3,
36
  "Hermes 3 Llama 3.1 405B fp8": 86.1,
37
  "Claude 2.1": 42.1,
38
  "GPT-3.5-turbo": 59.2,
 
39
  },
40
  "icelandic-arc-challenge": {
41
- "Claude 3.5 Sonnet": 89.6,
 
42
  "GPT-4o": 90.4,
43
- "GPT-4-turbo": 88.7,
44
  "Hermes 3 Llama 3.1 405B fp8": 72.0,
45
  "Claude 2.1": 59.9,
46
  "GPT-3.5-turbo": 49.5,
 
47
  },
48
  "icelandic-wiki-qa": {
49
- "Claude 3.5 Sonnet": 44.7,
 
50
  "GPT-4o": 38.0,
51
- "GPT-4-turbo": 31.0,
52
  "Hermes 3 Llama 3.1 405B fp8": 33.8,
53
  "Claude 2.1": 21.1,
54
  "GPT-3.5-turbo": 15.0,
 
55
  },
56
  }
57
 
 
6
  # Model scores
7
  BENCHMARK_SCORES = {
8
  "icelandic-winogrande": {
9
+ "o1-preview": 92.8,
10
+ "Claude 3.5 Sonnet": 91.3,
11
  "GPT-4o": 85.4,
12
+ # "GPT-4-turbo": 85.8,
13
  "Hermes 3 Llama 3.1 405B fp8": 70.6,
14
  "Claude 2.1": 55.1,
15
  "GPT-3.5-turbo": 52.0,
16
+ "Deepseek V3": 75.9,
17
  },
18
  "grammatical-error-detection": {
19
+ "o1-preview": 74.5,
20
+ "Claude 3.5 Sonnet": 72.5,
21
  "GPT-4o": 68.0,
22
+ # "GPT-4-turbo": 60.5,
23
  "Hermes 3 Llama 3.1 405B fp8": 53.5,
24
  "Claude 2.1": 52.5,
25
  "GPT-3.5-turbo": 52.0,
26
+ "Deepseek V3": 57.0,
27
  },
28
  "icelandic-inflection-all": {
29
+ "o1-preview": 84.4,
30
+ "Claude 3.5 Sonnet": 88.8,
31
  "GPT-4o": 87.8,
32
+ # "GPT-4-turbo": 76.6,
33
  "Hermes 3 Llama 3.1 405B fp8": 61.8,
34
  "Claude 2.1": 55.2,
35
  "GPT-3.5-turbo": 39.1,
36
+ "Deepseek V3": 77.3,
37
  },
38
  "icelandic-belebele": {
39
+ "o1-preview": 92.2,
40
+ "Claude 3.5 Sonnet": 92.2,
41
  "GPT-4o": 90.4,
42
+ # "GPT-4-turbo": 89.3,
43
  "Hermes 3 Llama 3.1 405B fp8": 86.1,
44
  "Claude 2.1": 42.1,
45
  "GPT-3.5-turbo": 59.2,
46
+ "Deepseek V3": 87.9,
47
  },
48
  "icelandic-arc-challenge": {
49
+ "o1-preview": 93.4,
50
+ "Claude 3.5 Sonnet": 91.3,
51
  "GPT-4o": 90.4,
52
+ # "GPT-4-turbo": 88.7,
53
  "Hermes 3 Llama 3.1 405B fp8": 72.0,
54
  "Claude 2.1": 59.9,
55
  "GPT-3.5-turbo": 49.5,
56
+ "Deepseek V3": 79.7,
57
  },
58
  "icelandic-wiki-qa": {
59
+ "o1-preview": 44.5,
60
+ "Claude 3.5 Sonnet": 45.2,
61
  "GPT-4o": 38.0,
62
+ # "GPT-4-turbo": 31.0,
63
  "Hermes 3 Llama 3.1 405B fp8": 33.8,
64
  "Claude 2.1": 21.1,
65
  "GPT-3.5-turbo": 15.0,
66
+ "Deepseek V3": 27.2,
67
  },
68
  }
69