Update score.py
Browse files
score.py
CHANGED
@@ -6,52 +6,64 @@ import re
|
|
6 |
# Model scores
|
7 |
BENCHMARK_SCORES = {
|
8 |
"icelandic-winogrande": {
|
9 |
-
"
|
|
|
10 |
"GPT-4o": 85.4,
|
11 |
-
"GPT-4-turbo": 85.8,
|
12 |
"Hermes 3 Llama 3.1 405B fp8": 70.6,
|
13 |
"Claude 2.1": 55.1,
|
14 |
"GPT-3.5-turbo": 52.0,
|
|
|
15 |
},
|
16 |
"grammatical-error-detection": {
|
17 |
-
"
|
|
|
18 |
"GPT-4o": 68.0,
|
19 |
-
"GPT-4-turbo": 60.5,
|
20 |
"Hermes 3 Llama 3.1 405B fp8": 53.5,
|
21 |
"Claude 2.1": 52.5,
|
22 |
"GPT-3.5-turbo": 52.0,
|
|
|
23 |
},
|
24 |
"icelandic-inflection-all": {
|
25 |
-
"
|
|
|
26 |
"GPT-4o": 87.8,
|
27 |
-
"GPT-4-turbo": 76.6,
|
28 |
"Hermes 3 Llama 3.1 405B fp8": 61.8,
|
29 |
"Claude 2.1": 55.2,
|
30 |
"GPT-3.5-turbo": 39.1,
|
|
|
31 |
},
|
32 |
"icelandic-belebele": {
|
33 |
-
"
|
|
|
34 |
"GPT-4o": 90.4,
|
35 |
-
"GPT-4-turbo": 89.3,
|
36 |
"Hermes 3 Llama 3.1 405B fp8": 86.1,
|
37 |
"Claude 2.1": 42.1,
|
38 |
"GPT-3.5-turbo": 59.2,
|
|
|
39 |
},
|
40 |
"icelandic-arc-challenge": {
|
41 |
-
"
|
|
|
42 |
"GPT-4o": 90.4,
|
43 |
-
"GPT-4-turbo": 88.7,
|
44 |
"Hermes 3 Llama 3.1 405B fp8": 72.0,
|
45 |
"Claude 2.1": 59.9,
|
46 |
"GPT-3.5-turbo": 49.5,
|
|
|
47 |
},
|
48 |
"icelandic-wiki-qa": {
|
49 |
-
"
|
|
|
50 |
"GPT-4o": 38.0,
|
51 |
-
"GPT-4-turbo": 31.0,
|
52 |
"Hermes 3 Llama 3.1 405B fp8": 33.8,
|
53 |
"Claude 2.1": 21.1,
|
54 |
"GPT-3.5-turbo": 15.0,
|
|
|
55 |
},
|
56 |
}
|
57 |
|
|
|
6 |
# Model scores
|
7 |
BENCHMARK_SCORES = {
|
8 |
"icelandic-winogrande": {
|
9 |
+
"o1-preview": 92.8,
|
10 |
+
"Claude 3.5 Sonnet": 91.3,
|
11 |
"GPT-4o": 85.4,
|
12 |
+
# "GPT-4-turbo": 85.8,
|
13 |
"Hermes 3 Llama 3.1 405B fp8": 70.6,
|
14 |
"Claude 2.1": 55.1,
|
15 |
"GPT-3.5-turbo": 52.0,
|
16 |
+
"Deepseek V3": 75.9,
|
17 |
},
|
18 |
"grammatical-error-detection": {
|
19 |
+
"o1-preview": 74.5,
|
20 |
+
"Claude 3.5 Sonnet": 72.5,
|
21 |
"GPT-4o": 68.0,
|
22 |
+
# "GPT-4-turbo": 60.5,
|
23 |
"Hermes 3 Llama 3.1 405B fp8": 53.5,
|
24 |
"Claude 2.1": 52.5,
|
25 |
"GPT-3.5-turbo": 52.0,
|
26 |
+
"Deepseek V3": 57.0,
|
27 |
},
|
28 |
"icelandic-inflection-all": {
|
29 |
+
"o1-preview": 84.4,
|
30 |
+
"Claude 3.5 Sonnet": 88.8,
|
31 |
"GPT-4o": 87.8,
|
32 |
+
# "GPT-4-turbo": 76.6,
|
33 |
"Hermes 3 Llama 3.1 405B fp8": 61.8,
|
34 |
"Claude 2.1": 55.2,
|
35 |
"GPT-3.5-turbo": 39.1,
|
36 |
+
"Deepseek V3": 77.3,
|
37 |
},
|
38 |
"icelandic-belebele": {
|
39 |
+
"o1-preview": 92.2,
|
40 |
+
"Claude 3.5 Sonnet": 92.2,
|
41 |
"GPT-4o": 90.4,
|
42 |
+
# "GPT-4-turbo": 89.3,
|
43 |
"Hermes 3 Llama 3.1 405B fp8": 86.1,
|
44 |
"Claude 2.1": 42.1,
|
45 |
"GPT-3.5-turbo": 59.2,
|
46 |
+
"Deepseek V3": 87.9,
|
47 |
},
|
48 |
"icelandic-arc-challenge": {
|
49 |
+
"o1-preview": 93.4,
|
50 |
+
"Claude 3.5 Sonnet": 91.3,
|
51 |
"GPT-4o": 90.4,
|
52 |
+
# "GPT-4-turbo": 88.7,
|
53 |
"Hermes 3 Llama 3.1 405B fp8": 72.0,
|
54 |
"Claude 2.1": 59.9,
|
55 |
"GPT-3.5-turbo": 49.5,
|
56 |
+
"Deepseek V3": 79.7,
|
57 |
},
|
58 |
"icelandic-wiki-qa": {
|
59 |
+
"o1-preview": 44.5,
|
60 |
+
"Claude 3.5 Sonnet": 45.2,
|
61 |
"GPT-4o": 38.0,
|
62 |
+
# "GPT-4-turbo": 31.0,
|
63 |
"Hermes 3 Llama 3.1 405B fp8": 33.8,
|
64 |
"Claude 2.1": 21.1,
|
65 |
"GPT-3.5-turbo": 15.0,
|
66 |
+
"Deepseek V3": 27.2,
|
67 |
},
|
68 |
}
|
69 |
|