File size: 3,496 Bytes
04e4741
 
 
 
1656d75
04e4741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from datasets import load_dataset


# Constants values
LEADERBOARD_PATH = "atlasia/Open-Arabic-Dialect-Identification-Leaderboard"
DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json"
MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json"

# classification metrics
metrics = [
    'f1_score',
    'precision',
    'recall',
    'false_positive_rate',
    'false_negative_rate',
    'weighted_f1_score',
    'macro_f1_score',
    'micro_f1_score',
    'balanced_accuracy',
    'matthews_correlation',
    'specificity',
    'negative_predictive_value',
    'n_test_samples',
]

# Mapping dict from iso code to country name
language_mapping_dict = {   
    'ace_Arab': 'Acehnese',
    'acm_Arab': 'Mesopotamia',          # 'Gilit Mesopotamian'
    'aeb_Arab': 'Tunisia',
    'ajp_Arab': 'Levantine',            # 'South Levantine'
    'apc_Arab': 'Levantine',
    'arb_Arab': 'MSA',
    'arq_Arab': 'Algeria',
    'ars_Arab': 'Saudi',                # Najdi is primarily Saudi Arabian
    'ary_Arab': 'Morocco',
    'arz_Arab': 'Egypt',
    'ayp_Arab': 'Mesopotamia',          # 'North Mesopotamian'
    'azb_Arab': 'Azerbaijan',           # South Azerbaijani pertains to this region
    'bcc_Arab': 'Balochistan',          # Southern Balochi is from Balochistan
    'bjn_Arab': 'Indonesia',            # Banjar is spoken in Indonesia
    'brh_Arab': 'Pakistan',             # Brahui is spoken in Pakistan
    'ckb_Arab': 'Kurdistan',            # Central Kurdish is mainly in Iraq
    'fuv_Arab': 'Nigeria',              # Hausa States Fulfulde
    'glk_Arab': 'Iran',                 # Gilaki is spoken in Iran
    'hac_Arab': 'Iran',                 # Gurani is also primarily spoken in Iran
    'kas_Arab': 'Kashmir',
    'knc_Arab': 'Nigeria',              # Central Kanuri is in Nigeria
    'lki_Arab': 'Iran',                 # Laki is from Iran
    'lrc_Arab': 'Iran',                 # Northern Luri is from Iran
    'min_Arab': 'Indonesia',            # Minangkabau is spoken in Indonesia
    'mzn_Arab': 'Iran',                 # Mazanderani is spoken in Iran
    'ota_Arab': 'Turkey',               # Ottoman Turkish
    'pbt_Arab': 'Afghanistan',          # Southern Pashto
    'pnb_Arab': 'Pakistan',             # Western Panjabi
    'sdh_Arab': 'Iraq',                 # Southern Kurdish
    'shu_Arab': 'Chad',                 # Chadian Arabic
    'skr_Arab': 'Pakistan',             # Saraiki
    'snd_Arab': 'Pakistan',             # Sindhi
    'sus_Arab': 'Guinea',               # Susu
    'tuk_Arab': 'Turkmenistan',         # Turkmen
    'uig_Arab': 'Uighur (China)',       # Uighur
    'urd_Arab': 'Pakistan',             # Urdu
    'uzs_Arab': 'Uzbekistan',           # Southern Uzbek
    'zsm_Arab': 'Malaysia'              # Standard Malay
}

# Default values
target_label = "Morocco"
is_binary = False

# default metrics to display in the multilingual leaderboard
default_metrics = [
    'f1_score', 
    'false_positive_rate', 
]

# default language to display in one-vs-all leaderboard
default_languages = [
    'MSA',
    #'Egypt',
    #'Algeria',
    #'Tunisia',
    #'Levantine',
]

# Load eval dataset
eval_dataset = load_dataset(DATA_PATH, split='test')  
 
# Supported dialects
all_target_languages = list(eval_dataset.unique("dialect"))
supported_dialects = all_target_languages + ['All']
languages_to_display_one_vs_all = all_target_languages # everything except All