BounharAbdelaziz's picture
saving through HfApi
1656d75
raw
history blame
3.5 kB
from datasets import load_dataset
# Constants values
LEADERBOARD_PATH = "atlasia/Open-Arabic-Dialect-Identification-Leaderboard"
DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json"
MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json"
# classification metrics
metrics = [
'f1_score',
'precision',
'recall',
'false_positive_rate',
'false_negative_rate',
'weighted_f1_score',
'macro_f1_score',
'micro_f1_score',
'balanced_accuracy',
'matthews_correlation',
'specificity',
'negative_predictive_value',
'n_test_samples',
]
# Mapping dict from iso code to country name
language_mapping_dict = {
'ace_Arab': 'Acehnese',
'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian'
'aeb_Arab': 'Tunisia',
'ajp_Arab': 'Levantine', # 'South Levantine'
'apc_Arab': 'Levantine',
'arb_Arab': 'MSA',
'arq_Arab': 'Algeria',
'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian
'ary_Arab': 'Morocco',
'arz_Arab': 'Egypt',
'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian'
'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region
'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan
'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia
'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan
'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq
'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
'glk_Arab': 'Iran', # Gilaki is spoken in Iran
'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran
'kas_Arab': 'Kashmir',
'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria
'lki_Arab': 'Iran', # Laki is from Iran
'lrc_Arab': 'Iran', # Northern Luri is from Iran
'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia
'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran
'ota_Arab': 'Turkey', # Ottoman Turkish
'pbt_Arab': 'Afghanistan', # Southern Pashto
'pnb_Arab': 'Pakistan', # Western Panjabi
'sdh_Arab': 'Iraq', # Southern Kurdish
'shu_Arab': 'Chad', # Chadian Arabic
'skr_Arab': 'Pakistan', # Saraiki
'snd_Arab': 'Pakistan', # Sindhi
'sus_Arab': 'Guinea', # Susu
'tuk_Arab': 'Turkmenistan', # Turkmen
'uig_Arab': 'Uighur (China)', # Uighur
'urd_Arab': 'Pakistan', # Urdu
'uzs_Arab': 'Uzbekistan', # Southern Uzbek
'zsm_Arab': 'Malaysia' # Standard Malay
}
# Default values
target_label = "Morocco"
is_binary = False
# default metrics to display in the multilingual leaderboard
default_metrics = [
'f1_score',
'false_positive_rate',
]
# default language to display in one-vs-all leaderboard
default_languages = [
'MSA',
#'Egypt',
#'Algeria',
#'Tunisia',
#'Levantine',
]
# Load eval dataset
eval_dataset = load_dataset(DATA_PATH, split='test')
# Supported dialects
all_target_languages = list(eval_dataset.unique("dialect"))
supported_dialects = all_target_languages + ['All']
languages_to_display_one_vs_all = all_target_languages # everything except All