|
from datasets import load_dataset |
|
|
|
|
|
|
|
LEADERBOARD_PATH = "atlasia/Open-Arabic-Dialect-Identification-Leaderboard" |
|
DATA_PATH = "atlasia/Arabic-LID-Leaderboard" |
|
DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json" |
|
MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json" |
|
|
|
|
|
metrics = [ |
|
'f1_score', |
|
'precision', |
|
'recall', |
|
'false_positive_rate', |
|
'false_negative_rate', |
|
'weighted_f1_score', |
|
'macro_f1_score', |
|
'micro_f1_score', |
|
'balanced_accuracy', |
|
'matthews_correlation', |
|
'specificity', |
|
'negative_predictive_value', |
|
'n_test_samples', |
|
] |
|
|
|
|
|
language_mapping_dict = { |
|
'ace_Arab': 'Acehnese', |
|
'acm_Arab': 'Mesopotamia', |
|
'aeb_Arab': 'Tunisia', |
|
'ajp_Arab': 'Levantine', |
|
'apc_Arab': 'Levantine', |
|
'arb_Arab': 'MSA', |
|
'arq_Arab': 'Algeria', |
|
'ars_Arab': 'Saudi', |
|
'ary_Arab': 'Morocco', |
|
'arz_Arab': 'Egypt', |
|
'ayp_Arab': 'Mesopotamia', |
|
'azb_Arab': 'Azerbaijan', |
|
'bcc_Arab': 'Balochistan', |
|
'bjn_Arab': 'Indonesia', |
|
'brh_Arab': 'Pakistan', |
|
'ckb_Arab': 'Kurdistan', |
|
'fuv_Arab': 'Nigeria', |
|
'glk_Arab': 'Iran', |
|
'hac_Arab': 'Iran', |
|
'kas_Arab': 'Kashmir', |
|
'knc_Arab': 'Nigeria', |
|
'lki_Arab': 'Iran', |
|
'lrc_Arab': 'Iran', |
|
'min_Arab': 'Indonesia', |
|
'mzn_Arab': 'Iran', |
|
'ota_Arab': 'Turkey', |
|
'pbt_Arab': 'Afghanistan', |
|
'pnb_Arab': 'Pakistan', |
|
'sdh_Arab': 'Iraq', |
|
'shu_Arab': 'Chad', |
|
'skr_Arab': 'Pakistan', |
|
'snd_Arab': 'Pakistan', |
|
'sus_Arab': 'Guinea', |
|
'tuk_Arab': 'Turkmenistan', |
|
'uig_Arab': 'Uighur (China)', |
|
'urd_Arab': 'Pakistan', |
|
'uzs_Arab': 'Uzbekistan', |
|
'zsm_Arab': 'Malaysia' |
|
} |
|
|
|
|
|
target_label = "Morocco" |
|
is_binary = False |
|
|
|
|
|
default_metrics = [ |
|
'f1_score', |
|
'false_positive_rate', |
|
] |
|
|
|
|
|
default_languages = [ |
|
'MSA', |
|
|
|
|
|
|
|
|
|
] |
|
|
|
|
|
eval_dataset = load_dataset(DATA_PATH, split='test') |
|
|
|
|
|
all_target_languages = list(eval_dataset.unique("dialect")) |
|
supported_dialects = all_target_languages + ['All'] |
|
languages_to_display_one_vs_all = all_target_languages |
|
|