from datasets import load_dataset # Constants values LEADERBOARD_PATH = "atlasia/Open-Arabic-Dialect-Identification-Leaderboard" DATA_PATH = "atlasia/Arabic-LID-Leaderboard" DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json" MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json" # classification metrics metrics = [ 'f1_score', 'precision', 'recall', 'false_positive_rate', 'false_negative_rate', 'weighted_f1_score', 'macro_f1_score', 'micro_f1_score', 'balanced_accuracy', 'matthews_correlation', 'specificity', 'negative_predictive_value', 'n_test_samples', ] # Mapping dict from iso code to country name language_mapping_dict = { 'ace_Arab': 'Acehnese', 'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian' 'aeb_Arab': 'Tunisia', 'ajp_Arab': 'Levantine', # 'South Levantine' 'apc_Arab': 'Levantine', 'arb_Arab': 'MSA', 'arq_Arab': 'Algeria', 'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian 'ary_Arab': 'Morocco', 'arz_Arab': 'Egypt', 'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian' 'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region 'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan 'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia 'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan 'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq 'fuv_Arab': 'Nigeria', # Hausa States Fulfulde 'glk_Arab': 'Iran', # Gilaki is spoken in Iran 'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran 'kas_Arab': 'Kashmir', 'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria 'lki_Arab': 'Iran', # Laki is from Iran 'lrc_Arab': 'Iran', # Northern Luri is from Iran 'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia 'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran 'ota_Arab': 'Turkey', # Ottoman Turkish 'pbt_Arab': 'Afghanistan', # Southern Pashto 'pnb_Arab': 'Pakistan', # Western Panjabi 'sdh_Arab': 'Iraq', # Southern Kurdish 'shu_Arab': 'Chad', # Chadian Arabic 'skr_Arab': 'Pakistan', # Saraiki 'snd_Arab': 'Pakistan', # Sindhi 'sus_Arab': 'Guinea', # Susu 'tuk_Arab': 'Turkmenistan', # Turkmen 'uig_Arab': 'Uighur (China)', # Uighur 'urd_Arab': 'Pakistan', # Urdu 'uzs_Arab': 'Uzbekistan', # Southern Uzbek 'zsm_Arab': 'Malaysia' # Standard Malay } # Default values target_label = "Morocco" is_binary = False # default metrics to display in the multilingual leaderboard default_metrics = [ 'f1_score', 'false_positive_rate', ] # default language to display in one-vs-all leaderboard default_languages = [ 'MSA', #'Egypt', #'Algeria', #'Tunisia', #'Levantine', ] # Load eval dataset eval_dataset = load_dataset(DATA_PATH, split='test') # Supported dialects all_target_languages = list(eval_dataset.unique("dialect")) supported_dialects = all_target_languages + ['All'] languages_to_display_one_vs_all = all_target_languages # everything except All