theo commited on
Commit
827b7ef
1 Parent(s): 1cc3978

validate against langcodes via textarea, better error display

Browse files
Files changed (2) hide show
  1. language_set.json +0 -478
  2. tagging_app.py +56 -48
language_set.json DELETED
@@ -1,478 +0,0 @@
1
- {
2
- "code": "Programming language code",
3
- "aa": "Afar",
4
- "ab": "Abkhazian",
5
- "ace": "Achinese",
6
- "ach": "Acoli",
7
- "ada": "Adangme",
8
- "ady": "Adyghe, Adygei",
9
- "ae": "Avestan",
10
- "af": "Afrikaans",
11
- "afa": "Afro-Asiatic languages",
12
- "afh": "Afrihili",
13
- "ain": "Ainu (Japan)",
14
- "ak": "Akan",
15
- "akk": "Akkadian",
16
- "ale": "Aleut",
17
- "alg": "Algonquian languages",
18
- "alt": "Southern Altai",
19
- "am": "Amharic",
20
- "an": "Aragonese",
21
- "ang": "Old English (ca. 450-1100)",
22
- "apa": "Apache languages",
23
- "ar": "Arabic",
24
- "arc": "Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE)",
25
- "arn": "Mapudungun, Mapuche",
26
- "arp": "Arapaho",
27
- "art": "Artificial languages",
28
- "arw": "Arawak",
29
- "as": "Assamese",
30
- "ast": "Asturian, Asturleonese, Bable, Leonese",
31
- "ath": "Athapascan languages",
32
- "aus": "Australian languages",
33
- "av": "Avaric",
34
- "awa": "Awadhi",
35
- "ay": "Aymara",
36
- "az": "Azerbaijani",
37
- "ba": "Bashkir",
38
- "bad": "Banda languages",
39
- "bai": "Bamileke languages",
40
- "bal": "Baluchi",
41
- "ban": "Balinese",
42
- "bas": "Basa (Cameroon)",
43
- "bat": "Baltic languages",
44
- "be": "Belarusian",
45
- "bej": "Beja, Bedawiyet",
46
- "bem": "Bemba (Zambia)",
47
- "ber": "Berber languages",
48
- "bg": "Bulgarian",
49
- "bh": "Bihari languages",
50
- "bho": "Bhojpuri",
51
- "bi": "Bislama",
52
- "bik": "Bikol",
53
- "bin": "Bini, Edo",
54
- "bla": "Siksika",
55
- "bm": "Bambara",
56
- "bn": "Bengali, Bangla",
57
- "bnt": "Bantu languages",
58
- "bo": "Tibetan",
59
- "br": "Breton",
60
- "bra": "Braj",
61
- "bs": "Bosnian",
62
- "btk": "Batak languages",
63
- "bua": "Buriat",
64
- "bug": "Buginese",
65
- "byn": "Bilin, Blin",
66
- "ca": "Catalan, Valencian",
67
- "cad": "Caddo",
68
- "cai": "Central American Indian languages",
69
- "car": "Galibi Carib",
70
- "cau": "Caucasian languages",
71
- "ce": "Chechen",
72
- "ceb": "Cebuano",
73
- "cel": "Celtic languages",
74
- "ch": "Chamorro",
75
- "chb": "Chibcha",
76
- "chg": "Chagatai",
77
- "chk": "Chuukese",
78
- "chm": "Mari (Russia)",
79
- "chn": "Chinook jargon",
80
- "cho": "Choctaw",
81
- "chp": "Chipewyan, Dene Suline",
82
- "chr": "Cherokee",
83
- "chy": "Cheyenne",
84
- "cmc": "Chamic languages",
85
- "co": "Corsican",
86
- "cop": "Coptic",
87
- "cpe": "English-based creoles and pidgins",
88
- "cpf": "French-based creoles and pidgins",
89
- "cpp": "Portuguese-based creoles and pidgins",
90
- "cr": "Cree",
91
- "crh": "Crimean Tatar, Crimean Turkish",
92
- "crp": "Creoles and pidgins",
93
- "cs": "Czech",
94
- "csb": "Kashubian",
95
- "cu": "Church Slavic, Church Slavonic, Old Bulgarian, Old Church Slavonic, Old Slavonic",
96
- "cus": "Cushitic languages",
97
- "cv": "Chuvash",
98
- "cy": "Welsh",
99
- "da": "Danish",
100
- "dak": "Dakota",
101
- "dar": "Dargwa",
102
- "day": "Land Dayak languages",
103
- "de": "German",
104
- "del": "Delaware",
105
- "den": "Slave (Athapascan)",
106
- "dgr": "Dogrib, T\u0142\u0131\u0328ch\u01eb",
107
- "din": "Dinka",
108
- "doi": "Dogri (macrolanguage)",
109
- "dra": "Dravidian languages",
110
- "dsb": "Lower Sorbian",
111
- "dua": "Duala",
112
- "dum": "Middle Dutch (ca. 1050-1350)",
113
- "dv": "Dhivehi, Divehi, Maldivian",
114
- "dyu": "Dyula",
115
- "dz": "Dzongkha",
116
- "ee": "Ewe",
117
- "efi": "Efik",
118
- "egy": "Egyptian (Ancient)",
119
- "eka": "Ekajuk",
120
- "el": "Modern Greek (1453-)",
121
- "elx": "Elamite",
122
- "en": "English",
123
- "enm": "Middle English (1100-1500)",
124
- "eo": "Esperanto",
125
- "es": "Spanish, Castilian",
126
- "et": "Estonian",
127
- "eu": "Basque",
128
- "ewo": "Ewondo",
129
- "fa": "Persian",
130
- "fan": "Fang (Equatorial Guinea)",
131
- "fat": "Fanti",
132
- "ff": "Fulah",
133
- "fi": "Finnish",
134
- "fil": "Filipino, Pilipino",
135
- "fiu": "Finno-Ugrian languages",
136
- "fj": "Fijian",
137
- "fo": "Faroese",
138
- "fon": "Fon",
139
- "fr": "French",
140
- "frm": "Middle French (ca. 1400-1600)",
141
- "fro": "Old French (842-ca. 1400)",
142
- "fur": "Friulian",
143
- "fy": "Western Frisian",
144
- "ga": "Irish",
145
- "gaa": "Ga",
146
- "gay": "Gayo",
147
- "gba": "Gbaya (Central African Republic)",
148
- "gd": "Scottish Gaelic, Gaelic",
149
- "gem": "Germanic languages",
150
- "gez": "Geez",
151
- "gil": "Gilbertese",
152
- "gl": "Galician",
153
- "gmh": "Middle High German (ca. 1050-1500)",
154
- "gn": "Guarani",
155
- "goh": "Old High German (ca. 750-1050)",
156
- "gon": "Gondi",
157
- "gor": "Gorontalo",
158
- "got": "Gothic",
159
- "grb": "Grebo",
160
- "grc": "Ancient Greek (to 1453)",
161
- "gu": "Gujarati",
162
- "gv": "Manx",
163
- "gwi": "Gwich\u02bcin",
164
- "ha": "Hausa",
165
- "hai": "Haida",
166
- "haw": "Hawaiian",
167
- "he": "Hebrew",
168
- "hi": "Hindi",
169
- "hil": "Hiligaynon",
170
- "him": "Himachali languages, Western Pahari languages",
171
- "hit": "Hittite",
172
- "hmn": "Hmong, Mong",
173
- "ho": "Hiri Motu",
174
- "hr": "Croatian",
175
- "hsb": "Upper Sorbian",
176
- "ht": "Haitian, Haitian Creole",
177
- "hu": "Hungarian",
178
- "hup": "Hupa",
179
- "hy": "Armenian",
180
- "hz": "Herero",
181
- "ia": "Interlingua (International Auxiliary Language Association)",
182
- "iba": "Iban",
183
- "id": "Indonesian",
184
- "ie": "Interlingue, Occidental",
185
- "ig": "Igbo",
186
- "ii": "Sichuan Yi, Nuosu",
187
- "ijo": "Ijo languages",
188
- "ik": "Inupiaq",
189
- "ilo": "Iloko",
190
- "inc": "Indic languages",
191
- "ine": "Indo-European languages",
192
- "inh": "Ingush",
193
- "io": "Ido",
194
- "ira": "Iranian languages",
195
- "iro": "Iroquoian languages",
196
- "is": "Icelandic",
197
- "it": "Italian",
198
- "iu": "Inuktitut",
199
- "ja": "Japanese",
200
- "jbo": "Lojban",
201
- "jpr": "Judeo-Persian",
202
- "jrb": "Judeo-Arabic",
203
- "jv": "Javanese",
204
- "ka": "Georgian",
205
- "kaa": "Kara-Kalpak, Karakalpak",
206
- "kab": "Kabyle",
207
- "kac": "Kachin, Jingpho",
208
- "kam": "Kamba (Kenya)",
209
- "kar": "Karen languages",
210
- "kaw": "Kawi",
211
- "kbd": "Kabardian",
212
- "kg": "Kongo",
213
- "kha": "Khasi",
214
- "khi": "Khoisan languages",
215
- "kho": "Khotanese, Sakan",
216
- "ki": "Kikuyu, Gikuyu",
217
- "kj": "Kuanyama, Kwanyama",
218
- "kk": "Kazakh",
219
- "kl": "Kalaallisut, Greenlandic",
220
- "km": "Khmer, Central Khmer",
221
- "kmb": "Kimbundu",
222
- "kn": "Kannada",
223
- "ko": "Korean",
224
- "kok": "Konkani (macrolanguage)",
225
- "kos": "Kosraean",
226
- "kpe": "Kpelle",
227
- "kr": "Kanuri",
228
- "krc": "Karachay-Balkar",
229
- "kro": "Kru languages",
230
- "kru": "Kurukh",
231
- "ks": "Kashmiri",
232
- "ku": "Kurdish",
233
- "kum": "Kumyk",
234
- "kut": "Kutenai",
235
- "kv": "Komi",
236
- "kw": "Cornish",
237
- "ky": "Kirghiz, Kyrgyz",
238
- "la": "Latin",
239
- "lad": "Ladino",
240
- "lah": "Lahnda",
241
- "lam": "Lamba",
242
- "lb": "Luxembourgish, Letzeburgesch",
243
- "lez": "Lezghian",
244
- "lg": "Ganda, Luganda",
245
- "li": "Limburgan, Limburger, Limburgish",
246
- "ln": "Lingala",
247
- "lo": "Lao",
248
- "lol": "Mongo",
249
- "loz": "Lozi",
250
- "lt": "Lithuanian",
251
- "lu": "Luba-Katanga",
252
- "lua": "Luba-Lulua",
253
- "lui": "Luiseno",
254
- "lun": "Lunda",
255
- "luo": "Luo (Kenya and Tanzania), Dholuo",
256
- "lus": "Lushai",
257
- "lv": "Latvian",
258
- "mad": "Madurese",
259
- "mag": "Magahi",
260
- "mai": "Maithili",
261
- "mak": "Makasar",
262
- "man": "Mandingo, Manding",
263
- "map": "Austronesian languages",
264
- "mas": "Masai",
265
- "mdf": "Moksha",
266
- "mdr": "Mandar",
267
- "men": "Mende (Sierra Leone)",
268
- "mg": "Malagasy",
269
- "mga": "Middle Irish (900-1200)",
270
- "mh": "Marshallese",
271
- "mi": "Maori",
272
- "mic": "Mi'kmaq, Micmac",
273
- "min": "Minangkabau",
274
- "mis": "Uncoded languages",
275
- "mk": "Macedonian",
276
- "mkh": "Mon-Khmer languages",
277
- "ml": "Malayalam",
278
- "mn": "Mongolian",
279
- "mnc": "Manchu",
280
- "mni": "Manipuri",
281
- "mno": "Manobo languages",
282
- "moh": "Mohawk",
283
- "mos": "Mossi",
284
- "mr": "Marathi",
285
- "ms": "Malay (macrolanguage)",
286
- "mt": "Maltese",
287
- "mul": "Multiple languages",
288
- "mun": "Munda languages",
289
- "mus": "Creek",
290
- "mwl": "Mirandese",
291
- "mwr": "Marwari",
292
- "my": "Burmese",
293
- "myn": "Mayan languages",
294
- "myv": "Erzya",
295
- "na": "Nauru",
296
- "nah": "Nahuatl languages",
297
- "nai": "North American Indian languages",
298
- "nap": "Neapolitan",
299
- "nb": "Norwegian Bokm\u00e5l",
300
- "nd": "North Ndebele",
301
- "nds": "Low German, Low Saxon",
302
- "ne": "Nepali (macrolanguage)",
303
- "new": "Newari, Nepal Bhasa",
304
- "ng": "Ndonga",
305
- "nia": "Nias",
306
- "nic": "Niger-Kordofanian languages",
307
- "niu": "Niuean",
308
- "nl": "Dutch, Flemish",
309
- "nn": "Norwegian Nynorsk",
310
- "no": "Norwegian",
311
- "nog": "Nogai",
312
- "non": "Old Norse",
313
- "nr": "South Ndebele",
314
- "nso": "Pedi, Northern Sotho, Sepedi",
315
- "nub": "Nubian languages",
316
- "nv": "Navajo, Navaho",
317
- "nwc": "Classical Newari, Classical Nepal Bhasa, Old Newari",
318
- "ny": "Nyanja, Chewa, Chichewa",
319
- "nym": "Nyamwezi",
320
- "nyn": "Nyankole",
321
- "nyo": "Nyoro",
322
- "nzi": "Nzima",
323
- "oc": "Occitan (post 1500)",
324
- "oj": "Ojibwa",
325
- "om": "Oromo",
326
- "or": "Oriya (macrolanguage), Odia (macrolanguage)",
327
- "os": "Ossetian, Ossetic",
328
- "osa": "Osage",
329
- "ota": "Ottoman Turkish (1500-1928)",
330
- "oto": "Otomian languages",
331
- "pa": "Panjabi, Punjabi",
332
- "paa": "Papuan languages",
333
- "pag": "Pangasinan",
334
- "pal": "Pahlavi",
335
- "pam": "Pampanga, Kapampangan",
336
- "pap": "Papiamento",
337
- "pau": "Palauan",
338
- "peo": "Old Persian (ca. 600-400 B.C.)",
339
- "phi": "Philippine languages",
340
- "phn": "Phoenician",
341
- "pi": "Pali",
342
- "pl": "Polish",
343
- "pon": "Pohnpeian",
344
- "pra": "Prakrit languages",
345
- "pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
346
- "ps": "Pushto, Pashto",
347
- "pt": "Portuguese",
348
- "qu": "Quechua",
349
- "raj": "Rajasthani",
350
- "rap": "Rapanui",
351
- "rar": "Rarotongan, Cook Islands Maori",
352
- "rm": "Romansh",
353
- "rn": "Rundi",
354
- "ro": "Romanian, Moldavian, Moldovan",
355
- "roa": "Romance languages",
356
- "rom": "Romany",
357
- "ru": "Russian",
358
- "rup": "Macedo-Romanian, Aromanian, Arumanian",
359
- "rw": "Kinyarwanda",
360
- "sa": "Sanskrit",
361
- "sad": "Sandawe",
362
- "sah": "Yakut",
363
- "sai": "South American Indian languages",
364
- "sal": "Salishan languages",
365
- "sam": "Samaritan Aramaic",
366
- "sas": "Sasak",
367
- "sat": "Santali",
368
- "sc": "Sardinian",
369
- "scn": "Sicilian",
370
- "sco": "Scots",
371
- "sd": "Sindhi",
372
- "se": "Northern Sami",
373
- "sel": "Selkup",
374
- "sem": "Semitic languages",
375
- "sg": "Sango",
376
- "sga": "Old Irish (to 900)",
377
- "sgn": "Sign languages",
378
- "sh": "Serbo-Croatian",
379
- "shn": "Shan",
380
- "si": "Sinhala, Sinhalese",
381
- "sid": "Sidamo",
382
- "sio": "Siouan languages",
383
- "sit": "Sino-Tibetan languages",
384
- "sk": "Slovak",
385
- "sl": "Slovenian",
386
- "sla": "Slavic languages",
387
- "sm": "Samoan",
388
- "sma": "Southern Sami",
389
- "smi": "Sami languages",
390
- "smj": "Lule Sami",
391
- "smn": "Inari Sami",
392
- "sms": "Skolt Sami",
393
- "sn": "Shona",
394
- "snk": "Soninke",
395
- "so": "Somali",
396
- "sog": "Sogdian",
397
- "son": "Songhai languages",
398
- "sq": "Albanian",
399
- "sr": "Serbian",
400
- "srn": "Sranan Tongo",
401
- "srr": "Serer",
402
- "ss": "Swati",
403
- "ssa": "Nilo-Saharan languages",
404
- "st": "Southern Sotho",
405
- "su": "Sundanese",
406
- "suk": "Sukuma",
407
- "sus": "Susu",
408
- "sux": "Sumerian",
409
- "sv": "Swedish",
410
- "sw": "Swahili (macrolanguage)",
411
- "syr": "Syriac",
412
- "ta": "Tamil",
413
- "tai": "Tai languages",
414
- "te": "Telugu",
415
- "tem": "Timne",
416
- "ter": "Tereno",
417
- "tet": "Tetum",
418
- "tg": "Tajik",
419
- "th": "Thai",
420
- "ti": "Tigrinya",
421
- "tig": "Tigre",
422
- "tiv": "Tiv",
423
- "tk": "Turkmen",
424
- "tkl": "Tokelau",
425
- "tl": "Tagalog",
426
- "tlh": "Klingon, tlhIngan Hol",
427
- "tli": "Tlingit",
428
- "tmh": "Tamashek",
429
- "tn": "Tswana",
430
- "to": "Tonga (Tonga Islands)",
431
- "tog": "Tonga (Nyasa)",
432
- "tpi": "Tok Pisin",
433
- "tr": "Turkish",
434
- "ts": "Tsonga",
435
- "tsi": "Tsimshian",
436
- "tt": "Tatar",
437
- "tum": "Tumbuka",
438
- "tup": "Tupi languages",
439
- "tut": "Altaic languages",
440
- "tvl": "Tuvalu",
441
- "tw": "Twi",
442
- "ty": "Tahitian",
443
- "tyv": "Tuvinian",
444
- "udm": "Udmurt",
445
- "ug": "Uighur, Uyghur",
446
- "uga": "Ugaritic",
447
- "uk": "Ukrainian",
448
- "umb": "Umbundu",
449
- "und": "Undetermined",
450
- "ur": "Urdu",
451
- "uz": "Uzbek",
452
- "vai": "Vai",
453
- "ve": "Venda",
454
- "vi": "Vietnamese",
455
- "vo": "Volap\u00fck",
456
- "vot": "Votic",
457
- "wa": "Walloon",
458
- "wak": "Wakashan languages",
459
- "wal": "Wolaytta, Wolaitta",
460
- "war": "Waray (Philippines)",
461
- "was": "Washo",
462
- "wen": "Sorbian languages",
463
- "wo": "Wolof",
464
- "xal": "Kalmyk, Oirat",
465
- "xh": "Xhosa",
466
- "yao": "Yao",
467
- "yap": "Yapese",
468
- "yi": "Yiddish",
469
- "yo": "Yoruba",
470
- "ypk": "Yupik languages",
471
- "za": "Zhuang, Chuang",
472
- "zap": "Zapotec",
473
- "zen": "Zenaga",
474
- "zh": "Chinese",
475
- "znd": "Zande languages",
476
- "zu": "Zulu",
477
- "zun": "Zuni"
478
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tagging_app.py CHANGED
@@ -2,9 +2,10 @@ import json
2
  from pathlib import Path
3
  from typing import Callable, Dict, List, Tuple
4
 
 
5
  import streamlit as st
6
  import yaml
7
- from datasets.utils.metadata_validator import DatasetMetadata
8
 
9
  st.set_page_config(
10
  page_title="HF Dataset Tagging App",
@@ -13,9 +14,20 @@ st.set_page_config(
13
  initial_sidebar_state="auto",
14
  )
15
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  task_set = json.load(open("task_set.json"))
17
  license_set = json.load(open("license_set.json"))
18
- language_set_restricted = json.load(open("language_set.json"))
19
 
20
  multilinguality_set = {
21
  "monolingual": "contains a single language",
@@ -74,30 +86,20 @@ def multiselect(
74
  format_func: Callable = str,
75
  ):
76
  valid_values, invalid_values = split_known(values, valid_set)
77
- w.markdown(
78
- """
79
- #### {title}
80
- {errors}
81
- """.format(
82
- title=title, errors="" if len(invalid_values) == 0 else f"_Found invalid values:_ `{invalid_values}`"
83
- )
84
- )
85
  return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
86
 
87
 
88
- def validate_dict(state_dict: Dict) -> str:
89
  try:
90
  DatasetMetadata(**state_dict)
91
- valid = "✔️ This is a valid tagset! 🤗"
92
  except Exception as e:
93
- valid = f"""
94
- 🙁 This is an invalid tagset, here are the errors in it:
95
- ```
96
- {e}
97
- ```
98
- You're _very_ welcome to fix these issues and submit a new PR on [`datasets`](https://github.com/huggingface/datasets/)
99
- """
100
- return valid
101
 
102
 
103
  def new_state():
@@ -131,15 +133,6 @@ st.sidebar.markdown(
131
 
132
  This app aims to make it easier to add structured tags to the datasets present in the library.
133
 
134
- Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
135
- to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
136
-
137
- The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
138
-
139
- ### Preloading an existing tag set
140
-
141
- You can load an existing tag set to get started if you want.
142
- Beware that clicking pre-load will overwrite the current state!
143
  """
144
  )
145
 
@@ -163,19 +156,23 @@ if leftbtn.button("pre-load"):
163
  initial_state = existing_tag_sets[preloaded_id]
164
  state = initial_state or new_state()
165
  st.experimental_set_query_params(preload_dataset=preloaded_id)
166
- if rightbtn.button("flush state"):
167
- state = new_state()
168
- initial_state = None
169
- preloaded_id = None
170
- st.experimental_set_query_params()
 
171
 
172
  if preloaded_id is not None and initial_state is not None:
173
- valid = validate_dict(initial_state)
174
  st.sidebar.markdown(
175
  f"""
176
  ---
177
  The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
178
- {valid}
 
 
 
 
179
  Here is the matching yaml block:
180
 
181
  ```yaml
@@ -235,15 +232,23 @@ if "other" in state["multilinguality"]:
235
  st.write(f"Registering other-{other_multilinguality} multilinguality")
236
  state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
237
 
238
- state["languages"] = multiselect(
239
- leftcol,
240
- "Languages",
241
- "What languages are represented in the dataset?",
242
- values=state["languages"],
243
- valid_set=list(language_set_restricted.keys()),
244
- format_func=lambda m: f"{m} : {language_set_restricted[m]}",
 
 
 
 
 
 
 
 
245
  )
246
-
247
 
248
  leftcol.markdown("### Dataset creators")
249
  state["language_creators"] = multiselect(
@@ -329,12 +334,16 @@ state["size_categories"] = [
329
  ## Show results
330
  ########################
331
 
332
- valid = validate_dict(state)
333
  rightcol.markdown(
334
  f"""
335
  ### Finalized tag set
336
 
337
- {valid}
 
 
 
 
 
338
 
339
  ```yaml
340
  {yaml.dump(state)}
@@ -349,5 +358,4 @@ This is a standalone tool, it is useful to check for errors on an existing tagse
349
  yamlblock = rightcol.text_area("Input your yaml here")
350
  if yamlblock.strip() != "":
351
  inputdict = yaml.safe_load(yamlblock)
352
- valid = validate_dict(inputdict)
353
- rightcol.markdown(valid)
 
2
  from pathlib import Path
3
  from typing import Callable, Dict, List, Tuple
4
 
5
+ import langcodes as lc
6
  import streamlit as st
7
  import yaml
8
+ from datasets.utils.metadata import DatasetMetadata
9
 
10
  st.set_page_config(
11
  page_title="HF Dataset Tagging App",
 
14
  initial_sidebar_state="auto",
15
  )
16
 
17
+ # XXX: restyling errors as streamlit does not respect whitespaces on `st.error` and doesn't scroll horizontally, which
18
+ # generally makes things easier when reading error reports
19
+ st.markdown(
20
+ """
21
+ <style>
22
+ div[role=alert] { overflow-x: scroll}
23
+ div.stAlert p { white-space: pre }
24
+ </style>
25
+ """,
26
+ unsafe_allow_html=True,
27
+ )
28
+
29
  task_set = json.load(open("task_set.json"))
30
  license_set = json.load(open("license_set.json"))
 
31
 
32
  multilinguality_set = {
33
  "monolingual": "contains a single language",
 
86
  format_func: Callable = str,
87
  ):
88
  valid_values, invalid_values = split_known(values, valid_set)
89
+ w.markdown(f"#### {title}")
90
+ if len(invalid_values) > 0:
91
+ w.markdown("Found the following invalid values:")
92
+ w.error(invalid_values)
 
 
 
 
93
  return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
94
 
95
 
96
+ def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
97
  try:
98
  DatasetMetadata(**state_dict)
99
+ w.markdown(" This is a valid tagset! 🤗")
100
  except Exception as e:
101
+ w.markdown("❌ This is an invalid tagset, here are the errors in it:")
102
+ w.error(e)
 
 
 
 
 
 
103
 
104
 
105
  def new_state():
 
133
 
134
  This app aims to make it easier to add structured tags to the datasets present in the library.
135
 
 
 
 
 
 
 
 
 
 
136
  """
137
  )
138
 
 
156
  initial_state = existing_tag_sets[preloaded_id]
157
  state = initial_state or new_state()
158
  st.experimental_set_query_params(preload_dataset=preloaded_id)
159
+ if sum(len(v) if v is not None else 0 for v in state.values()) > 0:
160
+ if rightbtn.button("flush state"):
161
+ state = new_state()
162
+ initial_state = None
163
+ preloaded_id = None
164
+ st.experimental_set_query_params()
165
 
166
  if preloaded_id is not None and initial_state is not None:
 
167
  st.sidebar.markdown(
168
  f"""
169
  ---
170
  The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
171
+ """
172
+ )
173
+ validate_dict(st.sidebar, initial_state)
174
+ st.sidebar.markdown(
175
+ f"""
176
  Here is the matching yaml block:
177
 
178
  ```yaml
 
232
  st.write(f"Registering other-{other_multilinguality} multilinguality")
233
  state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
234
 
235
+ valid_values, invalid_values = list(), list()
236
+ for langtag in state["languages"]:
237
+ try:
238
+ lc.get(langtag)
239
+ valid_values.append(langtag)
240
+ except:
241
+ invalid_values.append(langtag)
242
+ leftcol.markdown("#### Languages")
243
+ if len(invalid_values) > 0:
244
+ leftcol.markdown("Found the following invalid values:")
245
+ leftcol.error(invalid_values)
246
+
247
+ langtags = leftcol.text_area(
248
+ "What languages are represented in the dataset? expected format is BCP47 tags separated for ';' e.g. 'en-US;fr-FR'",
249
+ value=";".join(valid_values),
250
  )
251
+ state["languages"] = langtags.split(";")
252
 
253
  leftcol.markdown("### Dataset creators")
254
  state["language_creators"] = multiselect(
 
334
  ## Show results
335
  ########################
336
 
 
337
  rightcol.markdown(
338
  f"""
339
  ### Finalized tag set
340
 
341
+ """
342
+ )
343
+ validate_dict(rightcol, state)
344
+
345
+ rightcol.markdown(
346
+ f"""
347
 
348
  ```yaml
349
  {yaml.dump(state)}
 
358
  yamlblock = rightcol.text_area("Input your yaml here")
359
  if yamlblock.strip() != "":
360
  inputdict = yaml.safe_load(yamlblock)
361
+ validate_dict(rightcol, inputdict)