Spaces:
Running
Running
theo
commited on
Commit
•
827b7ef
1
Parent(s):
1cc3978
validate against langcodes via textarea, better error display
Browse files- language_set.json +0 -478
- tagging_app.py +56 -48
language_set.json
DELETED
@@ -1,478 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"code": "Programming language code",
|
3 |
-
"aa": "Afar",
|
4 |
-
"ab": "Abkhazian",
|
5 |
-
"ace": "Achinese",
|
6 |
-
"ach": "Acoli",
|
7 |
-
"ada": "Adangme",
|
8 |
-
"ady": "Adyghe, Adygei",
|
9 |
-
"ae": "Avestan",
|
10 |
-
"af": "Afrikaans",
|
11 |
-
"afa": "Afro-Asiatic languages",
|
12 |
-
"afh": "Afrihili",
|
13 |
-
"ain": "Ainu (Japan)",
|
14 |
-
"ak": "Akan",
|
15 |
-
"akk": "Akkadian",
|
16 |
-
"ale": "Aleut",
|
17 |
-
"alg": "Algonquian languages",
|
18 |
-
"alt": "Southern Altai",
|
19 |
-
"am": "Amharic",
|
20 |
-
"an": "Aragonese",
|
21 |
-
"ang": "Old English (ca. 450-1100)",
|
22 |
-
"apa": "Apache languages",
|
23 |
-
"ar": "Arabic",
|
24 |
-
"arc": "Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE)",
|
25 |
-
"arn": "Mapudungun, Mapuche",
|
26 |
-
"arp": "Arapaho",
|
27 |
-
"art": "Artificial languages",
|
28 |
-
"arw": "Arawak",
|
29 |
-
"as": "Assamese",
|
30 |
-
"ast": "Asturian, Asturleonese, Bable, Leonese",
|
31 |
-
"ath": "Athapascan languages",
|
32 |
-
"aus": "Australian languages",
|
33 |
-
"av": "Avaric",
|
34 |
-
"awa": "Awadhi",
|
35 |
-
"ay": "Aymara",
|
36 |
-
"az": "Azerbaijani",
|
37 |
-
"ba": "Bashkir",
|
38 |
-
"bad": "Banda languages",
|
39 |
-
"bai": "Bamileke languages",
|
40 |
-
"bal": "Baluchi",
|
41 |
-
"ban": "Balinese",
|
42 |
-
"bas": "Basa (Cameroon)",
|
43 |
-
"bat": "Baltic languages",
|
44 |
-
"be": "Belarusian",
|
45 |
-
"bej": "Beja, Bedawiyet",
|
46 |
-
"bem": "Bemba (Zambia)",
|
47 |
-
"ber": "Berber languages",
|
48 |
-
"bg": "Bulgarian",
|
49 |
-
"bh": "Bihari languages",
|
50 |
-
"bho": "Bhojpuri",
|
51 |
-
"bi": "Bislama",
|
52 |
-
"bik": "Bikol",
|
53 |
-
"bin": "Bini, Edo",
|
54 |
-
"bla": "Siksika",
|
55 |
-
"bm": "Bambara",
|
56 |
-
"bn": "Bengali, Bangla",
|
57 |
-
"bnt": "Bantu languages",
|
58 |
-
"bo": "Tibetan",
|
59 |
-
"br": "Breton",
|
60 |
-
"bra": "Braj",
|
61 |
-
"bs": "Bosnian",
|
62 |
-
"btk": "Batak languages",
|
63 |
-
"bua": "Buriat",
|
64 |
-
"bug": "Buginese",
|
65 |
-
"byn": "Bilin, Blin",
|
66 |
-
"ca": "Catalan, Valencian",
|
67 |
-
"cad": "Caddo",
|
68 |
-
"cai": "Central American Indian languages",
|
69 |
-
"car": "Galibi Carib",
|
70 |
-
"cau": "Caucasian languages",
|
71 |
-
"ce": "Chechen",
|
72 |
-
"ceb": "Cebuano",
|
73 |
-
"cel": "Celtic languages",
|
74 |
-
"ch": "Chamorro",
|
75 |
-
"chb": "Chibcha",
|
76 |
-
"chg": "Chagatai",
|
77 |
-
"chk": "Chuukese",
|
78 |
-
"chm": "Mari (Russia)",
|
79 |
-
"chn": "Chinook jargon",
|
80 |
-
"cho": "Choctaw",
|
81 |
-
"chp": "Chipewyan, Dene Suline",
|
82 |
-
"chr": "Cherokee",
|
83 |
-
"chy": "Cheyenne",
|
84 |
-
"cmc": "Chamic languages",
|
85 |
-
"co": "Corsican",
|
86 |
-
"cop": "Coptic",
|
87 |
-
"cpe": "English-based creoles and pidgins",
|
88 |
-
"cpf": "French-based creoles and pidgins",
|
89 |
-
"cpp": "Portuguese-based creoles and pidgins",
|
90 |
-
"cr": "Cree",
|
91 |
-
"crh": "Crimean Tatar, Crimean Turkish",
|
92 |
-
"crp": "Creoles and pidgins",
|
93 |
-
"cs": "Czech",
|
94 |
-
"csb": "Kashubian",
|
95 |
-
"cu": "Church Slavic, Church Slavonic, Old Bulgarian, Old Church Slavonic, Old Slavonic",
|
96 |
-
"cus": "Cushitic languages",
|
97 |
-
"cv": "Chuvash",
|
98 |
-
"cy": "Welsh",
|
99 |
-
"da": "Danish",
|
100 |
-
"dak": "Dakota",
|
101 |
-
"dar": "Dargwa",
|
102 |
-
"day": "Land Dayak languages",
|
103 |
-
"de": "German",
|
104 |
-
"del": "Delaware",
|
105 |
-
"den": "Slave (Athapascan)",
|
106 |
-
"dgr": "Dogrib, T\u0142\u0131\u0328ch\u01eb",
|
107 |
-
"din": "Dinka",
|
108 |
-
"doi": "Dogri (macrolanguage)",
|
109 |
-
"dra": "Dravidian languages",
|
110 |
-
"dsb": "Lower Sorbian",
|
111 |
-
"dua": "Duala",
|
112 |
-
"dum": "Middle Dutch (ca. 1050-1350)",
|
113 |
-
"dv": "Dhivehi, Divehi, Maldivian",
|
114 |
-
"dyu": "Dyula",
|
115 |
-
"dz": "Dzongkha",
|
116 |
-
"ee": "Ewe",
|
117 |
-
"efi": "Efik",
|
118 |
-
"egy": "Egyptian (Ancient)",
|
119 |
-
"eka": "Ekajuk",
|
120 |
-
"el": "Modern Greek (1453-)",
|
121 |
-
"elx": "Elamite",
|
122 |
-
"en": "English",
|
123 |
-
"enm": "Middle English (1100-1500)",
|
124 |
-
"eo": "Esperanto",
|
125 |
-
"es": "Spanish, Castilian",
|
126 |
-
"et": "Estonian",
|
127 |
-
"eu": "Basque",
|
128 |
-
"ewo": "Ewondo",
|
129 |
-
"fa": "Persian",
|
130 |
-
"fan": "Fang (Equatorial Guinea)",
|
131 |
-
"fat": "Fanti",
|
132 |
-
"ff": "Fulah",
|
133 |
-
"fi": "Finnish",
|
134 |
-
"fil": "Filipino, Pilipino",
|
135 |
-
"fiu": "Finno-Ugrian languages",
|
136 |
-
"fj": "Fijian",
|
137 |
-
"fo": "Faroese",
|
138 |
-
"fon": "Fon",
|
139 |
-
"fr": "French",
|
140 |
-
"frm": "Middle French (ca. 1400-1600)",
|
141 |
-
"fro": "Old French (842-ca. 1400)",
|
142 |
-
"fur": "Friulian",
|
143 |
-
"fy": "Western Frisian",
|
144 |
-
"ga": "Irish",
|
145 |
-
"gaa": "Ga",
|
146 |
-
"gay": "Gayo",
|
147 |
-
"gba": "Gbaya (Central African Republic)",
|
148 |
-
"gd": "Scottish Gaelic, Gaelic",
|
149 |
-
"gem": "Germanic languages",
|
150 |
-
"gez": "Geez",
|
151 |
-
"gil": "Gilbertese",
|
152 |
-
"gl": "Galician",
|
153 |
-
"gmh": "Middle High German (ca. 1050-1500)",
|
154 |
-
"gn": "Guarani",
|
155 |
-
"goh": "Old High German (ca. 750-1050)",
|
156 |
-
"gon": "Gondi",
|
157 |
-
"gor": "Gorontalo",
|
158 |
-
"got": "Gothic",
|
159 |
-
"grb": "Grebo",
|
160 |
-
"grc": "Ancient Greek (to 1453)",
|
161 |
-
"gu": "Gujarati",
|
162 |
-
"gv": "Manx",
|
163 |
-
"gwi": "Gwich\u02bcin",
|
164 |
-
"ha": "Hausa",
|
165 |
-
"hai": "Haida",
|
166 |
-
"haw": "Hawaiian",
|
167 |
-
"he": "Hebrew",
|
168 |
-
"hi": "Hindi",
|
169 |
-
"hil": "Hiligaynon",
|
170 |
-
"him": "Himachali languages, Western Pahari languages",
|
171 |
-
"hit": "Hittite",
|
172 |
-
"hmn": "Hmong, Mong",
|
173 |
-
"ho": "Hiri Motu",
|
174 |
-
"hr": "Croatian",
|
175 |
-
"hsb": "Upper Sorbian",
|
176 |
-
"ht": "Haitian, Haitian Creole",
|
177 |
-
"hu": "Hungarian",
|
178 |
-
"hup": "Hupa",
|
179 |
-
"hy": "Armenian",
|
180 |
-
"hz": "Herero",
|
181 |
-
"ia": "Interlingua (International Auxiliary Language Association)",
|
182 |
-
"iba": "Iban",
|
183 |
-
"id": "Indonesian",
|
184 |
-
"ie": "Interlingue, Occidental",
|
185 |
-
"ig": "Igbo",
|
186 |
-
"ii": "Sichuan Yi, Nuosu",
|
187 |
-
"ijo": "Ijo languages",
|
188 |
-
"ik": "Inupiaq",
|
189 |
-
"ilo": "Iloko",
|
190 |
-
"inc": "Indic languages",
|
191 |
-
"ine": "Indo-European languages",
|
192 |
-
"inh": "Ingush",
|
193 |
-
"io": "Ido",
|
194 |
-
"ira": "Iranian languages",
|
195 |
-
"iro": "Iroquoian languages",
|
196 |
-
"is": "Icelandic",
|
197 |
-
"it": "Italian",
|
198 |
-
"iu": "Inuktitut",
|
199 |
-
"ja": "Japanese",
|
200 |
-
"jbo": "Lojban",
|
201 |
-
"jpr": "Judeo-Persian",
|
202 |
-
"jrb": "Judeo-Arabic",
|
203 |
-
"jv": "Javanese",
|
204 |
-
"ka": "Georgian",
|
205 |
-
"kaa": "Kara-Kalpak, Karakalpak",
|
206 |
-
"kab": "Kabyle",
|
207 |
-
"kac": "Kachin, Jingpho",
|
208 |
-
"kam": "Kamba (Kenya)",
|
209 |
-
"kar": "Karen languages",
|
210 |
-
"kaw": "Kawi",
|
211 |
-
"kbd": "Kabardian",
|
212 |
-
"kg": "Kongo",
|
213 |
-
"kha": "Khasi",
|
214 |
-
"khi": "Khoisan languages",
|
215 |
-
"kho": "Khotanese, Sakan",
|
216 |
-
"ki": "Kikuyu, Gikuyu",
|
217 |
-
"kj": "Kuanyama, Kwanyama",
|
218 |
-
"kk": "Kazakh",
|
219 |
-
"kl": "Kalaallisut, Greenlandic",
|
220 |
-
"km": "Khmer, Central Khmer",
|
221 |
-
"kmb": "Kimbundu",
|
222 |
-
"kn": "Kannada",
|
223 |
-
"ko": "Korean",
|
224 |
-
"kok": "Konkani (macrolanguage)",
|
225 |
-
"kos": "Kosraean",
|
226 |
-
"kpe": "Kpelle",
|
227 |
-
"kr": "Kanuri",
|
228 |
-
"krc": "Karachay-Balkar",
|
229 |
-
"kro": "Kru languages",
|
230 |
-
"kru": "Kurukh",
|
231 |
-
"ks": "Kashmiri",
|
232 |
-
"ku": "Kurdish",
|
233 |
-
"kum": "Kumyk",
|
234 |
-
"kut": "Kutenai",
|
235 |
-
"kv": "Komi",
|
236 |
-
"kw": "Cornish",
|
237 |
-
"ky": "Kirghiz, Kyrgyz",
|
238 |
-
"la": "Latin",
|
239 |
-
"lad": "Ladino",
|
240 |
-
"lah": "Lahnda",
|
241 |
-
"lam": "Lamba",
|
242 |
-
"lb": "Luxembourgish, Letzeburgesch",
|
243 |
-
"lez": "Lezghian",
|
244 |
-
"lg": "Ganda, Luganda",
|
245 |
-
"li": "Limburgan, Limburger, Limburgish",
|
246 |
-
"ln": "Lingala",
|
247 |
-
"lo": "Lao",
|
248 |
-
"lol": "Mongo",
|
249 |
-
"loz": "Lozi",
|
250 |
-
"lt": "Lithuanian",
|
251 |
-
"lu": "Luba-Katanga",
|
252 |
-
"lua": "Luba-Lulua",
|
253 |
-
"lui": "Luiseno",
|
254 |
-
"lun": "Lunda",
|
255 |
-
"luo": "Luo (Kenya and Tanzania), Dholuo",
|
256 |
-
"lus": "Lushai",
|
257 |
-
"lv": "Latvian",
|
258 |
-
"mad": "Madurese",
|
259 |
-
"mag": "Magahi",
|
260 |
-
"mai": "Maithili",
|
261 |
-
"mak": "Makasar",
|
262 |
-
"man": "Mandingo, Manding",
|
263 |
-
"map": "Austronesian languages",
|
264 |
-
"mas": "Masai",
|
265 |
-
"mdf": "Moksha",
|
266 |
-
"mdr": "Mandar",
|
267 |
-
"men": "Mende (Sierra Leone)",
|
268 |
-
"mg": "Malagasy",
|
269 |
-
"mga": "Middle Irish (900-1200)",
|
270 |
-
"mh": "Marshallese",
|
271 |
-
"mi": "Maori",
|
272 |
-
"mic": "Mi'kmaq, Micmac",
|
273 |
-
"min": "Minangkabau",
|
274 |
-
"mis": "Uncoded languages",
|
275 |
-
"mk": "Macedonian",
|
276 |
-
"mkh": "Mon-Khmer languages",
|
277 |
-
"ml": "Malayalam",
|
278 |
-
"mn": "Mongolian",
|
279 |
-
"mnc": "Manchu",
|
280 |
-
"mni": "Manipuri",
|
281 |
-
"mno": "Manobo languages",
|
282 |
-
"moh": "Mohawk",
|
283 |
-
"mos": "Mossi",
|
284 |
-
"mr": "Marathi",
|
285 |
-
"ms": "Malay (macrolanguage)",
|
286 |
-
"mt": "Maltese",
|
287 |
-
"mul": "Multiple languages",
|
288 |
-
"mun": "Munda languages",
|
289 |
-
"mus": "Creek",
|
290 |
-
"mwl": "Mirandese",
|
291 |
-
"mwr": "Marwari",
|
292 |
-
"my": "Burmese",
|
293 |
-
"myn": "Mayan languages",
|
294 |
-
"myv": "Erzya",
|
295 |
-
"na": "Nauru",
|
296 |
-
"nah": "Nahuatl languages",
|
297 |
-
"nai": "North American Indian languages",
|
298 |
-
"nap": "Neapolitan",
|
299 |
-
"nb": "Norwegian Bokm\u00e5l",
|
300 |
-
"nd": "North Ndebele",
|
301 |
-
"nds": "Low German, Low Saxon",
|
302 |
-
"ne": "Nepali (macrolanguage)",
|
303 |
-
"new": "Newari, Nepal Bhasa",
|
304 |
-
"ng": "Ndonga",
|
305 |
-
"nia": "Nias",
|
306 |
-
"nic": "Niger-Kordofanian languages",
|
307 |
-
"niu": "Niuean",
|
308 |
-
"nl": "Dutch, Flemish",
|
309 |
-
"nn": "Norwegian Nynorsk",
|
310 |
-
"no": "Norwegian",
|
311 |
-
"nog": "Nogai",
|
312 |
-
"non": "Old Norse",
|
313 |
-
"nr": "South Ndebele",
|
314 |
-
"nso": "Pedi, Northern Sotho, Sepedi",
|
315 |
-
"nub": "Nubian languages",
|
316 |
-
"nv": "Navajo, Navaho",
|
317 |
-
"nwc": "Classical Newari, Classical Nepal Bhasa, Old Newari",
|
318 |
-
"ny": "Nyanja, Chewa, Chichewa",
|
319 |
-
"nym": "Nyamwezi",
|
320 |
-
"nyn": "Nyankole",
|
321 |
-
"nyo": "Nyoro",
|
322 |
-
"nzi": "Nzima",
|
323 |
-
"oc": "Occitan (post 1500)",
|
324 |
-
"oj": "Ojibwa",
|
325 |
-
"om": "Oromo",
|
326 |
-
"or": "Oriya (macrolanguage), Odia (macrolanguage)",
|
327 |
-
"os": "Ossetian, Ossetic",
|
328 |
-
"osa": "Osage",
|
329 |
-
"ota": "Ottoman Turkish (1500-1928)",
|
330 |
-
"oto": "Otomian languages",
|
331 |
-
"pa": "Panjabi, Punjabi",
|
332 |
-
"paa": "Papuan languages",
|
333 |
-
"pag": "Pangasinan",
|
334 |
-
"pal": "Pahlavi",
|
335 |
-
"pam": "Pampanga, Kapampangan",
|
336 |
-
"pap": "Papiamento",
|
337 |
-
"pau": "Palauan",
|
338 |
-
"peo": "Old Persian (ca. 600-400 B.C.)",
|
339 |
-
"phi": "Philippine languages",
|
340 |
-
"phn": "Phoenician",
|
341 |
-
"pi": "Pali",
|
342 |
-
"pl": "Polish",
|
343 |
-
"pon": "Pohnpeian",
|
344 |
-
"pra": "Prakrit languages",
|
345 |
-
"pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
|
346 |
-
"ps": "Pushto, Pashto",
|
347 |
-
"pt": "Portuguese",
|
348 |
-
"qu": "Quechua",
|
349 |
-
"raj": "Rajasthani",
|
350 |
-
"rap": "Rapanui",
|
351 |
-
"rar": "Rarotongan, Cook Islands Maori",
|
352 |
-
"rm": "Romansh",
|
353 |
-
"rn": "Rundi",
|
354 |
-
"ro": "Romanian, Moldavian, Moldovan",
|
355 |
-
"roa": "Romance languages",
|
356 |
-
"rom": "Romany",
|
357 |
-
"ru": "Russian",
|
358 |
-
"rup": "Macedo-Romanian, Aromanian, Arumanian",
|
359 |
-
"rw": "Kinyarwanda",
|
360 |
-
"sa": "Sanskrit",
|
361 |
-
"sad": "Sandawe",
|
362 |
-
"sah": "Yakut",
|
363 |
-
"sai": "South American Indian languages",
|
364 |
-
"sal": "Salishan languages",
|
365 |
-
"sam": "Samaritan Aramaic",
|
366 |
-
"sas": "Sasak",
|
367 |
-
"sat": "Santali",
|
368 |
-
"sc": "Sardinian",
|
369 |
-
"scn": "Sicilian",
|
370 |
-
"sco": "Scots",
|
371 |
-
"sd": "Sindhi",
|
372 |
-
"se": "Northern Sami",
|
373 |
-
"sel": "Selkup",
|
374 |
-
"sem": "Semitic languages",
|
375 |
-
"sg": "Sango",
|
376 |
-
"sga": "Old Irish (to 900)",
|
377 |
-
"sgn": "Sign languages",
|
378 |
-
"sh": "Serbo-Croatian",
|
379 |
-
"shn": "Shan",
|
380 |
-
"si": "Sinhala, Sinhalese",
|
381 |
-
"sid": "Sidamo",
|
382 |
-
"sio": "Siouan languages",
|
383 |
-
"sit": "Sino-Tibetan languages",
|
384 |
-
"sk": "Slovak",
|
385 |
-
"sl": "Slovenian",
|
386 |
-
"sla": "Slavic languages",
|
387 |
-
"sm": "Samoan",
|
388 |
-
"sma": "Southern Sami",
|
389 |
-
"smi": "Sami languages",
|
390 |
-
"smj": "Lule Sami",
|
391 |
-
"smn": "Inari Sami",
|
392 |
-
"sms": "Skolt Sami",
|
393 |
-
"sn": "Shona",
|
394 |
-
"snk": "Soninke",
|
395 |
-
"so": "Somali",
|
396 |
-
"sog": "Sogdian",
|
397 |
-
"son": "Songhai languages",
|
398 |
-
"sq": "Albanian",
|
399 |
-
"sr": "Serbian",
|
400 |
-
"srn": "Sranan Tongo",
|
401 |
-
"srr": "Serer",
|
402 |
-
"ss": "Swati",
|
403 |
-
"ssa": "Nilo-Saharan languages",
|
404 |
-
"st": "Southern Sotho",
|
405 |
-
"su": "Sundanese",
|
406 |
-
"suk": "Sukuma",
|
407 |
-
"sus": "Susu",
|
408 |
-
"sux": "Sumerian",
|
409 |
-
"sv": "Swedish",
|
410 |
-
"sw": "Swahili (macrolanguage)",
|
411 |
-
"syr": "Syriac",
|
412 |
-
"ta": "Tamil",
|
413 |
-
"tai": "Tai languages",
|
414 |
-
"te": "Telugu",
|
415 |
-
"tem": "Timne",
|
416 |
-
"ter": "Tereno",
|
417 |
-
"tet": "Tetum",
|
418 |
-
"tg": "Tajik",
|
419 |
-
"th": "Thai",
|
420 |
-
"ti": "Tigrinya",
|
421 |
-
"tig": "Tigre",
|
422 |
-
"tiv": "Tiv",
|
423 |
-
"tk": "Turkmen",
|
424 |
-
"tkl": "Tokelau",
|
425 |
-
"tl": "Tagalog",
|
426 |
-
"tlh": "Klingon, tlhIngan Hol",
|
427 |
-
"tli": "Tlingit",
|
428 |
-
"tmh": "Tamashek",
|
429 |
-
"tn": "Tswana",
|
430 |
-
"to": "Tonga (Tonga Islands)",
|
431 |
-
"tog": "Tonga (Nyasa)",
|
432 |
-
"tpi": "Tok Pisin",
|
433 |
-
"tr": "Turkish",
|
434 |
-
"ts": "Tsonga",
|
435 |
-
"tsi": "Tsimshian",
|
436 |
-
"tt": "Tatar",
|
437 |
-
"tum": "Tumbuka",
|
438 |
-
"tup": "Tupi languages",
|
439 |
-
"tut": "Altaic languages",
|
440 |
-
"tvl": "Tuvalu",
|
441 |
-
"tw": "Twi",
|
442 |
-
"ty": "Tahitian",
|
443 |
-
"tyv": "Tuvinian",
|
444 |
-
"udm": "Udmurt",
|
445 |
-
"ug": "Uighur, Uyghur",
|
446 |
-
"uga": "Ugaritic",
|
447 |
-
"uk": "Ukrainian",
|
448 |
-
"umb": "Umbundu",
|
449 |
-
"und": "Undetermined",
|
450 |
-
"ur": "Urdu",
|
451 |
-
"uz": "Uzbek",
|
452 |
-
"vai": "Vai",
|
453 |
-
"ve": "Venda",
|
454 |
-
"vi": "Vietnamese",
|
455 |
-
"vo": "Volap\u00fck",
|
456 |
-
"vot": "Votic",
|
457 |
-
"wa": "Walloon",
|
458 |
-
"wak": "Wakashan languages",
|
459 |
-
"wal": "Wolaytta, Wolaitta",
|
460 |
-
"war": "Waray (Philippines)",
|
461 |
-
"was": "Washo",
|
462 |
-
"wen": "Sorbian languages",
|
463 |
-
"wo": "Wolof",
|
464 |
-
"xal": "Kalmyk, Oirat",
|
465 |
-
"xh": "Xhosa",
|
466 |
-
"yao": "Yao",
|
467 |
-
"yap": "Yapese",
|
468 |
-
"yi": "Yiddish",
|
469 |
-
"yo": "Yoruba",
|
470 |
-
"ypk": "Yupik languages",
|
471 |
-
"za": "Zhuang, Chuang",
|
472 |
-
"zap": "Zapotec",
|
473 |
-
"zen": "Zenaga",
|
474 |
-
"zh": "Chinese",
|
475 |
-
"znd": "Zande languages",
|
476 |
-
"zu": "Zulu",
|
477 |
-
"zun": "Zuni"
|
478 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tagging_app.py
CHANGED
@@ -2,9 +2,10 @@ import json
|
|
2 |
from pathlib import Path
|
3 |
from typing import Callable, Dict, List, Tuple
|
4 |
|
|
|
5 |
import streamlit as st
|
6 |
import yaml
|
7 |
-
from datasets.utils.
|
8 |
|
9 |
st.set_page_config(
|
10 |
page_title="HF Dataset Tagging App",
|
@@ -13,9 +14,20 @@ st.set_page_config(
|
|
13 |
initial_sidebar_state="auto",
|
14 |
)
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
task_set = json.load(open("task_set.json"))
|
17 |
license_set = json.load(open("license_set.json"))
|
18 |
-
language_set_restricted = json.load(open("language_set.json"))
|
19 |
|
20 |
multilinguality_set = {
|
21 |
"monolingual": "contains a single language",
|
@@ -74,30 +86,20 @@ def multiselect(
|
|
74 |
format_func: Callable = str,
|
75 |
):
|
76 |
valid_values, invalid_values = split_known(values, valid_set)
|
77 |
-
w.markdown(
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
""".format(
|
82 |
-
title=title, errors="" if len(invalid_values) == 0 else f"_Found invalid values:_ `{invalid_values}`"
|
83 |
-
)
|
84 |
-
)
|
85 |
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
|
86 |
|
87 |
|
88 |
-
def validate_dict(state_dict: Dict)
|
89 |
try:
|
90 |
DatasetMetadata(**state_dict)
|
91 |
-
|
92 |
except Exception as e:
|
93 |
-
|
94 |
-
|
95 |
-
```
|
96 |
-
{e}
|
97 |
-
```
|
98 |
-
You're _very_ welcome to fix these issues and submit a new PR on [`datasets`](https://github.com/huggingface/datasets/)
|
99 |
-
"""
|
100 |
-
return valid
|
101 |
|
102 |
|
103 |
def new_state():
|
@@ -131,15 +133,6 @@ st.sidebar.markdown(
|
|
131 |
|
132 |
This app aims to make it easier to add structured tags to the datasets present in the library.
|
133 |
|
134 |
-
Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
|
135 |
-
to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
|
136 |
-
|
137 |
-
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
138 |
-
|
139 |
-
### Preloading an existing tag set
|
140 |
-
|
141 |
-
You can load an existing tag set to get started if you want.
|
142 |
-
Beware that clicking pre-load will overwrite the current state!
|
143 |
"""
|
144 |
)
|
145 |
|
@@ -163,19 +156,23 @@ if leftbtn.button("pre-load"):
|
|
163 |
initial_state = existing_tag_sets[preloaded_id]
|
164 |
state = initial_state or new_state()
|
165 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
166 |
-
if
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
171 |
|
172 |
if preloaded_id is not None and initial_state is not None:
|
173 |
-
valid = validate_dict(initial_state)
|
174 |
st.sidebar.markdown(
|
175 |
f"""
|
176 |
---
|
177 |
The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
|
178 |
-
|
|
|
|
|
|
|
|
|
179 |
Here is the matching yaml block:
|
180 |
|
181 |
```yaml
|
@@ -235,15 +232,23 @@ if "other" in state["multilinguality"]:
|
|
235 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
236 |
state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
|
237 |
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
)
|
246 |
-
|
247 |
|
248 |
leftcol.markdown("### Dataset creators")
|
249 |
state["language_creators"] = multiselect(
|
@@ -329,12 +334,16 @@ state["size_categories"] = [
|
|
329 |
## Show results
|
330 |
########################
|
331 |
|
332 |
-
valid = validate_dict(state)
|
333 |
rightcol.markdown(
|
334 |
f"""
|
335 |
### Finalized tag set
|
336 |
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
```yaml
|
340 |
{yaml.dump(state)}
|
@@ -349,5 +358,4 @@ This is a standalone tool, it is useful to check for errors on an existing tagse
|
|
349 |
yamlblock = rightcol.text_area("Input your yaml here")
|
350 |
if yamlblock.strip() != "":
|
351 |
inputdict = yaml.safe_load(yamlblock)
|
352 |
-
|
353 |
-
rightcol.markdown(valid)
|
|
|
2 |
from pathlib import Path
|
3 |
from typing import Callable, Dict, List, Tuple
|
4 |
|
5 |
+
import langcodes as lc
|
6 |
import streamlit as st
|
7 |
import yaml
|
8 |
+
from datasets.utils.metadata import DatasetMetadata
|
9 |
|
10 |
st.set_page_config(
|
11 |
page_title="HF Dataset Tagging App",
|
|
|
14 |
initial_sidebar_state="auto",
|
15 |
)
|
16 |
|
17 |
+
# XXX: restyling errors as streamlit does not respect whitespaces on `st.error` and doesn't scroll horizontally, which
|
18 |
+
# generally makes things easier when reading error reports
|
19 |
+
st.markdown(
|
20 |
+
"""
|
21 |
+
<style>
|
22 |
+
div[role=alert] { overflow-x: scroll}
|
23 |
+
div.stAlert p { white-space: pre }
|
24 |
+
</style>
|
25 |
+
""",
|
26 |
+
unsafe_allow_html=True,
|
27 |
+
)
|
28 |
+
|
29 |
task_set = json.load(open("task_set.json"))
|
30 |
license_set = json.load(open("license_set.json"))
|
|
|
31 |
|
32 |
multilinguality_set = {
|
33 |
"monolingual": "contains a single language",
|
|
|
86 |
format_func: Callable = str,
|
87 |
):
|
88 |
valid_values, invalid_values = split_known(values, valid_set)
|
89 |
+
w.markdown(f"#### {title}")
|
90 |
+
if len(invalid_values) > 0:
|
91 |
+
w.markdown("Found the following invalid values:")
|
92 |
+
w.error(invalid_values)
|
|
|
|
|
|
|
|
|
93 |
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
|
94 |
|
95 |
|
96 |
+
def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
|
97 |
try:
|
98 |
DatasetMetadata(**state_dict)
|
99 |
+
w.markdown("✅ This is a valid tagset! 🤗")
|
100 |
except Exception as e:
|
101 |
+
w.markdown("❌ This is an invalid tagset, here are the errors in it:")
|
102 |
+
w.error(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
|
105 |
def new_state():
|
|
|
133 |
|
134 |
This app aims to make it easier to add structured tags to the datasets present in the library.
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
"""
|
137 |
)
|
138 |
|
|
|
156 |
initial_state = existing_tag_sets[preloaded_id]
|
157 |
state = initial_state or new_state()
|
158 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
159 |
+
if sum(len(v) if v is not None else 0 for v in state.values()) > 0:
|
160 |
+
if rightbtn.button("flush state"):
|
161 |
+
state = new_state()
|
162 |
+
initial_state = None
|
163 |
+
preloaded_id = None
|
164 |
+
st.experimental_set_query_params()
|
165 |
|
166 |
if preloaded_id is not None and initial_state is not None:
|
|
|
167 |
st.sidebar.markdown(
|
168 |
f"""
|
169 |
---
|
170 |
The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
|
171 |
+
"""
|
172 |
+
)
|
173 |
+
validate_dict(st.sidebar, initial_state)
|
174 |
+
st.sidebar.markdown(
|
175 |
+
f"""
|
176 |
Here is the matching yaml block:
|
177 |
|
178 |
```yaml
|
|
|
232 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
233 |
state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
|
234 |
|
235 |
+
valid_values, invalid_values = list(), list()
|
236 |
+
for langtag in state["languages"]:
|
237 |
+
try:
|
238 |
+
lc.get(langtag)
|
239 |
+
valid_values.append(langtag)
|
240 |
+
except:
|
241 |
+
invalid_values.append(langtag)
|
242 |
+
leftcol.markdown("#### Languages")
|
243 |
+
if len(invalid_values) > 0:
|
244 |
+
leftcol.markdown("Found the following invalid values:")
|
245 |
+
leftcol.error(invalid_values)
|
246 |
+
|
247 |
+
langtags = leftcol.text_area(
|
248 |
+
"What languages are represented in the dataset? expected format is BCP47 tags separated for ';' e.g. 'en-US;fr-FR'",
|
249 |
+
value=";".join(valid_values),
|
250 |
)
|
251 |
+
state["languages"] = langtags.split(";")
|
252 |
|
253 |
leftcol.markdown("### Dataset creators")
|
254 |
state["language_creators"] = multiselect(
|
|
|
334 |
## Show results
|
335 |
########################
|
336 |
|
|
|
337 |
rightcol.markdown(
|
338 |
f"""
|
339 |
### Finalized tag set
|
340 |
|
341 |
+
"""
|
342 |
+
)
|
343 |
+
validate_dict(rightcol, state)
|
344 |
+
|
345 |
+
rightcol.markdown(
|
346 |
+
f"""
|
347 |
|
348 |
```yaml
|
349 |
{yaml.dump(state)}
|
|
|
358 |
yamlblock = rightcol.text_area("Input your yaml here")
|
359 |
if yamlblock.strip() != "":
|
360 |
inputdict = yaml.safe_load(yamlblock)
|
361 |
+
validate_dict(rightcol, inputdict)
|
|