Spaces:
Running
Running
yacine
commited on
Commit
•
cca065e
1
Parent(s):
be293db
expanded tag sets
Browse files- language_set.json +478 -0
- language_set_full.json +0 -0
- license_set.json +452 -0
- tag_set.json +0 -1
- tagging_app.py +19 -144
- task_set.json +84 -0
language_set.json
ADDED
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"aa": "Afar",
|
3 |
+
"ab": "Abkhazian",
|
4 |
+
"ace": "Achinese",
|
5 |
+
"ach": "Acoli",
|
6 |
+
"ada": "Adangme",
|
7 |
+
"ady": "Adyghe, Adygei",
|
8 |
+
"ae": "Avestan",
|
9 |
+
"af": "Afrikaans",
|
10 |
+
"afa": "Afro-Asiatic languages",
|
11 |
+
"afh": "Afrihili",
|
12 |
+
"ain": "Ainu (Japan)",
|
13 |
+
"ak": "Akan",
|
14 |
+
"akk": "Akkadian",
|
15 |
+
"ale": "Aleut",
|
16 |
+
"alg": "Algonquian languages",
|
17 |
+
"alt": "Southern Altai",
|
18 |
+
"am": "Amharic",
|
19 |
+
"an": "Aragonese",
|
20 |
+
"ang": "Old English (ca. 450-1100)",
|
21 |
+
"apa": "Apache languages",
|
22 |
+
"ar": "Arabic",
|
23 |
+
"arc": "Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE)",
|
24 |
+
"arn": "Mapudungun, Mapuche",
|
25 |
+
"arp": "Arapaho",
|
26 |
+
"art": "Artificial languages",
|
27 |
+
"arw": "Arawak",
|
28 |
+
"as": "Assamese",
|
29 |
+
"ast": "Asturian, Asturleonese, Bable, Leonese",
|
30 |
+
"ath": "Athapascan languages",
|
31 |
+
"aus": "Australian languages",
|
32 |
+
"av": "Avaric",
|
33 |
+
"awa": "Awadhi",
|
34 |
+
"ay": "Aymara",
|
35 |
+
"az": "Azerbaijani",
|
36 |
+
"ba": "Bashkir",
|
37 |
+
"bad": "Banda languages",
|
38 |
+
"bai": "Bamileke languages",
|
39 |
+
"bal": "Baluchi",
|
40 |
+
"ban": "Balinese",
|
41 |
+
"bas": "Basa (Cameroon)",
|
42 |
+
"bat": "Baltic languages",
|
43 |
+
"be": "Belarusian",
|
44 |
+
"bej": "Beja, Bedawiyet",
|
45 |
+
"bem": "Bemba (Zambia)",
|
46 |
+
"ber": "Berber languages",
|
47 |
+
"bg": "Bulgarian",
|
48 |
+
"bh": "Bihari languages",
|
49 |
+
"bho": "Bhojpuri",
|
50 |
+
"bi": "Bislama",
|
51 |
+
"bik": "Bikol",
|
52 |
+
"bin": "Bini, Edo",
|
53 |
+
"bla": "Siksika",
|
54 |
+
"bm": "Bambara",
|
55 |
+
"bn": "Bengali, Bangla",
|
56 |
+
"bnt": "Bantu languages",
|
57 |
+
"bo": "Tibetan",
|
58 |
+
"br": "Breton",
|
59 |
+
"bra": "Braj",
|
60 |
+
"bs": "Bosnian",
|
61 |
+
"btk": "Batak languages",
|
62 |
+
"bua": "Buriat",
|
63 |
+
"bug": "Buginese",
|
64 |
+
"byn": "Bilin, Blin",
|
65 |
+
"ca": "Catalan, Valencian",
|
66 |
+
"cad": "Caddo",
|
67 |
+
"cai": "Central American Indian languages",
|
68 |
+
"car": "Galibi Carib",
|
69 |
+
"cau": "Caucasian languages",
|
70 |
+
"ce": "Chechen",
|
71 |
+
"ceb": "Cebuano",
|
72 |
+
"cel": "Celtic languages",
|
73 |
+
"ch": "Chamorro",
|
74 |
+
"chb": "Chibcha",
|
75 |
+
"chg": "Chagatai",
|
76 |
+
"chk": "Chuukese",
|
77 |
+
"chm": "Mari (Russia)",
|
78 |
+
"chn": "Chinook jargon",
|
79 |
+
"cho": "Choctaw",
|
80 |
+
"chp": "Chipewyan, Dene Suline",
|
81 |
+
"chr": "Cherokee",
|
82 |
+
"chy": "Cheyenne",
|
83 |
+
"cmc": "Chamic languages",
|
84 |
+
"co": "Corsican",
|
85 |
+
"cop": "Coptic",
|
86 |
+
"cpe": "English-based creoles and pidgins",
|
87 |
+
"cpf": "French-based creoles and pidgins",
|
88 |
+
"cpp": "Portuguese-based creoles and pidgins",
|
89 |
+
"cr": "Cree",
|
90 |
+
"crh": "Crimean Tatar, Crimean Turkish",
|
91 |
+
"crp": "Creoles and pidgins",
|
92 |
+
"cs": "Czech",
|
93 |
+
"csb": "Kashubian",
|
94 |
+
"cu": "Church Slavic, Church Slavonic, Old Bulgarian, Old Church Slavonic, Old Slavonic",
|
95 |
+
"cus": "Cushitic languages",
|
96 |
+
"cv": "Chuvash",
|
97 |
+
"cy": "Welsh",
|
98 |
+
"da": "Danish",
|
99 |
+
"dak": "Dakota",
|
100 |
+
"dar": "Dargwa",
|
101 |
+
"day": "Land Dayak languages",
|
102 |
+
"de": "German",
|
103 |
+
"del": "Delaware",
|
104 |
+
"den": "Slave (Athapascan)",
|
105 |
+
"dgr": "Dogrib, T\u0142\u0131\u0328ch\u01eb",
|
106 |
+
"din": "Dinka",
|
107 |
+
"doi": "Dogri (macrolanguage)",
|
108 |
+
"dra": "Dravidian languages",
|
109 |
+
"dsb": "Lower Sorbian",
|
110 |
+
"dua": "Duala",
|
111 |
+
"dum": "Middle Dutch (ca. 1050-1350)",
|
112 |
+
"dv": "Dhivehi, Divehi, Maldivian",
|
113 |
+
"dyu": "Dyula",
|
114 |
+
"dz": "Dzongkha",
|
115 |
+
"ee": "Ewe",
|
116 |
+
"efi": "Efik",
|
117 |
+
"egy": "Egyptian (Ancient)",
|
118 |
+
"eka": "Ekajuk",
|
119 |
+
"el": "Modern Greek (1453-)",
|
120 |
+
"elx": "Elamite",
|
121 |
+
"en": "English",
|
122 |
+
"enm": "Middle English (1100-1500)",
|
123 |
+
"eo": "Esperanto",
|
124 |
+
"es": "Spanish, Castilian",
|
125 |
+
"et": "Estonian",
|
126 |
+
"eu": "Basque",
|
127 |
+
"ewo": "Ewondo",
|
128 |
+
"fa": "Persian",
|
129 |
+
"fan": "Fang (Equatorial Guinea)",
|
130 |
+
"fat": "Fanti",
|
131 |
+
"ff": "Fulah",
|
132 |
+
"fi": "Finnish",
|
133 |
+
"fil": "Filipino, Pilipino",
|
134 |
+
"fiu": "Finno-Ugrian languages",
|
135 |
+
"fj": "Fijian",
|
136 |
+
"fo": "Faroese",
|
137 |
+
"fon": "Fon",
|
138 |
+
"fr": "French",
|
139 |
+
"frm": "Middle French (ca. 1400-1600)",
|
140 |
+
"fro": "Old French (842-ca. 1400)",
|
141 |
+
"fur": "Friulian",
|
142 |
+
"fy": "Western Frisian",
|
143 |
+
"ga": "Irish",
|
144 |
+
"gaa": "Ga",
|
145 |
+
"gay": "Gayo",
|
146 |
+
"gba": "Gbaya (Central African Republic)",
|
147 |
+
"gd": "Scottish Gaelic, Gaelic",
|
148 |
+
"gem": "Germanic languages",
|
149 |
+
"gez": "Geez",
|
150 |
+
"gil": "Gilbertese",
|
151 |
+
"gl": "Galician",
|
152 |
+
"gmh": "Middle High German (ca. 1050-1500)",
|
153 |
+
"gn": "Guarani",
|
154 |
+
"goh": "Old High German (ca. 750-1050)",
|
155 |
+
"gon": "Gondi",
|
156 |
+
"gor": "Gorontalo",
|
157 |
+
"got": "Gothic",
|
158 |
+
"grb": "Grebo",
|
159 |
+
"grc": "Ancient Greek (to 1453)",
|
160 |
+
"gu": "Gujarati",
|
161 |
+
"gv": "Manx",
|
162 |
+
"gwi": "Gwich\u02bcin",
|
163 |
+
"ha": "Hausa",
|
164 |
+
"hai": "Haida",
|
165 |
+
"haw": "Hawaiian",
|
166 |
+
"he": "Hebrew",
|
167 |
+
"hi": "Hindi",
|
168 |
+
"hil": "Hiligaynon",
|
169 |
+
"him": "Himachali languages, Western Pahari languages",
|
170 |
+
"hit": "Hittite",
|
171 |
+
"hmn": "Hmong, Mong",
|
172 |
+
"ho": "Hiri Motu",
|
173 |
+
"hr": "Croatian",
|
174 |
+
"hsb": "Upper Sorbian",
|
175 |
+
"ht": "Haitian, Haitian Creole",
|
176 |
+
"hu": "Hungarian",
|
177 |
+
"hup": "Hupa",
|
178 |
+
"hy": "Armenian",
|
179 |
+
"hz": "Herero",
|
180 |
+
"ia": "Interlingua (International Auxiliary Language Association)",
|
181 |
+
"iba": "Iban",
|
182 |
+
"id": "Indonesian",
|
183 |
+
"ie": "Interlingue, Occidental",
|
184 |
+
"ig": "Igbo",
|
185 |
+
"ii": "Sichuan Yi, Nuosu",
|
186 |
+
"ijo": "Ijo languages",
|
187 |
+
"ik": "Inupiaq",
|
188 |
+
"ilo": "Iloko",
|
189 |
+
"inc": "Indic languages",
|
190 |
+
"ine": "Indo-European languages",
|
191 |
+
"inh": "Ingush",
|
192 |
+
"io": "Ido",
|
193 |
+
"ira": "Iranian languages",
|
194 |
+
"iro": "Iroquoian languages",
|
195 |
+
"is": "Icelandic",
|
196 |
+
"it": "Italian",
|
197 |
+
"iu": "Inuktitut",
|
198 |
+
"ja": "Japanese",
|
199 |
+
"jbo": "Lojban",
|
200 |
+
"jpr": "Judeo-Persian",
|
201 |
+
"jrb": "Judeo-Arabic",
|
202 |
+
"jv": "Javanese",
|
203 |
+
"ka": "Georgian",
|
204 |
+
"kaa": "Kara-Kalpak, Karakalpak",
|
205 |
+
"kab": "Kabyle",
|
206 |
+
"kac": "Kachin, Jingpho",
|
207 |
+
"kam": "Kamba (Kenya)",
|
208 |
+
"kar": "Karen languages",
|
209 |
+
"kaw": "Kawi",
|
210 |
+
"kbd": "Kabardian",
|
211 |
+
"kg": "Kongo",
|
212 |
+
"kha": "Khasi",
|
213 |
+
"khi": "Khoisan languages",
|
214 |
+
"kho": "Khotanese, Sakan",
|
215 |
+
"ki": "Kikuyu, Gikuyu",
|
216 |
+
"kj": "Kuanyama, Kwanyama",
|
217 |
+
"kk": "Kazakh",
|
218 |
+
"kl": "Kalaallisut, Greenlandic",
|
219 |
+
"km": "Khmer, Central Khmer",
|
220 |
+
"kmb": "Kimbundu",
|
221 |
+
"kn": "Kannada",
|
222 |
+
"ko": "Korean",
|
223 |
+
"kok": "Konkani (macrolanguage)",
|
224 |
+
"kos": "Kosraean",
|
225 |
+
"kpe": "Kpelle",
|
226 |
+
"kr": "Kanuri",
|
227 |
+
"krc": "Karachay-Balkar",
|
228 |
+
"kro": "Kru languages",
|
229 |
+
"kru": "Kurukh",
|
230 |
+
"ks": "Kashmiri",
|
231 |
+
"ku": "Kurdish",
|
232 |
+
"kum": "Kumyk",
|
233 |
+
"kut": "Kutenai",
|
234 |
+
"kv": "Komi",
|
235 |
+
"kw": "Cornish",
|
236 |
+
"ky": "Kirghiz, Kyrgyz",
|
237 |
+
"la": "Latin",
|
238 |
+
"lad": "Ladino",
|
239 |
+
"lah": "Lahnda",
|
240 |
+
"lam": "Lamba",
|
241 |
+
"lb": "Luxembourgish, Letzeburgesch",
|
242 |
+
"lez": "Lezghian",
|
243 |
+
"lg": "Ganda, Luganda",
|
244 |
+
"li": "Limburgan, Limburger, Limburgish",
|
245 |
+
"ln": "Lingala",
|
246 |
+
"lo": "Lao",
|
247 |
+
"lol": "Mongo",
|
248 |
+
"loz": "Lozi",
|
249 |
+
"lt": "Lithuanian",
|
250 |
+
"lu": "Luba-Katanga",
|
251 |
+
"lua": "Luba-Lulua",
|
252 |
+
"lui": "Luiseno",
|
253 |
+
"lun": "Lunda",
|
254 |
+
"luo": "Luo (Kenya and Tanzania), Dholuo",
|
255 |
+
"lus": "Lushai",
|
256 |
+
"lv": "Latvian",
|
257 |
+
"mad": "Madurese",
|
258 |
+
"mag": "Magahi",
|
259 |
+
"mai": "Maithili",
|
260 |
+
"mak": "Makasar",
|
261 |
+
"man": "Mandingo, Manding",
|
262 |
+
"map": "Austronesian languages",
|
263 |
+
"mas": "Masai",
|
264 |
+
"mdf": "Moksha",
|
265 |
+
"mdr": "Mandar",
|
266 |
+
"men": "Mende (Sierra Leone)",
|
267 |
+
"mg": "Malagasy",
|
268 |
+
"mga": "Middle Irish (900-1200)",
|
269 |
+
"mh": "Marshallese",
|
270 |
+
"mi": "Maori",
|
271 |
+
"mic": "Mi'kmaq, Micmac",
|
272 |
+
"min": "Minangkabau",
|
273 |
+
"mis": "Uncoded languages",
|
274 |
+
"mk": "Macedonian",
|
275 |
+
"mkh": "Mon-Khmer languages",
|
276 |
+
"ml": "Malayalam",
|
277 |
+
"mn": "Mongolian",
|
278 |
+
"mnc": "Manchu",
|
279 |
+
"mni": "Manipuri",
|
280 |
+
"mno": "Manobo languages",
|
281 |
+
"moh": "Mohawk",
|
282 |
+
"mos": "Mossi",
|
283 |
+
"mr": "Marathi",
|
284 |
+
"ms": "Malay (macrolanguage)",
|
285 |
+
"mt": "Maltese",
|
286 |
+
"mul": "Multiple languages",
|
287 |
+
"mun": "Munda languages",
|
288 |
+
"mus": "Creek",
|
289 |
+
"mwl": "Mirandese",
|
290 |
+
"mwr": "Marwari",
|
291 |
+
"my": "Burmese",
|
292 |
+
"myn": "Mayan languages",
|
293 |
+
"myv": "Erzya",
|
294 |
+
"na": "Nauru",
|
295 |
+
"nah": "Nahuatl languages",
|
296 |
+
"nai": "North American Indian languages",
|
297 |
+
"nap": "Neapolitan",
|
298 |
+
"nb": "Norwegian Bokm\u00e5l",
|
299 |
+
"nd": "North Ndebele",
|
300 |
+
"nds": "Low German, Low Saxon",
|
301 |
+
"ne": "Nepali (macrolanguage)",
|
302 |
+
"new": "Newari, Nepal Bhasa",
|
303 |
+
"ng": "Ndonga",
|
304 |
+
"nia": "Nias",
|
305 |
+
"nic": "Niger-Kordofanian languages",
|
306 |
+
"niu": "Niuean",
|
307 |
+
"nl": "Dutch, Flemish",
|
308 |
+
"nn": "Norwegian Nynorsk",
|
309 |
+
"no": "Norwegian",
|
310 |
+
"nog": "Nogai",
|
311 |
+
"non": "Old Norse",
|
312 |
+
"nr": "South Ndebele",
|
313 |
+
"nso": "Pedi, Northern Sotho, Sepedi",
|
314 |
+
"nub": "Nubian languages",
|
315 |
+
"nv": "Navajo, Navaho",
|
316 |
+
"nwc": "Classical Newari, Classical Nepal Bhasa, Old Newari",
|
317 |
+
"ny": "Nyanja, Chewa, Chichewa",
|
318 |
+
"nym": "Nyamwezi",
|
319 |
+
"nyn": "Nyankole",
|
320 |
+
"nyo": "Nyoro",
|
321 |
+
"nzi": "Nzima",
|
322 |
+
"oc": "Occitan (post 1500)",
|
323 |
+
"oj": "Ojibwa",
|
324 |
+
"om": "Oromo",
|
325 |
+
"or": "Oriya (macrolanguage), Odia (macrolanguage)",
|
326 |
+
"os": "Ossetian, Ossetic",
|
327 |
+
"osa": "Osage",
|
328 |
+
"ota": "Ottoman Turkish (1500-1928)",
|
329 |
+
"oto": "Otomian languages",
|
330 |
+
"pa": "Panjabi, Punjabi",
|
331 |
+
"paa": "Papuan languages",
|
332 |
+
"pag": "Pangasinan",
|
333 |
+
"pal": "Pahlavi",
|
334 |
+
"pam": "Pampanga, Kapampangan",
|
335 |
+
"pap": "Papiamento",
|
336 |
+
"pau": "Palauan",
|
337 |
+
"peo": "Old Persian (ca. 600-400 B.C.)",
|
338 |
+
"phi": "Philippine languages",
|
339 |
+
"phn": "Phoenician",
|
340 |
+
"pi": "Pali",
|
341 |
+
"pl": "Polish",
|
342 |
+
"pon": "Pohnpeian",
|
343 |
+
"pra": "Prakrit languages",
|
344 |
+
"pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
|
345 |
+
"ps": "Pushto, Pashto",
|
346 |
+
"pt": "Portuguese",
|
347 |
+
"qaa..qtz": "Private use",
|
348 |
+
"qu": "Quechua",
|
349 |
+
"raj": "Rajasthani",
|
350 |
+
"rap": "Rapanui",
|
351 |
+
"rar": "Rarotongan, Cook Islands Maori",
|
352 |
+
"rm": "Romansh",
|
353 |
+
"rn": "Rundi",
|
354 |
+
"ro": "Romanian, Moldavian, Moldovan",
|
355 |
+
"roa": "Romance languages",
|
356 |
+
"rom": "Romany",
|
357 |
+
"ru": "Russian",
|
358 |
+
"rup": "Macedo-Romanian, Aromanian, Arumanian",
|
359 |
+
"rw": "Kinyarwanda",
|
360 |
+
"sa": "Sanskrit",
|
361 |
+
"sad": "Sandawe",
|
362 |
+
"sah": "Yakut",
|
363 |
+
"sai": "South American Indian languages",
|
364 |
+
"sal": "Salishan languages",
|
365 |
+
"sam": "Samaritan Aramaic",
|
366 |
+
"sas": "Sasak",
|
367 |
+
"sat": "Santali",
|
368 |
+
"sc": "Sardinian",
|
369 |
+
"scn": "Sicilian",
|
370 |
+
"sco": "Scots",
|
371 |
+
"sd": "Sindhi",
|
372 |
+
"se": "Northern Sami",
|
373 |
+
"sel": "Selkup",
|
374 |
+
"sem": "Semitic languages",
|
375 |
+
"sg": "Sango",
|
376 |
+
"sga": "Old Irish (to 900)",
|
377 |
+
"sgn": "Sign languages",
|
378 |
+
"sh": "Serbo-Croatian",
|
379 |
+
"shn": "Shan",
|
380 |
+
"si": "Sinhala, Sinhalese",
|
381 |
+
"sid": "Sidamo",
|
382 |
+
"sio": "Siouan languages",
|
383 |
+
"sit": "Sino-Tibetan languages",
|
384 |
+
"sk": "Slovak",
|
385 |
+
"sl": "Slovenian",
|
386 |
+
"sla": "Slavic languages",
|
387 |
+
"sm": "Samoan",
|
388 |
+
"sma": "Southern Sami",
|
389 |
+
"smi": "Sami languages",
|
390 |
+
"smj": "Lule Sami",
|
391 |
+
"smn": "Inari Sami",
|
392 |
+
"sms": "Skolt Sami",
|
393 |
+
"sn": "Shona",
|
394 |
+
"snk": "Soninke",
|
395 |
+
"so": "Somali",
|
396 |
+
"sog": "Sogdian",
|
397 |
+
"son": "Songhai languages",
|
398 |
+
"sq": "Albanian",
|
399 |
+
"sr": "Serbian",
|
400 |
+
"srn": "Sranan Tongo",
|
401 |
+
"srr": "Serer",
|
402 |
+
"ss": "Swati",
|
403 |
+
"ssa": "Nilo-Saharan languages",
|
404 |
+
"st": "Southern Sotho",
|
405 |
+
"su": "Sundanese",
|
406 |
+
"suk": "Sukuma",
|
407 |
+
"sus": "Susu",
|
408 |
+
"sux": "Sumerian",
|
409 |
+
"sv": "Swedish",
|
410 |
+
"sw": "Swahili (macrolanguage)",
|
411 |
+
"syr": "Syriac",
|
412 |
+
"ta": "Tamil",
|
413 |
+
"tai": "Tai languages",
|
414 |
+
"te": "Telugu",
|
415 |
+
"tem": "Timne",
|
416 |
+
"ter": "Tereno",
|
417 |
+
"tet": "Tetum",
|
418 |
+
"tg": "Tajik",
|
419 |
+
"th": "Thai",
|
420 |
+
"ti": "Tigrinya",
|
421 |
+
"tig": "Tigre",
|
422 |
+
"tiv": "Tiv",
|
423 |
+
"tk": "Turkmen",
|
424 |
+
"tkl": "Tokelau",
|
425 |
+
"tl": "Tagalog",
|
426 |
+
"tlh": "Klingon, tlhIngan Hol",
|
427 |
+
"tli": "Tlingit",
|
428 |
+
"tmh": "Tamashek",
|
429 |
+
"tn": "Tswana",
|
430 |
+
"to": "Tonga (Tonga Islands)",
|
431 |
+
"tog": "Tonga (Nyasa)",
|
432 |
+
"tpi": "Tok Pisin",
|
433 |
+
"tr": "Turkish",
|
434 |
+
"ts": "Tsonga",
|
435 |
+
"tsi": "Tsimshian",
|
436 |
+
"tt": "Tatar",
|
437 |
+
"tum": "Tumbuka",
|
438 |
+
"tup": "Tupi languages",
|
439 |
+
"tut": "Altaic languages",
|
440 |
+
"tvl": "Tuvalu",
|
441 |
+
"tw": "Twi",
|
442 |
+
"ty": "Tahitian",
|
443 |
+
"tyv": "Tuvinian",
|
444 |
+
"udm": "Udmurt",
|
445 |
+
"ug": "Uighur, Uyghur",
|
446 |
+
"uga": "Ugaritic",
|
447 |
+
"uk": "Ukrainian",
|
448 |
+
"umb": "Umbundu",
|
449 |
+
"und": "Undetermined",
|
450 |
+
"ur": "Urdu",
|
451 |
+
"uz": "Uzbek",
|
452 |
+
"vai": "Vai",
|
453 |
+
"ve": "Venda",
|
454 |
+
"vi": "Vietnamese",
|
455 |
+
"vo": "Volap\u00fck",
|
456 |
+
"vot": "Votic",
|
457 |
+
"wa": "Walloon",
|
458 |
+
"wak": "Wakashan languages",
|
459 |
+
"wal": "Wolaytta, Wolaitta",
|
460 |
+
"war": "Waray (Philippines)",
|
461 |
+
"was": "Washo",
|
462 |
+
"wen": "Sorbian languages",
|
463 |
+
"wo": "Wolof",
|
464 |
+
"xal": "Kalmyk, Oirat",
|
465 |
+
"xh": "Xhosa",
|
466 |
+
"yao": "Yao",
|
467 |
+
"yap": "Yapese",
|
468 |
+
"yi": "Yiddish",
|
469 |
+
"yo": "Yoruba",
|
470 |
+
"ypk": "Yupik languages",
|
471 |
+
"za": "Zhuang, Chuang",
|
472 |
+
"zap": "Zapotec",
|
473 |
+
"zen": "Zenaga",
|
474 |
+
"zh": "Chinese",
|
475 |
+
"znd": "Zande languages",
|
476 |
+
"zu": "Zulu",
|
477 |
+
"zun": "Zuni"
|
478 |
+
}
|
language_set_full.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
license_set.json
ADDED
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"other": "Other license",
|
3 |
+
"unknown": "License information unavailable",
|
4 |
+
"0bsd": "BSD Zero Clause License",
|
5 |
+
"aal": "Attribution Assurance License",
|
6 |
+
"abstyles": "Abstyles License",
|
7 |
+
"adobe-2006": "Adobe Systems Incorporated Source Code License Agreement",
|
8 |
+
"adobe-glyph": "Adobe Glyph List License",
|
9 |
+
"adsl": "Amazon Digital Services License",
|
10 |
+
"afl-1.1": "Academic Free License v1.1",
|
11 |
+
"afl-1.2": "Academic Free License v1.2",
|
12 |
+
"afl-2.0": "Academic Free License v2.0",
|
13 |
+
"afl-2.1": "Academic Free License v2.1",
|
14 |
+
"afl-3.0": "Academic Free License v3.0",
|
15 |
+
"afmparse": "Afmparse License",
|
16 |
+
"agpl-1.0": "Affero General Public License v1.0",
|
17 |
+
"agpl-1.0-only": "Affero General Public License v1.0 only",
|
18 |
+
"agpl-1.0-or-later": "Affero General Public License v1.0 or later",
|
19 |
+
"agpl-3.0": "GNU Affero General Public License v3.0",
|
20 |
+
"agpl-3.0-only": "GNU Affero General Public License v3.0 only",
|
21 |
+
"agpl-3.0-or-later": "GNU Affero General Public License v3.0 or later",
|
22 |
+
"aladdin": "Aladdin Free Public License",
|
23 |
+
"amdplpa": "AMD's plpa_map.c License",
|
24 |
+
"aml": "Apple MIT License",
|
25 |
+
"ampas": "Academy of Motion Picture Arts and Sciences BSD",
|
26 |
+
"antlr-pd": "ANTLR Software Rights Notice",
|
27 |
+
"antlr-pd-fallback": "ANTLR Software Rights Notice with license fallback",
|
28 |
+
"apache-1.0": "Apache License 1.0",
|
29 |
+
"apache-1.1": "Apache License 1.1",
|
30 |
+
"apache-2.0": "Apache License 2.0",
|
31 |
+
"apafml": "Adobe Postscript AFM License",
|
32 |
+
"apl-1.0": "Adaptive Public License 1.0",
|
33 |
+
"apsl-1.0": "Apple Public Source License 1.0",
|
34 |
+
"apsl-1.1": "Apple Public Source License 1.1",
|
35 |
+
"apsl-1.2": "Apple Public Source License 1.2",
|
36 |
+
"apsl-2.0": "Apple Public Source License 2.0",
|
37 |
+
"artistic-1.0": "Artistic License 1.0",
|
38 |
+
"artistic-1.0-cl8": "Artistic License 1.0 w/clause 8",
|
39 |
+
"artistic-1.0-perl": "Artistic License 1.0 (Perl)",
|
40 |
+
"artistic-2.0": "Artistic License 2.0",
|
41 |
+
"bahyph": "Bahyph License",
|
42 |
+
"barr": "Barr License",
|
43 |
+
"beerware": "Beerware License",
|
44 |
+
"bittorrent-1.0": "BitTorrent Open Source License v1.0",
|
45 |
+
"bittorrent-1.1": "BitTorrent Open Source License v1.1",
|
46 |
+
"blessing": "SQLite Blessing",
|
47 |
+
"blueoak-1.0.0": "Blue Oak Model License 1.0.0",
|
48 |
+
"borceux": "Borceux license",
|
49 |
+
"bsd-1-clause": "BSD 1-Clause License",
|
50 |
+
"bsd-2-clause": "BSD 2-Clause \"Simplified\" License",
|
51 |
+
"bsd-2-clause-freebsd": "BSD 2-Clause FreeBSD License",
|
52 |
+
"bsd-2-clause-netbsd": "BSD 2-Clause NetBSD License",
|
53 |
+
"bsd-2-clause-patent": "BSD-2-Clause Plus Patent License",
|
54 |
+
"bsd-2-clause-views": "BSD 2-Clause with views sentence",
|
55 |
+
"bsd-3-clause": "BSD 3-Clause \"New\" or \"Revised\" License",
|
56 |
+
"bsd-3-clause-attribution": "BSD with attribution",
|
57 |
+
"bsd-3-clause-clear": "BSD 3-Clause Clear License",
|
58 |
+
"bsd-3-clause-lbnl": "Lawrence Berkeley National Labs BSD variant license",
|
59 |
+
"bsd-3-clause-no-nuclear-license": "BSD 3-Clause No Nuclear License",
|
60 |
+
"bsd-3-clause-no-nuclear-license-2014": "BSD 3-Clause No Nuclear License 2014",
|
61 |
+
"bsd-3-clause-no-nuclear-warranty": "BSD 3-Clause No Nuclear Warranty",
|
62 |
+
"bsd-3-clause-open-mpi": "BSD 3-Clause Open MPI variant",
|
63 |
+
"bsd-4-clause": "BSD 4-Clause \"Original\" or \"Old\" License",
|
64 |
+
"bsd-4-clause-uc": "BSD-4-Clause (University of California-Specific)",
|
65 |
+
"bsd-protection": "BSD Protection License",
|
66 |
+
"bsd-source-code": "BSD Source Code Attribution",
|
67 |
+
"bsl-1.0": "Boost Software License 1.0",
|
68 |
+
"busl-1.1": "Business Source License 1.1",
|
69 |
+
"bzip2-1.0.5": "bzip2 and libbzip2 License v1.0.5",
|
70 |
+
"bzip2-1.0.6": "bzip2 and libbzip2 License v1.0.6",
|
71 |
+
"cal-1.0": "Cryptographic Autonomy License 1.0",
|
72 |
+
"cal-1.0-combined-work-exception": "Cryptographic Autonomy License 1.0 (Combined Work Exception)",
|
73 |
+
"caldera": "Caldera License",
|
74 |
+
"catosl-1.1": "Computer Associates Trusted Open Source License 1.1",
|
75 |
+
"cc-by-1.0": "Creative Commons Attribution 1.0 Generic",
|
76 |
+
"cc-by-2.0": "Creative Commons Attribution 2.0 Generic",
|
77 |
+
"cc-by-2.5": "Creative Commons Attribution 2.5 Generic",
|
78 |
+
"cc-by-3.0": "Creative Commons Attribution 3.0 Unported",
|
79 |
+
"cc-by-3.0-at": "Creative Commons Attribution 3.0 Austria",
|
80 |
+
"cc-by-3.0-us": "Creative Commons Attribution 3.0 United States",
|
81 |
+
"cc-by-4.0": "Creative Commons Attribution 4.0 International",
|
82 |
+
"cc-by-nc-1.0": "Creative Commons Attribution Non Commercial 1.0 Generic",
|
83 |
+
"cc-by-nc-2.0": "Creative Commons Attribution Non Commercial 2.0 Generic",
|
84 |
+
"cc-by-nc-2.5": "Creative Commons Attribution Non Commercial 2.5 Generic",
|
85 |
+
"cc-by-nc-3.0": "Creative Commons Attribution Non Commercial 3.0 Unported",
|
86 |
+
"cc-by-nc-4.0": "Creative Commons Attribution Non Commercial 4.0 International",
|
87 |
+
"cc-by-nc-nd-1.0": "Creative Commons Attribution Non Commercial No Derivatives 1.0 Generic",
|
88 |
+
"cc-by-nc-nd-2.0": "Creative Commons Attribution Non Commercial No Derivatives 2.0 Generic",
|
89 |
+
"cc-by-nc-nd-2.5": "Creative Commons Attribution Non Commercial No Derivatives 2.5 Generic",
|
90 |
+
"cc-by-nc-nd-3.0": "Creative Commons Attribution Non Commercial No Derivatives 3.0 Unported",
|
91 |
+
"cc-by-nc-nd-3.0-igo": "Creative Commons Attribution Non Commercial No Derivatives 3.0 IGO",
|
92 |
+
"cc-by-nc-nd-4.0": "Creative Commons Attribution Non Commercial No Derivatives 4.0 International",
|
93 |
+
"cc-by-nc-sa-1.0": "Creative Commons Attribution Non Commercial Share Alike 1.0 Generic",
|
94 |
+
"cc-by-nc-sa-2.0": "Creative Commons Attribution Non Commercial Share Alike 2.0 Generic",
|
95 |
+
"cc-by-nc-sa-2.5": "Creative Commons Attribution Non Commercial Share Alike 2.5 Generic",
|
96 |
+
"cc-by-nc-sa-3.0": "Creative Commons Attribution Non Commercial Share Alike 3.0 Unported",
|
97 |
+
"cc-by-nc-sa-4.0": "Creative Commons Attribution Non Commercial Share Alike 4.0 International",
|
98 |
+
"cc-by-nd-1.0": "Creative Commons Attribution No Derivatives 1.0 Generic",
|
99 |
+
"cc-by-nd-2.0": "Creative Commons Attribution No Derivatives 2.0 Generic",
|
100 |
+
"cc-by-nd-2.5": "Creative Commons Attribution No Derivatives 2.5 Generic",
|
101 |
+
"cc-by-nd-3.0": "Creative Commons Attribution No Derivatives 3.0 Unported",
|
102 |
+
"cc-by-nd-4.0": "Creative Commons Attribution No Derivatives 4.0 International",
|
103 |
+
"cc-by-sa-1.0": "Creative Commons Attribution Share Alike 1.0 Generic",
|
104 |
+
"cc-by-sa-2.0": "Creative Commons Attribution Share Alike 2.0 Generic",
|
105 |
+
"cc-by-sa-2.0-uk": "Creative Commons Attribution Share Alike 2.0 England and Wales",
|
106 |
+
"cc-by-sa-2.5": "Creative Commons Attribution Share Alike 2.5 Generic",
|
107 |
+
"cc-by-sa-3.0": "Creative Commons Attribution Share Alike 3.0 Unported",
|
108 |
+
"cc-by-sa-3.0-at": "Creative Commons Attribution-Share Alike 3.0 Austria",
|
109 |
+
"cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0 International",
|
110 |
+
"cc-pddc": "Creative Commons Public Domain Dedication and Certification",
|
111 |
+
"cc0-1.0": "Creative Commons Zero v1.0 Universal",
|
112 |
+
"cddl-1.0": "Common Development and Distribution License 1.0",
|
113 |
+
"cddl-1.1": "Common Development and Distribution License 1.1",
|
114 |
+
"cdla-permissive-1.0": "Community Data License Agreement Permissive 1.0",
|
115 |
+
"cdla-sharing-1.0": "Community Data License Agreement Sharing 1.0",
|
116 |
+
"cecill-1.0": "CeCILL Free Software License Agreement v1.0",
|
117 |
+
"cecill-1.1": "CeCILL Free Software License Agreement v1.1",
|
118 |
+
"cecill-2.0": "CeCILL Free Software License Agreement v2.0",
|
119 |
+
"cecill-2.1": "CeCILL Free Software License Agreement v2.1",
|
120 |
+
"cecill-b": "CeCILL-B Free Software License Agreement",
|
121 |
+
"cecill-c": "CeCILL-C Free Software License Agreement",
|
122 |
+
"cern-ohl-1.1": "CERN Open Hardware Licence v1.1",
|
123 |
+
"cern-ohl-1.2": "CERN Open Hardware Licence v1.2",
|
124 |
+
"cern-ohl-p-2.0": "CERN Open Hardware Licence Version 2 - Permissive",
|
125 |
+
"cern-ohl-s-2.0": "CERN Open Hardware Licence Version 2 - Strongly Reciprocal",
|
126 |
+
"cern-ohl-w-2.0": "CERN Open Hardware Licence Version 2 - Weakly Reciprocal",
|
127 |
+
"clartistic": "Clarified Artistic License",
|
128 |
+
"cnri-jython": "CNRI Jython License",
|
129 |
+
"cnri-python": "CNRI Python License",
|
130 |
+
"cnri-python-gpl-compatible": "CNRI Python Open Source GPL Compatible License Agreement",
|
131 |
+
"condor-1.1": "Condor Public License v1.1",
|
132 |
+
"copyleft-next-0.3.0": "copyleft-next 0.3.0",
|
133 |
+
"copyleft-next-0.3.1": "copyleft-next 0.3.1",
|
134 |
+
"cpal-1.0": "Common Public Attribution License 1.0",
|
135 |
+
"cpl-1.0": "Common Public License 1.0",
|
136 |
+
"cpol-1.02": "Code Project Open License 1.02",
|
137 |
+
"crossword": "Crossword License",
|
138 |
+
"crystalstacker": "CrystalStacker License",
|
139 |
+
"cua-opl-1.0": "CUA Office Public License v1.0",
|
140 |
+
"cube": "Cube License",
|
141 |
+
"curl": "curl License",
|
142 |
+
"d-fsl-1.0": "Deutsche Freie Software Lizenz",
|
143 |
+
"diffmark": "diffmark license",
|
144 |
+
"doc": "DOC License",
|
145 |
+
"dotseqn": "Dotseqn License",
|
146 |
+
"dsdp": "DSDP License",
|
147 |
+
"dvipdfm": "dvipdfm License",
|
148 |
+
"ecl-1.0": "Educational Community License v1.0",
|
149 |
+
"ecl-2.0": "Educational Community License v2.0",
|
150 |
+
"ecos-2.0": "eCos license version 2.0",
|
151 |
+
"efl-1.0": "Eiffel Forum License v1.0",
|
152 |
+
"efl-2.0": "Eiffel Forum License v2.0",
|
153 |
+
"egenix": "eGenix.com Public License 1.1.0",
|
154 |
+
"entessa": "Entessa Public License v1.0",
|
155 |
+
"epics": "EPICS Open License",
|
156 |
+
"epl-1.0": "Eclipse Public License 1.0",
|
157 |
+
"epl-2.0": "Eclipse Public License 2.0",
|
158 |
+
"erlpl-1.1": "Erlang Public License v1.1",
|
159 |
+
"etalab-2.0": "Etalab Open License 2.0",
|
160 |
+
"eudatagrid": "EU DataGrid Software License",
|
161 |
+
"eupl-1.0": "European Union Public License 1.0",
|
162 |
+
"eupl-1.1": "European Union Public License 1.1",
|
163 |
+
"eupl-1.2": "European Union Public License 1.2",
|
164 |
+
"eurosym": "Eurosym License",
|
165 |
+
"fair": "Fair License",
|
166 |
+
"frameworx-1.0": "Frameworx Open License 1.0",
|
167 |
+
"freeimage": "FreeImage Public License v1.0",
|
168 |
+
"fsfap": "FSF All Permissive License",
|
169 |
+
"fsful": "FSF Unlimited License",
|
170 |
+
"fsfullr": "FSF Unlimited License (with License Retention)",
|
171 |
+
"ftl": "Freetype Project License",
|
172 |
+
"gfdl-1.1": "GNU Free Documentation License v1.1",
|
173 |
+
"gfdl-1.1-invariants-only": "GNU Free Documentation License v1.1 only - invariants",
|
174 |
+
"gfdl-1.1-invariants-or-later": "GNU Free Documentation License v1.1 or later - invariants",
|
175 |
+
"gfdl-1.1-no-invariants-only": "GNU Free Documentation License v1.1 only - no invariants",
|
176 |
+
"gfdl-1.1-no-invariants-or-later": "GNU Free Documentation License v1.1 or later - no invariants",
|
177 |
+
"gfdl-1.1-only": "GNU Free Documentation License v1.1 only",
|
178 |
+
"gfdl-1.1-or-later": "GNU Free Documentation License v1.1 or later",
|
179 |
+
"gfdl-1.2": "GNU Free Documentation License v1.2",
|
180 |
+
"gfdl-1.2-invariants-only": "GNU Free Documentation License v1.2 only - invariants",
|
181 |
+
"gfdl-1.2-invariants-or-later": "GNU Free Documentation License v1.2 or later - invariants",
|
182 |
+
"gfdl-1.2-no-invariants-only": "GNU Free Documentation License v1.2 only - no invariants",
|
183 |
+
"gfdl-1.2-no-invariants-or-later": "GNU Free Documentation License v1.2 or later - no invariants",
|
184 |
+
"gfdl-1.2-only": "GNU Free Documentation License v1.2 only",
|
185 |
+
"gfdl-1.2-or-later": "GNU Free Documentation License v1.2 or later",
|
186 |
+
"gfdl-1.3": "GNU Free Documentation License v1.3",
|
187 |
+
"gfdl-1.3-invariants-only": "GNU Free Documentation License v1.3 only - invariants",
|
188 |
+
"gfdl-1.3-invariants-or-later": "GNU Free Documentation License v1.3 or later - invariants",
|
189 |
+
"gfdl-1.3-no-invariants-only": "GNU Free Documentation License v1.3 only - no invariants",
|
190 |
+
"gfdl-1.3-no-invariants-or-later": "GNU Free Documentation License v1.3 or later - no invariants",
|
191 |
+
"gfdl-1.3-only": "GNU Free Documentation License v1.3 only",
|
192 |
+
"gfdl-1.3-or-later": "GNU Free Documentation License v1.3 or later",
|
193 |
+
"giftware": "Giftware License",
|
194 |
+
"gl2ps": "GL2PS License",
|
195 |
+
"glide": "3dfx Glide License",
|
196 |
+
"glulxe": "Glulxe License",
|
197 |
+
"glwtpl": "Good Luck With That Public License",
|
198 |
+
"gnuplot": "gnuplot License",
|
199 |
+
"gpl-1.0": "GNU General Public License v1.0 only",
|
200 |
+
"gpl-1.0+": "GNU General Public License v1.0 or later",
|
201 |
+
"gpl-1.0-only": "GNU General Public License v1.0 only",
|
202 |
+
"gpl-1.0-or-later": "GNU General Public License v1.0 or later",
|
203 |
+
"gpl-2.0": "GNU General Public License v2.0 only",
|
204 |
+
"gpl-2.0+": "GNU General Public License v2.0 or later",
|
205 |
+
"gpl-2.0-only": "GNU General Public License v2.0 only",
|
206 |
+
"gpl-2.0-or-later": "GNU General Public License v2.0 or later",
|
207 |
+
"gpl-2.0-with-autoconf-exception": "GNU General Public License v2.0 w/Autoconf exception",
|
208 |
+
"gpl-2.0-with-bison-exception": "GNU General Public License v2.0 w/Bison exception",
|
209 |
+
"gpl-2.0-with-classpath-exception": "GNU General Public License v2.0 w/Classpath exception",
|
210 |
+
"gpl-2.0-with-font-exception": "GNU General Public License v2.0 w/Font exception",
|
211 |
+
"gpl-2.0-with-gcc-exception": "GNU General Public License v2.0 w/GCC Runtime Library exception",
|
212 |
+
"gpl-3.0": "GNU General Public License v3.0 only",
|
213 |
+
"gpl-3.0+": "GNU General Public License v3.0 or later",
|
214 |
+
"gpl-3.0-only": "GNU General Public License v3.0 only",
|
215 |
+
"gpl-3.0-or-later": "GNU General Public License v3.0 or later",
|
216 |
+
"gpl-3.0-with-autoconf-exception": "GNU General Public License v3.0 w/Autoconf exception",
|
217 |
+
"gpl-3.0-with-gcc-exception": "GNU General Public License v3.0 w/GCC Runtime Library exception",
|
218 |
+
"gsoap-1.3b": "gSOAP Public License v1.3b",
|
219 |
+
"haskellreport": "Haskell Language Report License",
|
220 |
+
"hippocratic-2.1": "Hippocratic License 2.1",
|
221 |
+
"hpnd": "Historical Permission Notice and Disclaimer",
|
222 |
+
"hpnd-sell-variant": "Historical Permission Notice and Disclaimer - sell variant",
|
223 |
+
"htmltidy": "HTML Tidy License",
|
224 |
+
"ibm-pibs": "IBM PowerPC Initialization and Boot Software",
|
225 |
+
"icu": "ICU License",
|
226 |
+
"ijg": "Independent JPEG Group License",
|
227 |
+
"imagemagick": "ImageMagick License",
|
228 |
+
"imatix": "iMatix Standard Function Library Agreement",
|
229 |
+
"imlib2": "Imlib2 License",
|
230 |
+
"info-zip": "Info-ZIP License",
|
231 |
+
"intel": "Intel Open Source License",
|
232 |
+
"intel-acpi": "Intel ACPI Software License Agreement",
|
233 |
+
"interbase-1.0": "Interbase Public License v1.0",
|
234 |
+
"ipa": "IPA Font License",
|
235 |
+
"ipl-1.0": "IBM Public License v1.0",
|
236 |
+
"isc": "ISC License",
|
237 |
+
"jasper-2.0": "JasPer License",
|
238 |
+
"jpnic": "Japan Network Information Center License",
|
239 |
+
"json": "JSON License",
|
240 |
+
"lal-1.2": "Licence Art Libre 1.2",
|
241 |
+
"lal-1.3": "Licence Art Libre 1.3",
|
242 |
+
"latex2e": "Latex2e License",
|
243 |
+
"leptonica": "Leptonica License",
|
244 |
+
"lgpl-2.0": "GNU Library General Public License v2 only",
|
245 |
+
"lgpl-2.0+": "GNU Library General Public License v2 or later",
|
246 |
+
"lgpl-2.0-only": "GNU Library General Public License v2 only",
|
247 |
+
"lgpl-2.0-or-later": "GNU Library General Public License v2 or later",
|
248 |
+
"lgpl-2.1": "GNU Lesser General Public License v2.1 only",
|
249 |
+
"lgpl-2.1+": "GNU Library General Public License v2.1 or later",
|
250 |
+
"lgpl-2.1-only": "GNU Lesser General Public License v2.1 only",
|
251 |
+
"lgpl-2.1-or-later": "GNU Lesser General Public License v2.1 or later",
|
252 |
+
"lgpl-3.0": "GNU Lesser General Public License v3.0 only",
|
253 |
+
"lgpl-3.0+": "GNU Lesser General Public License v3.0 or later",
|
254 |
+
"lgpl-3.0-only": "GNU Lesser General Public License v3.0 only",
|
255 |
+
"lgpl-3.0-or-later": "GNU Lesser General Public License v3.0 or later",
|
256 |
+
"lgpllr": "Lesser General Public License For Linguistic Resources",
|
257 |
+
"libpng": "libpng License",
|
258 |
+
"libpng-2.0": "PNG Reference Library version 2",
|
259 |
+
"libselinux-1.0": "libselinux public domain notice",
|
260 |
+
"libtiff": "libtiff License",
|
261 |
+
"liliq-p-1.1": "Licence Libre du Qu\u00e9bec \u2013 Permissive version 1.1",
|
262 |
+
"liliq-r-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 version 1.1",
|
263 |
+
"liliq-rplus-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 forte version 1.1",
|
264 |
+
"linux-openib": "Linux Kernel Variant of OpenIB.org license",
|
265 |
+
"lpl-1.0": "Lucent Public License Version 1.0",
|
266 |
+
"lpl-1.02": "Lucent Public License v1.02",
|
267 |
+
"lppl-1.0": "LaTeX Project Public License v1.0",
|
268 |
+
"lppl-1.1": "LaTeX Project Public License v1.1",
|
269 |
+
"lppl-1.2": "LaTeX Project Public License v1.2",
|
270 |
+
"lppl-1.3a": "LaTeX Project Public License v1.3a",
|
271 |
+
"lppl-1.3c": "LaTeX Project Public License v1.3c",
|
272 |
+
"makeindex": "MakeIndex License",
|
273 |
+
"miros": "The MirOS Licence",
|
274 |
+
"mit": "MIT License",
|
275 |
+
"mit-0": "MIT No Attribution",
|
276 |
+
"mit-advertising": "Enlightenment License (e16)",
|
277 |
+
"mit-cmu": "CMU License",
|
278 |
+
"mit-enna": "enna License",
|
279 |
+
"mit-feh": "feh License",
|
280 |
+
"mit-open-group": "MIT Open Group variant",
|
281 |
+
"mitnfa": "MIT +no-false-attribs license",
|
282 |
+
"motosoto": "Motosoto License",
|
283 |
+
"mpich2": "mpich2 License",
|
284 |
+
"mpl-1.0": "Mozilla Public License 1.0",
|
285 |
+
"mpl-1.1": "Mozilla Public License 1.1",
|
286 |
+
"mpl-2.0": "Mozilla Public License 2.0",
|
287 |
+
"mpl-2.0-no-copyleft-exception": "Mozilla Public License 2.0 (no copyleft exception)",
|
288 |
+
"ms-pl": "Microsoft Public License",
|
289 |
+
"ms-rl": "Microsoft Reciprocal License",
|
290 |
+
"mtll": "Matrix Template Library License",
|
291 |
+
"mulanpsl-1.0": "Mulan Permissive Software License, Version 1",
|
292 |
+
"mulanpsl-2.0": "Mulan Permissive Software License, Version 2",
|
293 |
+
"multics": "Multics License",
|
294 |
+
"mup": "Mup License",
|
295 |
+
"nasa-1.3": "NASA Open Source Agreement 1.3",
|
296 |
+
"naumen": "Naumen Public License",
|
297 |
+
"nbpl-1.0": "Net Boolean Public License v1",
|
298 |
+
"ncgl-uk-2.0": "Non-Commercial Government Licence",
|
299 |
+
"ncsa": "University of Illinois/NCSA Open Source License",
|
300 |
+
"net-snmp": "Net-SNMP License",
|
301 |
+
"netcdf": "NetCDF license",
|
302 |
+
"newsletr": "Newsletr License",
|
303 |
+
"ngpl": "Nethack General Public License",
|
304 |
+
"nist-pd": "NIST Public Domain Notice",
|
305 |
+
"nist-pd-fallback": "NIST Public Domain Notice with license fallback",
|
306 |
+
"nlod-1.0": "Norwegian Licence for Open Government Data",
|
307 |
+
"nlpl": "No Limit Public License",
|
308 |
+
"nokia": "Nokia Open Source License",
|
309 |
+
"nosl": "Netizen Open Source License",
|
310 |
+
"noweb": "Noweb License",
|
311 |
+
"npl-1.0": "Netscape Public License v1.0",
|
312 |
+
"npl-1.1": "Netscape Public License v1.1",
|
313 |
+
"nposl-3.0": "Non-Profit Open Software License 3.0",
|
314 |
+
"nrl": "NRL License",
|
315 |
+
"ntp": "NTP License",
|
316 |
+
"ntp-0": "NTP No Attribution",
|
317 |
+
"nunit": "Nunit License",
|
318 |
+
"o-uda-1.0": "Open Use of Data Agreement v1.0",
|
319 |
+
"occt-pl": "Open CASCADE Technology Public License",
|
320 |
+
"oclc-2.0": "OCLC Research Public License 2.0",
|
321 |
+
"odbl-1.0": "ODC Open Database License v1.0",
|
322 |
+
"odc-by-1.0": "Open Data Commons Attribution License v1.0",
|
323 |
+
"ofl-1.0": "SIL Open Font License 1.0",
|
324 |
+
"ofl-1.0-no-rfn": "SIL Open Font License 1.0 with no Reserved Font Name",
|
325 |
+
"ofl-1.0-rfn": "SIL Open Font License 1.0 with Reserved Font Name",
|
326 |
+
"ofl-1.1": "SIL Open Font License 1.1",
|
327 |
+
"ofl-1.1-no-rfn": "SIL Open Font License 1.1 with no Reserved Font Name",
|
328 |
+
"ofl-1.1-rfn": "SIL Open Font License 1.1 with Reserved Font Name",
|
329 |
+
"ogc-1.0": "OGC Software License, Version 1.0",
|
330 |
+
"ogl-canada-2.0": "Open Government Licence - Canada",
|
331 |
+
"ogl-uk-1.0": "Open Government Licence v1.0",
|
332 |
+
"ogl-uk-2.0": "Open Government Licence v2.0",
|
333 |
+
"ogl-uk-3.0": "Open Government Licence v3.0",
|
334 |
+
"ogtsl": "Open Group Test Suite License",
|
335 |
+
"oldap-1.1": "Open LDAP Public License v1.1",
|
336 |
+
"oldap-1.2": "Open LDAP Public License v1.2",
|
337 |
+
"oldap-1.3": "Open LDAP Public License v1.3",
|
338 |
+
"oldap-1.4": "Open LDAP Public License v1.4",
|
339 |
+
"oldap-2.0": "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)",
|
340 |
+
"oldap-2.0.1": "Open LDAP Public License v2.0.1",
|
341 |
+
"oldap-2.1": "Open LDAP Public License v2.1",
|
342 |
+
"oldap-2.2": "Open LDAP Public License v2.2",
|
343 |
+
"oldap-2.2.1": "Open LDAP Public License v2.2.1",
|
344 |
+
"oldap-2.2.2": "Open LDAP Public License 2.2.2",
|
345 |
+
"oldap-2.3": "Open LDAP Public License v2.3",
|
346 |
+
"oldap-2.4": "Open LDAP Public License v2.4",
|
347 |
+
"oldap-2.5": "Open LDAP Public License v2.5",
|
348 |
+
"oldap-2.6": "Open LDAP Public License v2.6",
|
349 |
+
"oldap-2.7": "Open LDAP Public License v2.7",
|
350 |
+
"oldap-2.8": "Open LDAP Public License v2.8",
|
351 |
+
"oml": "Open Market License",
|
352 |
+
"openssl": "OpenSSL License",
|
353 |
+
"opl-1.0": "Open Public License v1.0",
|
354 |
+
"oset-pl-2.1": "OSET Public License version 2.1",
|
355 |
+
"osl-1.0": "Open Software License 1.0",
|
356 |
+
"osl-1.1": "Open Software License 1.1",
|
357 |
+
"osl-2.0": "Open Software License 2.0",
|
358 |
+
"osl-2.1": "Open Software License 2.1",
|
359 |
+
"osl-3.0": "Open Software License 3.0",
|
360 |
+
"parity-6.0.0": "The Parity Public License 6.0.0",
|
361 |
+
"parity-7.0.0": "The Parity Public License 7.0.0",
|
362 |
+
"pddl-1.0": "ODC Public Domain Dedication & License 1.0",
|
363 |
+
"php-3.0": "PHP License v3.0",
|
364 |
+
"php-3.01": "PHP License v3.01",
|
365 |
+
"plexus": "Plexus Classworlds License",
|
366 |
+
"polyform-noncommercial-1.0.0": "PolyForm Noncommercial License 1.0.0",
|
367 |
+
"polyform-small-business-1.0.0": "PolyForm Small Business License 1.0.0",
|
368 |
+
"postgresql": "PostgreSQL License",
|
369 |
+
"psf-2.0": "Python Software Foundation License 2.0",
|
370 |
+
"psfrag": "psfrag License",
|
371 |
+
"psutils": "psutils License",
|
372 |
+
"python-2.0": "Python License 2.0",
|
373 |
+
"qhull": "Qhull License",
|
374 |
+
"qpl-1.0": "Q Public License 1.0",
|
375 |
+
"rdisc": "Rdisc License",
|
376 |
+
"rhecos-1.1": "Red Hat eCos Public License v1.1",
|
377 |
+
"rpl-1.1": "Reciprocal Public License 1.1",
|
378 |
+
"rpl-1.5": "Reciprocal Public License 1.5",
|
379 |
+
"rpsl-1.0": "RealNetworks Public Source License v1.0",
|
380 |
+
"rsa-md": "RSA Message-Digest License",
|
381 |
+
"rscpl": "Ricoh Source Code Public License",
|
382 |
+
"ruby": "Ruby License",
|
383 |
+
"sax-pd": "Sax Public Domain Notice",
|
384 |
+
"saxpath": "Saxpath License",
|
385 |
+
"scea": "SCEA Shared Source License",
|
386 |
+
"sendmail": "Sendmail License",
|
387 |
+
"sendmail-8.23": "Sendmail License 8.23",
|
388 |
+
"sgi-b-1.0": "SGI Free Software License B v1.0",
|
389 |
+
"sgi-b-1.1": "SGI Free Software License B v1.1",
|
390 |
+
"sgi-b-2.0": "SGI Free Software License B v2.0",
|
391 |
+
"shl-0.5": "Solderpad Hardware License v0.5",
|
392 |
+
"shl-0.51": "Solderpad Hardware License, Version 0.51",
|
393 |
+
"simpl-2.0": "Simple Public License 2.0",
|
394 |
+
"sissl": "Sun Industry Standards Source License v1.1",
|
395 |
+
"sissl-1.2": "Sun Industry Standards Source License v1.2",
|
396 |
+
"sleepycat": "Sleepycat License",
|
397 |
+
"smlnj": "Standard ML of New Jersey License",
|
398 |
+
"smppl": "Secure Messaging Protocol Public License",
|
399 |
+
"snia": "SNIA Public License 1.1",
|
400 |
+
"spencer-86": "Spencer License 86",
|
401 |
+
"spencer-94": "Spencer License 94",
|
402 |
+
"spencer-99": "Spencer License 99",
|
403 |
+
"spl-1.0": "Sun Public License v1.0",
|
404 |
+
"ssh-openssh": "SSH OpenSSH license",
|
405 |
+
"ssh-short": "SSH short notice",
|
406 |
+
"sspl-1.0": "Server Side Public License, v 1",
|
407 |
+
"standardml-nj": "Standard ML of New Jersey License",
|
408 |
+
"sugarcrm-1.1.3": "SugarCRM Public License v1.1.3",
|
409 |
+
"swl": "Scheme Widget Library (SWL) Software License Agreement",
|
410 |
+
"tapr-ohl-1.0": "TAPR Open Hardware License v1.0",
|
411 |
+
"tcl": "TCL/TK License",
|
412 |
+
"tcp-wrappers": "TCP Wrappers License",
|
413 |
+
"tmate": "TMate Open Source License",
|
414 |
+
"torque-1.1": "TORQUE v2.5+ Software License v1.1",
|
415 |
+
"tosl": "Trusster Open Source License",
|
416 |
+
"tu-berlin-1.0": "Technische Universitaet Berlin License 1.0",
|
417 |
+
"tu-berlin-2.0": "Technische Universitaet Berlin License 2.0",
|
418 |
+
"ucl-1.0": "Upstream Compatibility License v1.0",
|
419 |
+
"unicode-dfs-2015": "Unicode License Agreement - Data Files and Software (2015)",
|
420 |
+
"unicode-dfs-2016": "Unicode License Agreement - Data Files and Software (2016)",
|
421 |
+
"unicode-tou": "Unicode Terms of Use",
|
422 |
+
"unlicense": "The Unlicense",
|
423 |
+
"upl-1.0": "Universal Permissive License v1.0",
|
424 |
+
"vim": "Vim License",
|
425 |
+
"vostrom": "VOSTROM Public License for Open Source",
|
426 |
+
"vsl-1.0": "Vovida Software License v1.0",
|
427 |
+
"w3c": "W3C Software Notice and License (2002-12-31)",
|
428 |
+
"w3c-19980720": "W3C Software Notice and License (1998-07-20)",
|
429 |
+
"w3c-20150513": "W3C Software Notice and Document License (2015-05-13)",
|
430 |
+
"watcom-1.0": "Sybase Open Watcom Public License 1.0",
|
431 |
+
"wsuipa": "Wsuipa License",
|
432 |
+
"wtfpl": "Do What The F*ck You Want To Public License",
|
433 |
+
"wxwindows": "wxWindows Library License",
|
434 |
+
"x11": "X11 License",
|
435 |
+
"xerox": "Xerox License",
|
436 |
+
"xfree86-1.1": "XFree86 License 1.1",
|
437 |
+
"xinetd": "xinetd License",
|
438 |
+
"xnet": "X.Net License",
|
439 |
+
"xpp": "XPP License",
|
440 |
+
"xskat": "XSkat License",
|
441 |
+
"ypl-1.0": "Yahoo! Public License v1.0",
|
442 |
+
"ypl-1.1": "Yahoo! Public License v1.1",
|
443 |
+
"zed": "Zed License",
|
444 |
+
"zend-2.0": "Zend License v2.0",
|
445 |
+
"zimbra-1.3": "Zimbra Public License v1.3",
|
446 |
+
"zimbra-1.4": "Zimbra Public License v1.4",
|
447 |
+
"zlib": "zlib License",
|
448 |
+
"zlib-acknowledgement": "zlib/libpng License with Acknowledgement",
|
449 |
+
"zpl-1.1": "Zope Public License 1.1",
|
450 |
+
"zpl-2.0": "Zope Public License 2.0",
|
451 |
+
"zpl-2.1": "Zope Public License 2.1"
|
452 |
+
}
|
tag_set.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"task_structure": {"Txt2Class": "text to classification task", "Txt2Class.Bi": "text to binary classification task", "Txt2Class.Multi.Sing": "text to multiple classes single label", "Txt2Class.Multi.Multi": "text to multiple classes multiple labels", "Strct2Txt": "structured information to text task", "Txt2Strct": "text to structured information task", "Txt2Txt": "text to text task", "Txt": "just text", "Oth": "other"}, "purpose": {"NLI": "natural language inference", "SentA": "sentiment analysis", "MT": "machine translation", "Summ.ext": "extractive summarization", "Summ.abs": "abstractive summarization", "QA.abs": "abstractive question answering", "QA.ext": "extractive question answering", "QA.open": "open domain question answering", "QA.closed": "closed domain question answering", "QA.open.abs": "open domain abstractive question answering", "QA.closed.abs": "closed domain abstractive question answering", "QA.open.ext": "open domain extractive question answering", "QA.closed.ext": "closed domain extractive question answering", "Dialog": "dialogue or multi-turn text", "LM": "language modeling", "NER": "named entity recognition", "Pars": "parsing", "TxtSimp": "text simplification", "Coref": "coreference resolution", "FactChk": "fact checking", "EntLink": "entity linking", "SSplitFus": "sentence splitting/fusion", "SlotFillClz": "slot filling / Cloze test", "InfoRet": "information retrieval", "IntentClass": "intent classification", "SemSim": "semantic similarity", "Oth": "other"}, "language_producers": {"crwdsrc_l": "data produced by crowdsource workers", "machgen_l": "machine-generated data", "found_l": "found data", "Oth": "other"}, "annotation": {"crwdsrc_a": "annotation produced by crowdsource workers", "machgen_a": "machine-generated annotation", "exp_a": "expert annotation", "no_a": "no annotation", "Oth": "other"}, "license": {"afl-3.0": "Academic Free License", "apache-2.0": "Apache license 2.0", "artistic-2.0": "Artistic license 2.0", "bsl-1.0": "Boost Software License 1.0", "bsd-2-clause": "BSD 2-clause \"Simplified\" license", "bsd-3-clause": "BSD 3-clause \"New\" or \"Revised\" license", "bsd-3-clause-clear": "BSD 3-clause Clear license", "cc": "Creative Commons license family", "cc0-1.0": "Creative Commons Zero v1.0 Universal", "cc-by-4.0": "Creative Commons Attribution 4.0", "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0", "wtfpl": "Do What The F*ck You Want To Public License", "ecl-2.0": "Educational Community License v2.0", "epl-1.0": "Eclipse Public License 1.0", "epl-2.0": "Eclipse Public License 2.0", "eupl-1.1": "European Union Public License 1.1", "agpl-3.0": "GNU Affero General Public License v3.0", "gpl": "GNU General Public License family", "gpl-2.0": "GNU General Public License v2.0", "gpl-3.0": "GNU General Public License v3.0", "lgpl": "GNU Lesser General Public License family", "lgpl-2.1": "GNU Lesser General Public License v2.1", "lgpl-3.0": "GNU Lesser General Public License v3.0", "isc": "ISC", "lppl-1.3c": "LaTeX Project Public License v1.3c", "ms-pl": "Microsoft Public License", "mit": "MIT", "mpl-2.0": "Mozilla Public License 2.0", "osl-3.0": "Open Software License 3.0", "postgresql": "PostgreSQL License", "ofl-1.1": "SIL Open Font License 1.1", "ncsa": "University of Illinois/NCSA Open Source License", "unlicense": "The Unlicense", "zlib": "zLib License", "Oth": "other"}, "language": {"cardinality": {"1ling": "monolingual; only one language in the dataset", "trsl": "translation; parallel language use", "multiling": "multilingual; more than one language being used within or across datasets over different content", "Oth": "other"}, "BCP-47": {"en": "English, dialect unknown", "es": "Spanish, dialect unknown", "fr": "French, dialect unknown", "sv": "Swedish, dialect unknown", "fi": "Finnish, dialect unknown", "de": "German, dialect unknown", "ru": "Russian, dialect unknown", "uk": "Ukranian, dialect unknown", "it": "Italian, dialect unknown", "eo": "Esperanto, dialect unknown", "ar": "Arabic, dialect unknown", "tr": "Turkish, dialect unknown", "bg": "Bulgarian, dialect unknown", "pl": "Polish, dialect unknown", "nl": "Dutch, dialect unknown", "id": "Indonesian, dialect unknown", "zh": "Chinese, dialect unknown", "af": "Afrikaans, dialect unknown", "ca": "Catalan, dialect unknown", "cs": "Czech, dialect unknown", "pt": "Portuguese, dialect unknown", "no": "Norwegian, dialect unknown", "he": "Hebrew, dialect unknown", "da": "Danish, dialect unknown", "is": "Icelandic, dialect unknown", "hu": "Hungarian, dialect unknown", "ro": "Romanian, dialect unknown", "ms": "Malay, dialect unknown", "ja": "Japanese, dialect unknown", "hi": "Hindi, dialect unknown", "sl": "Slovene, dialect unknown", "lt": "Lithuanian, dialect unknown", "ht": "Haitian, dialect unknown", "vi": "Vietnamese, dialect unknown", "et": "Estonian, dialect unknown", "el": "Greek, dialect unknown", "hr": "Croatian, dialect unknown", "mt": "Maltese, dialect unknown", "ts": "Tsonga, dialect unknown", "mk": "Macedonian, dialect unknown", "ln": "Lingala, dialect unknown", "ig": "Igbo, dialect unknown", "ee": "Ewe, dialect unknown", "xh": "Xhosa, dialect unknown", "sn": "Shona, dialect unknown", "rw": "Kinyarwanda, dialect unknown", "ny": "Chichewa, dialect unknown", "lv": "Latvian, dialect unknown", "lg": "Ganda, dialect unknown", "ko": "Korean, dialect unknown", "gl": "Galician, dialect unknown", "sg": "Sango, dialect unknown", "yo": "Yoruba, dialect unknown", "ur": "Urdu, dialect unknown", "rn": "Kirundi, dialect unknown", "mr": "Marathi, dialect unknown", "bn": "Bengali, dialect unknown", "nso": "Pedi, dialect unknown", "ty": "Tahitian, dialect unknown", "to": "Tonga, dialect unknown", "gu": "Gujarati, dialect unknown", "eu": "Basque, dialect unknown", "niu": "Niuean, dialect unknown", "guw": "Gun, dialect unknown", "gaa": "Ga, dialect unknown", "crs": "Seselwa Creole French, dialect unknown", "bcl": "Central Bikol, dialect unknown", "tn": "Tswana, dialect unknown", "sm": "Samoan, dialect unknown", "si": "Sinhala, dialect unknown", "nn": "Norwegian Nynorsk, dialect unknown", "nb": "Norwegian Bokm\u00e5l, dialect unknown", "fj": "Fijian, dialect unknown", "be": "Belarusian, dialect unknown", "pon": "Pohnpeian, dialect unknown", "pis": "Pijin, dialect unknown", "pap": "Papiamento, dialect unknown", "pag": "Pangasinan, dialect unknown", "lua": "Luba-Lulua, dialect unknown", "iso": "Isoko, dialect unknown", "ilo": "Iloko, dialect unknown", "gil": "Gilbertese, dialect unknown", "efi": "Efik, dialect unknown", "bzs": "Brazilian Sign Language, dialect unknown", "yi": "Yiddish, dialect unknown", "wa": "Walloon, dialect unknown", "sq": "Albanian, dialect unknown", "or": "Oriya, dialect unknown", "mh": "Marshallese, dialect unknown", "lb": "Luxembourgish, dialect unknown", "ha": "Hausa, dialect unknown", "fy": "Western Frisian, dialect unknown", "fo": "Faroese, dialect unknown", "as": "Assamese, dialect unknown", "tvl": "Tuvalua, dialect unknown", "tll": "Tetela, dialect unknown", "swc": "Congo Swahili, dialect unknown", "lus": "Lushai, dialect unknown", "loz": "Lozi, dialect unknown", "ceb": "Cebuano, dialect unknown", "ti": "Tigrinya, dialect unknown", "st": "Southern Sotho, dialect unknown", "rm": "Romansh, dialect unknown", "oc": "Occitan, dialect unknown", "kg": "Kongo, dialect unknown", "ga": "Irish, dialect unknown", "co": "Corsican, dialect unknown", "an": "Aragonese, dialect unknown", "war": "Waray, dialect unknown", "lue": "Luvale, dialect unknown", "hil": "Hiligaynon, dialect unknown", "bem": "Bemba, dialect unknown", "ase": "American Sign Language, dialect unknown", "zu": "Zulu, dialect unknown", "tw": "Twi, dialect unknown", "tl": "Tagalog, dialect unknown", "sk": "Slovak, dialect unknown", "lu": "Luba-Katanga, dialect unknown", "hy": "Armenian, dialect unknown", "gv": "Manx, dialect unknown", "cy": "Welsh, dialect unknown", "bi": "Bislama, dialect unknown", "am": "Amharic, dialect unknown", "srn": "Sranan Tongo, dialect unknown", "toi": "Tonga (Zambia), dialect unknown", "kqn": "Kaonde, dialect unknown", "se": "Northern Sami, dialect unknown", "ps": "Pashto, dialect unknown", "os": "Ossetian, dialect unknown", "zne": "Zande (individual language), dialect unknown", "wls": "Wallisian, dialect unknown", "tpi": "Tok Pisin, dialect unknown", "tiv": "Tiv, dialect unknown", "run": "Rundi, dialect unknown", "so": "Somali, dialect unknown", "kw": "Cornish, dialect unknown", "ho": "Hiri Motu, dialect unknown", "gd": "Scottish Gaelic, dialect unknown", "br": "Breton, dialect unknown", "tum": "Tumbuka, dialect unknown", "yap": "Yapese, dialect unknown", "rnd": "Ruund, dialect unknown", "mfe": "Morisyen, dialect unknown", "kwy": "San Salvador Kongo, dialect unknown", "chk": "Chuukese, dialect unknown", "ber": "Berber languages, dialect unknown", "wo": "Wolof, dialect unknown", "ve": "Venda, dialect unknown", "th": "Thai, dialect unknown", "sc": "Sardinian, dialect unknown", "ml": "Malayalam, dialect unknown", "mg": "Malagasy, dialect unknown", "km": "Khmer, dialect unknown", "ka": "Georgian, dialect unknown", "mos": "Mossi, dialect unknown", "ta": "Tamil, dialect unknown", "mn": "Mongolian, dialect unknown", "kn": "Kannada, dialect unknown", "az": "Azerbaijani, dialect unknown", "roa": "Romance languages, dialect unknown", "yue": "Yue Chinese, dialect unknown", "tt": "Tatar, dialect unknown", "tk": "Turkmen, dialect unknown", "te": "Telugu, dialect unknown", "na": "Nauru, dialect unknown", "mi": "M\u0101ori, dialect unknown", "cv": "Chuvash, dialect unknown", "ba": "Bashkir, dialect unknown", "cel": "Celtic languages, dialect unknown", "umb": "Umbundu, dialect unknown", "sa": "Sanskrit, dialect unknown", "my": "Burmese, dialect unknown", "lo": "Lao, dialect unknown", "kl": "Kalaallisut, dialect unknown", "io": "Ido, dialect unknown", "ce": "Chechen, dialect unknown", "ab": "Abkhaz, dialect unknown", "fse": "Finnish Sign Language, dialect unknown", "zai": "Isthmus Zapotec, dialect unknown", "tzo": "Tzotzil, dialect unknown", "prl": "Peruvian Sign Language, dialect unknown", "mfs": "Mexican Sign Language, dialect unknown", "nyk": "Nyaneka, dialect unknown", "luo": "Luo, dialect unknown", "lun": "Lunda, dialect unknown", "kwn": "Kwangali, dialect unknown", "csn": "Colombian Sign Language, dialect unknown", "csg": "Chilean Sign Language, dialect unknown", "aed": "Argentine Sign Language, dialect unknown", "sw": "Swahili, dialect unknown", "su": "Sundanese, dialect unknown", "ss": "Swati, dialect unknown", "om": "Oromo, dialect unknown", "nv": "Navajo, dialect unknown", "ng": "Ndonga, dialect unknown", "ne": "Nepali, dialect unknown", "kj": "Kwanyama, dialect unknown", "jv": "Javanese, dialect unknown", "gn": "Guaran\u00ed, dialect unknown", "fa": "Persian, dialect unknown", "ch": "Chamorro, dialect unknown", "bo": "Tibetan Standard, dialect unknown", "wal": "Wolaitta, dialect unknown", "vsl": "Venezuelan Sign Language, dialect unknown", "ssp": "Spanish Sign Language, dialect unknown", "kab": "Kabyle, dialect unknown", "yua": "Yucateco, dialect unknown", "tdt": "Tetun Dili, dialect unknown", "pa": "Punjabi, dialect unknown", "nr": "Southern Ndebele, dialect unknown", "kk": "Kazakh, dialect unknown", "dv": "Divehi, dialect unknown", "Oth": "other"}}}
|
|
|
|
tagging_app.py
CHANGED
@@ -16,97 +16,17 @@ st.beta_set_page_config(
|
|
16 |
initial_sidebar_state="auto",
|
17 |
)
|
18 |
|
19 |
-
task_set =
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
"machine-translation",
|
24 |
-
"sentence-splitting-fusion",
|
25 |
-
"summarization",
|
26 |
-
"table-to-text",
|
27 |
-
"text-simplification",
|
28 |
-
"explanation-generation",
|
29 |
-
"other",
|
30 |
-
],
|
31 |
-
},
|
32 |
-
"question-answering": {
|
33 |
-
"description": "question answering tasks",
|
34 |
-
"options": [
|
35 |
-
"open-domain-qa",
|
36 |
-
"closed-domain-qa",
|
37 |
-
"multiple-choice-qa",
|
38 |
-
"extractive-qa",
|
39 |
-
"abstractive-qa",
|
40 |
-
"other",
|
41 |
-
],
|
42 |
-
},
|
43 |
-
"sequence-modeling": {
|
44 |
-
"description": "such as language modeling or dialogue",
|
45 |
-
"options": [
|
46 |
-
"dialogue-modeling",
|
47 |
-
"language-modeling",
|
48 |
-
"other-multi-turn",
|
49 |
-
"slot-filling",
|
50 |
-
"other",
|
51 |
-
],
|
52 |
-
},
|
53 |
-
"structure-prediction": {
|
54 |
-
"description": "predicting structural properties of the text, such as syntax",
|
55 |
-
"options": [
|
56 |
-
"coreference-resolution",
|
57 |
-
"named-entity-recognition",
|
58 |
-
"parsing",
|
59 |
-
"other",
|
60 |
-
],
|
61 |
-
},
|
62 |
-
"text-classification": {
|
63 |
-
"description": "predicting a class index or boolean value",
|
64 |
-
"options": [
|
65 |
-
"acceptability-classification",
|
66 |
-
"entity-linking-classification",
|
67 |
-
"fact-checking",
|
68 |
-
"intent-classification",
|
69 |
-
"multi-class-classification",
|
70 |
-
"multi-label-classification",
|
71 |
-
"natural-language-inference",
|
72 |
-
"semantic-similarity-classification",
|
73 |
-
"sentiment-classification",
|
74 |
-
"topic-classification",
|
75 |
-
"other",
|
76 |
-
],
|
77 |
-
},
|
78 |
-
"text-retrieval": {
|
79 |
-
"description": "information or text retrieval tasks",
|
80 |
-
"options": [
|
81 |
-
"document-retrieval",
|
82 |
-
"utterance-retrieval",
|
83 |
-
"entity-linking-retrieval",
|
84 |
-
"fact-checking-retrieval",
|
85 |
-
"other",
|
86 |
-
],
|
87 |
-
},
|
88 |
-
"text-scoring": {
|
89 |
-
"description": "text scoring tasks, predicting a real valued score for some text",
|
90 |
-
"options": [
|
91 |
-
"semantic-similarity-scoring",
|
92 |
-
"sentiment-scoring",
|
93 |
-
"other",
|
94 |
-
],
|
95 |
-
},
|
96 |
-
"other": {
|
97 |
-
"description": "other task family not mentioned here",
|
98 |
-
"options": [
|
99 |
-
"other",
|
100 |
-
],
|
101 |
-
},
|
102 |
-
}
|
103 |
|
104 |
multilinguality_set = {
|
105 |
"monolingual": "contains a single language",
|
106 |
"multilingual": "contains multiple languages",
|
107 |
"translation": "contains translated or aligned text",
|
108 |
"other": "other type of language distribution",
|
109 |
-
}
|
110 |
|
111 |
creator_set = {
|
112 |
"language": [
|
@@ -126,51 +46,7 @@ creator_set = {
|
|
126 |
],
|
127 |
}
|
128 |
|
129 |
-
license_set = {
|
130 |
-
'afl-3.0': 'Academic Free License',
|
131 |
-
'apache-2.0': 'Apache license 2.0',
|
132 |
-
'artistic-2.0': 'Artistic license 2.0',
|
133 |
-
'bsl-1.0': 'Boost Software License 1.0',
|
134 |
-
'bsd-2-clause': 'BSD 2-clause "Simplified" license',
|
135 |
-
'bsd-3-clause': 'BSD 3-clause "New" or "Revised" license',
|
136 |
-
'bsd-3-clause-clear': 'BSD 3-clause Clear license',
|
137 |
-
'cc': 'Creative Commons license family',
|
138 |
-
'cc0-1.0': 'Creative Commons Zero v1.0 Universal',
|
139 |
-
'cc-by-sa-3.0': 'Creative Commons Attribution Share Alike 3.0',
|
140 |
-
'cc-by-4.0': 'Creative Commons Attribution 4.0',
|
141 |
-
'cc-by-nc-4.0': 'Creative Commons Attribution Non Commercial 4.0',
|
142 |
-
'cc-by-nc-sa-4.0': 'Creative Commons Attribution Non Commercial Share Alike 4.0',
|
143 |
-
'cc-by-sa-4.0': 'Creative Commons Attribution Share Alike 4.0',
|
144 |
-
'wtfpl': 'Do What The F*ck You Want To Public License',
|
145 |
-
'ecl-2.0': 'Educational Community License v2.0',
|
146 |
-
'epl-1.0': 'Eclipse Public License 1.0',
|
147 |
-
'epl-2.0': 'Eclipse Public License 2.0',
|
148 |
-
'eupl-1.1': 'European Union Public License 1.1',
|
149 |
-
'agpl-3.0': 'GNU Affero General Public License v3.0',
|
150 |
-
'gpl': 'GNU General Public License family',
|
151 |
-
'gpl-2.0': 'GNU General Public License v2.0',
|
152 |
-
'gpl-3.0': 'GNU General Public License v3.0',
|
153 |
-
'lgpl': 'GNU Lesser General Public License family',
|
154 |
-
'lgpl-2.1': 'GNU Lesser General Public License v2.1',
|
155 |
-
'lgpl-3.0': 'GNU Lesser General Public License v3.0',
|
156 |
-
'isc': 'ISC',
|
157 |
-
'lppl-1.3c': 'LaTeX Project Public License v1.3c',
|
158 |
-
'ms-pl': 'Microsoft Public License',
|
159 |
-
'mit': 'MIT',
|
160 |
-
'mpl-2.0': 'Mozilla Public License 2.0',
|
161 |
-
'osl-3.0': 'Open Software License 3.0',
|
162 |
-
'postgresql': 'PostgreSQL License',
|
163 |
-
'ofl-1.1': 'SIL Open Font License 1.1',
|
164 |
-
'ncsa': 'University of Illinois/NCSA Open Source License',
|
165 |
-
'unlicense': 'The Unlicense',
|
166 |
-
'zlib': 'zLib License',
|
167 |
-
'other': 'other license',
|
168 |
-
'unknown': 'could not find license information',
|
169 |
-
}
|
170 |
|
171 |
-
tag_set = json.load(open('tag_set.json'))
|
172 |
-
language_set = dict([(k, v.replace(', dialect unknown', ''))
|
173 |
-
for k, v in tag_set['language']["BCP-47"].items()])
|
174 |
|
175 |
########################
|
176 |
## Helper functions
|
@@ -205,7 +81,7 @@ def filter_features(feature_dict):
|
|
205 |
return {
|
206 |
"feature_type": feature_dict["_type"],
|
207 |
"dtype": "string",
|
208 |
-
"languages": feature_dict["languages"],
|
209 |
}
|
210 |
else:
|
211 |
return dict([(k, filter_features(v)) for k, v in feature_dict.items()])
|
@@ -271,12 +147,12 @@ st.sidebar.markdown(
|
|
271 |
)
|
272 |
|
273 |
app_desc = """
|
274 |
-
### Dataset Tagger
|
275 |
|
276 |
-
This app aims to make it easier to add structured tags to the datasets present in the library.
|
277 |
|
278 |
Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
|
279 |
-
to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
|
280 |
|
281 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
282 |
"""
|
@@ -390,13 +266,13 @@ with c2.beta_expander("- Choose tag set to pre-load"):
|
|
390 |
|
391 |
pre_loaded["languages"] = list(set(pre_loaded["languages"] + find_languages(features)))
|
392 |
if config_infos["license"] in license_set:
|
393 |
-
pre_loaded["licenses"] = list(set(pre_loaded["licenses"] + [config_infos["license"]]))
|
394 |
|
395 |
##########
|
396 |
# Modify or add new tags
|
397 |
##########
|
398 |
c2.markdown("#### Editing the tag set")
|
399 |
-
c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
|
400 |
|
401 |
with c2.beta_expander("- Supported tasks"):
|
402 |
task_categories = st.multiselect(
|
@@ -414,13 +290,13 @@ with c2.beta_expander("- Supported tasks"):
|
|
414 |
)
|
415 |
if "other" in task_specs:
|
416 |
other_task = st.text_input(
|
417 |
-
"You selected 'other' task. Please enter a short hyphen-separated description for the task:",
|
418 |
value='my-task-description',
|
419 |
)
|
420 |
st.write(f"Registering {tg}-other-{other_task} task")
|
421 |
task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
|
422 |
task_specifics += task_specs
|
423 |
-
|
424 |
with c2.beta_expander("- Languages"):
|
425 |
multilinguality = st.multiselect(
|
426 |
"Does the dataset contain more than one language?",
|
@@ -430,7 +306,7 @@ with c2.beta_expander("- Languages"):
|
|
430 |
)
|
431 |
if "other" in multilinguality:
|
432 |
other_multilinguality = st.text_input(
|
433 |
-
"You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
|
434 |
value='my-multilinguality',
|
435 |
)
|
436 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
@@ -461,7 +337,7 @@ with c2.beta_expander("- Dataset creators"):
|
|
461 |
)
|
462 |
if "other" in licenses:
|
463 |
other_license = st.text_input(
|
464 |
-
"You selected 'other' type of license. Please enter a short hyphen-separated description:",
|
465 |
value='my-license',
|
466 |
)
|
467 |
st.write(f"Registering other-{other_license} license")
|
@@ -487,13 +363,13 @@ with c2.beta_expander("- Dataset creators"):
|
|
487 |
)
|
488 |
if "other" in extended_sources:
|
489 |
other_extended_sources = st.text_input(
|
490 |
-
"You selected 'other' dataset. Please enter a short hyphen-separated description:",
|
491 |
value='my-dataset',
|
492 |
)
|
493 |
st.write(f"Registering other-{other_extended_sources} dataset")
|
494 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
495 |
source_datasets += [f"extended|{src}" for src in extended_sources]
|
496 |
-
|
497 |
num_examples = (
|
498 |
sum([dct.get('num_examples', 0) for spl, dct in config_infos['splits'].items()])
|
499 |
if config_infos.get('splits', None) is not None
|
@@ -511,7 +387,7 @@ elif num_examples < 1000000:
|
|
511 |
size_cat = "100K<n<1M"
|
512 |
else:
|
513 |
size_cat = "n>1M"
|
514 |
-
|
515 |
res = {
|
516 |
"task_categories": task_categories,
|
517 |
"task_ids": task_specifics,
|
@@ -535,7 +411,7 @@ if c3.button("Done? Save to File!"):
|
|
535 |
if not os.path.isdir(pjoin('saved_tags', dataset_id, config_id)):
|
536 |
_ = os.mkdir(pjoin('saved_tags', dataset_id, config_id))
|
537 |
json.dump(res, open(pjoin('saved_tags', dataset_id, config_id, 'tags.json'), 'w'))
|
538 |
-
|
539 |
with c3.beta_expander("Show JSON output"):
|
540 |
st.write(res)
|
541 |
|
@@ -546,4 +422,3 @@ c3.markdown("--- ")
|
|
546 |
|
547 |
with c3.beta_expander("----> show full task set <----", expanded=True):
|
548 |
st.write(task_set)
|
549 |
-
|
|
|
16 |
initial_sidebar_state="auto",
|
17 |
)
|
18 |
|
19 |
+
task_set = json.load(open("task_set.json"))
|
20 |
+
license_set = json.load(open("license_set.json"))
|
21 |
+
language_set = json.load(open("language_set.json"))
|
22 |
+
language_set_full = json.load(open("language_set_full.json"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
multilinguality_set = {
|
25 |
"monolingual": "contains a single language",
|
26 |
"multilingual": "contains multiple languages",
|
27 |
"translation": "contains translated or aligned text",
|
28 |
"other": "other type of language distribution",
|
29 |
+
}
|
30 |
|
31 |
creator_set = {
|
32 |
"language": [
|
|
|
46 |
],
|
47 |
}
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
|
|
50 |
|
51 |
########################
|
52 |
## Helper functions
|
|
|
81 |
return {
|
82 |
"feature_type": feature_dict["_type"],
|
83 |
"dtype": "string",
|
84 |
+
"languages": feature_dict["languages"],
|
85 |
}
|
86 |
else:
|
87 |
return dict([(k, filter_features(v)) for k, v in feature_dict.items()])
|
|
|
147 |
)
|
148 |
|
149 |
app_desc = """
|
150 |
+
### Dataset Tagger
|
151 |
|
152 |
+
This app aims to make it easier to add structured tags to the datasets present in the library.
|
153 |
|
154 |
Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
|
155 |
+
to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
|
156 |
|
157 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
158 |
"""
|
|
|
266 |
|
267 |
pre_loaded["languages"] = list(set(pre_loaded["languages"] + find_languages(features)))
|
268 |
if config_infos["license"] in license_set:
|
269 |
+
pre_loaded["licenses"] = list(set(pre_loaded["licenses"] + [config_infos["license"]]))
|
270 |
|
271 |
##########
|
272 |
# Modify or add new tags
|
273 |
##########
|
274 |
c2.markdown("#### Editing the tag set")
|
275 |
+
c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
|
276 |
|
277 |
with c2.beta_expander("- Supported tasks"):
|
278 |
task_categories = st.multiselect(
|
|
|
290 |
)
|
291 |
if "other" in task_specs:
|
292 |
other_task = st.text_input(
|
293 |
+
"You selected 'other' task. Please enter a short hyphen-separated description for the task:",
|
294 |
value='my-task-description',
|
295 |
)
|
296 |
st.write(f"Registering {tg}-other-{other_task} task")
|
297 |
task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
|
298 |
task_specifics += task_specs
|
299 |
+
|
300 |
with c2.beta_expander("- Languages"):
|
301 |
multilinguality = st.multiselect(
|
302 |
"Does the dataset contain more than one language?",
|
|
|
306 |
)
|
307 |
if "other" in multilinguality:
|
308 |
other_multilinguality = st.text_input(
|
309 |
+
"You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
|
310 |
value='my-multilinguality',
|
311 |
)
|
312 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
|
|
337 |
)
|
338 |
if "other" in licenses:
|
339 |
other_license = st.text_input(
|
340 |
+
"You selected 'other' type of license. Please enter a short hyphen-separated description:",
|
341 |
value='my-license',
|
342 |
)
|
343 |
st.write(f"Registering other-{other_license} license")
|
|
|
363 |
)
|
364 |
if "other" in extended_sources:
|
365 |
other_extended_sources = st.text_input(
|
366 |
+
"You selected 'other' dataset. Please enter a short hyphen-separated description:",
|
367 |
value='my-dataset',
|
368 |
)
|
369 |
st.write(f"Registering other-{other_extended_sources} dataset")
|
370 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
371 |
source_datasets += [f"extended|{src}" for src in extended_sources]
|
372 |
+
|
373 |
num_examples = (
|
374 |
sum([dct.get('num_examples', 0) for spl, dct in config_infos['splits'].items()])
|
375 |
if config_infos.get('splits', None) is not None
|
|
|
387 |
size_cat = "100K<n<1M"
|
388 |
else:
|
389 |
size_cat = "n>1M"
|
390 |
+
|
391 |
res = {
|
392 |
"task_categories": task_categories,
|
393 |
"task_ids": task_specifics,
|
|
|
411 |
if not os.path.isdir(pjoin('saved_tags', dataset_id, config_id)):
|
412 |
_ = os.mkdir(pjoin('saved_tags', dataset_id, config_id))
|
413 |
json.dump(res, open(pjoin('saved_tags', dataset_id, config_id, 'tags.json'), 'w'))
|
414 |
+
|
415 |
with c3.beta_expander("Show JSON output"):
|
416 |
st.write(res)
|
417 |
|
|
|
422 |
|
423 |
with c3.beta_expander("----> show full task set <----", expanded=True):
|
424 |
st.write(task_set)
|
|
task_set.json
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"conditional-text-generation": {
|
3 |
+
"description": "data-to-text and text transduction tasks such as translation or summarization",
|
4 |
+
"options": [
|
5 |
+
"machine-translation",
|
6 |
+
"sentence-splitting-fusion",
|
7 |
+
"summarization",
|
8 |
+
"table-to-text",
|
9 |
+
"text-simplification",
|
10 |
+
"explanation-generation",
|
11 |
+
"other"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
"question-answering": {
|
15 |
+
"description": "question answering tasks",
|
16 |
+
"options": [
|
17 |
+
"open-domain-qa",
|
18 |
+
"closed-domain-qa",
|
19 |
+
"multiple-choice-qa",
|
20 |
+
"extractive-qa",
|
21 |
+
"abstractive-qa",
|
22 |
+
"other"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
"sequence-modeling": {
|
26 |
+
"description": "such as language modeling or dialogue",
|
27 |
+
"options": [
|
28 |
+
"dialogue-modeling",
|
29 |
+
"language-modeling",
|
30 |
+
"other-multi-turn",
|
31 |
+
"slot-filling",
|
32 |
+
"other"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
"structure-prediction": {
|
36 |
+
"description": "predicting structural properties of the text, such as syntax",
|
37 |
+
"options": [
|
38 |
+
"coreference-resolution",
|
39 |
+
"named-entity-recognition",
|
40 |
+
"parsing",
|
41 |
+
"other"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
"text-classification": {
|
45 |
+
"description": "predicting a class index or boolean value",
|
46 |
+
"options": [
|
47 |
+
"acceptability-classification",
|
48 |
+
"entity-linking-classification",
|
49 |
+
"fact-checking",
|
50 |
+
"intent-classification",
|
51 |
+
"multi-class-classification",
|
52 |
+
"multi-label-classification",
|
53 |
+
"natural-language-inference",
|
54 |
+
"semantic-similarity-classification",
|
55 |
+
"sentiment-classification",
|
56 |
+
"topic-classification",
|
57 |
+
"other"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
"text-retrieval": {
|
61 |
+
"description": "information or text retrieval tasks",
|
62 |
+
"options": [
|
63 |
+
"document-retrieval",
|
64 |
+
"utterance-retrieval",
|
65 |
+
"entity-linking-retrieval",
|
66 |
+
"fact-checking-retrieval",
|
67 |
+
"other"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
"text-scoring": {
|
71 |
+
"description": "text scoring tasks, predicting a real valued score for some text",
|
72 |
+
"options": [
|
73 |
+
"semantic-similarity-scoring",
|
74 |
+
"sentiment-scoring",
|
75 |
+
"other"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
"other": {
|
79 |
+
"description": "other task family not mentioned here",
|
80 |
+
"options": [
|
81 |
+
"other"
|
82 |
+
]
|
83 |
+
}
|
84 |
+
}
|