theo commited on
Commit
9ec7c08
1 Parent(s): d71e67c

leverage infos file, fix prerun script

Browse files
Files changed (5) hide show
  1. apputils.py +15 -0
  2. build_metadata_file.py +7 -4
  3. license_set.json +0 -452
  4. tagging_app.py +78 -30
  5. task_set.json +0 -86
apputils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+
4
+ def new_state() -> Dict[str, List]:
5
+ return {
6
+ "task_categories": [],
7
+ "task_ids": [],
8
+ "multilinguality": [],
9
+ "languages": [],
10
+ "language_creators": [],
11
+ "annotations_creators": [],
12
+ "source_datasets": [],
13
+ "size_categories": [],
14
+ "licenses": [],
15
+ }
build_metadata_file.py CHANGED
@@ -12,6 +12,8 @@ from typing import Dict
12
 
13
  import yaml
14
 
 
 
15
 
16
  def metadata_from_readme(f: Path) -> Dict:
17
  with f.open() as fi:
@@ -25,10 +27,10 @@ def metadata_from_readme(f: Path) -> Dict:
25
  def load_ds_datas():
26
  drepo = Path("datasets")
27
  if drepo.exists() and drepo.is_dir():
28
- check_call(["git", "pull"], cwd=str((Path.cwd() / "datasets").absolute()))
29
  else:
30
  check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
31
- head_sha = check_output(["git", "rev-parse", "HEAD"])
32
 
33
  datasets_md = dict()
34
 
@@ -38,6 +40,8 @@ def load_ds_datas():
38
  metadata = metadata_from_readme(ddir / "README.md")
39
  except:
40
  metadata = None
 
 
41
 
42
  try:
43
  with (ddir / "dataset_infos.json").open() as fi:
@@ -45,8 +49,7 @@ def load_ds_datas():
45
  except:
46
  infos = None
47
 
48
- if metadata is not None and len(metadata) > 0:
49
- datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
50
  return head_sha.decode().strip(), datasets_md
51
 
52
 
 
12
 
13
  import yaml
14
 
15
+ from apputils import new_state
16
+
17
 
18
  def metadata_from_readme(f: Path) -> Dict:
19
  with f.open() as fi:
 
27
  def load_ds_datas():
28
  drepo = Path("datasets")
29
  if drepo.exists() and drepo.is_dir():
30
+ check_call(["git", "pull"], cwd=drepo)
31
  else:
32
  check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
33
+ head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
34
 
35
  datasets_md = dict()
36
 
 
40
  metadata = metadata_from_readme(ddir / "README.md")
41
  except:
42
  metadata = None
43
+ if metadata is None or len(metadata) == 0:
44
+ metadata = new_state()
45
 
46
  try:
47
  with (ddir / "dataset_infos.json").open() as fi:
 
49
  except:
50
  infos = None
51
 
52
+ datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
 
53
  return head_sha.decode().strip(), datasets_md
54
 
55
 
license_set.json DELETED
@@ -1,452 +0,0 @@
1
- {
2
- "other": "Other license",
3
- "unknown": "License information unavailable",
4
- "0bsd": "BSD Zero Clause License",
5
- "aal": "Attribution Assurance License",
6
- "abstyles": "Abstyles License",
7
- "adobe-2006": "Adobe Systems Incorporated Source Code License Agreement",
8
- "adobe-glyph": "Adobe Glyph List License",
9
- "adsl": "Amazon Digital Services License",
10
- "afl-1.1": "Academic Free License v1.1",
11
- "afl-1.2": "Academic Free License v1.2",
12
- "afl-2.0": "Academic Free License v2.0",
13
- "afl-2.1": "Academic Free License v2.1",
14
- "afl-3.0": "Academic Free License v3.0",
15
- "afmparse": "Afmparse License",
16
- "agpl-1.0": "Affero General Public License v1.0",
17
- "agpl-1.0-only": "Affero General Public License v1.0 only",
18
- "agpl-1.0-or-later": "Affero General Public License v1.0 or later",
19
- "agpl-3.0": "GNU Affero General Public License v3.0",
20
- "agpl-3.0-only": "GNU Affero General Public License v3.0 only",
21
- "agpl-3.0-or-later": "GNU Affero General Public License v3.0 or later",
22
- "aladdin": "Aladdin Free Public License",
23
- "amdplpa": "AMD's plpa_map.c License",
24
- "aml": "Apple MIT License",
25
- "ampas": "Academy of Motion Picture Arts and Sciences BSD",
26
- "antlr-pd": "ANTLR Software Rights Notice",
27
- "antlr-pd-fallback": "ANTLR Software Rights Notice with license fallback",
28
- "apache-1.0": "Apache License 1.0",
29
- "apache-1.1": "Apache License 1.1",
30
- "apache-2.0": "Apache License 2.0",
31
- "apafml": "Adobe Postscript AFM License",
32
- "apl-1.0": "Adaptive Public License 1.0",
33
- "apsl-1.0": "Apple Public Source License 1.0",
34
- "apsl-1.1": "Apple Public Source License 1.1",
35
- "apsl-1.2": "Apple Public Source License 1.2",
36
- "apsl-2.0": "Apple Public Source License 2.0",
37
- "artistic-1.0": "Artistic License 1.0",
38
- "artistic-1.0-cl8": "Artistic License 1.0 w/clause 8",
39
- "artistic-1.0-perl": "Artistic License 1.0 (Perl)",
40
- "artistic-2.0": "Artistic License 2.0",
41
- "bahyph": "Bahyph License",
42
- "barr": "Barr License",
43
- "beerware": "Beerware License",
44
- "bittorrent-1.0": "BitTorrent Open Source License v1.0",
45
- "bittorrent-1.1": "BitTorrent Open Source License v1.1",
46
- "blessing": "SQLite Blessing",
47
- "blueoak-1.0.0": "Blue Oak Model License 1.0.0",
48
- "borceux": "Borceux license",
49
- "bsd-1-clause": "BSD 1-Clause License",
50
- "bsd-2-clause": "BSD 2-Clause \"Simplified\" License",
51
- "bsd-2-clause-freebsd": "BSD 2-Clause FreeBSD License",
52
- "bsd-2-clause-netbsd": "BSD 2-Clause NetBSD License",
53
- "bsd-2-clause-patent": "BSD-2-Clause Plus Patent License",
54
- "bsd-2-clause-views": "BSD 2-Clause with views sentence",
55
- "bsd-3-clause": "BSD 3-Clause \"New\" or \"Revised\" License",
56
- "bsd-3-clause-attribution": "BSD with attribution",
57
- "bsd-3-clause-clear": "BSD 3-Clause Clear License",
58
- "bsd-3-clause-lbnl": "Lawrence Berkeley National Labs BSD variant license",
59
- "bsd-3-clause-no-nuclear-license": "BSD 3-Clause No Nuclear License",
60
- "bsd-3-clause-no-nuclear-license-2014": "BSD 3-Clause No Nuclear License 2014",
61
- "bsd-3-clause-no-nuclear-warranty": "BSD 3-Clause No Nuclear Warranty",
62
- "bsd-3-clause-open-mpi": "BSD 3-Clause Open MPI variant",
63
- "bsd-4-clause": "BSD 4-Clause \"Original\" or \"Old\" License",
64
- "bsd-4-clause-uc": "BSD-4-Clause (University of California-Specific)",
65
- "bsd-protection": "BSD Protection License",
66
- "bsd-source-code": "BSD Source Code Attribution",
67
- "bsl-1.0": "Boost Software License 1.0",
68
- "busl-1.1": "Business Source License 1.1",
69
- "bzip2-1.0.5": "bzip2 and libbzip2 License v1.0.5",
70
- "bzip2-1.0.6": "bzip2 and libbzip2 License v1.0.6",
71
- "cal-1.0": "Cryptographic Autonomy License 1.0",
72
- "cal-1.0-combined-work-exception": "Cryptographic Autonomy License 1.0 (Combined Work Exception)",
73
- "caldera": "Caldera License",
74
- "catosl-1.1": "Computer Associates Trusted Open Source License 1.1",
75
- "cc-by-1.0": "Creative Commons Attribution 1.0 Generic",
76
- "cc-by-2.0": "Creative Commons Attribution 2.0 Generic",
77
- "cc-by-2.5": "Creative Commons Attribution 2.5 Generic",
78
- "cc-by-3.0": "Creative Commons Attribution 3.0 Unported",
79
- "cc-by-3.0-at": "Creative Commons Attribution 3.0 Austria",
80
- "cc-by-3.0-us": "Creative Commons Attribution 3.0 United States",
81
- "cc-by-4.0": "Creative Commons Attribution 4.0 International",
82
- "cc-by-nc-1.0": "Creative Commons Attribution Non Commercial 1.0 Generic",
83
- "cc-by-nc-2.0": "Creative Commons Attribution Non Commercial 2.0 Generic",
84
- "cc-by-nc-2.5": "Creative Commons Attribution Non Commercial 2.5 Generic",
85
- "cc-by-nc-3.0": "Creative Commons Attribution Non Commercial 3.0 Unported",
86
- "cc-by-nc-4.0": "Creative Commons Attribution Non Commercial 4.0 International",
87
- "cc-by-nc-nd-1.0": "Creative Commons Attribution Non Commercial No Derivatives 1.0 Generic",
88
- "cc-by-nc-nd-2.0": "Creative Commons Attribution Non Commercial No Derivatives 2.0 Generic",
89
- "cc-by-nc-nd-2.5": "Creative Commons Attribution Non Commercial No Derivatives 2.5 Generic",
90
- "cc-by-nc-nd-3.0": "Creative Commons Attribution Non Commercial No Derivatives 3.0 Unported",
91
- "cc-by-nc-nd-3.0-igo": "Creative Commons Attribution Non Commercial No Derivatives 3.0 IGO",
92
- "cc-by-nc-nd-4.0": "Creative Commons Attribution Non Commercial No Derivatives 4.0 International",
93
- "cc-by-nc-sa-1.0": "Creative Commons Attribution Non Commercial Share Alike 1.0 Generic",
94
- "cc-by-nc-sa-2.0": "Creative Commons Attribution Non Commercial Share Alike 2.0 Generic",
95
- "cc-by-nc-sa-2.5": "Creative Commons Attribution Non Commercial Share Alike 2.5 Generic",
96
- "cc-by-nc-sa-3.0": "Creative Commons Attribution Non Commercial Share Alike 3.0 Unported",
97
- "cc-by-nc-sa-4.0": "Creative Commons Attribution Non Commercial Share Alike 4.0 International",
98
- "cc-by-nd-1.0": "Creative Commons Attribution No Derivatives 1.0 Generic",
99
- "cc-by-nd-2.0": "Creative Commons Attribution No Derivatives 2.0 Generic",
100
- "cc-by-nd-2.5": "Creative Commons Attribution No Derivatives 2.5 Generic",
101
- "cc-by-nd-3.0": "Creative Commons Attribution No Derivatives 3.0 Unported",
102
- "cc-by-nd-4.0": "Creative Commons Attribution No Derivatives 4.0 International",
103
- "cc-by-sa-1.0": "Creative Commons Attribution Share Alike 1.0 Generic",
104
- "cc-by-sa-2.0": "Creative Commons Attribution Share Alike 2.0 Generic",
105
- "cc-by-sa-2.0-uk": "Creative Commons Attribution Share Alike 2.0 England and Wales",
106
- "cc-by-sa-2.5": "Creative Commons Attribution Share Alike 2.5 Generic",
107
- "cc-by-sa-3.0": "Creative Commons Attribution Share Alike 3.0 Unported",
108
- "cc-by-sa-3.0-at": "Creative Commons Attribution-Share Alike 3.0 Austria",
109
- "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0 International",
110
- "cc-pddc": "Creative Commons Public Domain Dedication and Certification",
111
- "cc0-1.0": "Creative Commons Zero v1.0 Universal",
112
- "cddl-1.0": "Common Development and Distribution License 1.0",
113
- "cddl-1.1": "Common Development and Distribution License 1.1",
114
- "cdla-permissive-1.0": "Community Data License Agreement Permissive 1.0",
115
- "cdla-sharing-1.0": "Community Data License Agreement Sharing 1.0",
116
- "cecill-1.0": "CeCILL Free Software License Agreement v1.0",
117
- "cecill-1.1": "CeCILL Free Software License Agreement v1.1",
118
- "cecill-2.0": "CeCILL Free Software License Agreement v2.0",
119
- "cecill-2.1": "CeCILL Free Software License Agreement v2.1",
120
- "cecill-b": "CeCILL-B Free Software License Agreement",
121
- "cecill-c": "CeCILL-C Free Software License Agreement",
122
- "cern-ohl-1.1": "CERN Open Hardware Licence v1.1",
123
- "cern-ohl-1.2": "CERN Open Hardware Licence v1.2",
124
- "cern-ohl-p-2.0": "CERN Open Hardware Licence Version 2 - Permissive",
125
- "cern-ohl-s-2.0": "CERN Open Hardware Licence Version 2 - Strongly Reciprocal",
126
- "cern-ohl-w-2.0": "CERN Open Hardware Licence Version 2 - Weakly Reciprocal",
127
- "clartistic": "Clarified Artistic License",
128
- "cnri-jython": "CNRI Jython License",
129
- "cnri-python": "CNRI Python License",
130
- "cnri-python-gpl-compatible": "CNRI Python Open Source GPL Compatible License Agreement",
131
- "condor-1.1": "Condor Public License v1.1",
132
- "copyleft-next-0.3.0": "copyleft-next 0.3.0",
133
- "copyleft-next-0.3.1": "copyleft-next 0.3.1",
134
- "cpal-1.0": "Common Public Attribution License 1.0",
135
- "cpl-1.0": "Common Public License 1.0",
136
- "cpol-1.02": "Code Project Open License 1.02",
137
- "crossword": "Crossword License",
138
- "crystalstacker": "CrystalStacker License",
139
- "cua-opl-1.0": "CUA Office Public License v1.0",
140
- "cube": "Cube License",
141
- "curl": "curl License",
142
- "d-fsl-1.0": "Deutsche Freie Software Lizenz",
143
- "diffmark": "diffmark license",
144
- "doc": "DOC License",
145
- "dotseqn": "Dotseqn License",
146
- "dsdp": "DSDP License",
147
- "dvipdfm": "dvipdfm License",
148
- "ecl-1.0": "Educational Community License v1.0",
149
- "ecl-2.0": "Educational Community License v2.0",
150
- "ecos-2.0": "eCos license version 2.0",
151
- "efl-1.0": "Eiffel Forum License v1.0",
152
- "efl-2.0": "Eiffel Forum License v2.0",
153
- "egenix": "eGenix.com Public License 1.1.0",
154
- "entessa": "Entessa Public License v1.0",
155
- "epics": "EPICS Open License",
156
- "epl-1.0": "Eclipse Public License 1.0",
157
- "epl-2.0": "Eclipse Public License 2.0",
158
- "erlpl-1.1": "Erlang Public License v1.1",
159
- "etalab-2.0": "Etalab Open License 2.0",
160
- "eudatagrid": "EU DataGrid Software License",
161
- "eupl-1.0": "European Union Public License 1.0",
162
- "eupl-1.1": "European Union Public License 1.1",
163
- "eupl-1.2": "European Union Public License 1.2",
164
- "eurosym": "Eurosym License",
165
- "fair": "Fair License",
166
- "frameworx-1.0": "Frameworx Open License 1.0",
167
- "freeimage": "FreeImage Public License v1.0",
168
- "fsfap": "FSF All Permissive License",
169
- "fsful": "FSF Unlimited License",
170
- "fsfullr": "FSF Unlimited License (with License Retention)",
171
- "ftl": "Freetype Project License",
172
- "gfdl-1.1": "GNU Free Documentation License v1.1",
173
- "gfdl-1.1-invariants-only": "GNU Free Documentation License v1.1 only - invariants",
174
- "gfdl-1.1-invariants-or-later": "GNU Free Documentation License v1.1 or later - invariants",
175
- "gfdl-1.1-no-invariants-only": "GNU Free Documentation License v1.1 only - no invariants",
176
- "gfdl-1.1-no-invariants-or-later": "GNU Free Documentation License v1.1 or later - no invariants",
177
- "gfdl-1.1-only": "GNU Free Documentation License v1.1 only",
178
- "gfdl-1.1-or-later": "GNU Free Documentation License v1.1 or later",
179
- "gfdl-1.2": "GNU Free Documentation License v1.2",
180
- "gfdl-1.2-invariants-only": "GNU Free Documentation License v1.2 only - invariants",
181
- "gfdl-1.2-invariants-or-later": "GNU Free Documentation License v1.2 or later - invariants",
182
- "gfdl-1.2-no-invariants-only": "GNU Free Documentation License v1.2 only - no invariants",
183
- "gfdl-1.2-no-invariants-or-later": "GNU Free Documentation License v1.2 or later - no invariants",
184
- "gfdl-1.2-only": "GNU Free Documentation License v1.2 only",
185
- "gfdl-1.2-or-later": "GNU Free Documentation License v1.2 or later",
186
- "gfdl-1.3": "GNU Free Documentation License v1.3",
187
- "gfdl-1.3-invariants-only": "GNU Free Documentation License v1.3 only - invariants",
188
- "gfdl-1.3-invariants-or-later": "GNU Free Documentation License v1.3 or later - invariants",
189
- "gfdl-1.3-no-invariants-only": "GNU Free Documentation License v1.3 only - no invariants",
190
- "gfdl-1.3-no-invariants-or-later": "GNU Free Documentation License v1.3 or later - no invariants",
191
- "gfdl-1.3-only": "GNU Free Documentation License v1.3 only",
192
- "gfdl-1.3-or-later": "GNU Free Documentation License v1.3 or later",
193
- "giftware": "Giftware License",
194
- "gl2ps": "GL2PS License",
195
- "glide": "3dfx Glide License",
196
- "glulxe": "Glulxe License",
197
- "glwtpl": "Good Luck With That Public License",
198
- "gnuplot": "gnuplot License",
199
- "gpl-1.0": "GNU General Public License v1.0 only",
200
- "gpl-1.0+": "GNU General Public License v1.0 or later",
201
- "gpl-1.0-only": "GNU General Public License v1.0 only",
202
- "gpl-1.0-or-later": "GNU General Public License v1.0 or later",
203
- "gpl-2.0": "GNU General Public License v2.0 only",
204
- "gpl-2.0+": "GNU General Public License v2.0 or later",
205
- "gpl-2.0-only": "GNU General Public License v2.0 only",
206
- "gpl-2.0-or-later": "GNU General Public License v2.0 or later",
207
- "gpl-2.0-with-autoconf-exception": "GNU General Public License v2.0 w/Autoconf exception",
208
- "gpl-2.0-with-bison-exception": "GNU General Public License v2.0 w/Bison exception",
209
- "gpl-2.0-with-classpath-exception": "GNU General Public License v2.0 w/Classpath exception",
210
- "gpl-2.0-with-font-exception": "GNU General Public License v2.0 w/Font exception",
211
- "gpl-2.0-with-gcc-exception": "GNU General Public License v2.0 w/GCC Runtime Library exception",
212
- "gpl-3.0": "GNU General Public License v3.0 only",
213
- "gpl-3.0+": "GNU General Public License v3.0 or later",
214
- "gpl-3.0-only": "GNU General Public License v3.0 only",
215
- "gpl-3.0-or-later": "GNU General Public License v3.0 or later",
216
- "gpl-3.0-with-autoconf-exception": "GNU General Public License v3.0 w/Autoconf exception",
217
- "gpl-3.0-with-gcc-exception": "GNU General Public License v3.0 w/GCC Runtime Library exception",
218
- "gsoap-1.3b": "gSOAP Public License v1.3b",
219
- "haskellreport": "Haskell Language Report License",
220
- "hippocratic-2.1": "Hippocratic License 2.1",
221
- "hpnd": "Historical Permission Notice and Disclaimer",
222
- "hpnd-sell-variant": "Historical Permission Notice and Disclaimer - sell variant",
223
- "htmltidy": "HTML Tidy License",
224
- "ibm-pibs": "IBM PowerPC Initialization and Boot Software",
225
- "icu": "ICU License",
226
- "ijg": "Independent JPEG Group License",
227
- "imagemagick": "ImageMagick License",
228
- "imatix": "iMatix Standard Function Library Agreement",
229
- "imlib2": "Imlib2 License",
230
- "info-zip": "Info-ZIP License",
231
- "intel": "Intel Open Source License",
232
- "intel-acpi": "Intel ACPI Software License Agreement",
233
- "interbase-1.0": "Interbase Public License v1.0",
234
- "ipa": "IPA Font License",
235
- "ipl-1.0": "IBM Public License v1.0",
236
- "isc": "ISC License",
237
- "jasper-2.0": "JasPer License",
238
- "jpnic": "Japan Network Information Center License",
239
- "json": "JSON License",
240
- "lal-1.2": "Licence Art Libre 1.2",
241
- "lal-1.3": "Licence Art Libre 1.3",
242
- "latex2e": "Latex2e License",
243
- "leptonica": "Leptonica License",
244
- "lgpl-2.0": "GNU Library General Public License v2 only",
245
- "lgpl-2.0+": "GNU Library General Public License v2 or later",
246
- "lgpl-2.0-only": "GNU Library General Public License v2 only",
247
- "lgpl-2.0-or-later": "GNU Library General Public License v2 or later",
248
- "lgpl-2.1": "GNU Lesser General Public License v2.1 only",
249
- "lgpl-2.1+": "GNU Library General Public License v2.1 or later",
250
- "lgpl-2.1-only": "GNU Lesser General Public License v2.1 only",
251
- "lgpl-2.1-or-later": "GNU Lesser General Public License v2.1 or later",
252
- "lgpl-3.0": "GNU Lesser General Public License v3.0 only",
253
- "lgpl-3.0+": "GNU Lesser General Public License v3.0 or later",
254
- "lgpl-3.0-only": "GNU Lesser General Public License v3.0 only",
255
- "lgpl-3.0-or-later": "GNU Lesser General Public License v3.0 or later",
256
- "lgpllr": "Lesser General Public License For Linguistic Resources",
257
- "libpng": "libpng License",
258
- "libpng-2.0": "PNG Reference Library version 2",
259
- "libselinux-1.0": "libselinux public domain notice",
260
- "libtiff": "libtiff License",
261
- "liliq-p-1.1": "Licence Libre du Qu\u00e9bec \u2013 Permissive version 1.1",
262
- "liliq-r-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 version 1.1",
263
- "liliq-rplus-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 forte version 1.1",
264
- "linux-openib": "Linux Kernel Variant of OpenIB.org license",
265
- "lpl-1.0": "Lucent Public License Version 1.0",
266
- "lpl-1.02": "Lucent Public License v1.02",
267
- "lppl-1.0": "LaTeX Project Public License v1.0",
268
- "lppl-1.1": "LaTeX Project Public License v1.1",
269
- "lppl-1.2": "LaTeX Project Public License v1.2",
270
- "lppl-1.3a": "LaTeX Project Public License v1.3a",
271
- "lppl-1.3c": "LaTeX Project Public License v1.3c",
272
- "makeindex": "MakeIndex License",
273
- "miros": "The MirOS Licence",
274
- "mit": "MIT License",
275
- "mit-0": "MIT No Attribution",
276
- "mit-advertising": "Enlightenment License (e16)",
277
- "mit-cmu": "CMU License",
278
- "mit-enna": "enna License",
279
- "mit-feh": "feh License",
280
- "mit-open-group": "MIT Open Group variant",
281
- "mitnfa": "MIT +no-false-attribs license",
282
- "motosoto": "Motosoto License",
283
- "mpich2": "mpich2 License",
284
- "mpl-1.0": "Mozilla Public License 1.0",
285
- "mpl-1.1": "Mozilla Public License 1.1",
286
- "mpl-2.0": "Mozilla Public License 2.0",
287
- "mpl-2.0-no-copyleft-exception": "Mozilla Public License 2.0 (no copyleft exception)",
288
- "ms-pl": "Microsoft Public License",
289
- "ms-rl": "Microsoft Reciprocal License",
290
- "mtll": "Matrix Template Library License",
291
- "mulanpsl-1.0": "Mulan Permissive Software License, Version 1",
292
- "mulanpsl-2.0": "Mulan Permissive Software License, Version 2",
293
- "multics": "Multics License",
294
- "mup": "Mup License",
295
- "nasa-1.3": "NASA Open Source Agreement 1.3",
296
- "naumen": "Naumen Public License",
297
- "nbpl-1.0": "Net Boolean Public License v1",
298
- "ncgl-uk-2.0": "Non-Commercial Government Licence",
299
- "ncsa": "University of Illinois/NCSA Open Source License",
300
- "net-snmp": "Net-SNMP License",
301
- "netcdf": "NetCDF license",
302
- "newsletr": "Newsletr License",
303
- "ngpl": "Nethack General Public License",
304
- "nist-pd": "NIST Public Domain Notice",
305
- "nist-pd-fallback": "NIST Public Domain Notice with license fallback",
306
- "nlod-1.0": "Norwegian Licence for Open Government Data",
307
- "nlpl": "No Limit Public License",
308
- "nokia": "Nokia Open Source License",
309
- "nosl": "Netizen Open Source License",
310
- "noweb": "Noweb License",
311
- "npl-1.0": "Netscape Public License v1.0",
312
- "npl-1.1": "Netscape Public License v1.1",
313
- "nposl-3.0": "Non-Profit Open Software License 3.0",
314
- "nrl": "NRL License",
315
- "ntp": "NTP License",
316
- "ntp-0": "NTP No Attribution",
317
- "nunit": "Nunit License",
318
- "o-uda-1.0": "Open Use of Data Agreement v1.0",
319
- "occt-pl": "Open CASCADE Technology Public License",
320
- "oclc-2.0": "OCLC Research Public License 2.0",
321
- "odbl-1.0": "ODC Open Database License v1.0",
322
- "odc-by-1.0": "Open Data Commons Attribution License v1.0",
323
- "ofl-1.0": "SIL Open Font License 1.0",
324
- "ofl-1.0-no-rfn": "SIL Open Font License 1.0 with no Reserved Font Name",
325
- "ofl-1.0-rfn": "SIL Open Font License 1.0 with Reserved Font Name",
326
- "ofl-1.1": "SIL Open Font License 1.1",
327
- "ofl-1.1-no-rfn": "SIL Open Font License 1.1 with no Reserved Font Name",
328
- "ofl-1.1-rfn": "SIL Open Font License 1.1 with Reserved Font Name",
329
- "ogc-1.0": "OGC Software License, Version 1.0",
330
- "ogl-canada-2.0": "Open Government Licence - Canada",
331
- "ogl-uk-1.0": "Open Government Licence v1.0",
332
- "ogl-uk-2.0": "Open Government Licence v2.0",
333
- "ogl-uk-3.0": "Open Government Licence v3.0",
334
- "ogtsl": "Open Group Test Suite License",
335
- "oldap-1.1": "Open LDAP Public License v1.1",
336
- "oldap-1.2": "Open LDAP Public License v1.2",
337
- "oldap-1.3": "Open LDAP Public License v1.3",
338
- "oldap-1.4": "Open LDAP Public License v1.4",
339
- "oldap-2.0": "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)",
340
- "oldap-2.0.1": "Open LDAP Public License v2.0.1",
341
- "oldap-2.1": "Open LDAP Public License v2.1",
342
- "oldap-2.2": "Open LDAP Public License v2.2",
343
- "oldap-2.2.1": "Open LDAP Public License v2.2.1",
344
- "oldap-2.2.2": "Open LDAP Public License 2.2.2",
345
- "oldap-2.3": "Open LDAP Public License v2.3",
346
- "oldap-2.4": "Open LDAP Public License v2.4",
347
- "oldap-2.5": "Open LDAP Public License v2.5",
348
- "oldap-2.6": "Open LDAP Public License v2.6",
349
- "oldap-2.7": "Open LDAP Public License v2.7",
350
- "oldap-2.8": "Open LDAP Public License v2.8",
351
- "oml": "Open Market License",
352
- "openssl": "OpenSSL License",
353
- "opl-1.0": "Open Public License v1.0",
354
- "oset-pl-2.1": "OSET Public License version 2.1",
355
- "osl-1.0": "Open Software License 1.0",
356
- "osl-1.1": "Open Software License 1.1",
357
- "osl-2.0": "Open Software License 2.0",
358
- "osl-2.1": "Open Software License 2.1",
359
- "osl-3.0": "Open Software License 3.0",
360
- "parity-6.0.0": "The Parity Public License 6.0.0",
361
- "parity-7.0.0": "The Parity Public License 7.0.0",
362
- "pddl-1.0": "ODC Public Domain Dedication & License 1.0",
363
- "php-3.0": "PHP License v3.0",
364
- "php-3.01": "PHP License v3.01",
365
- "plexus": "Plexus Classworlds License",
366
- "polyform-noncommercial-1.0.0": "PolyForm Noncommercial License 1.0.0",
367
- "polyform-small-business-1.0.0": "PolyForm Small Business License 1.0.0",
368
- "postgresql": "PostgreSQL License",
369
- "psf-2.0": "Python Software Foundation License 2.0",
370
- "psfrag": "psfrag License",
371
- "psutils": "psutils License",
372
- "python-2.0": "Python License 2.0",
373
- "qhull": "Qhull License",
374
- "qpl-1.0": "Q Public License 1.0",
375
- "rdisc": "Rdisc License",
376
- "rhecos-1.1": "Red Hat eCos Public License v1.1",
377
- "rpl-1.1": "Reciprocal Public License 1.1",
378
- "rpl-1.5": "Reciprocal Public License 1.5",
379
- "rpsl-1.0": "RealNetworks Public Source License v1.0",
380
- "rsa-md": "RSA Message-Digest License",
381
- "rscpl": "Ricoh Source Code Public License",
382
- "ruby": "Ruby License",
383
- "sax-pd": "Sax Public Domain Notice",
384
- "saxpath": "Saxpath License",
385
- "scea": "SCEA Shared Source License",
386
- "sendmail": "Sendmail License",
387
- "sendmail-8.23": "Sendmail License 8.23",
388
- "sgi-b-1.0": "SGI Free Software License B v1.0",
389
- "sgi-b-1.1": "SGI Free Software License B v1.1",
390
- "sgi-b-2.0": "SGI Free Software License B v2.0",
391
- "shl-0.5": "Solderpad Hardware License v0.5",
392
- "shl-0.51": "Solderpad Hardware License, Version 0.51",
393
- "simpl-2.0": "Simple Public License 2.0",
394
- "sissl": "Sun Industry Standards Source License v1.1",
395
- "sissl-1.2": "Sun Industry Standards Source License v1.2",
396
- "sleepycat": "Sleepycat License",
397
- "smlnj": "Standard ML of New Jersey License",
398
- "smppl": "Secure Messaging Protocol Public License",
399
- "snia": "SNIA Public License 1.1",
400
- "spencer-86": "Spencer License 86",
401
- "spencer-94": "Spencer License 94",
402
- "spencer-99": "Spencer License 99",
403
- "spl-1.0": "Sun Public License v1.0",
404
- "ssh-openssh": "SSH OpenSSH license",
405
- "ssh-short": "SSH short notice",
406
- "sspl-1.0": "Server Side Public License, v 1",
407
- "standardml-nj": "Standard ML of New Jersey License",
408
- "sugarcrm-1.1.3": "SugarCRM Public License v1.1.3",
409
- "swl": "Scheme Widget Library (SWL) Software License Agreement",
410
- "tapr-ohl-1.0": "TAPR Open Hardware License v1.0",
411
- "tcl": "TCL/TK License",
412
- "tcp-wrappers": "TCP Wrappers License",
413
- "tmate": "TMate Open Source License",
414
- "torque-1.1": "TORQUE v2.5+ Software License v1.1",
415
- "tosl": "Trusster Open Source License",
416
- "tu-berlin-1.0": "Technische Universitaet Berlin License 1.0",
417
- "tu-berlin-2.0": "Technische Universitaet Berlin License 2.0",
418
- "ucl-1.0": "Upstream Compatibility License v1.0",
419
- "unicode-dfs-2015": "Unicode License Agreement - Data Files and Software (2015)",
420
- "unicode-dfs-2016": "Unicode License Agreement - Data Files and Software (2016)",
421
- "unicode-tou": "Unicode Terms of Use",
422
- "unlicense": "The Unlicense",
423
- "upl-1.0": "Universal Permissive License v1.0",
424
- "vim": "Vim License",
425
- "vostrom": "VOSTROM Public License for Open Source",
426
- "vsl-1.0": "Vovida Software License v1.0",
427
- "w3c": "W3C Software Notice and License (2002-12-31)",
428
- "w3c-19980720": "W3C Software Notice and License (1998-07-20)",
429
- "w3c-20150513": "W3C Software Notice and Document License (2015-05-13)",
430
- "watcom-1.0": "Sybase Open Watcom Public License 1.0",
431
- "wsuipa": "Wsuipa License",
432
- "wtfpl": "Do What The F*ck You Want To Public License",
433
- "wxwindows": "wxWindows Library License",
434
- "x11": "X11 License",
435
- "xerox": "Xerox License",
436
- "xfree86-1.1": "XFree86 License 1.1",
437
- "xinetd": "xinetd License",
438
- "xnet": "X.Net License",
439
- "xpp": "XPP License",
440
- "xskat": "XSkat License",
441
- "ypl-1.0": "Yahoo! Public License v1.0",
442
- "ypl-1.1": "Yahoo! Public License v1.1",
443
- "zed": "Zed License",
444
- "zend-2.0": "Zend License v2.0",
445
- "zimbra-1.3": "Zimbra Public License v1.3",
446
- "zimbra-1.4": "Zimbra Public License v1.4",
447
- "zlib": "zlib License",
448
- "zlib-acknowledgement": "zlib/libpng License with Acknowledgement",
449
- "zpl-1.1": "Zope Public License 1.1",
450
- "zpl-2.0": "Zope Public License 2.0",
451
- "zpl-2.1": "Zope Public License 2.1"
452
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tagging_app.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  from pathlib import Path
3
  from typing import Callable, Dict, List, Tuple
4
 
@@ -14,6 +15,8 @@ from datasets.utils.metadata import (
14
  known_task_ids,
15
  )
16
 
 
 
17
  st.set_page_config(
18
  page_title="HF Dataset Tagging App",
19
  page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
@@ -38,7 +41,7 @@ st.markdown(
38
  ########################
39
 
40
 
41
- def load_ds_datas():
42
  metada_exports = sorted(
43
  [f for f in Path.cwd().iterdir() if f.name.startswith("metadata_")],
44
  key=lambda f: f.lstat().st_mtime,
@@ -47,6 +50,7 @@ def load_ds_datas():
47
  if len(metada_exports) == 0:
48
  raise ValueError("need to run ./build_metada_file.py at least once")
49
  with metada_exports[0].open() as fi:
 
50
  return json.load(fi)
51
 
52
 
@@ -81,18 +85,32 @@ def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
81
  w.error(e)
82
 
83
 
84
- def new_state() -> Dict[str, List]:
85
- return {
86
- "task_categories": [],
87
- "task_ids": [],
88
- "multilinguality": [],
89
- "languages": [],
90
- "language_creators": [],
91
- "annotations_creators": [],
92
- "source_datasets": [],
93
- "size_categories": [],
94
- "licenses": [],
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
 
98
  def is_state_empty(state: Dict[str, List]) -> bool:
@@ -101,8 +119,9 @@ def is_state_empty(state: Dict[str, List]) -> bool:
101
 
102
  state = new_state()
103
  datasets_md = load_ds_datas()
104
- existing_tag_sets = {name: mds["metadata"] for name, mds in datasets_md.items()}
105
- all_dataset_ids = list(existing_tag_sets.keys())
 
106
 
107
 
108
  ########################
@@ -124,19 +143,26 @@ queryparams = st.experimental_get_query_params()
124
  preload = queryparams.get("preload_dataset", list())
125
  preloaded_id = None
126
  initial_state = None
127
- did_index = 0
128
- if len(preload) == 1 and preload[0] in all_dataset_ids:
 
 
129
  preloaded_id, *_ = preload
130
- initial_state = existing_tag_sets.get(preloaded_id)
 
 
131
  state = initial_state or new_state()
132
- did_index = all_dataset_ids.index(preloaded_id)
133
 
134
  preloaded_id = st.sidebar.selectbox(
135
- label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index
136
  )
 
137
  leftbtn, rightbtn = st.sidebar.beta_columns(2)
138
  if leftbtn.button("pre-load"):
139
- initial_state = existing_tag_sets[preloaded_id]
 
 
140
  state = initial_state or new_state()
141
  st.experimental_set_query_params(preload_dataset=preloaded_id)
142
  if not is_state_empty(state):
@@ -168,6 +194,9 @@ Here is the matching yaml block:
168
  leftcol, _, rightcol = st.beta_columns([12, 1, 12])
169
 
170
 
 
 
 
171
  leftcol.markdown("### Supported tasks")
172
  state["task_categories"] = multiselect(
173
  leftcol,
@@ -197,6 +226,9 @@ for task_category in state["task_categories"]:
197
  state["task_ids"] = task_specifics
198
 
199
 
 
 
 
200
  leftcol.markdown("### Languages")
201
  state["multilinguality"] = multiselect(
202
  leftcol,
@@ -233,6 +265,10 @@ langtags = leftcol.text_area(
233
  )
234
  state["languages"] = langtags.split(";")
235
 
 
 
 
 
236
  leftcol.markdown("### Dataset creators")
237
  state["language_creators"] = multiselect(
238
  leftcol,
@@ -250,6 +286,9 @@ state["annotations_creators"] = multiselect(
250
  )
251
 
252
 
 
 
 
253
  state["licenses"] = multiselect(
254
  leftcol,
255
  "Licenses",
@@ -266,7 +305,10 @@ if "other" in state["licenses"]:
266
  st.write(f"Registering other-{other_license} license")
267
  state["licenses"][state["licenses"].index("other")] = f"other-{other_license}"
268
 
269
- # link to supported datasets
 
 
 
270
  pre_select_ext_a = []
271
  if "original" in state["source_datasets"]:
272
  pre_select_ext_a += ["original"]
@@ -288,7 +330,7 @@ if "extended" in state["extended"]:
288
  "Linked datasets",
289
  "Which other datasets does this one use data from?",
290
  values=pre_select_ext_b,
291
- valid_set=all_dataset_ids + ["other"],
292
  )
293
  if "other" in extended_sources:
294
  other_extended_sources = leftcol.text_input(
@@ -299,17 +341,23 @@ if "extended" in state["extended"]:
299
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
300
  state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  current_size_cats = state.get("size_categories") or ["unknown"]
303
  ok, nonok = split_known(current_size_cats, known_size_categories)
304
  if len(nonok) > 0:
305
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
306
- state["size_categories"] = [
307
- leftcol.selectbox(
308
- "What is the size category of the dataset?",
309
- options=known_size_categories,
310
- index=known_size_categories.index(ok[0]) if len(ok) > 0 else 0,
311
- )
312
- ]
313
 
314
 
315
  ########################
 
1
  import json
2
+ import logging
3
  from pathlib import Path
4
  from typing import Callable, Dict, List, Tuple
5
 
 
15
  known_task_ids,
16
  )
17
 
18
+ from apputils import new_state
19
+
20
  st.set_page_config(
21
  page_title="HF Dataset Tagging App",
22
  page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
 
41
  ########################
42
 
43
 
44
+ def load_ds_datas() -> Dict[str, Dict[str, Dict]]:
45
  metada_exports = sorted(
46
  [f for f in Path.cwd().iterdir() if f.name.startswith("metadata_")],
47
  key=lambda f: f.lstat().st_mtime,
 
50
  if len(metada_exports) == 0:
51
  raise ValueError("need to run ./build_metada_file.py at least once")
52
  with metada_exports[0].open() as fi:
53
+ logging.info(f"loaded {metada_exports[0]}")
54
  return json.load(fi)
55
 
56
 
 
85
  w.error(e)
86
 
87
 
88
+ def map_num_examples_to_size_categories(n: int) -> str:
89
+ if n <= 0:
90
+ size_cat = "unknown"
91
+ elif n < 1000:
92
+ size_cat = "n<1K"
93
+ elif n < 10000:
94
+ size_cat = "1K<n<10K"
95
+ elif n < 100000:
96
+ size_cat = "10K<n<100K"
97
+ elif n < 1000000:
98
+ size_cat = "100K<n<1M"
99
+ elif n < 10000000:
100
+ size_cat = "1M<n<10M"
101
+ elif n < 100000000:
102
+ size_cat = "10M<n<100M"
103
+ elif n < 1000000000:
104
+ size_cat = "100M<n<1B"
105
+ elif n < 10000000000:
106
+ size_cat = "1B<n<10B"
107
+ elif n < 100000000000:
108
+ size_cat = "10B<n<100B"
109
+ elif n < 1000000000000:
110
+ size_cat = "100B<n<1T"
111
+ else:
112
+ size_cat = "n>1T"
113
+ return size_cat
114
 
115
 
116
  def is_state_empty(state: Dict[str, List]) -> bool:
 
119
 
120
  state = new_state()
121
  datasets_md = load_ds_datas()
122
+ dataset_ids = list(datasets_md.keys())
123
+ dataset_id_to_metadata = {name: mds["metadata"] for name, mds in datasets_md.items()}
124
+ dataset_id_to_infos = {name: mds["infos"] for name, mds in datasets_md.items()}
125
 
126
 
127
  ########################
 
143
  preload = queryparams.get("preload_dataset", list())
144
  preloaded_id = None
145
  initial_state = None
146
+ initial_infos, initial_info_cfg = None, None
147
+ dataset_selector_index = 0
148
+
149
+ if len(preload) == 1 and preload[0] in dataset_ids:
150
  preloaded_id, *_ = preload
151
+ initial_state = dataset_id_to_metadata.get(preloaded_id)
152
+ initial_infos = dataset_id_to_infos.get(preloaded_id)
153
+ initial_info_cfg = next(iter(initial_infos)) if initial_infos is not None else None # pick first available config
154
  state = initial_state or new_state()
155
+ dataset_selector_index = dataset_ids.index(preloaded_id)
156
 
157
  preloaded_id = st.sidebar.selectbox(
158
+ label="Choose dataset to load tag set from", options=dataset_ids, index=dataset_selector_index
159
  )
160
+
161
  leftbtn, rightbtn = st.sidebar.beta_columns(2)
162
  if leftbtn.button("pre-load"):
163
+ initial_state = dataset_id_to_metadata[preloaded_id]
164
+ initial_infos = dataset_id_to_infos[preloaded_id]
165
+ initial_info_cfg = next(iter(initial_infos)) # pick first available config
166
  state = initial_state or new_state()
167
  st.experimental_set_query_params(preload_dataset=preloaded_id)
168
  if not is_state_empty(state):
 
194
  leftcol, _, rightcol = st.beta_columns([12, 1, 12])
195
 
196
 
197
+ #
198
+ # TASKS
199
+ #
200
  leftcol.markdown("### Supported tasks")
201
  state["task_categories"] = multiselect(
202
  leftcol,
 
226
  state["task_ids"] = task_specifics
227
 
228
 
229
+ #
230
+ # LANGUAGES
231
+ #
232
  leftcol.markdown("### Languages")
233
  state["multilinguality"] = multiselect(
234
  leftcol,
 
265
  )
266
  state["languages"] = langtags.split(";")
267
 
268
+
269
+ #
270
+ # DATASET CREATORS & ORIGINS
271
+ #
272
  leftcol.markdown("### Dataset creators")
273
  state["language_creators"] = multiselect(
274
  leftcol,
 
286
  )
287
 
288
 
289
+ #
290
+ # LICENSES
291
+ #
292
  state["licenses"] = multiselect(
293
  leftcol,
294
  "Licenses",
 
305
  st.write(f"Registering other-{other_license} license")
306
  state["licenses"][state["licenses"].index("other")] = f"other-{other_license}"
307
 
308
+
309
+ #
310
+ # LINK TO SUPPORTED DATASETS
311
+ #
312
  pre_select_ext_a = []
313
  if "original" in state["source_datasets"]:
314
  pre_select_ext_a += ["original"]
 
330
  "Linked datasets",
331
  "Which other datasets does this one use data from?",
332
  values=pre_select_ext_b,
333
+ valid_set=dataset_ids + ["other"],
334
  )
335
  if "other" in extended_sources:
336
  other_extended_sources = leftcol.text_input(
 
341
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
342
  state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
343
 
344
+
345
+ #
346
+ # SIZE CATEGORY
347
+ #
348
+ leftcol.markdown("### Size category")
349
+ logging.info(initial_infos[initial_info_cfg]["splits"] if initial_infos is not None else 0)
350
+ initial_num_examples = (
351
+ sum([dct.get("num_examples", 0) for _split, dct in initial_infos[initial_info_cfg].get("splits", dict()).items()])
352
+ if initial_infos is not None
353
+ else -1
354
+ )
355
+ initial_size_cats = map_num_examples_to_size_categories(initial_num_examples)
356
+ leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_cats}`")
357
  current_size_cats = state.get("size_categories") or ["unknown"]
358
  ok, nonok = split_known(current_size_cats, known_size_categories)
359
  if len(nonok) > 0:
360
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
 
 
 
 
 
 
 
361
 
362
 
363
  ########################
task_set.json DELETED
@@ -1,86 +0,0 @@
1
- {
2
- "conditional-text-generation": {
3
- "description": "data-to-text and text transduction tasks such as translation or summarization",
4
- "options": [
5
- "machine-translation",
6
- "sentence-splitting-fusion",
7
- "summarization",
8
- "table-to-text",
9
- "text-simplification",
10
- "explanation-generation",
11
- "other-stuctured-to-text",
12
- "other"
13
- ]
14
- },
15
- "question-answering": {
16
- "description": "question answering tasks",
17
- "options": [
18
- "open-domain-qa",
19
- "closed-domain-qa",
20
- "multiple-choice-qa",
21
- "extractive-qa",
22
- "abstractive-qa",
23
- "other"
24
- ]
25
- },
26
- "sequence-modeling": {
27
- "description": "such as language modeling or dialogue",
28
- "options": [
29
- "dialogue-modeling",
30
- "language-modeling",
31
- "other-multi-turn",
32
- "slot-filling",
33
- "other"
34
- ]
35
- },
36
- "structure-prediction": {
37
- "description": "predicting structural properties of the text, such as syntax",
38
- "options": [
39
- "coreference-resolution",
40
- "named-entity-recognition",
41
- "part-of-speech-tagging",
42
- "parsing",
43
- "other"
44
- ]
45
- },
46
- "text-classification": {
47
- "description": "predicting a class index or boolean value",
48
- "options": [
49
- "acceptability-classification",
50
- "entity-linking-classification",
51
- "fact-checking",
52
- "intent-classification",
53
- "multi-class-classification",
54
- "multi-label-classification",
55
- "natural-language-inference",
56
- "semantic-similarity-classification",
57
- "sentiment-classification",
58
- "topic-classification",
59
- "other"
60
- ]
61
- },
62
- "text-retrieval": {
63
- "description": "information or text retrieval tasks",
64
- "options": [
65
- "document-retrieval",
66
- "utterance-retrieval",
67
- "entity-linking-retrieval",
68
- "fact-checking-retrieval",
69
- "other"
70
- ]
71
- },
72
- "text-scoring": {
73
- "description": "text scoring tasks, predicting a real valued score for some text",
74
- "options": [
75
- "semantic-similarity-scoring",
76
- "sentiment-scoring",
77
- "other"
78
- ]
79
- },
80
- "other": {
81
- "description": "other task family not mentioned here",
82
- "options": [
83
- "other"
84
- ]
85
- }
86
- }