Spaces:
Running
Running
theo
commited on
Commit
•
9ec7c08
1
Parent(s):
d71e67c
leverage infos file, fix prerun script
Browse files- apputils.py +15 -0
- build_metadata_file.py +7 -4
- license_set.json +0 -452
- tagging_app.py +78 -30
- task_set.json +0 -86
apputils.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
|
3 |
+
|
4 |
+
def new_state() -> Dict[str, List]:
|
5 |
+
return {
|
6 |
+
"task_categories": [],
|
7 |
+
"task_ids": [],
|
8 |
+
"multilinguality": [],
|
9 |
+
"languages": [],
|
10 |
+
"language_creators": [],
|
11 |
+
"annotations_creators": [],
|
12 |
+
"source_datasets": [],
|
13 |
+
"size_categories": [],
|
14 |
+
"licenses": [],
|
15 |
+
}
|
build_metadata_file.py
CHANGED
@@ -12,6 +12,8 @@ from typing import Dict
|
|
12 |
|
13 |
import yaml
|
14 |
|
|
|
|
|
15 |
|
16 |
def metadata_from_readme(f: Path) -> Dict:
|
17 |
with f.open() as fi:
|
@@ -25,10 +27,10 @@ def metadata_from_readme(f: Path) -> Dict:
|
|
25 |
def load_ds_datas():
|
26 |
drepo = Path("datasets")
|
27 |
if drepo.exists() and drepo.is_dir():
|
28 |
-
check_call(["git", "pull"], cwd=
|
29 |
else:
|
30 |
check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
|
31 |
-
head_sha = check_output(["git", "rev-parse", "HEAD"])
|
32 |
|
33 |
datasets_md = dict()
|
34 |
|
@@ -38,6 +40,8 @@ def load_ds_datas():
|
|
38 |
metadata = metadata_from_readme(ddir / "README.md")
|
39 |
except:
|
40 |
metadata = None
|
|
|
|
|
41 |
|
42 |
try:
|
43 |
with (ddir / "dataset_infos.json").open() as fi:
|
@@ -45,8 +49,7 @@ def load_ds_datas():
|
|
45 |
except:
|
46 |
infos = None
|
47 |
|
48 |
-
|
49 |
-
datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
|
50 |
return head_sha.decode().strip(), datasets_md
|
51 |
|
52 |
|
|
|
12 |
|
13 |
import yaml
|
14 |
|
15 |
+
from apputils import new_state
|
16 |
+
|
17 |
|
18 |
def metadata_from_readme(f: Path) -> Dict:
|
19 |
with f.open() as fi:
|
|
|
27 |
def load_ds_datas():
|
28 |
drepo = Path("datasets")
|
29 |
if drepo.exists() and drepo.is_dir():
|
30 |
+
check_call(["git", "pull"], cwd=drepo)
|
31 |
else:
|
32 |
check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
|
33 |
+
head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
|
34 |
|
35 |
datasets_md = dict()
|
36 |
|
|
|
40 |
metadata = metadata_from_readme(ddir / "README.md")
|
41 |
except:
|
42 |
metadata = None
|
43 |
+
if metadata is None or len(metadata) == 0:
|
44 |
+
metadata = new_state()
|
45 |
|
46 |
try:
|
47 |
with (ddir / "dataset_infos.json").open() as fi:
|
|
|
49 |
except:
|
50 |
infos = None
|
51 |
|
52 |
+
datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
|
|
|
53 |
return head_sha.decode().strip(), datasets_md
|
54 |
|
55 |
|
license_set.json
DELETED
@@ -1,452 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"other": "Other license",
|
3 |
-
"unknown": "License information unavailable",
|
4 |
-
"0bsd": "BSD Zero Clause License",
|
5 |
-
"aal": "Attribution Assurance License",
|
6 |
-
"abstyles": "Abstyles License",
|
7 |
-
"adobe-2006": "Adobe Systems Incorporated Source Code License Agreement",
|
8 |
-
"adobe-glyph": "Adobe Glyph List License",
|
9 |
-
"adsl": "Amazon Digital Services License",
|
10 |
-
"afl-1.1": "Academic Free License v1.1",
|
11 |
-
"afl-1.2": "Academic Free License v1.2",
|
12 |
-
"afl-2.0": "Academic Free License v2.0",
|
13 |
-
"afl-2.1": "Academic Free License v2.1",
|
14 |
-
"afl-3.0": "Academic Free License v3.0",
|
15 |
-
"afmparse": "Afmparse License",
|
16 |
-
"agpl-1.0": "Affero General Public License v1.0",
|
17 |
-
"agpl-1.0-only": "Affero General Public License v1.0 only",
|
18 |
-
"agpl-1.0-or-later": "Affero General Public License v1.0 or later",
|
19 |
-
"agpl-3.0": "GNU Affero General Public License v3.0",
|
20 |
-
"agpl-3.0-only": "GNU Affero General Public License v3.0 only",
|
21 |
-
"agpl-3.0-or-later": "GNU Affero General Public License v3.0 or later",
|
22 |
-
"aladdin": "Aladdin Free Public License",
|
23 |
-
"amdplpa": "AMD's plpa_map.c License",
|
24 |
-
"aml": "Apple MIT License",
|
25 |
-
"ampas": "Academy of Motion Picture Arts and Sciences BSD",
|
26 |
-
"antlr-pd": "ANTLR Software Rights Notice",
|
27 |
-
"antlr-pd-fallback": "ANTLR Software Rights Notice with license fallback",
|
28 |
-
"apache-1.0": "Apache License 1.0",
|
29 |
-
"apache-1.1": "Apache License 1.1",
|
30 |
-
"apache-2.0": "Apache License 2.0",
|
31 |
-
"apafml": "Adobe Postscript AFM License",
|
32 |
-
"apl-1.0": "Adaptive Public License 1.0",
|
33 |
-
"apsl-1.0": "Apple Public Source License 1.0",
|
34 |
-
"apsl-1.1": "Apple Public Source License 1.1",
|
35 |
-
"apsl-1.2": "Apple Public Source License 1.2",
|
36 |
-
"apsl-2.0": "Apple Public Source License 2.0",
|
37 |
-
"artistic-1.0": "Artistic License 1.0",
|
38 |
-
"artistic-1.0-cl8": "Artistic License 1.0 w/clause 8",
|
39 |
-
"artistic-1.0-perl": "Artistic License 1.0 (Perl)",
|
40 |
-
"artistic-2.0": "Artistic License 2.0",
|
41 |
-
"bahyph": "Bahyph License",
|
42 |
-
"barr": "Barr License",
|
43 |
-
"beerware": "Beerware License",
|
44 |
-
"bittorrent-1.0": "BitTorrent Open Source License v1.0",
|
45 |
-
"bittorrent-1.1": "BitTorrent Open Source License v1.1",
|
46 |
-
"blessing": "SQLite Blessing",
|
47 |
-
"blueoak-1.0.0": "Blue Oak Model License 1.0.0",
|
48 |
-
"borceux": "Borceux license",
|
49 |
-
"bsd-1-clause": "BSD 1-Clause License",
|
50 |
-
"bsd-2-clause": "BSD 2-Clause \"Simplified\" License",
|
51 |
-
"bsd-2-clause-freebsd": "BSD 2-Clause FreeBSD License",
|
52 |
-
"bsd-2-clause-netbsd": "BSD 2-Clause NetBSD License",
|
53 |
-
"bsd-2-clause-patent": "BSD-2-Clause Plus Patent License",
|
54 |
-
"bsd-2-clause-views": "BSD 2-Clause with views sentence",
|
55 |
-
"bsd-3-clause": "BSD 3-Clause \"New\" or \"Revised\" License",
|
56 |
-
"bsd-3-clause-attribution": "BSD with attribution",
|
57 |
-
"bsd-3-clause-clear": "BSD 3-Clause Clear License",
|
58 |
-
"bsd-3-clause-lbnl": "Lawrence Berkeley National Labs BSD variant license",
|
59 |
-
"bsd-3-clause-no-nuclear-license": "BSD 3-Clause No Nuclear License",
|
60 |
-
"bsd-3-clause-no-nuclear-license-2014": "BSD 3-Clause No Nuclear License 2014",
|
61 |
-
"bsd-3-clause-no-nuclear-warranty": "BSD 3-Clause No Nuclear Warranty",
|
62 |
-
"bsd-3-clause-open-mpi": "BSD 3-Clause Open MPI variant",
|
63 |
-
"bsd-4-clause": "BSD 4-Clause \"Original\" or \"Old\" License",
|
64 |
-
"bsd-4-clause-uc": "BSD-4-Clause (University of California-Specific)",
|
65 |
-
"bsd-protection": "BSD Protection License",
|
66 |
-
"bsd-source-code": "BSD Source Code Attribution",
|
67 |
-
"bsl-1.0": "Boost Software License 1.0",
|
68 |
-
"busl-1.1": "Business Source License 1.1",
|
69 |
-
"bzip2-1.0.5": "bzip2 and libbzip2 License v1.0.5",
|
70 |
-
"bzip2-1.0.6": "bzip2 and libbzip2 License v1.0.6",
|
71 |
-
"cal-1.0": "Cryptographic Autonomy License 1.0",
|
72 |
-
"cal-1.0-combined-work-exception": "Cryptographic Autonomy License 1.0 (Combined Work Exception)",
|
73 |
-
"caldera": "Caldera License",
|
74 |
-
"catosl-1.1": "Computer Associates Trusted Open Source License 1.1",
|
75 |
-
"cc-by-1.0": "Creative Commons Attribution 1.0 Generic",
|
76 |
-
"cc-by-2.0": "Creative Commons Attribution 2.0 Generic",
|
77 |
-
"cc-by-2.5": "Creative Commons Attribution 2.5 Generic",
|
78 |
-
"cc-by-3.0": "Creative Commons Attribution 3.0 Unported",
|
79 |
-
"cc-by-3.0-at": "Creative Commons Attribution 3.0 Austria",
|
80 |
-
"cc-by-3.0-us": "Creative Commons Attribution 3.0 United States",
|
81 |
-
"cc-by-4.0": "Creative Commons Attribution 4.0 International",
|
82 |
-
"cc-by-nc-1.0": "Creative Commons Attribution Non Commercial 1.0 Generic",
|
83 |
-
"cc-by-nc-2.0": "Creative Commons Attribution Non Commercial 2.0 Generic",
|
84 |
-
"cc-by-nc-2.5": "Creative Commons Attribution Non Commercial 2.5 Generic",
|
85 |
-
"cc-by-nc-3.0": "Creative Commons Attribution Non Commercial 3.0 Unported",
|
86 |
-
"cc-by-nc-4.0": "Creative Commons Attribution Non Commercial 4.0 International",
|
87 |
-
"cc-by-nc-nd-1.0": "Creative Commons Attribution Non Commercial No Derivatives 1.0 Generic",
|
88 |
-
"cc-by-nc-nd-2.0": "Creative Commons Attribution Non Commercial No Derivatives 2.0 Generic",
|
89 |
-
"cc-by-nc-nd-2.5": "Creative Commons Attribution Non Commercial No Derivatives 2.5 Generic",
|
90 |
-
"cc-by-nc-nd-3.0": "Creative Commons Attribution Non Commercial No Derivatives 3.0 Unported",
|
91 |
-
"cc-by-nc-nd-3.0-igo": "Creative Commons Attribution Non Commercial No Derivatives 3.0 IGO",
|
92 |
-
"cc-by-nc-nd-4.0": "Creative Commons Attribution Non Commercial No Derivatives 4.0 International",
|
93 |
-
"cc-by-nc-sa-1.0": "Creative Commons Attribution Non Commercial Share Alike 1.0 Generic",
|
94 |
-
"cc-by-nc-sa-2.0": "Creative Commons Attribution Non Commercial Share Alike 2.0 Generic",
|
95 |
-
"cc-by-nc-sa-2.5": "Creative Commons Attribution Non Commercial Share Alike 2.5 Generic",
|
96 |
-
"cc-by-nc-sa-3.0": "Creative Commons Attribution Non Commercial Share Alike 3.0 Unported",
|
97 |
-
"cc-by-nc-sa-4.0": "Creative Commons Attribution Non Commercial Share Alike 4.0 International",
|
98 |
-
"cc-by-nd-1.0": "Creative Commons Attribution No Derivatives 1.0 Generic",
|
99 |
-
"cc-by-nd-2.0": "Creative Commons Attribution No Derivatives 2.0 Generic",
|
100 |
-
"cc-by-nd-2.5": "Creative Commons Attribution No Derivatives 2.5 Generic",
|
101 |
-
"cc-by-nd-3.0": "Creative Commons Attribution No Derivatives 3.0 Unported",
|
102 |
-
"cc-by-nd-4.0": "Creative Commons Attribution No Derivatives 4.0 International",
|
103 |
-
"cc-by-sa-1.0": "Creative Commons Attribution Share Alike 1.0 Generic",
|
104 |
-
"cc-by-sa-2.0": "Creative Commons Attribution Share Alike 2.0 Generic",
|
105 |
-
"cc-by-sa-2.0-uk": "Creative Commons Attribution Share Alike 2.0 England and Wales",
|
106 |
-
"cc-by-sa-2.5": "Creative Commons Attribution Share Alike 2.5 Generic",
|
107 |
-
"cc-by-sa-3.0": "Creative Commons Attribution Share Alike 3.0 Unported",
|
108 |
-
"cc-by-sa-3.0-at": "Creative Commons Attribution-Share Alike 3.0 Austria",
|
109 |
-
"cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0 International",
|
110 |
-
"cc-pddc": "Creative Commons Public Domain Dedication and Certification",
|
111 |
-
"cc0-1.0": "Creative Commons Zero v1.0 Universal",
|
112 |
-
"cddl-1.0": "Common Development and Distribution License 1.0",
|
113 |
-
"cddl-1.1": "Common Development and Distribution License 1.1",
|
114 |
-
"cdla-permissive-1.0": "Community Data License Agreement Permissive 1.0",
|
115 |
-
"cdla-sharing-1.0": "Community Data License Agreement Sharing 1.0",
|
116 |
-
"cecill-1.0": "CeCILL Free Software License Agreement v1.0",
|
117 |
-
"cecill-1.1": "CeCILL Free Software License Agreement v1.1",
|
118 |
-
"cecill-2.0": "CeCILL Free Software License Agreement v2.0",
|
119 |
-
"cecill-2.1": "CeCILL Free Software License Agreement v2.1",
|
120 |
-
"cecill-b": "CeCILL-B Free Software License Agreement",
|
121 |
-
"cecill-c": "CeCILL-C Free Software License Agreement",
|
122 |
-
"cern-ohl-1.1": "CERN Open Hardware Licence v1.1",
|
123 |
-
"cern-ohl-1.2": "CERN Open Hardware Licence v1.2",
|
124 |
-
"cern-ohl-p-2.0": "CERN Open Hardware Licence Version 2 - Permissive",
|
125 |
-
"cern-ohl-s-2.0": "CERN Open Hardware Licence Version 2 - Strongly Reciprocal",
|
126 |
-
"cern-ohl-w-2.0": "CERN Open Hardware Licence Version 2 - Weakly Reciprocal",
|
127 |
-
"clartistic": "Clarified Artistic License",
|
128 |
-
"cnri-jython": "CNRI Jython License",
|
129 |
-
"cnri-python": "CNRI Python License",
|
130 |
-
"cnri-python-gpl-compatible": "CNRI Python Open Source GPL Compatible License Agreement",
|
131 |
-
"condor-1.1": "Condor Public License v1.1",
|
132 |
-
"copyleft-next-0.3.0": "copyleft-next 0.3.0",
|
133 |
-
"copyleft-next-0.3.1": "copyleft-next 0.3.1",
|
134 |
-
"cpal-1.0": "Common Public Attribution License 1.0",
|
135 |
-
"cpl-1.0": "Common Public License 1.0",
|
136 |
-
"cpol-1.02": "Code Project Open License 1.02",
|
137 |
-
"crossword": "Crossword License",
|
138 |
-
"crystalstacker": "CrystalStacker License",
|
139 |
-
"cua-opl-1.0": "CUA Office Public License v1.0",
|
140 |
-
"cube": "Cube License",
|
141 |
-
"curl": "curl License",
|
142 |
-
"d-fsl-1.0": "Deutsche Freie Software Lizenz",
|
143 |
-
"diffmark": "diffmark license",
|
144 |
-
"doc": "DOC License",
|
145 |
-
"dotseqn": "Dotseqn License",
|
146 |
-
"dsdp": "DSDP License",
|
147 |
-
"dvipdfm": "dvipdfm License",
|
148 |
-
"ecl-1.0": "Educational Community License v1.0",
|
149 |
-
"ecl-2.0": "Educational Community License v2.0",
|
150 |
-
"ecos-2.0": "eCos license version 2.0",
|
151 |
-
"efl-1.0": "Eiffel Forum License v1.0",
|
152 |
-
"efl-2.0": "Eiffel Forum License v2.0",
|
153 |
-
"egenix": "eGenix.com Public License 1.1.0",
|
154 |
-
"entessa": "Entessa Public License v1.0",
|
155 |
-
"epics": "EPICS Open License",
|
156 |
-
"epl-1.0": "Eclipse Public License 1.0",
|
157 |
-
"epl-2.0": "Eclipse Public License 2.0",
|
158 |
-
"erlpl-1.1": "Erlang Public License v1.1",
|
159 |
-
"etalab-2.0": "Etalab Open License 2.0",
|
160 |
-
"eudatagrid": "EU DataGrid Software License",
|
161 |
-
"eupl-1.0": "European Union Public License 1.0",
|
162 |
-
"eupl-1.1": "European Union Public License 1.1",
|
163 |
-
"eupl-1.2": "European Union Public License 1.2",
|
164 |
-
"eurosym": "Eurosym License",
|
165 |
-
"fair": "Fair License",
|
166 |
-
"frameworx-1.0": "Frameworx Open License 1.0",
|
167 |
-
"freeimage": "FreeImage Public License v1.0",
|
168 |
-
"fsfap": "FSF All Permissive License",
|
169 |
-
"fsful": "FSF Unlimited License",
|
170 |
-
"fsfullr": "FSF Unlimited License (with License Retention)",
|
171 |
-
"ftl": "Freetype Project License",
|
172 |
-
"gfdl-1.1": "GNU Free Documentation License v1.1",
|
173 |
-
"gfdl-1.1-invariants-only": "GNU Free Documentation License v1.1 only - invariants",
|
174 |
-
"gfdl-1.1-invariants-or-later": "GNU Free Documentation License v1.1 or later - invariants",
|
175 |
-
"gfdl-1.1-no-invariants-only": "GNU Free Documentation License v1.1 only - no invariants",
|
176 |
-
"gfdl-1.1-no-invariants-or-later": "GNU Free Documentation License v1.1 or later - no invariants",
|
177 |
-
"gfdl-1.1-only": "GNU Free Documentation License v1.1 only",
|
178 |
-
"gfdl-1.1-or-later": "GNU Free Documentation License v1.1 or later",
|
179 |
-
"gfdl-1.2": "GNU Free Documentation License v1.2",
|
180 |
-
"gfdl-1.2-invariants-only": "GNU Free Documentation License v1.2 only - invariants",
|
181 |
-
"gfdl-1.2-invariants-or-later": "GNU Free Documentation License v1.2 or later - invariants",
|
182 |
-
"gfdl-1.2-no-invariants-only": "GNU Free Documentation License v1.2 only - no invariants",
|
183 |
-
"gfdl-1.2-no-invariants-or-later": "GNU Free Documentation License v1.2 or later - no invariants",
|
184 |
-
"gfdl-1.2-only": "GNU Free Documentation License v1.2 only",
|
185 |
-
"gfdl-1.2-or-later": "GNU Free Documentation License v1.2 or later",
|
186 |
-
"gfdl-1.3": "GNU Free Documentation License v1.3",
|
187 |
-
"gfdl-1.3-invariants-only": "GNU Free Documentation License v1.3 only - invariants",
|
188 |
-
"gfdl-1.3-invariants-or-later": "GNU Free Documentation License v1.3 or later - invariants",
|
189 |
-
"gfdl-1.3-no-invariants-only": "GNU Free Documentation License v1.3 only - no invariants",
|
190 |
-
"gfdl-1.3-no-invariants-or-later": "GNU Free Documentation License v1.3 or later - no invariants",
|
191 |
-
"gfdl-1.3-only": "GNU Free Documentation License v1.3 only",
|
192 |
-
"gfdl-1.3-or-later": "GNU Free Documentation License v1.3 or later",
|
193 |
-
"giftware": "Giftware License",
|
194 |
-
"gl2ps": "GL2PS License",
|
195 |
-
"glide": "3dfx Glide License",
|
196 |
-
"glulxe": "Glulxe License",
|
197 |
-
"glwtpl": "Good Luck With That Public License",
|
198 |
-
"gnuplot": "gnuplot License",
|
199 |
-
"gpl-1.0": "GNU General Public License v1.0 only",
|
200 |
-
"gpl-1.0+": "GNU General Public License v1.0 or later",
|
201 |
-
"gpl-1.0-only": "GNU General Public License v1.0 only",
|
202 |
-
"gpl-1.0-or-later": "GNU General Public License v1.0 or later",
|
203 |
-
"gpl-2.0": "GNU General Public License v2.0 only",
|
204 |
-
"gpl-2.0+": "GNU General Public License v2.0 or later",
|
205 |
-
"gpl-2.0-only": "GNU General Public License v2.0 only",
|
206 |
-
"gpl-2.0-or-later": "GNU General Public License v2.0 or later",
|
207 |
-
"gpl-2.0-with-autoconf-exception": "GNU General Public License v2.0 w/Autoconf exception",
|
208 |
-
"gpl-2.0-with-bison-exception": "GNU General Public License v2.0 w/Bison exception",
|
209 |
-
"gpl-2.0-with-classpath-exception": "GNU General Public License v2.0 w/Classpath exception",
|
210 |
-
"gpl-2.0-with-font-exception": "GNU General Public License v2.0 w/Font exception",
|
211 |
-
"gpl-2.0-with-gcc-exception": "GNU General Public License v2.0 w/GCC Runtime Library exception",
|
212 |
-
"gpl-3.0": "GNU General Public License v3.0 only",
|
213 |
-
"gpl-3.0+": "GNU General Public License v3.0 or later",
|
214 |
-
"gpl-3.0-only": "GNU General Public License v3.0 only",
|
215 |
-
"gpl-3.0-or-later": "GNU General Public License v3.0 or later",
|
216 |
-
"gpl-3.0-with-autoconf-exception": "GNU General Public License v3.0 w/Autoconf exception",
|
217 |
-
"gpl-3.0-with-gcc-exception": "GNU General Public License v3.0 w/GCC Runtime Library exception",
|
218 |
-
"gsoap-1.3b": "gSOAP Public License v1.3b",
|
219 |
-
"haskellreport": "Haskell Language Report License",
|
220 |
-
"hippocratic-2.1": "Hippocratic License 2.1",
|
221 |
-
"hpnd": "Historical Permission Notice and Disclaimer",
|
222 |
-
"hpnd-sell-variant": "Historical Permission Notice and Disclaimer - sell variant",
|
223 |
-
"htmltidy": "HTML Tidy License",
|
224 |
-
"ibm-pibs": "IBM PowerPC Initialization and Boot Software",
|
225 |
-
"icu": "ICU License",
|
226 |
-
"ijg": "Independent JPEG Group License",
|
227 |
-
"imagemagick": "ImageMagick License",
|
228 |
-
"imatix": "iMatix Standard Function Library Agreement",
|
229 |
-
"imlib2": "Imlib2 License",
|
230 |
-
"info-zip": "Info-ZIP License",
|
231 |
-
"intel": "Intel Open Source License",
|
232 |
-
"intel-acpi": "Intel ACPI Software License Agreement",
|
233 |
-
"interbase-1.0": "Interbase Public License v1.0",
|
234 |
-
"ipa": "IPA Font License",
|
235 |
-
"ipl-1.0": "IBM Public License v1.0",
|
236 |
-
"isc": "ISC License",
|
237 |
-
"jasper-2.0": "JasPer License",
|
238 |
-
"jpnic": "Japan Network Information Center License",
|
239 |
-
"json": "JSON License",
|
240 |
-
"lal-1.2": "Licence Art Libre 1.2",
|
241 |
-
"lal-1.3": "Licence Art Libre 1.3",
|
242 |
-
"latex2e": "Latex2e License",
|
243 |
-
"leptonica": "Leptonica License",
|
244 |
-
"lgpl-2.0": "GNU Library General Public License v2 only",
|
245 |
-
"lgpl-2.0+": "GNU Library General Public License v2 or later",
|
246 |
-
"lgpl-2.0-only": "GNU Library General Public License v2 only",
|
247 |
-
"lgpl-2.0-or-later": "GNU Library General Public License v2 or later",
|
248 |
-
"lgpl-2.1": "GNU Lesser General Public License v2.1 only",
|
249 |
-
"lgpl-2.1+": "GNU Library General Public License v2.1 or later",
|
250 |
-
"lgpl-2.1-only": "GNU Lesser General Public License v2.1 only",
|
251 |
-
"lgpl-2.1-or-later": "GNU Lesser General Public License v2.1 or later",
|
252 |
-
"lgpl-3.0": "GNU Lesser General Public License v3.0 only",
|
253 |
-
"lgpl-3.0+": "GNU Lesser General Public License v3.0 or later",
|
254 |
-
"lgpl-3.0-only": "GNU Lesser General Public License v3.0 only",
|
255 |
-
"lgpl-3.0-or-later": "GNU Lesser General Public License v3.0 or later",
|
256 |
-
"lgpllr": "Lesser General Public License For Linguistic Resources",
|
257 |
-
"libpng": "libpng License",
|
258 |
-
"libpng-2.0": "PNG Reference Library version 2",
|
259 |
-
"libselinux-1.0": "libselinux public domain notice",
|
260 |
-
"libtiff": "libtiff License",
|
261 |
-
"liliq-p-1.1": "Licence Libre du Qu\u00e9bec \u2013 Permissive version 1.1",
|
262 |
-
"liliq-r-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 version 1.1",
|
263 |
-
"liliq-rplus-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 forte version 1.1",
|
264 |
-
"linux-openib": "Linux Kernel Variant of OpenIB.org license",
|
265 |
-
"lpl-1.0": "Lucent Public License Version 1.0",
|
266 |
-
"lpl-1.02": "Lucent Public License v1.02",
|
267 |
-
"lppl-1.0": "LaTeX Project Public License v1.0",
|
268 |
-
"lppl-1.1": "LaTeX Project Public License v1.1",
|
269 |
-
"lppl-1.2": "LaTeX Project Public License v1.2",
|
270 |
-
"lppl-1.3a": "LaTeX Project Public License v1.3a",
|
271 |
-
"lppl-1.3c": "LaTeX Project Public License v1.3c",
|
272 |
-
"makeindex": "MakeIndex License",
|
273 |
-
"miros": "The MirOS Licence",
|
274 |
-
"mit": "MIT License",
|
275 |
-
"mit-0": "MIT No Attribution",
|
276 |
-
"mit-advertising": "Enlightenment License (e16)",
|
277 |
-
"mit-cmu": "CMU License",
|
278 |
-
"mit-enna": "enna License",
|
279 |
-
"mit-feh": "feh License",
|
280 |
-
"mit-open-group": "MIT Open Group variant",
|
281 |
-
"mitnfa": "MIT +no-false-attribs license",
|
282 |
-
"motosoto": "Motosoto License",
|
283 |
-
"mpich2": "mpich2 License",
|
284 |
-
"mpl-1.0": "Mozilla Public License 1.0",
|
285 |
-
"mpl-1.1": "Mozilla Public License 1.1",
|
286 |
-
"mpl-2.0": "Mozilla Public License 2.0",
|
287 |
-
"mpl-2.0-no-copyleft-exception": "Mozilla Public License 2.0 (no copyleft exception)",
|
288 |
-
"ms-pl": "Microsoft Public License",
|
289 |
-
"ms-rl": "Microsoft Reciprocal License",
|
290 |
-
"mtll": "Matrix Template Library License",
|
291 |
-
"mulanpsl-1.0": "Mulan Permissive Software License, Version 1",
|
292 |
-
"mulanpsl-2.0": "Mulan Permissive Software License, Version 2",
|
293 |
-
"multics": "Multics License",
|
294 |
-
"mup": "Mup License",
|
295 |
-
"nasa-1.3": "NASA Open Source Agreement 1.3",
|
296 |
-
"naumen": "Naumen Public License",
|
297 |
-
"nbpl-1.0": "Net Boolean Public License v1",
|
298 |
-
"ncgl-uk-2.0": "Non-Commercial Government Licence",
|
299 |
-
"ncsa": "University of Illinois/NCSA Open Source License",
|
300 |
-
"net-snmp": "Net-SNMP License",
|
301 |
-
"netcdf": "NetCDF license",
|
302 |
-
"newsletr": "Newsletr License",
|
303 |
-
"ngpl": "Nethack General Public License",
|
304 |
-
"nist-pd": "NIST Public Domain Notice",
|
305 |
-
"nist-pd-fallback": "NIST Public Domain Notice with license fallback",
|
306 |
-
"nlod-1.0": "Norwegian Licence for Open Government Data",
|
307 |
-
"nlpl": "No Limit Public License",
|
308 |
-
"nokia": "Nokia Open Source License",
|
309 |
-
"nosl": "Netizen Open Source License",
|
310 |
-
"noweb": "Noweb License",
|
311 |
-
"npl-1.0": "Netscape Public License v1.0",
|
312 |
-
"npl-1.1": "Netscape Public License v1.1",
|
313 |
-
"nposl-3.0": "Non-Profit Open Software License 3.0",
|
314 |
-
"nrl": "NRL License",
|
315 |
-
"ntp": "NTP License",
|
316 |
-
"ntp-0": "NTP No Attribution",
|
317 |
-
"nunit": "Nunit License",
|
318 |
-
"o-uda-1.0": "Open Use of Data Agreement v1.0",
|
319 |
-
"occt-pl": "Open CASCADE Technology Public License",
|
320 |
-
"oclc-2.0": "OCLC Research Public License 2.0",
|
321 |
-
"odbl-1.0": "ODC Open Database License v1.0",
|
322 |
-
"odc-by-1.0": "Open Data Commons Attribution License v1.0",
|
323 |
-
"ofl-1.0": "SIL Open Font License 1.0",
|
324 |
-
"ofl-1.0-no-rfn": "SIL Open Font License 1.0 with no Reserved Font Name",
|
325 |
-
"ofl-1.0-rfn": "SIL Open Font License 1.0 with Reserved Font Name",
|
326 |
-
"ofl-1.1": "SIL Open Font License 1.1",
|
327 |
-
"ofl-1.1-no-rfn": "SIL Open Font License 1.1 with no Reserved Font Name",
|
328 |
-
"ofl-1.1-rfn": "SIL Open Font License 1.1 with Reserved Font Name",
|
329 |
-
"ogc-1.0": "OGC Software License, Version 1.0",
|
330 |
-
"ogl-canada-2.0": "Open Government Licence - Canada",
|
331 |
-
"ogl-uk-1.0": "Open Government Licence v1.0",
|
332 |
-
"ogl-uk-2.0": "Open Government Licence v2.0",
|
333 |
-
"ogl-uk-3.0": "Open Government Licence v3.0",
|
334 |
-
"ogtsl": "Open Group Test Suite License",
|
335 |
-
"oldap-1.1": "Open LDAP Public License v1.1",
|
336 |
-
"oldap-1.2": "Open LDAP Public License v1.2",
|
337 |
-
"oldap-1.3": "Open LDAP Public License v1.3",
|
338 |
-
"oldap-1.4": "Open LDAP Public License v1.4",
|
339 |
-
"oldap-2.0": "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)",
|
340 |
-
"oldap-2.0.1": "Open LDAP Public License v2.0.1",
|
341 |
-
"oldap-2.1": "Open LDAP Public License v2.1",
|
342 |
-
"oldap-2.2": "Open LDAP Public License v2.2",
|
343 |
-
"oldap-2.2.1": "Open LDAP Public License v2.2.1",
|
344 |
-
"oldap-2.2.2": "Open LDAP Public License 2.2.2",
|
345 |
-
"oldap-2.3": "Open LDAP Public License v2.3",
|
346 |
-
"oldap-2.4": "Open LDAP Public License v2.4",
|
347 |
-
"oldap-2.5": "Open LDAP Public License v2.5",
|
348 |
-
"oldap-2.6": "Open LDAP Public License v2.6",
|
349 |
-
"oldap-2.7": "Open LDAP Public License v2.7",
|
350 |
-
"oldap-2.8": "Open LDAP Public License v2.8",
|
351 |
-
"oml": "Open Market License",
|
352 |
-
"openssl": "OpenSSL License",
|
353 |
-
"opl-1.0": "Open Public License v1.0",
|
354 |
-
"oset-pl-2.1": "OSET Public License version 2.1",
|
355 |
-
"osl-1.0": "Open Software License 1.0",
|
356 |
-
"osl-1.1": "Open Software License 1.1",
|
357 |
-
"osl-2.0": "Open Software License 2.0",
|
358 |
-
"osl-2.1": "Open Software License 2.1",
|
359 |
-
"osl-3.0": "Open Software License 3.0",
|
360 |
-
"parity-6.0.0": "The Parity Public License 6.0.0",
|
361 |
-
"parity-7.0.0": "The Parity Public License 7.0.0",
|
362 |
-
"pddl-1.0": "ODC Public Domain Dedication & License 1.0",
|
363 |
-
"php-3.0": "PHP License v3.0",
|
364 |
-
"php-3.01": "PHP License v3.01",
|
365 |
-
"plexus": "Plexus Classworlds License",
|
366 |
-
"polyform-noncommercial-1.0.0": "PolyForm Noncommercial License 1.0.0",
|
367 |
-
"polyform-small-business-1.0.0": "PolyForm Small Business License 1.0.0",
|
368 |
-
"postgresql": "PostgreSQL License",
|
369 |
-
"psf-2.0": "Python Software Foundation License 2.0",
|
370 |
-
"psfrag": "psfrag License",
|
371 |
-
"psutils": "psutils License",
|
372 |
-
"python-2.0": "Python License 2.0",
|
373 |
-
"qhull": "Qhull License",
|
374 |
-
"qpl-1.0": "Q Public License 1.0",
|
375 |
-
"rdisc": "Rdisc License",
|
376 |
-
"rhecos-1.1": "Red Hat eCos Public License v1.1",
|
377 |
-
"rpl-1.1": "Reciprocal Public License 1.1",
|
378 |
-
"rpl-1.5": "Reciprocal Public License 1.5",
|
379 |
-
"rpsl-1.0": "RealNetworks Public Source License v1.0",
|
380 |
-
"rsa-md": "RSA Message-Digest License",
|
381 |
-
"rscpl": "Ricoh Source Code Public License",
|
382 |
-
"ruby": "Ruby License",
|
383 |
-
"sax-pd": "Sax Public Domain Notice",
|
384 |
-
"saxpath": "Saxpath License",
|
385 |
-
"scea": "SCEA Shared Source License",
|
386 |
-
"sendmail": "Sendmail License",
|
387 |
-
"sendmail-8.23": "Sendmail License 8.23",
|
388 |
-
"sgi-b-1.0": "SGI Free Software License B v1.0",
|
389 |
-
"sgi-b-1.1": "SGI Free Software License B v1.1",
|
390 |
-
"sgi-b-2.0": "SGI Free Software License B v2.0",
|
391 |
-
"shl-0.5": "Solderpad Hardware License v0.5",
|
392 |
-
"shl-0.51": "Solderpad Hardware License, Version 0.51",
|
393 |
-
"simpl-2.0": "Simple Public License 2.0",
|
394 |
-
"sissl": "Sun Industry Standards Source License v1.1",
|
395 |
-
"sissl-1.2": "Sun Industry Standards Source License v1.2",
|
396 |
-
"sleepycat": "Sleepycat License",
|
397 |
-
"smlnj": "Standard ML of New Jersey License",
|
398 |
-
"smppl": "Secure Messaging Protocol Public License",
|
399 |
-
"snia": "SNIA Public License 1.1",
|
400 |
-
"spencer-86": "Spencer License 86",
|
401 |
-
"spencer-94": "Spencer License 94",
|
402 |
-
"spencer-99": "Spencer License 99",
|
403 |
-
"spl-1.0": "Sun Public License v1.0",
|
404 |
-
"ssh-openssh": "SSH OpenSSH license",
|
405 |
-
"ssh-short": "SSH short notice",
|
406 |
-
"sspl-1.0": "Server Side Public License, v 1",
|
407 |
-
"standardml-nj": "Standard ML of New Jersey License",
|
408 |
-
"sugarcrm-1.1.3": "SugarCRM Public License v1.1.3",
|
409 |
-
"swl": "Scheme Widget Library (SWL) Software License Agreement",
|
410 |
-
"tapr-ohl-1.0": "TAPR Open Hardware License v1.0",
|
411 |
-
"tcl": "TCL/TK License",
|
412 |
-
"tcp-wrappers": "TCP Wrappers License",
|
413 |
-
"tmate": "TMate Open Source License",
|
414 |
-
"torque-1.1": "TORQUE v2.5+ Software License v1.1",
|
415 |
-
"tosl": "Trusster Open Source License",
|
416 |
-
"tu-berlin-1.0": "Technische Universitaet Berlin License 1.0",
|
417 |
-
"tu-berlin-2.0": "Technische Universitaet Berlin License 2.0",
|
418 |
-
"ucl-1.0": "Upstream Compatibility License v1.0",
|
419 |
-
"unicode-dfs-2015": "Unicode License Agreement - Data Files and Software (2015)",
|
420 |
-
"unicode-dfs-2016": "Unicode License Agreement - Data Files and Software (2016)",
|
421 |
-
"unicode-tou": "Unicode Terms of Use",
|
422 |
-
"unlicense": "The Unlicense",
|
423 |
-
"upl-1.0": "Universal Permissive License v1.0",
|
424 |
-
"vim": "Vim License",
|
425 |
-
"vostrom": "VOSTROM Public License for Open Source",
|
426 |
-
"vsl-1.0": "Vovida Software License v1.0",
|
427 |
-
"w3c": "W3C Software Notice and License (2002-12-31)",
|
428 |
-
"w3c-19980720": "W3C Software Notice and License (1998-07-20)",
|
429 |
-
"w3c-20150513": "W3C Software Notice and Document License (2015-05-13)",
|
430 |
-
"watcom-1.0": "Sybase Open Watcom Public License 1.0",
|
431 |
-
"wsuipa": "Wsuipa License",
|
432 |
-
"wtfpl": "Do What The F*ck You Want To Public License",
|
433 |
-
"wxwindows": "wxWindows Library License",
|
434 |
-
"x11": "X11 License",
|
435 |
-
"xerox": "Xerox License",
|
436 |
-
"xfree86-1.1": "XFree86 License 1.1",
|
437 |
-
"xinetd": "xinetd License",
|
438 |
-
"xnet": "X.Net License",
|
439 |
-
"xpp": "XPP License",
|
440 |
-
"xskat": "XSkat License",
|
441 |
-
"ypl-1.0": "Yahoo! Public License v1.0",
|
442 |
-
"ypl-1.1": "Yahoo! Public License v1.1",
|
443 |
-
"zed": "Zed License",
|
444 |
-
"zend-2.0": "Zend License v2.0",
|
445 |
-
"zimbra-1.3": "Zimbra Public License v1.3",
|
446 |
-
"zimbra-1.4": "Zimbra Public License v1.4",
|
447 |
-
"zlib": "zlib License",
|
448 |
-
"zlib-acknowledgement": "zlib/libpng License with Acknowledgement",
|
449 |
-
"zpl-1.1": "Zope Public License 1.1",
|
450 |
-
"zpl-2.0": "Zope Public License 2.0",
|
451 |
-
"zpl-2.1": "Zope Public License 2.1"
|
452 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tagging_app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
from pathlib import Path
|
3 |
from typing import Callable, Dict, List, Tuple
|
4 |
|
@@ -14,6 +15,8 @@ from datasets.utils.metadata import (
|
|
14 |
known_task_ids,
|
15 |
)
|
16 |
|
|
|
|
|
17 |
st.set_page_config(
|
18 |
page_title="HF Dataset Tagging App",
|
19 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
@@ -38,7 +41,7 @@ st.markdown(
|
|
38 |
########################
|
39 |
|
40 |
|
41 |
-
def load_ds_datas():
|
42 |
metada_exports = sorted(
|
43 |
[f for f in Path.cwd().iterdir() if f.name.startswith("metadata_")],
|
44 |
key=lambda f: f.lstat().st_mtime,
|
@@ -47,6 +50,7 @@ def load_ds_datas():
|
|
47 |
if len(metada_exports) == 0:
|
48 |
raise ValueError("need to run ./build_metada_file.py at least once")
|
49 |
with metada_exports[0].open() as fi:
|
|
|
50 |
return json.load(fi)
|
51 |
|
52 |
|
@@ -81,18 +85,32 @@ def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
|
|
81 |
w.error(e)
|
82 |
|
83 |
|
84 |
-
def
|
85 |
-
|
86 |
-
"
|
87 |
-
|
88 |
-
"
|
89 |
-
|
90 |
-
"
|
91 |
-
|
92 |
-
"
|
93 |
-
|
94 |
-
"
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
|
98 |
def is_state_empty(state: Dict[str, List]) -> bool:
|
@@ -101,8 +119,9 @@ def is_state_empty(state: Dict[str, List]) -> bool:
|
|
101 |
|
102 |
state = new_state()
|
103 |
datasets_md = load_ds_datas()
|
104 |
-
|
105 |
-
|
|
|
106 |
|
107 |
|
108 |
########################
|
@@ -124,19 +143,26 @@ queryparams = st.experimental_get_query_params()
|
|
124 |
preload = queryparams.get("preload_dataset", list())
|
125 |
preloaded_id = None
|
126 |
initial_state = None
|
127 |
-
|
128 |
-
|
|
|
|
|
129 |
preloaded_id, *_ = preload
|
130 |
-
initial_state =
|
|
|
|
|
131 |
state = initial_state or new_state()
|
132 |
-
|
133 |
|
134 |
preloaded_id = st.sidebar.selectbox(
|
135 |
-
label="Choose dataset to load tag set from", options=
|
136 |
)
|
|
|
137 |
leftbtn, rightbtn = st.sidebar.beta_columns(2)
|
138 |
if leftbtn.button("pre-load"):
|
139 |
-
initial_state =
|
|
|
|
|
140 |
state = initial_state or new_state()
|
141 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
142 |
if not is_state_empty(state):
|
@@ -168,6 +194,9 @@ Here is the matching yaml block:
|
|
168 |
leftcol, _, rightcol = st.beta_columns([12, 1, 12])
|
169 |
|
170 |
|
|
|
|
|
|
|
171 |
leftcol.markdown("### Supported tasks")
|
172 |
state["task_categories"] = multiselect(
|
173 |
leftcol,
|
@@ -197,6 +226,9 @@ for task_category in state["task_categories"]:
|
|
197 |
state["task_ids"] = task_specifics
|
198 |
|
199 |
|
|
|
|
|
|
|
200 |
leftcol.markdown("### Languages")
|
201 |
state["multilinguality"] = multiselect(
|
202 |
leftcol,
|
@@ -233,6 +265,10 @@ langtags = leftcol.text_area(
|
|
233 |
)
|
234 |
state["languages"] = langtags.split(";")
|
235 |
|
|
|
|
|
|
|
|
|
236 |
leftcol.markdown("### Dataset creators")
|
237 |
state["language_creators"] = multiselect(
|
238 |
leftcol,
|
@@ -250,6 +286,9 @@ state["annotations_creators"] = multiselect(
|
|
250 |
)
|
251 |
|
252 |
|
|
|
|
|
|
|
253 |
state["licenses"] = multiselect(
|
254 |
leftcol,
|
255 |
"Licenses",
|
@@ -266,7 +305,10 @@ if "other" in state["licenses"]:
|
|
266 |
st.write(f"Registering other-{other_license} license")
|
267 |
state["licenses"][state["licenses"].index("other")] = f"other-{other_license}"
|
268 |
|
269 |
-
|
|
|
|
|
|
|
270 |
pre_select_ext_a = []
|
271 |
if "original" in state["source_datasets"]:
|
272 |
pre_select_ext_a += ["original"]
|
@@ -288,7 +330,7 @@ if "extended" in state["extended"]:
|
|
288 |
"Linked datasets",
|
289 |
"Which other datasets does this one use data from?",
|
290 |
values=pre_select_ext_b,
|
291 |
-
valid_set=
|
292 |
)
|
293 |
if "other" in extended_sources:
|
294 |
other_extended_sources = leftcol.text_input(
|
@@ -299,17 +341,23 @@ if "extended" in state["extended"]:
|
|
299 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
300 |
state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
|
301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
303 |
ok, nonok = split_known(current_size_cats, known_size_categories)
|
304 |
if len(nonok) > 0:
|
305 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
306 |
-
state["size_categories"] = [
|
307 |
-
leftcol.selectbox(
|
308 |
-
"What is the size category of the dataset?",
|
309 |
-
options=known_size_categories,
|
310 |
-
index=known_size_categories.index(ok[0]) if len(ok) > 0 else 0,
|
311 |
-
)
|
312 |
-
]
|
313 |
|
314 |
|
315 |
########################
|
|
|
1 |
import json
|
2 |
+
import logging
|
3 |
from pathlib import Path
|
4 |
from typing import Callable, Dict, List, Tuple
|
5 |
|
|
|
15 |
known_task_ids,
|
16 |
)
|
17 |
|
18 |
+
from apputils import new_state
|
19 |
+
|
20 |
st.set_page_config(
|
21 |
page_title="HF Dataset Tagging App",
|
22 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
|
|
41 |
########################
|
42 |
|
43 |
|
44 |
+
def load_ds_datas() -> Dict[str, Dict[str, Dict]]:
|
45 |
metada_exports = sorted(
|
46 |
[f for f in Path.cwd().iterdir() if f.name.startswith("metadata_")],
|
47 |
key=lambda f: f.lstat().st_mtime,
|
|
|
50 |
if len(metada_exports) == 0:
|
51 |
raise ValueError("need to run ./build_metada_file.py at least once")
|
52 |
with metada_exports[0].open() as fi:
|
53 |
+
logging.info(f"loaded {metada_exports[0]}")
|
54 |
return json.load(fi)
|
55 |
|
56 |
|
|
|
85 |
w.error(e)
|
86 |
|
87 |
|
88 |
+
def map_num_examples_to_size_categories(n: int) -> str:
|
89 |
+
if n <= 0:
|
90 |
+
size_cat = "unknown"
|
91 |
+
elif n < 1000:
|
92 |
+
size_cat = "n<1K"
|
93 |
+
elif n < 10000:
|
94 |
+
size_cat = "1K<n<10K"
|
95 |
+
elif n < 100000:
|
96 |
+
size_cat = "10K<n<100K"
|
97 |
+
elif n < 1000000:
|
98 |
+
size_cat = "100K<n<1M"
|
99 |
+
elif n < 10000000:
|
100 |
+
size_cat = "1M<n<10M"
|
101 |
+
elif n < 100000000:
|
102 |
+
size_cat = "10M<n<100M"
|
103 |
+
elif n < 1000000000:
|
104 |
+
size_cat = "100M<n<1B"
|
105 |
+
elif n < 10000000000:
|
106 |
+
size_cat = "1B<n<10B"
|
107 |
+
elif n < 100000000000:
|
108 |
+
size_cat = "10B<n<100B"
|
109 |
+
elif n < 1000000000000:
|
110 |
+
size_cat = "100B<n<1T"
|
111 |
+
else:
|
112 |
+
size_cat = "n>1T"
|
113 |
+
return size_cat
|
114 |
|
115 |
|
116 |
def is_state_empty(state: Dict[str, List]) -> bool:
|
|
|
119 |
|
120 |
state = new_state()
|
121 |
datasets_md = load_ds_datas()
|
122 |
+
dataset_ids = list(datasets_md.keys())
|
123 |
+
dataset_id_to_metadata = {name: mds["metadata"] for name, mds in datasets_md.items()}
|
124 |
+
dataset_id_to_infos = {name: mds["infos"] for name, mds in datasets_md.items()}
|
125 |
|
126 |
|
127 |
########################
|
|
|
143 |
preload = queryparams.get("preload_dataset", list())
|
144 |
preloaded_id = None
|
145 |
initial_state = None
|
146 |
+
initial_infos, initial_info_cfg = None, None
|
147 |
+
dataset_selector_index = 0
|
148 |
+
|
149 |
+
if len(preload) == 1 and preload[0] in dataset_ids:
|
150 |
preloaded_id, *_ = preload
|
151 |
+
initial_state = dataset_id_to_metadata.get(preloaded_id)
|
152 |
+
initial_infos = dataset_id_to_infos.get(preloaded_id)
|
153 |
+
initial_info_cfg = next(iter(initial_infos)) if initial_infos is not None else None # pick first available config
|
154 |
state = initial_state or new_state()
|
155 |
+
dataset_selector_index = dataset_ids.index(preloaded_id)
|
156 |
|
157 |
preloaded_id = st.sidebar.selectbox(
|
158 |
+
label="Choose dataset to load tag set from", options=dataset_ids, index=dataset_selector_index
|
159 |
)
|
160 |
+
|
161 |
leftbtn, rightbtn = st.sidebar.beta_columns(2)
|
162 |
if leftbtn.button("pre-load"):
|
163 |
+
initial_state = dataset_id_to_metadata[preloaded_id]
|
164 |
+
initial_infos = dataset_id_to_infos[preloaded_id]
|
165 |
+
initial_info_cfg = next(iter(initial_infos)) # pick first available config
|
166 |
state = initial_state or new_state()
|
167 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
168 |
if not is_state_empty(state):
|
|
|
194 |
leftcol, _, rightcol = st.beta_columns([12, 1, 12])
|
195 |
|
196 |
|
197 |
+
#
|
198 |
+
# TASKS
|
199 |
+
#
|
200 |
leftcol.markdown("### Supported tasks")
|
201 |
state["task_categories"] = multiselect(
|
202 |
leftcol,
|
|
|
226 |
state["task_ids"] = task_specifics
|
227 |
|
228 |
|
229 |
+
#
|
230 |
+
# LANGUAGES
|
231 |
+
#
|
232 |
leftcol.markdown("### Languages")
|
233 |
state["multilinguality"] = multiselect(
|
234 |
leftcol,
|
|
|
265 |
)
|
266 |
state["languages"] = langtags.split(";")
|
267 |
|
268 |
+
|
269 |
+
#
|
270 |
+
# DATASET CREATORS & ORIGINS
|
271 |
+
#
|
272 |
leftcol.markdown("### Dataset creators")
|
273 |
state["language_creators"] = multiselect(
|
274 |
leftcol,
|
|
|
286 |
)
|
287 |
|
288 |
|
289 |
+
#
|
290 |
+
# LICENSES
|
291 |
+
#
|
292 |
state["licenses"] = multiselect(
|
293 |
leftcol,
|
294 |
"Licenses",
|
|
|
305 |
st.write(f"Registering other-{other_license} license")
|
306 |
state["licenses"][state["licenses"].index("other")] = f"other-{other_license}"
|
307 |
|
308 |
+
|
309 |
+
#
|
310 |
+
# LINK TO SUPPORTED DATASETS
|
311 |
+
#
|
312 |
pre_select_ext_a = []
|
313 |
if "original" in state["source_datasets"]:
|
314 |
pre_select_ext_a += ["original"]
|
|
|
330 |
"Linked datasets",
|
331 |
"Which other datasets does this one use data from?",
|
332 |
values=pre_select_ext_b,
|
333 |
+
valid_set=dataset_ids + ["other"],
|
334 |
)
|
335 |
if "other" in extended_sources:
|
336 |
other_extended_sources = leftcol.text_input(
|
|
|
341 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
342 |
state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
|
343 |
|
344 |
+
|
345 |
+
#
|
346 |
+
# SIZE CATEGORY
|
347 |
+
#
|
348 |
+
leftcol.markdown("### Size category")
|
349 |
+
logging.info(initial_infos[initial_info_cfg]["splits"] if initial_infos is not None else 0)
|
350 |
+
initial_num_examples = (
|
351 |
+
sum([dct.get("num_examples", 0) for _split, dct in initial_infos[initial_info_cfg].get("splits", dict()).items()])
|
352 |
+
if initial_infos is not None
|
353 |
+
else -1
|
354 |
+
)
|
355 |
+
initial_size_cats = map_num_examples_to_size_categories(initial_num_examples)
|
356 |
+
leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_cats}`")
|
357 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
358 |
ok, nonok = split_known(current_size_cats, known_size_categories)
|
359 |
if len(nonok) > 0:
|
360 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
|
363 |
########################
|
task_set.json
DELETED
@@ -1,86 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"conditional-text-generation": {
|
3 |
-
"description": "data-to-text and text transduction tasks such as translation or summarization",
|
4 |
-
"options": [
|
5 |
-
"machine-translation",
|
6 |
-
"sentence-splitting-fusion",
|
7 |
-
"summarization",
|
8 |
-
"table-to-text",
|
9 |
-
"text-simplification",
|
10 |
-
"explanation-generation",
|
11 |
-
"other-stuctured-to-text",
|
12 |
-
"other"
|
13 |
-
]
|
14 |
-
},
|
15 |
-
"question-answering": {
|
16 |
-
"description": "question answering tasks",
|
17 |
-
"options": [
|
18 |
-
"open-domain-qa",
|
19 |
-
"closed-domain-qa",
|
20 |
-
"multiple-choice-qa",
|
21 |
-
"extractive-qa",
|
22 |
-
"abstractive-qa",
|
23 |
-
"other"
|
24 |
-
]
|
25 |
-
},
|
26 |
-
"sequence-modeling": {
|
27 |
-
"description": "such as language modeling or dialogue",
|
28 |
-
"options": [
|
29 |
-
"dialogue-modeling",
|
30 |
-
"language-modeling",
|
31 |
-
"other-multi-turn",
|
32 |
-
"slot-filling",
|
33 |
-
"other"
|
34 |
-
]
|
35 |
-
},
|
36 |
-
"structure-prediction": {
|
37 |
-
"description": "predicting structural properties of the text, such as syntax",
|
38 |
-
"options": [
|
39 |
-
"coreference-resolution",
|
40 |
-
"named-entity-recognition",
|
41 |
-
"part-of-speech-tagging",
|
42 |
-
"parsing",
|
43 |
-
"other"
|
44 |
-
]
|
45 |
-
},
|
46 |
-
"text-classification": {
|
47 |
-
"description": "predicting a class index or boolean value",
|
48 |
-
"options": [
|
49 |
-
"acceptability-classification",
|
50 |
-
"entity-linking-classification",
|
51 |
-
"fact-checking",
|
52 |
-
"intent-classification",
|
53 |
-
"multi-class-classification",
|
54 |
-
"multi-label-classification",
|
55 |
-
"natural-language-inference",
|
56 |
-
"semantic-similarity-classification",
|
57 |
-
"sentiment-classification",
|
58 |
-
"topic-classification",
|
59 |
-
"other"
|
60 |
-
]
|
61 |
-
},
|
62 |
-
"text-retrieval": {
|
63 |
-
"description": "information or text retrieval tasks",
|
64 |
-
"options": [
|
65 |
-
"document-retrieval",
|
66 |
-
"utterance-retrieval",
|
67 |
-
"entity-linking-retrieval",
|
68 |
-
"fact-checking-retrieval",
|
69 |
-
"other"
|
70 |
-
]
|
71 |
-
},
|
72 |
-
"text-scoring": {
|
73 |
-
"description": "text scoring tasks, predicting a real valued score for some text",
|
74 |
-
"options": [
|
75 |
-
"semantic-similarity-scoring",
|
76 |
-
"sentiment-scoring",
|
77 |
-
"other"
|
78 |
-
]
|
79 |
-
},
|
80 |
-
"other": {
|
81 |
-
"description": "other task family not mentioned here",
|
82 |
-
"options": [
|
83 |
-
"other"
|
84 |
-
]
|
85 |
-
}
|
86 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|