Spaces:
Runtime error
Runtime error
Yacine Jernite
commited on
Commit
·
9994065
1
Parent(s):
3578aa2
can download
Browse files- app.py +37 -12
- datacards/curation.py +128 -18
- datacards/gem.py +51 -2
- datacards/overview.py +1 -1
- datacards/results.py +1 -1
app.py
CHANGED
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
1 |
from datetime import datetime
|
2 |
|
3 |
import datacards
|
@@ -60,8 +63,8 @@ def main():
|
|
60 |
pages = {
|
61 |
"Dataset at a Glance": glance_page,
|
62 |
"Section: Dataset Overview": overview_page,
|
63 |
-
"Section: Dataset in GEM": gem_page,
|
64 |
"Section: Dataset Curation": curation_page,
|
|
|
65 |
"Section: Previous Results": results_page,
|
66 |
"Section: Considerations for Using Data": considerations_page,
|
67 |
"Section: Broader Social Context": context_page,
|
@@ -78,30 +81,52 @@ def main():
|
|
78 |
def glance_page():
|
79 |
with st.expander("Dataset at a Glance", expanded=True):
|
80 |
dataset_summary = ""
|
81 |
-
dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
|
82 |
-
dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"
|
83 |
-
dataset_summary += f"- **Dataset License**: {st.session_state.save_state.get('overview_languages_license', '')}\n"
|
84 |
-
dataset_summary += f"- **Multilingual Dataset**: {st.session_state.save_state.get('overview_languages_is-multilingual', '')}\n"
|
85 |
-
dataset_summary += f"- **Dataset Languages**: {
|
86 |
-
dataset_summary += f"- **Dataset Supported Task**: {st.session_state.save_state.get('overview_languages_task', '
|
87 |
-
dataset_summary += f"- **Communicative Goal**: {st.session_state.save_state.get('
|
88 |
-
dataset_summary += f"- **Language Data Origin**: {st.session_state.save_state.get('
|
89 |
-
dataset_summary += f"- **Annotation Data Origin**: {st.session_state.save_state.get('
|
90 |
-
dataset_summary += f"- **Likelihood of PII**: {st.session_state.save_state.get('
|
91 |
st.markdown(dataset_summary + "---\n")
|
92 |
num_fields = sum([len(dct) for k in st.session_state.get("card_dict", {}) for dct in st.session_state.card_dict.get(k, {}).values()])
|
93 |
st.markdown(f"You have currently filled out **{num_fields} of {_N_FIELDS} required fields** in the data card.")
|
94 |
left_col, right_col = st.columns(2)
|
95 |
with left_col:
|
96 |
overview_summary()
|
97 |
-
gem_summary()
|
98 |
curation_summary()
|
|
|
99 |
with right_col:
|
100 |
results_summary()
|
101 |
considerations_summary()
|
102 |
context_summary()
|
103 |
|
104 |
def review_page():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
st.write(st.session_state.get("card_dict", {}))
|
106 |
# TODO add buttons to save and download
|
107 |
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
|
4 |
from datetime import datetime
|
5 |
|
6 |
import datacards
|
|
|
63 |
pages = {
|
64 |
"Dataset at a Glance": glance_page,
|
65 |
"Section: Dataset Overview": overview_page,
|
|
|
66 |
"Section: Dataset Curation": curation_page,
|
67 |
+
"Section: Dataset in GEM": gem_page,
|
68 |
"Section: Previous Results": results_page,
|
69 |
"Section: Considerations for Using Data": considerations_page,
|
70 |
"Section: Broader Social Context": context_page,
|
|
|
81 |
def glance_page():
|
82 |
with st.expander("Dataset at a Glance", expanded=True):
|
83 |
dataset_summary = ""
|
84 |
+
dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '*Go to `Section: Dataset Overview` to fill in*')}\n"
|
85 |
+
dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '*Go to `Section: Dataset Overview` to fill in*')}\n"
|
86 |
+
dataset_summary += f"- **Dataset License**: {st.session_state.save_state.get('overview_languages_license', '*Go to `Section: Dataset Overview` to fill in*')}\n"
|
87 |
+
dataset_summary += f"- **Multilingual Dataset**: {st.session_state.save_state.get('overview_languages_is-multilingual', '*Go to `Section: Dataset Overview` to fill in*')}\n"
|
88 |
+
dataset_summary += f"- **Dataset Languages**: {st.session_state.save_state.get('overview_languages_language-names', '*Go to `Section: Dataset Overview` to fill in*')}\n"
|
89 |
+
dataset_summary += f"- **Dataset Supported Task**: {st.session_state.save_state.get('overview_languages_task', '*Go to `Section: Dataset Overview` to fill in*')}\n"
|
90 |
+
dataset_summary += f"- **Communicative Goal**: {st.session_state.save_state.get('overview_languages_communicative', '*Go to `Section: Dataset Overview` to fill in*')}\n"
|
91 |
+
dataset_summary += f"- **Language Data Origin**: {st.session_state.save_state.get('curation_language_obtained', '*Go to `Section: Dataset Curation` to fill in*')}\n"
|
92 |
+
dataset_summary += f"- **Annotation Data Origin**: {st.session_state.save_state.get('curation_annotations_obtained', '*Go to `Section: Dataset Curation` to fill in*')}\n"
|
93 |
+
dataset_summary += f"- **Likelihood of PII**: {st.session_state.save_state.get('curation_pii_has-pii', '*Go to `Section: Dataset Curation` to fill in*')}\n"
|
94 |
st.markdown(dataset_summary + "---\n")
|
95 |
num_fields = sum([len(dct) for k in st.session_state.get("card_dict", {}) for dct in st.session_state.card_dict.get(k, {}).values()])
|
96 |
st.markdown(f"You have currently filled out **{num_fields} of {_N_FIELDS} required fields** in the data card.")
|
97 |
left_col, right_col = st.columns(2)
|
98 |
with left_col:
|
99 |
overview_summary()
|
|
|
100 |
curation_summary()
|
101 |
+
gem_summary()
|
102 |
with right_col:
|
103 |
results_summary()
|
104 |
considerations_summary()
|
105 |
context_summary()
|
106 |
|
107 |
def review_page():
|
108 |
+
dataset_name = st.text_input(
|
109 |
+
label="Enter dataset name here",
|
110 |
+
)
|
111 |
+
if dataset_name != "":
|
112 |
+
friendly_name = re.sub(
|
113 |
+
r"[^\w\s]", " ", dataset_name.lower()
|
114 |
+
).strip().replace(" ", "_")
|
115 |
+
current_date = datetime.now().strftime(
|
116 |
+
"%m/%d/%Y, %H:%M:%S"
|
117 |
+
)
|
118 |
+
friendly_date = re.sub(
|
119 |
+
r"[^\w\s]", "_", current_date
|
120 |
+
).replace(" ", "_").replace("__", "_").replace("-", "")
|
121 |
+
dataset_file_name = f"{friendly_name}-{friendly_date}.json"
|
122 |
+
st.download_button(
|
123 |
+
label=f"Download the Dataset Card below as {dataset_file_name}",
|
124 |
+
data=json.dumps(st.session_state.get("card_dict", {}), indent=2),
|
125 |
+
file_name=dataset_file_name,
|
126 |
+
)
|
127 |
+
else:
|
128 |
+
st.markdown("Enter dataset name above to save!")
|
129 |
+
st.markdown("---\n")
|
130 |
st.write(st.session_state.get("card_dict", {}))
|
131 |
# TODO add buttons to save and download
|
132 |
|
datacards/curation.py
CHANGED
@@ -13,10 +13,9 @@ from .streamlit_utils import (
|
|
13 |
N_FIELDS_ORIGINAL = 4
|
14 |
N_FIELDS_LANGUAGE = 12
|
15 |
N_FIELDS_ANNOTATIONS = 10
|
16 |
-
N_FIELDS_CONSENT =
|
17 |
-
N_FIELDS_PII =
|
18 |
-
N_FIELDS_MAINTENANCE =
|
19 |
-
N_FIELDS_GEM = 0
|
20 |
|
21 |
N_FIELDS = (
|
22 |
N_FIELDS_ORIGINAL
|
@@ -25,15 +24,9 @@ N_FIELDS = (
|
|
25 |
+ N_FIELDS_CONSENT
|
26 |
+ N_FIELDS_PII
|
27 |
+ N_FIELDS_MAINTENANCE
|
28 |
-
+ N_FIELDS_GEM
|
29 |
)
|
30 |
|
31 |
|
32 |
-
"""
|
33 |
-
What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.]
|
34 |
-
"""
|
35 |
-
|
36 |
-
|
37 |
def curation_page():
|
38 |
st.session_state.card_dict["curation"] = st.session_state.card_dict.get(
|
39 |
"curation", {}
|
@@ -64,6 +57,7 @@ def curation_page():
|
|
64 |
key_list=key_pref + ["aggregated-sources"],
|
65 |
help="Otherwise, type N/A",
|
66 |
)
|
|
|
67 |
with st.expander("Language Data", expanded=False):
|
68 |
key_pref = ["curation", "language"]
|
69 |
st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict[
|
@@ -222,27 +216,144 @@ def curation_page():
|
|
222 |
help="Describe how quality was ensured in the data curation process.",
|
223 |
)
|
224 |
|
225 |
-
|
226 |
with st.expander("Consent", expanded=False):
|
227 |
key_pref = ["curation", "consent"]
|
228 |
st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict[
|
229 |
"curation"
|
230 |
].get("consent", {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
with st.expander("Private Identifying Information (PII)", expanded=False):
|
232 |
key_pref = ["curation", "pii"]
|
233 |
st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict[
|
234 |
"curation"
|
235 |
].get("pii", {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
with st.expander("Maintenance", expanded=False):
|
237 |
key_pref = ["curation", "maintenance"]
|
238 |
st.session_state.card_dict["curation"][
|
239 |
"maintenance"
|
240 |
] = st.session_state.card_dict["curation"].get("maintenance", {})
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
"
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
|
248 |
def curation_summary():
|
@@ -254,7 +365,7 @@ def curation_summary():
|
|
254 |
):
|
255 |
completion_markdown = ""
|
256 |
completion_markdown += (
|
257 |
-
f"- **Overall
|
258 |
)
|
259 |
completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
|
260 |
completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
|
@@ -262,5 +373,4 @@ def curation_summary():
|
|
262 |
completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
|
263 |
completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
|
264 |
completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
|
265 |
-
completion_markdown += f"- **Sub-section - GEM Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n"
|
266 |
st.markdown(completion_markdown)
|
|
|
13 |
N_FIELDS_ORIGINAL = 4
|
14 |
N_FIELDS_LANGUAGE = 12
|
15 |
N_FIELDS_ANNOTATIONS = 10
|
16 |
+
N_FIELDS_CONSENT = 4
|
17 |
+
N_FIELDS_PII = 7
|
18 |
+
N_FIELDS_MAINTENANCE = 6
|
|
|
19 |
|
20 |
N_FIELDS = (
|
21 |
N_FIELDS_ORIGINAL
|
|
|
24 |
+ N_FIELDS_CONSENT
|
25 |
+ N_FIELDS_PII
|
26 |
+ N_FIELDS_MAINTENANCE
|
|
|
27 |
)
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
30 |
def curation_page():
|
31 |
st.session_state.card_dict["curation"] = st.session_state.card_dict.get(
|
32 |
"curation", {}
|
|
|
57 |
key_list=key_pref + ["aggregated-sources"],
|
58 |
help="Otherwise, type N/A",
|
59 |
)
|
60 |
+
|
61 |
with st.expander("Language Data", expanded=False):
|
62 |
key_pref = ["curation", "language"]
|
63 |
st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict[
|
|
|
216 |
help="Describe how quality was ensured in the data curation process.",
|
217 |
)
|
218 |
|
|
|
219 |
with st.expander("Consent", expanded=False):
|
220 |
key_pref = ["curation", "consent"]
|
221 |
st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict[
|
222 |
"curation"
|
223 |
].get("consent", {})
|
224 |
+
make_radio(
|
225 |
+
label="Was there a consent policy involved when gathering the data?",
|
226 |
+
options=["no", "yes"],
|
227 |
+
key_list=key_pref+["has-consent"],
|
228 |
+
)
|
229 |
+
if st.session_state.card_dict["curation"]["consent"]["has-consent"] == "yes":
|
230 |
+
make_text_area(
|
231 |
+
label="What was the consent policy?",
|
232 |
+
key_list=key_pref+["consent-policy"],
|
233 |
+
help="If available, provide the text that data creators were shown, else, describe the process.",
|
234 |
+
)
|
235 |
+
make_text_area(
|
236 |
+
label="What other downstream uses of the data did the original data creators and the data curators consent to?",
|
237 |
+
key_list=key_pref+["consent-other"],
|
238 |
+
)
|
239 |
+
st.session_state.card_dict["curation"]["consent"]["no-consent-justification"] = "N/A"
|
240 |
+
else:
|
241 |
+
st.session_state.card_dict["curation"]["consent"]["consent-policy"] = "N/A"
|
242 |
+
st.session_state.card_dict["curation"]["consent"]["consent-other"] = "N/A"
|
243 |
+
make_text_area(
|
244 |
+
label="If not, what is the justification for reusing the data? ",
|
245 |
+
key_list=key_pref+["no-consent-justification"],
|
246 |
+
help="Why would be a justification the data without consent of the data creators in this case?",
|
247 |
+
)
|
248 |
+
|
249 |
with st.expander("Private Identifying Information (PII)", expanded=False):
|
250 |
key_pref = ["curation", "pii"]
|
251 |
st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict[
|
252 |
"curation"
|
253 |
].get("pii", {})
|
254 |
+
make_radio(
|
255 |
+
label="Does the source language data likely contain Personal Identifying Information about the data creators or subjects?",
|
256 |
+
options=["yes/very likely", "likely", "unlikely", "no PII"],
|
257 |
+
key_list=key_pref+["has-pii"],
|
258 |
+
help="most datasets have some form of PII: names, addresses, emails, account names, personal beliefs, gender, etc. - select `no PII` only if sure",
|
259 |
+
)
|
260 |
+
if st.session_state.card_dict["curation"]["pii"]["has-pii"] == "no PII":
|
261 |
+
make_text_area(
|
262 |
+
label="Provide a justification for selecting `no PII` above.",
|
263 |
+
key_list=key_pref+["no-pii-justification"],
|
264 |
+
help="for example, if the text is about general knowledge without references to the author or to any persons.",
|
265 |
+
)
|
266 |
+
st.session_state.card_dict["curation"]["pii"]["pii-categories"] = []
|
267 |
+
st.session_state.card_dict["curation"]["pii"]["is-pii-identified"] = "N/A"
|
268 |
+
st.session_state.card_dict["curation"]["pii"]["pii-identified-method"] = "N/A"
|
269 |
+
st.session_state.card_dict["curation"]["pii"]["is-pii-replaced"] = "N/A"
|
270 |
+
st.session_state.card_dict["curation"]["pii"]["pii-replaced-method"] = "N/A"
|
271 |
+
else:
|
272 |
+
st.session_state.card_dict["curation"]["pii"]["no-pii-justification"] = "N/A"
|
273 |
+
pii_help_text = """
|
274 |
+
- Personally identifying general information includes names, physical and email addresses, website accounts with names or handles, dates (birth, death, etc.), full-face photographs and comparable images, URLS, and biometric identifiers (fingerprints, voice, etc.).
|
275 |
+
- Personally identifying numbers include information such as telephone numbers, fax numbers, vehicle and device identifiers and serial numbers, social security numbers and equivalent, IP addresses, medical record numbers, health plan beneficiary numbers, account numbers, certificate/license numbers, and any other uniquely identifying numbers.
|
276 |
+
- Sensitive information includes descriptions of racial or ethnic origin, political opinions, religious or philosophical beliefs, trade-union membership, genetic data, health-related data, and data concerning a person's sex life or sexual orientation.
|
277 |
+
"""
|
278 |
+
make_multiselect(
|
279 |
+
label="What categories of PII are present or suspected in the data?",
|
280 |
+
options=["generic PII", "numeric PII", "sensitive information"],
|
281 |
+
key_list=key_pref+["pii-categories"],
|
282 |
+
help=pii_help_text,
|
283 |
+
)
|
284 |
+
make_radio(
|
285 |
+
label="Did the curators use any automatic/manual method to identify PII in the dataset?",
|
286 |
+
options=["no identification", "manual identification", "automatic identification", "mixed method"],
|
287 |
+
key_list=key_pref+["is-pii-identified"],
|
288 |
+
)
|
289 |
+
if st.session_state.card_dict["curation"]["pii"]["is-pii-identified"] == "no identification":
|
290 |
+
st.session_state.card_dict["curation"]["pii"]["pii-identified-method"] = "N/A"
|
291 |
+
st.session_state.card_dict["curation"]["pii"]["is-pii-replaced"] = "N/A"
|
292 |
+
st.session_state.card_dict["curation"]["pii"]["pii-replaced-method"] = "N/A"
|
293 |
+
else:
|
294 |
+
make_text_area(
|
295 |
+
label="Describe the method used to identify PII in the dataset",
|
296 |
+
key_list=key_pref+["pii-identified-method"],
|
297 |
+
)
|
298 |
+
make_radio(
|
299 |
+
label="Was the PII pseudonymized/handled somehow?",
|
300 |
+
options=["no", "yes"],
|
301 |
+
key_list=key_pref+["is-pii-replaced"],
|
302 |
+
)
|
303 |
+
if st.session_state.card_dict["curation"]["pii"]["is-pii-replaced"] == "yes":
|
304 |
+
make_text_area(
|
305 |
+
label="Describe the methods that were used to process the PII",
|
306 |
+
key_list=key_pref+["pii-replaced-method"],
|
307 |
+
)
|
308 |
+
else:
|
309 |
+
st.session_state.card_dict["curation"]["pii"]["pii-replaced-method"] = "N/A"
|
310 |
+
|
311 |
with st.expander("Maintenance", expanded=False):
|
312 |
key_pref = ["curation", "maintenance"]
|
313 |
st.session_state.card_dict["curation"][
|
314 |
"maintenance"
|
315 |
] = st.session_state.card_dict["curation"].get("maintenance", {})
|
316 |
+
make_radio(
|
317 |
+
label="Does the original dataset have a maintenance plan?",
|
318 |
+
options=["no", "yes"],
|
319 |
+
key_list=key_pref+["has-maintenance"],
|
320 |
+
help="this can include planned update or a commitment to removing content on request",
|
321 |
+
)
|
322 |
+
if st.session_state.card_dict["curation"]["maintenance"]["has-maintenance"] == "yes":
|
323 |
+
make_text_area(
|
324 |
+
label="Describe the original dataset's maintenance plan.",
|
325 |
+
key_list=key_pref+["description"],
|
326 |
+
)
|
327 |
+
make_text_area(
|
328 |
+
label="Provide contact information of a person responsible for the dataset maintenance",
|
329 |
+
key_list=key_pref+["contact"],
|
330 |
+
)
|
331 |
+
make_radio(
|
332 |
+
label="Does the maintenance plan include a contestation mechanism allowing individuals to request removal fo content?",
|
333 |
+
options=["no mechanism", "form submission", "contact maintainer", "other"],
|
334 |
+
key_list=key_pref+["contestation-mechanism"],
|
335 |
+
)
|
336 |
+
if st.session_state.card_dict["curation"]["maintenance"]["contestation-mechanism"] == "no mechanism":
|
337 |
+
st.session_state.card_dict["curation"]["maintenance"]["contestation-link"] = "N/A"
|
338 |
+
st.session_state.card_dict["curation"]["maintenance"]["contestation-description"] = "N/A"
|
339 |
+
elif st.session_state.card_dict["curation"]["maintenance"]["contestation-mechanism"] == "other":
|
340 |
+
st.session_state.card_dict["curation"]["maintenance"]["contestation-link"] = "N/A"
|
341 |
+
make_text_area(
|
342 |
+
label="Describe the contestation mechanism",
|
343 |
+
key_list=key_pref+["contestation-description"],
|
344 |
+
)
|
345 |
+
else:
|
346 |
+
make_text_input(
|
347 |
+
label="Provide the form link or contact information",
|
348 |
+
key_list=key_pref+["contestation-link"],
|
349 |
+
)
|
350 |
+
st.session_state.card_dict["curation"]["maintenance"]["contestation-description"] = "N/A"
|
351 |
+
else:
|
352 |
+
st.session_state.card_dict["curation"]["maintenance"]["description"] = "N/A"
|
353 |
+
st.session_state.card_dict["curation"]["maintenance"]["contact"] = "N/A"
|
354 |
+
st.session_state.card_dict["curation"]["maintenance"]["contestation-mechanism"] = "N/A"
|
355 |
+
st.session_state.card_dict["curation"]["maintenance"]["contestation-link"] = "N/A"
|
356 |
+
st.session_state.card_dict["curation"]["maintenance"]["contestation-description"] = "N/A"
|
357 |
|
358 |
|
359 |
def curation_summary():
|
|
|
365 |
):
|
366 |
completion_markdown = ""
|
367 |
completion_markdown += (
|
368 |
+
f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
|
369 |
)
|
370 |
completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
|
371 |
completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
|
|
|
373 |
completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
|
374 |
completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
|
375 |
completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
|
|
|
376 |
st.markdown(completion_markdown)
|
datacards/gem.py
CHANGED
@@ -3,14 +3,16 @@ import streamlit as st
|
|
3 |
from .streamlit_utils import make_text_input
|
4 |
|
5 |
from .streamlit_utils import (
|
|
|
6 |
make_text_area,
|
7 |
make_radio,
|
8 |
)
|
9 |
|
10 |
N_FIELDS_RATIONALE = 5
|
|
|
11 |
N_FIELDS_STARTING = 2
|
12 |
|
13 |
-
N_FIELDS = N_FIELDS_RATIONALE + N_FIELDS_STARTING
|
14 |
|
15 |
|
16 |
def gem_page():
|
@@ -47,6 +49,51 @@ def gem_page():
|
|
47 |
key_list=key_pref + ["model-ability"],
|
48 |
help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?",
|
49 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
with st.expander("Getting Started", expanded=False):
|
51 |
key_pref = ["gem", "starting"]
|
52 |
st.session_state.card_dict["gem"]["starting"] = st.session_state.card_dict[
|
@@ -64,6 +111,7 @@ def gem_page():
|
|
64 |
)
|
65 |
|
66 |
|
|
|
67 |
def gem_summary():
|
68 |
total_filled = sum(
|
69 |
[len(dct) for dct in st.session_state.card_dict.get("gem", {}).values()]
|
@@ -73,8 +121,9 @@ def gem_summary():
|
|
73 |
):
|
74 |
completion_markdown = ""
|
75 |
completion_markdown += (
|
76 |
-
f"- **Overall
|
77 |
)
|
78 |
completion_markdown += f"- **Sub-section - Rationale:**\n - {len(st.session_state.card_dict.get('gem', {}).get('rationale', {}))} of {N_FIELDS_RATIONALE} fields\n"
|
|
|
79 |
completion_markdown += f"- **Sub-section - Getting Started:**\n - {len(st.session_state.card_dict.get('gem', {}).get('starting', {}))} of {N_FIELDS_STARTING} fields\n"
|
80 |
st.markdown(completion_markdown)
|
|
|
3 |
from .streamlit_utils import make_text_input
|
4 |
|
5 |
from .streamlit_utils import (
|
6 |
+
make_multiselect,
|
7 |
make_text_area,
|
8 |
make_radio,
|
9 |
)
|
10 |
|
11 |
N_FIELDS_RATIONALE = 5
|
12 |
+
N_FIELDS_CURATION = 6
|
13 |
N_FIELDS_STARTING = 2
|
14 |
|
15 |
+
N_FIELDS = N_FIELDS_RATIONALE + N_FIELDS_CURATION + N_FIELDS_STARTING
|
16 |
|
17 |
|
18 |
def gem_page():
|
|
|
49 |
key_list=key_pref + ["model-ability"],
|
50 |
help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?",
|
51 |
)
|
52 |
+
|
53 |
+
with st.expander("GEM Additional Curation", expanded=False):
|
54 |
+
key_pref = ["gem", "curation"]
|
55 |
+
st.session_state.card_dict["gem"]["curation"] = st.session_state.card_dict[
|
56 |
+
"gem"
|
57 |
+
].get("curation", {})
|
58 |
+
make_radio(
|
59 |
+
label="Has the GEM version of the dataset been modified in any way (data, processing, splits) from the original curated data?",
|
60 |
+
options=["no", "yes"],
|
61 |
+
key_list=key_pref+["has-additional-curation"],
|
62 |
+
)
|
63 |
+
if st.session_state.card_dict["gem"]["curation"]["has-additional-curation"] == "yes":
|
64 |
+
make_multiselect(
|
65 |
+
label="What changes have been made to he original dataset?",
|
66 |
+
options=["data points added", "data points removed", "data points modified", "annotations added", "other"],
|
67 |
+
key_list=key_pref+["modification-types"],
|
68 |
+
)
|
69 |
+
make_text_area(
|
70 |
+
label="For each of these changes, described them in more details and provided the intended purpose of the modification",
|
71 |
+
key_list=key_pref+["modification-description"],
|
72 |
+
)
|
73 |
+
make_radio(
|
74 |
+
label="Does GEM provide additional splits to the dataset?",
|
75 |
+
options=["no", "yes"],
|
76 |
+
key_list=key_pref+["has-additional-splits"],
|
77 |
+
)
|
78 |
+
if st.session_state.card_dict["gem"]["curation"]["has-additional-splits"] == "yes":
|
79 |
+
make_text_area(
|
80 |
+
label="Describe how the new splits were created",
|
81 |
+
key_list=key_pref+["additional-splits-description"],
|
82 |
+
)
|
83 |
+
make_text_area(
|
84 |
+
label="What aspects of the model's generation capacities were the splits created to test?",
|
85 |
+
key_list=key_pref+["additional-splits-capacicites"],
|
86 |
+
)
|
87 |
+
else:
|
88 |
+
st.session_state.card_dict["gem"]["curation"]["additional-splits-description"] = "N/A"
|
89 |
+
st.session_state.card_dict["gem"]["curation"]["additional-splits-capacicites"] = "N/A"
|
90 |
+
else:
|
91 |
+
st.session_state.card_dict["gem"]["curation"]["modification-types"] = []
|
92 |
+
st.session_state.card_dict["gem"]["curation"]["modification-description"] = "N/A"
|
93 |
+
st.session_state.card_dict["gem"]["curation"]["has-additional-splits"] = "no"
|
94 |
+
st.session_state.card_dict["gem"]["curation"]["additional-splits-description"] = "N/A"
|
95 |
+
st.session_state.card_dict["gem"]["curation"]["additional-splits-capacicites"] = "N/A"
|
96 |
+
|
97 |
with st.expander("Getting Started", expanded=False):
|
98 |
key_pref = ["gem", "starting"]
|
99 |
st.session_state.card_dict["gem"]["starting"] = st.session_state.card_dict[
|
|
|
111 |
)
|
112 |
|
113 |
|
114 |
+
|
115 |
def gem_summary():
|
116 |
total_filled = sum(
|
117 |
[len(dct) for dct in st.session_state.card_dict.get("gem", {}).values()]
|
|
|
121 |
):
|
122 |
completion_markdown = ""
|
123 |
completion_markdown += (
|
124 |
+
f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
|
125 |
)
|
126 |
completion_markdown += f"- **Sub-section - Rationale:**\n - {len(st.session_state.card_dict.get('gem', {}).get('rationale', {}))} of {N_FIELDS_RATIONALE} fields\n"
|
127 |
+
completion_markdown += f"- **Sub-section - GEM Additional Curation:**\n - {len(st.session_state.card_dict.get('gem', {}).get('curation', {}))} of {N_FIELDS_CURATION} fields\n"
|
128 |
completion_markdown += f"- **Sub-section - Getting Started:**\n - {len(st.session_state.card_dict.get('gem', {}).get('starting', {}))} of {N_FIELDS_STARTING} fields\n"
|
129 |
st.markdown(completion_markdown)
|
datacards/overview.py
CHANGED
@@ -222,7 +222,7 @@ def overview_summary():
|
|
222 |
):
|
223 |
completion_markdown = ""
|
224 |
completion_markdown += (
|
225 |
-
f"- **Overall
|
226 |
)
|
227 |
completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
|
228 |
completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
|
|
|
222 |
):
|
223 |
completion_markdown = ""
|
224 |
completion_markdown += (
|
225 |
+
f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
|
226 |
)
|
227 |
completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
|
228 |
completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
|
datacards/results.py
CHANGED
@@ -82,7 +82,7 @@ def results_summary():
|
|
82 |
):
|
83 |
completion_markdown = ""
|
84 |
completion_markdown += (
|
85 |
-
f"- **Overall
|
86 |
)
|
87 |
completion_markdown += f"- **Sub-section - Previous Results:**\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n"
|
88 |
st.markdown(completion_markdown)
|
|
|
82 |
):
|
83 |
completion_markdown = ""
|
84 |
completion_markdown += (
|
85 |
+
f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
|
86 |
)
|
87 |
completion_markdown += f"- **Sub-section - Previous Results:**\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n"
|
88 |
st.markdown(completion_markdown)
|