Spaces:

huggingface
/

datasets-tagging

Running

Quentin Lhoest commited on Apr 5, 2022

Commit

9242f47

•

1 Parent(s): 40a1ebe

update task taxonomy

- `datasets` version is from the PR on github for now
- pre-loading tags are from the PR that updates all the datasets
- specify the size category yourself

Files changed (4) hide show

build_metadata_file.py +3 -1
metadata_8418b1d4ebac7c59372c5a55556522584891ba9c.json +0 -3
requirements.txt +1 -1
tagging_app.py +24 -13

build_metadata_file.py CHANGED Viewed

@@ -14,6 +14,8 @@ import yaml
 from apputils import new_state
 def metadata_from_readme(f: Path) -> Dict:
     with f.open() as fi:
@@ -29,7 +31,7 @@ def load_ds_datas():
     if drepo.exists() and drepo.is_dir():
         check_call(["git", "pull"], cwd=drepo)
     else:
-        check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
     head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
     datasets_md = dict()

 from apputils import new_state
+DATASETS_BRANCH = "tasks-alignment-with-models"
 def metadata_from_readme(f: Path) -> Dict:
     with f.open() as fi:
     if drepo.exists() and drepo.is_dir():
         check_call(["git", "pull"], cwd=drepo)
     else:
+        check_call(["git", "clone", "-b", DATASETS_BRANCH, "https://github.com/huggingface/datasets.git"])
     head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
     datasets_md = dict()

metadata_8418b1d4ebac7c59372c5a55556522584891ba9c.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a72566a87cb959e17e04840367969ef3a5966db12f82039ca6faea9b87da54d9
-size 29912341

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 pyyaml
-datasets==1.9.0
 streamlit>=0.88.0
 langcodes[data]

 pyyaml
 streamlit>=0.88.0
 langcodes[data]
+git+https://github.com/huggingface/datasets.git@update-task-list

tagging_app.py CHANGED Viewed

@@ -73,20 +73,22 @@ def multiselect(
     if len(invalid_values) > 0:
         w.markdown("Found the following invalid values:")
         w.error(invalid_values)
-    return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
 def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
     try:
         DatasetMetadata(**state_dict)
         w.markdown("✅ This is a valid tagset! 🤗")
     except Exception as e:
         w.markdown("❌ This is an invalid tagset, here are the errors in it:")
         w.error(e)
-def map_num_examples_to_size_categories(n: int) -> str:
-    if n <= 0:
         size_cat = "unknown"
     elif n < 1000:
         size_cat = "n<1K"
@@ -212,8 +214,7 @@ state["task_categories"] = multiselect(
     "Task category",
     "What categories of task does the dataset support?",
     values=state["task_categories"],
-    valid_set=list(known_task_ids.keys()),
-    format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
 )
 task_specifics = []
 for task_category in state["task_categories"]:
@@ -221,8 +222,8 @@ for task_category in state["task_categories"]:
         leftcol,
         f"Specific _{task_category}_ tasks",
         f"What specific tasks does the dataset support?",
-        values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category]["options"]],
-        valid_set=known_task_ids[task_category]["options"],
     )
     if "other" in specs:
         other_task = leftcol.text_input(
@@ -355,14 +356,24 @@ initial_num_examples = (
     if initial_infos is not None
     else -1
 )
-initial_size_cats = map_num_examples_to_size_categories(initial_num_examples)
-leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_cats}`")
-current_size_cats = state.get("size_categories") or ["unknown"]
-ok, nonok = split_known(current_size_cats, known_size_categories)
 if len(nonok) > 0:
     leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
-else:
-    state["size_categories"] = [initial_size_cats]
 ########################

     if len(invalid_values) > 0:
         w.markdown("Found the following invalid values:")
         w.error(invalid_values)
+    return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func, key=title)
 def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
     try:
         DatasetMetadata(**state_dict)
+        if not state_dict.get("pretty_name"):
+            raise ValueError("Please specify a non-empty Dataset name.")
         w.markdown("✅ This is a valid tagset! 🤗")
     except Exception as e:
         w.markdown("❌ This is an invalid tagset, here are the errors in it:")
         w.error(e)
+def map_num_examples_to_size_category(n: int) -> str:
+    if n < 0:
         size_cat = "unknown"
     elif n < 1000:
         size_cat = "n<1K"
     "Task category",
     "What categories of task does the dataset support?",
     values=state["task_categories"],
+    valid_set=sorted(list(known_task_ids.keys())),
 )
 task_specifics = []
 for task_category in state["task_categories"]:
         leftcol,
         f"Specific _{task_category}_ tasks",
         f"What specific tasks does the dataset support?",
+        values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category].get("subtasks", [])],
+        valid_set=known_task_ids[task_category].get("subtasks", []),
     )
     if "other" in specs:
         other_task = leftcol.text_input(
     if initial_infos is not None
     else -1
 )
+if initial_num_examples >= 0:
+    initial_size_categories = [map_num_examples_to_size_category(initial_num_examples)]
+else:
+    initial_size_categories = []
+current_size_cats = multiselect(
+    leftcol,
+    f"Size category",
+    f"How many samples are there in the dataset?",
+    values=initial_size_categories,
+    valid_set=known_size_categories,
+)
+if initial_size_categories:
+    leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_categories}`")
+prev_size_cats = state.get("size_categories") or []
+ok, nonok = split_known(prev_size_cats, known_size_categories)
 if len(nonok) > 0:
     leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
+state["size_categories"] = current_size_cats
 ########################