Spaces:
Running
Running
Yacine Jernite
commited on
Commit
•
b468d19
1
Parent(s):
183ff95
streamline
Browse files- tagging_app.py +25 -48
tagging_app.py
CHANGED
@@ -120,7 +120,6 @@ def filter_features(features, name="", is_sequence=False):
|
|
120 |
desc += pre_desc
|
121 |
return filtered, desc
|
122 |
|
123 |
-
|
124 |
@st.cache
|
125 |
def find_languages(feature_dict):
|
126 |
if type(feature_dict) in [dict, datasets.features.Features]:
|
@@ -198,50 +197,30 @@ all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(a
|
|
198 |
st.sidebar.markdown(app_desc)
|
199 |
|
200 |
# option to only select from datasets that still need to be annotated
|
201 |
-
only_missing = st.sidebar.checkbox("Show only un-annotated configs")
|
202 |
-
|
203 |
-
if only_missing:
|
204 |
-
dataset_choose_list = ["local dataset"] + [did for did, c_dict in all_dataset_infos.items()
|
205 |
-
if not all([cid in existing_tag_sets.get(did, {}) for cid in c_dict])]
|
206 |
-
else:
|
207 |
-
dataset_choose_list = ["local dataset"] + list(all_dataset_infos.keys())
|
208 |
-
|
209 |
-
dataset_id = st.sidebar.selectbox(
|
210 |
-
label="Choose dataset to tag",
|
211 |
-
options=dataset_choose_list,
|
212 |
-
index=0,
|
213 |
-
)
|
214 |
-
|
215 |
all_info_dicts = {}
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
dataset_id = list(dataset_infos.values())[0]["builder_name"]
|
226 |
-
else:
|
227 |
-
dataset_id = "tmp_dir"
|
228 |
-
all_info_dicts = {
|
229 |
-
"default":{
|
230 |
-
'description': "",
|
231 |
-
'features': {},
|
232 |
-
'homepage': "",
|
233 |
-
'license': "",
|
234 |
-
'splits': {},
|
235 |
-
}
|
236 |
-
}
|
237 |
else:
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
if not cid in existing_tag_sets.get(dataset_id, {})]
|
243 |
-
else:
|
244 |
-
config_choose_list = list(all_info_dicts.keys())
|
245 |
|
246 |
config_id = st.sidebar.selectbox(
|
247 |
label="Choose configuration",
|
@@ -257,9 +236,7 @@ c1, _, c2, _, c3 = st.beta_columns([8, 1, 12, 1, 12])
|
|
257 |
########################
|
258 |
|
259 |
data_desc = f"### Dataset: {dataset_id} | Configuration: {config_id}" + "\n"
|
260 |
-
data_desc += f"[Homepage]({config_infos['homepage']})"
|
261 |
-
data_desc += f"[Data script](https://github.com/huggingface/datasets/blob/master/datasets/{dataset_id}/{dataset_id}.py)" + " | "
|
262 |
-
data_desc += f"[View examples](https://huggingface.co/nlp/viewer/?dataset={dataset_id}&config={config_id})"
|
263 |
c1.markdown(data_desc)
|
264 |
|
265 |
with c1.beta_expander("Dataset description:", expanded=True):
|
@@ -329,7 +306,7 @@ if config_infos["license"] in license_set:
|
|
329 |
c2.markdown("#### Editing the tag set")
|
330 |
c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
|
331 |
|
332 |
-
with c2.beta_expander("- Supported tasks"):
|
333 |
task_categories = st.multiselect(
|
334 |
"What categories of task does the dataset support?",
|
335 |
options=list(task_set.keys()),
|
@@ -352,7 +329,7 @@ with c2.beta_expander("- Supported tasks"):
|
|
352 |
task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
|
353 |
task_specifics += task_specs
|
354 |
|
355 |
-
with c2.beta_expander("- Languages"):
|
356 |
multilinguality = st.multiselect(
|
357 |
"Does the dataset contain more than one language?",
|
358 |
options=list(multilinguality_set.keys()),
|
@@ -373,7 +350,7 @@ with c2.beta_expander("- Languages"):
|
|
373 |
format_func= lambda m: f"{m} : {language_set[m]}",
|
374 |
)
|
375 |
|
376 |
-
with c2.beta_expander("- Dataset creators"):
|
377 |
language_creators = st.multiselect(
|
378 |
"Where does the text in the dataset come from?",
|
379 |
options=creator_set["language"],
|
|
|
120 |
desc += pre_desc
|
121 |
return filtered, desc
|
122 |
|
|
|
123 |
@st.cache
|
124 |
def find_languages(feature_dict):
|
125 |
if type(feature_dict) in [dict, datasets.features.Features]:
|
|
|
197 |
st.sidebar.markdown(app_desc)
|
198 |
|
199 |
# option to only select from datasets that still need to be annotated
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
all_info_dicts = {}
|
201 |
+
path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
|
202 |
+
if path_to_info not in ["/path/to/dataset/", ""]:
|
203 |
+
dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
|
204 |
+
confs = dataset_infos.keys()
|
205 |
+
all_info_dicts = {}
|
206 |
+
for conf, info in dataset_infos.items():
|
207 |
+
conf_info_dict = dict([(k, info[k]) for k in keep_keys])
|
208 |
+
all_info_dicts[conf] = conf_info_dict
|
209 |
+
dataset_id = list(dataset_infos.values())[0]["builder_name"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
else:
|
211 |
+
dataset_id = "tmp_dir"
|
212 |
+
all_info_dicts = {
|
213 |
+
"default":{
|
214 |
+
'description': "",
|
215 |
+
'features': {},
|
216 |
+
'homepage': "",
|
217 |
+
'license': "",
|
218 |
+
'splits': {},
|
219 |
+
}
|
220 |
+
}
|
221 |
|
222 |
+
|
223 |
+
config_choose_list = list(all_info_dicts.keys())
|
|
|
|
|
|
|
224 |
|
225 |
config_id = st.sidebar.selectbox(
|
226 |
label="Choose configuration",
|
|
|
236 |
########################
|
237 |
|
238 |
data_desc = f"### Dataset: {dataset_id} | Configuration: {config_id}" + "\n"
|
239 |
+
data_desc += f"[Homepage]({config_infos['homepage']})"
|
|
|
|
|
240 |
c1.markdown(data_desc)
|
241 |
|
242 |
with c1.beta_expander("Dataset description:", expanded=True):
|
|
|
306 |
c2.markdown("#### Editing the tag set")
|
307 |
c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
|
308 |
|
309 |
+
with c2.beta_expander("- Supported tasks", expanded=True):
|
310 |
task_categories = st.multiselect(
|
311 |
"What categories of task does the dataset support?",
|
312 |
options=list(task_set.keys()),
|
|
|
329 |
task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
|
330 |
task_specifics += task_specs
|
331 |
|
332 |
+
with c2.beta_expander("- Languages", expanded=True):
|
333 |
multilinguality = st.multiselect(
|
334 |
"Does the dataset contain more than one language?",
|
335 |
options=list(multilinguality_set.keys()),
|
|
|
350 |
format_func= lambda m: f"{m} : {language_set[m]}",
|
351 |
)
|
352 |
|
353 |
+
with c2.beta_expander("- Dataset creators", expanded=True):
|
354 |
language_creators = st.multiselect(
|
355 |
"Where does the text in the dataset come from?",
|
356 |
options=creator_set["language"],
|