Yacine Jernite commited on
Commit
b468d19
1 Parent(s): 183ff95

streamline

Browse files
Files changed (1) hide show
  1. tagging_app.py +25 -48
tagging_app.py CHANGED
@@ -120,7 +120,6 @@ def filter_features(features, name="", is_sequence=False):
120
  desc += pre_desc
121
  return filtered, desc
122
 
123
-
124
  @st.cache
125
  def find_languages(feature_dict):
126
  if type(feature_dict) in [dict, datasets.features.Features]:
@@ -198,50 +197,30 @@ all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(a
198
  st.sidebar.markdown(app_desc)
199
 
200
  # option to only select from datasets that still need to be annotated
201
- only_missing = st.sidebar.checkbox("Show only un-annotated configs")
202
-
203
- if only_missing:
204
- dataset_choose_list = ["local dataset"] + [did for did, c_dict in all_dataset_infos.items()
205
- if not all([cid in existing_tag_sets.get(did, {}) for cid in c_dict])]
206
- else:
207
- dataset_choose_list = ["local dataset"] + list(all_dataset_infos.keys())
208
-
209
- dataset_id = st.sidebar.selectbox(
210
- label="Choose dataset to tag",
211
- options=dataset_choose_list,
212
- index=0,
213
- )
214
-
215
  all_info_dicts = {}
216
- if dataset_id == "local dataset":
217
- path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
218
- if path_to_info not in ["/path/to/dataset/", ""]:
219
- dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
220
- confs = dataset_infos.keys()
221
- all_info_dicts = {}
222
- for conf, info in dataset_infos.items():
223
- conf_info_dict = dict([(k, info[k]) for k in keep_keys])
224
- all_info_dicts[conf] = conf_info_dict
225
- dataset_id = list(dataset_infos.values())[0]["builder_name"]
226
- else:
227
- dataset_id = "tmp_dir"
228
- all_info_dicts = {
229
- "default":{
230
- 'description': "",
231
- 'features': {},
232
- 'homepage': "",
233
- 'license': "",
234
- 'splits': {},
235
- }
236
- }
237
  else:
238
- all_info_dicts = all_dataset_infos[dataset_id]
 
 
 
 
 
 
 
 
 
239
 
240
- if only_missing:
241
- config_choose_list = [cid for cid in all_info_dicts
242
- if not cid in existing_tag_sets.get(dataset_id, {})]
243
- else:
244
- config_choose_list = list(all_info_dicts.keys())
245
 
246
  config_id = st.sidebar.selectbox(
247
  label="Choose configuration",
@@ -257,9 +236,7 @@ c1, _, c2, _, c3 = st.beta_columns([8, 1, 12, 1, 12])
257
  ########################
258
 
259
  data_desc = f"### Dataset: {dataset_id} | Configuration: {config_id}" + "\n"
260
- data_desc += f"[Homepage]({config_infos['homepage']})" + " | "
261
- data_desc += f"[Data script](https://github.com/huggingface/datasets/blob/master/datasets/{dataset_id}/{dataset_id}.py)" + " | "
262
- data_desc += f"[View examples](https://huggingface.co/nlp/viewer/?dataset={dataset_id}&config={config_id})"
263
  c1.markdown(data_desc)
264
 
265
  with c1.beta_expander("Dataset description:", expanded=True):
@@ -329,7 +306,7 @@ if config_infos["license"] in license_set:
329
  c2.markdown("#### Editing the tag set")
330
  c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
331
 
332
- with c2.beta_expander("- Supported tasks"):
333
  task_categories = st.multiselect(
334
  "What categories of task does the dataset support?",
335
  options=list(task_set.keys()),
@@ -352,7 +329,7 @@ with c2.beta_expander("- Supported tasks"):
352
  task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
353
  task_specifics += task_specs
354
 
355
- with c2.beta_expander("- Languages"):
356
  multilinguality = st.multiselect(
357
  "Does the dataset contain more than one language?",
358
  options=list(multilinguality_set.keys()),
@@ -373,7 +350,7 @@ with c2.beta_expander("- Languages"):
373
  format_func= lambda m: f"{m} : {language_set[m]}",
374
  )
375
 
376
- with c2.beta_expander("- Dataset creators"):
377
  language_creators = st.multiselect(
378
  "Where does the text in the dataset come from?",
379
  options=creator_set["language"],
 
120
  desc += pre_desc
121
  return filtered, desc
122
 
 
123
  @st.cache
124
  def find_languages(feature_dict):
125
  if type(feature_dict) in [dict, datasets.features.Features]:
 
197
  st.sidebar.markdown(app_desc)
198
 
199
  # option to only select from datasets that still need to be annotated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  all_info_dicts = {}
201
+ path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
202
+ if path_to_info not in ["/path/to/dataset/", ""]:
203
+ dataset_infos = json.load(open(pjoin(path_to_info, "dataset_infos.json")))
204
+ confs = dataset_infos.keys()
205
+ all_info_dicts = {}
206
+ for conf, info in dataset_infos.items():
207
+ conf_info_dict = dict([(k, info[k]) for k in keep_keys])
208
+ all_info_dicts[conf] = conf_info_dict
209
+ dataset_id = list(dataset_infos.values())[0]["builder_name"]
 
 
 
 
 
 
 
 
 
 
 
 
210
  else:
211
+ dataset_id = "tmp_dir"
212
+ all_info_dicts = {
213
+ "default":{
214
+ 'description': "",
215
+ 'features': {},
216
+ 'homepage': "",
217
+ 'license': "",
218
+ 'splits': {},
219
+ }
220
+ }
221
 
222
+
223
+ config_choose_list = list(all_info_dicts.keys())
 
 
 
224
 
225
  config_id = st.sidebar.selectbox(
226
  label="Choose configuration",
 
236
  ########################
237
 
238
  data_desc = f"### Dataset: {dataset_id} | Configuration: {config_id}" + "\n"
239
+ data_desc += f"[Homepage]({config_infos['homepage']})"
 
 
240
  c1.markdown(data_desc)
241
 
242
  with c1.beta_expander("Dataset description:", expanded=True):
 
306
  c2.markdown("#### Editing the tag set")
307
  c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")
308
 
309
+ with c2.beta_expander("- Supported tasks", expanded=True):
310
  task_categories = st.multiselect(
311
  "What categories of task does the dataset support?",
312
  options=list(task_set.keys()),
 
329
  task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
330
  task_specifics += task_specs
331
 
332
+ with c2.beta_expander("- Languages", expanded=True):
333
  multilinguality = st.multiselect(
334
  "Does the dataset contain more than one language?",
335
  options=list(multilinguality_set.keys()),
 
350
  format_func= lambda m: f"{m} : {language_set[m]}",
351
  )
352
 
353
+ with c2.beta_expander("- Dataset creators", expanded=True):
354
  language_creators = st.multiselect(
355
  "Where does the text in the dataset come from?",
356
  options=creator_set["language"],