davidberenstein1957 HF staff commited on
Commit
fc100c2
1 Parent(s): b445efe

feat: Add functionality to add records to existing dataset

Browse files
src/distilabel_dataset_generator/apps/sft.py CHANGED
@@ -370,6 +370,7 @@ def push_to_argilla(
370
  def validate_argilla_dataset_name(
371
  dataset_name: str,
372
  final_dataset: pd.DataFrame,
 
373
  oauth_token: Union[OAuthToken, None] = None,
374
  progress=gr.Progress(),
375
  ) -> str:
@@ -379,7 +380,7 @@ def validate_argilla_dataset_name(
379
  if dataset_name is None or dataset_name == "":
380
  raise gr.Error("Dataset name is required")
381
  dataset = client.datasets(name=dataset_name, workspace=hf_user)
382
- if dataset:
383
  raise gr.Error(f"Dataset {dataset_name} already exists")
384
  return final_dataset
385
 
@@ -515,6 +516,13 @@ with gr.Blocks(
515
  placeholder="dataset_name",
516
  value="my-distiset",
517
  )
 
 
 
 
 
 
 
518
 
519
  with gr.Row(variant="panel"):
520
  btn_generate_full_dataset_copy = gr.Button(
@@ -648,7 +656,7 @@ with gr.Blocks(
648
 
649
  btn_generate_and_push_to_argilla.click(
650
  fn=validate_argilla_dataset_name,
651
- inputs=[dataset_name, final_dataset],
652
  outputs=[final_dataset],
653
  show_progress=True,
654
  ).success(
@@ -718,7 +726,7 @@ with gr.Blocks(
718
  outputs=[success_message],
719
  ).success(
720
  fn=validate_argilla_dataset_name,
721
- inputs=[dataset_name, final_dataset],
722
  outputs=[final_dataset],
723
  show_progress=True,
724
  ).success(
 
370
  def validate_argilla_dataset_name(
371
  dataset_name: str,
372
  final_dataset: pd.DataFrame,
373
+ add_to_existing_dataset: bool,
374
  oauth_token: Union[OAuthToken, None] = None,
375
  progress=gr.Progress(),
376
  ) -> str:
 
380
  if dataset_name is None or dataset_name == "":
381
  raise gr.Error("Dataset name is required")
382
  dataset = client.datasets(name=dataset_name, workspace=hf_user)
383
+ if dataset and not add_to_existing_dataset:
384
  raise gr.Error(f"Dataset {dataset_name} already exists")
385
  return final_dataset
386
 
 
516
  placeholder="dataset_name",
517
  value="my-distiset",
518
  )
519
+ add_to_existing_dataset = gr.Checkbox(
520
+ label="Allow adding records to existing dataset",
521
+ info="When selected, you do need to ensure the number of turns in the conversation is the same as the number of turns in the existing dataset.",
522
+ value=False,
523
+ interactive=True,
524
+ scale=0.5,
525
+ )
526
 
527
  with gr.Row(variant="panel"):
528
  btn_generate_full_dataset_copy = gr.Button(
 
656
 
657
  btn_generate_and_push_to_argilla.click(
658
  fn=validate_argilla_dataset_name,
659
+ inputs=[dataset_name, final_dataset, add_to_existing_dataset],
660
  outputs=[final_dataset],
661
  show_progress=True,
662
  ).success(
 
726
  outputs=[success_message],
727
  ).success(
728
  fn=validate_argilla_dataset_name,
729
+ inputs=[dataset_name, final_dataset, add_to_existing_dataset],
730
  outputs=[final_dataset],
731
  show_progress=True,
732
  ).success(