Commit
•
fc100c2
1
Parent(s):
b445efe
feat: Add functionality to add records to existing dataset
Browse files
src/distilabel_dataset_generator/apps/sft.py
CHANGED
@@ -370,6 +370,7 @@ def push_to_argilla(
|
|
370 |
def validate_argilla_dataset_name(
|
371 |
dataset_name: str,
|
372 |
final_dataset: pd.DataFrame,
|
|
|
373 |
oauth_token: Union[OAuthToken, None] = None,
|
374 |
progress=gr.Progress(),
|
375 |
) -> str:
|
@@ -379,7 +380,7 @@ def validate_argilla_dataset_name(
|
|
379 |
if dataset_name is None or dataset_name == "":
|
380 |
raise gr.Error("Dataset name is required")
|
381 |
dataset = client.datasets(name=dataset_name, workspace=hf_user)
|
382 |
-
if dataset:
|
383 |
raise gr.Error(f"Dataset {dataset_name} already exists")
|
384 |
return final_dataset
|
385 |
|
@@ -515,6 +516,13 @@ with gr.Blocks(
|
|
515 |
placeholder="dataset_name",
|
516 |
value="my-distiset",
|
517 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
518 |
|
519 |
with gr.Row(variant="panel"):
|
520 |
btn_generate_full_dataset_copy = gr.Button(
|
@@ -648,7 +656,7 @@ with gr.Blocks(
|
|
648 |
|
649 |
btn_generate_and_push_to_argilla.click(
|
650 |
fn=validate_argilla_dataset_name,
|
651 |
-
inputs=[dataset_name, final_dataset],
|
652 |
outputs=[final_dataset],
|
653 |
show_progress=True,
|
654 |
).success(
|
@@ -718,7 +726,7 @@ with gr.Blocks(
|
|
718 |
outputs=[success_message],
|
719 |
).success(
|
720 |
fn=validate_argilla_dataset_name,
|
721 |
-
inputs=[dataset_name, final_dataset],
|
722 |
outputs=[final_dataset],
|
723 |
show_progress=True,
|
724 |
).success(
|
|
|
370 |
def validate_argilla_dataset_name(
|
371 |
dataset_name: str,
|
372 |
final_dataset: pd.DataFrame,
|
373 |
+
add_to_existing_dataset: bool,
|
374 |
oauth_token: Union[OAuthToken, None] = None,
|
375 |
progress=gr.Progress(),
|
376 |
) -> str:
|
|
|
380 |
if dataset_name is None or dataset_name == "":
|
381 |
raise gr.Error("Dataset name is required")
|
382 |
dataset = client.datasets(name=dataset_name, workspace=hf_user)
|
383 |
+
if dataset and not add_to_existing_dataset:
|
384 |
raise gr.Error(f"Dataset {dataset_name} already exists")
|
385 |
return final_dataset
|
386 |
|
|
|
516 |
placeholder="dataset_name",
|
517 |
value="my-distiset",
|
518 |
)
|
519 |
+
add_to_existing_dataset = gr.Checkbox(
|
520 |
+
label="Allow adding records to existing dataset",
|
521 |
+
info="When selected, you do need to ensure the number of turns in the conversation is the same as the number of turns in the existing dataset.",
|
522 |
+
value=False,
|
523 |
+
interactive=True,
|
524 |
+
scale=0.5,
|
525 |
+
)
|
526 |
|
527 |
with gr.Row(variant="panel"):
|
528 |
btn_generate_full_dataset_copy = gr.Button(
|
|
|
656 |
|
657 |
btn_generate_and_push_to_argilla.click(
|
658 |
fn=validate_argilla_dataset_name,
|
659 |
+
inputs=[dataset_name, final_dataset, add_to_existing_dataset],
|
660 |
outputs=[final_dataset],
|
661 |
show_progress=True,
|
662 |
).success(
|
|
|
726 |
outputs=[success_message],
|
727 |
).success(
|
728 |
fn=validate_argilla_dataset_name,
|
729 |
+
inputs=[dataset_name, final_dataset, add_to_existing_dataset],
|
730 |
outputs=[final_dataset],
|
731 |
show_progress=True,
|
732 |
).success(
|