Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 HF staff commited on Oct 17, 2024

Commit

54d4d8d

1 Parent(s): 2e2beb7

refactor: re-usable gradio component

Browse files

Files changed (5) hide show

app.py +2 -2
src/distilabel_dataset_generator/apps/base.py +498 -0
src/distilabel_dataset_generator/apps/sft.py +211 -460
src/distilabel_dataset_generator/pipelines/base.py +12 -0
src/distilabel_dataset_generator/pipelines/sft.py +5 -111

app.py CHANGED Viewed

@@ -26,8 +26,8 @@ css = """
 """
 demo = gr.TabbedInterface(
-    [sft_app, textcat_app, faq_app],
-    ["Supervised Fine-Tuning", "Text Classification", "FAQ"],
     css=css,
     title="""
     <style>

 """
 demo = gr.TabbedInterface(
+    [textcat_app, sft_app, faq_app],
+    ["Text Classification", "Supervised Fine-Tuning", "FAQ"],
     css=css,
     title="""
     <style>

src/distilabel_dataset_generator/apps/base.py ADDED Viewed

	@@ -0,0 +1,498 @@

+import io
+import uuid
+from typing import Any, Callable, List, Optional, Tuple, Union
+import argilla as rg
+import gradio as gr
+import pandas as pd
+from datasets import Dataset
+from distilabel.distiset import Distiset
+from gradio import OAuthToken
+from huggingface_hub import HfApi, upload_file
+from src.distilabel_dataset_generator.utils import (
+    _LOGGED_OUT_CSS,
+    get_argilla_client,
+    list_orgs,
+)
+def swap_visibilty(oauth_token: Optional[OAuthToken] = None):
+    if oauth_token:
+        return gr.update(elem_classes=["main_ui_logged_in"])
+    else:
+        return gr.update(elem_classes=["main_ui_logged_out"])
+def get_main_ui(
+    default_dataset_descriptions: List[str],
+    default_system_prompts: List[str],
+    default_datasets: List[pd.DataFrame],
+    fn_generate_system_prompt: Callable,
+    fn_generate_dataset: Callable,
+):
+    def fn_generate_sample_dataset(system_prompt, progress=gr.Progress()):
+        if system_prompt in default_system_prompts:
+            index = default_system_prompts.index(system_prompt)
+            if index < len(default_datasets):
+                return default_datasets[index]
+        result = fn_generate_dataset(
+            system_prompt, num_turns=1, num_rows=1, progress=progress, is_sample=True
+        )
+        return result
+    with gr.Blocks(
+        title="🧬 Synthetic Data Generator",
+        head="🧬 Synthetic Data Generator",
+        css=_LOGGED_OUT_CSS,
+    ) as app:
+        with gr.Row():
+            gr.Markdown(
+                "Want to run this locally or with other LLMs? Take a look at the FAQ tab. distilabel Synthetic Data Generator is free, we use the authentication token to push the dataset to the Hugging Face Hub and not for data generation."
+            )
+        with gr.Row():
+            gr.Column()
+            get_login_button()
+            gr.Column()
+        gr.Markdown("## Iterate on a sample dataset")
+        with gr.Column() as main_ui:
+            (
+                dataset_description,
+                examples,
+                btn_generate_system_prompt,
+                system_prompt,
+                sample_dataset,
+                btn_generate_sample_dataset,
+            ) = get_iterate_on_sample_dataset_ui(
+                default_dataset_descriptions=default_dataset_descriptions,
+                default_system_prompts=default_system_prompts,
+                default_datasets=default_datasets,
+            )
+            gr.Markdown("## Generate full dataset")
+            gr.Markdown(
+                "Once you're satisfied with the sample, generate a larger dataset and push it to Argilla or the Hugging Face Hub."
+            )
+            with gr.Row(variant="panel") as custom_input_ui:
+                pass
+            (
+                dataset_name,
+                add_to_existing_dataset,
+                btn_generate_full_dataset_copy,
+                btn_generate_and_push_to_argilla,
+                btn_push_to_argilla,
+                org_name,
+                repo_name,
+                private,
+                btn_generate_full_dataset,
+                btn_generate_and_push_to_hub,
+                btn_push_to_hub,
+                final_dataset,
+                success_message,
+            ) = get_push_to_hub_ui(default_datasets)
+        sample_dataset.change(
+            fn=lambda x: x,
+            inputs=[sample_dataset],
+            outputs=[final_dataset],
+        )
+        btn_generate_system_prompt.click(
+            fn=fn_generate_system_prompt,
+            inputs=[dataset_description],
+            outputs=[system_prompt],
+            show_progress=True,
+        ).then(
+            fn=fn_generate_sample_dataset,
+            inputs=[system_prompt],
+            outputs=[sample_dataset],
+            show_progress=True,
+        )
+        btn_generate_sample_dataset.click(
+            fn=fn_generate_sample_dataset,
+            inputs=[system_prompt],
+            outputs=[sample_dataset],
+            show_progress=True,
+        )
+        app.load(fn=swap_visibilty, outputs=main_ui)
+        app.load(get_org_dropdown, outputs=[org_name])
+    return (
+        app,
+        main_ui,
+        custom_input_ui,
+        dataset_description,
+        examples,
+        btn_generate_system_prompt,
+        system_prompt,
+        sample_dataset,
+        btn_generate_sample_dataset,
+        dataset_name,
+        add_to_existing_dataset,
+        btn_generate_full_dataset_copy,
+        btn_generate_and_push_to_argilla,
+        btn_push_to_argilla,
+        org_name,
+        repo_name,
+        private,
+        btn_generate_full_dataset,
+        btn_generate_and_push_to_hub,
+        btn_push_to_hub,
+        final_dataset,
+        success_message,
+    )
+def validate_argilla_user_workspace_dataset(
+    dataset_name: str,
+    final_dataset: pd.DataFrame,
+    add_to_existing_dataset: bool,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> str:
+    progress(0, desc="Validating dataset configuration")
+    hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+    client = get_argilla_client()
+    if dataset_name is None or dataset_name == "":
+        raise gr.Error("Dataset name is required")
+    # Create user if it doesn't exist
+    rg_user = client.users(username=hf_user)
+    if rg_user is None:
+        rg_user = client.users.add(
+            rg.User(username=hf_user, role="admin", password=str(uuid.uuid4()))
+        )
+    # Create workspace if it doesn't exist
+    workspace = client.workspaces(name=hf_user)
+    if workspace is None:
+        workspace = client.workspaces.add(rg.Workspace(name=hf_user))
+        workspace.add_user(hf_user)
+    # Check if dataset exists
+    dataset = client.datasets(name=dataset_name, workspace=hf_user)
+    if dataset and not add_to_existing_dataset:
+        raise gr.Error(f"Dataset {dataset_name} already exists")
+    return final_dataset
+def get_login_button():
+    return gr.LoginButton(
+        value="Sign in with Hugging Face!", size="lg", scale=2
+    ).activate()
+def get_org_dropdown(oauth_token: OAuthToken = None):
+    orgs = list_orgs(oauth_token)
+    return gr.Dropdown(
+        label="Organization",
+        choices=orgs,
+        value=orgs[0] if orgs else None,
+        allow_custom_value=True,
+    )
+def get_push_to_hub_ui(default_datasets):
+    with gr.Column() as push_to_hub_ui:
+        (
+            dataset_name,
+            add_to_existing_dataset,
+            btn_generate_full_dataset_copy,
+            btn_generate_and_push_to_argilla,
+            btn_push_to_argilla,
+        ) = get_argilla_tab()
+        (
+            org_name,
+            repo_name,
+            private,
+            btn_generate_full_dataset,
+            btn_generate_and_push_to_hub,
+            btn_push_to_hub,
+        ) = get_hf_tab()
+        final_dataset = get_final_dataset_row(default_datasets)
+        success_message = get_success_message_row()
+    return (
+        dataset_name,
+        add_to_existing_dataset,
+        btn_generate_full_dataset_copy,
+        btn_generate_and_push_to_argilla,
+        btn_push_to_argilla,
+        org_name,
+        repo_name,
+        private,
+        btn_generate_full_dataset,
+        btn_generate_and_push_to_hub,
+        btn_push_to_hub,
+        final_dataset,
+        success_message,
+    )
+def get_iterate_on_sample_dataset_ui(
+    default_dataset_descriptions: List[str],
+    default_system_prompts: List[str],
+    default_datasets: List[pd.DataFrame],
+):
+    with gr.Column():
+        dataset_description = gr.TextArea(
+            label="Give a precise description of the assistant or tool. Don't describe the dataset",
+            value=default_dataset_descriptions[0],
+            lines=2,
+        )
+        examples = gr.Examples(
+            elem_id="system_prompt_examples",
+            examples=[[example] for example in default_dataset_descriptions],
+            inputs=[dataset_description],
+        )
+        with gr.Row():
+            gr.Column(scale=1)
+            btn_generate_system_prompt = gr.Button(
+                value="Generate system prompt and sample dataset"
+            )
+            gr.Column(scale=1)
+        system_prompt = gr.TextArea(
+            label="System prompt for dataset generation. You can tune it and regenerate the sample",
+            value=default_system_prompts[0],
+            lines=5,
+        )
+        with gr.Row():
+            sample_dataset = gr.Dataframe(
+                value=default_datasets[0],
+                label="Sample dataset. Prompts and completions truncated to 256 tokens.",
+                interactive=False,
+                wrap=True,
+            )
+        with gr.Row():
+            gr.Column(scale=1)
+            btn_generate_sample_dataset = gr.Button(
+                value="Generate sample dataset",
+            )
+            gr.Column(scale=1)
+    return (
+        dataset_description,
+        examples,
+        btn_generate_system_prompt,
+        system_prompt,
+        sample_dataset,
+        btn_generate_sample_dataset,
+    )
+def get_pipeline_code_ui(pipeline_code: str) -> gr.Code:
+    gr.Markdown("## Or run this pipeline locally with distilabel")
+    gr.Markdown(
+        "You can run this pipeline locally with distilabel. For more information, please refer to the [distilabel documentation](https://distilabel.argilla.io/) or go to the FAQ tab at the top of the page for more information."
+    )
+    with gr.Accordion(
+        "Run this pipeline using distilabel",
+        open=False,
+    ):
+        pipeline_code = gr.Code(
+            value=pipeline_code,
+            language="python",
+            label="Distilabel Pipeline Code",
+        )
+    return pipeline_code
+def get_argilla_tab() -> Tuple[Any]:
+    with gr.Tab(label="Argilla"):
+        if get_argilla_client() is not None:
+            with gr.Row(variant="panel"):
+                dataset_name = gr.Textbox(
+                    label="Dataset name",
+                    placeholder="dataset_name",
+                    value="my-distiset",
+                )
+                add_to_existing_dataset = gr.Checkbox(
+                    label="Allow adding records to existing dataset",
+                    info="When selected, you do need to ensure the number of turns in the conversation is the same as the number of turns in the existing dataset.",
+                    value=False,
+                    interactive=True,
+                    scale=0.5,
+                )
+            with gr.Row(variant="panel"):
+                btn_generate_full_dataset_copy = gr.Button(
+                    value="Generate", variant="primary", scale=2
+                )
+                btn_generate_and_push_to_argilla = gr.Button(
+                    value="Generate and Push to Argilla",
+                    variant="primary",
+                    scale=2,
+                )
+                btn_push_to_argilla = gr.Button(
+                    value="Push to Argilla", variant="primary", scale=2
+                )
+        else:
+            gr.Markdown(
+                "Please add `ARGILLA_API_URL` and `ARGILLA_API_KEY` to use Argilla or export the dataset to the Hugging Face Hub."
+            )
+    return (
+        dataset_name,
+        add_to_existing_dataset,
+        btn_generate_full_dataset_copy,
+        btn_generate_and_push_to_argilla,
+        btn_push_to_argilla,
+    )
+def get_hf_tab() -> Tuple[Any]:
+    with gr.Tab("Hugging Face Hub"):
+        with gr.Row(variant="panel"):
+            org_name = get_org_dropdown()
+            repo_name = gr.Textbox(
+                label="Repo name",
+                placeholder="dataset_name",
+                value="my-distiset",
+            )
+            private = gr.Checkbox(
+                label="Private dataset",
+                value=True,
+                interactive=True,
+                scale=0.5,
+            )
+        with gr.Row(variant="panel"):
+            btn_generate_full_dataset = gr.Button(
+                value="Generate", variant="primary", scale=2
+            )
+            btn_generate_and_push_to_hub = gr.Button(
+                value="Generate and Push to Hub", variant="primary", scale=2
+            )
+            btn_push_to_hub = gr.Button(value="Push to Hub", variant="primary", scale=2)
+    return (
+        org_name,
+        repo_name,
+        private,
+        btn_generate_full_dataset,
+        btn_generate_and_push_to_hub,
+        btn_push_to_hub,
+    )
+def push_pipeline_code_to_hub(
+    pipeline_code: str,
+    org_name: str,
+    repo_name: str,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+):
+    repo_id = _check_push_to_hub(org_name, repo_name)
+    progress(0.1, desc="Uploading pipeline code")
+    with io.BytesIO(pipeline_code.encode("utf-8")) as f:
+        upload_file(
+            path_or_fileobj=f,
+            path_in_repo="pipeline.py",
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=oauth_token.token,
+            commit_message="Include pipeline script",
+            create_pr=False,
+        )
+    progress(1.0, desc="Pipeline code uploaded")
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    private: bool = True,
+    org_name: str = None,
+    repo_name: str = None,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    progress(0.1, desc="Setting up dataset")
+    repo_id = _check_push_to_hub(org_name, repo_name)
+    distiset = Distiset(
+        {
+            "default": Dataset.from_pandas(dataframe),
+        }
+    )
+    progress(0.2, desc="Pushing dataset to hub")
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+    progress(1.0, desc="Dataset pushed to hub")
+    return dataframe
+def _check_push_to_hub(org_name, repo_name):
+    repo_id = (
+        f"{org_name}/{repo_name}"
+        if repo_name is not None and org_name is not None
+        else None
+    )
+    if repo_id is not None:
+        if not all([repo_id, org_name, repo_name]):
+            raise gr.Error(
+                "Please provide a `repo_name` and `org_name` to push the dataset to."
+            )
+    return repo_id
+def get_final_dataset_row(default_datasets) -> gr.Dataframe:
+    with gr.Row():
+        final_dataset = gr.Dataframe(
+            value=default_datasets[0],
+            label="Generated dataset",
+            interactive=False,
+            wrap=True,
+        )
+    return final_dataset
+def get_success_message_row() -> gr.Markdown:
+    with gr.Row():
+        success_message = gr.Markdown(visible=False)
+    return success_message
+def show_success_message_argilla() -> gr.Markdown:
+    client = get_argilla_client()
+    argilla_api_url = client.api_url
+    return gr.Markdown(
+        value=f"""
+        <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
+            <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+            <p style="margin-top: 0.5em;">
+                Your dataset is now available at:
+                <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                    {argilla_api_url}
+                </a>
+                <br>Unfamiliar with Argilla? Here are some docs to help you get started:
+                <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
+                <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
+            </p>
+        </div>
+        """,
+        visible=True,
+    )
+def show_success_message_hub(org_name, repo_name) -> gr.Markdown:
+    return gr.Markdown(
+        value=f"""
+        <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
+            <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+            <p style="margin-top: 0.5em;">
+                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain or other frameworks.
+                Your dataset is now available at:
+                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                    https://huggingface.co/datasets/{org_name}/{repo_name}
+                </a>
+            </p>
+        </div>
+        """,
+        visible=True,
+    )
+def hide_success_message() -> gr.Markdown:
+    return gr.Markdown(visible=False)

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import ast
-import io
-import uuid
 from typing import Dict, List, Union
 import argilla as rg
@@ -8,17 +6,29 @@ import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from distilabel.distiset import Distiset
-from distilabel.steps.tasks.text_generation import TextGeneration
-from gradio.oauth import OAuthToken
-from huggingface_hub import upload_file
-from huggingface_hub.hf_api import HfApi
 from src.distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
 from src.distilabel_dataset_generator.pipelines.sft import (
-    DEFAULT_BATCH_SIZE,
     DEFAULT_DATASET_DESCRIPTIONS,
     DEFAULT_DATASETS,
     DEFAULT_SYSTEM_PROMPTS,
@@ -28,222 +38,45 @@ from src.distilabel_dataset_generator.pipelines.sft import (
     get_prompt_generator,
     get_response_generator,
 )
-from src.distilabel_dataset_generator.utils import (
-    get_argilla_client,
-    get_base_app,
-    get_org_dropdown,
-    swap_visibilty,
-)
-def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
-    return ast.literal_eval(
-        messages.replace("'user'}", "'user'},")
-        .replace("'system'}", "'system'},")
-        .replace("'assistant'}", "'assistant'},")
-    )
-def generate_system_prompt(dataset_description, progress=gr.Progress()):
-    progress(0.0, desc="Generating system prompt")
-    if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
-        index = DEFAULT_DATASET_DESCRIPTIONS.index(dataset_description)
-        if index < len(DEFAULT_SYSTEM_PROMPTS):
-            return DEFAULT_SYSTEM_PROMPTS[index]
-    progress(0.3, desc="Initializing text generation")
-    generate_description: TextGeneration = get_prompt_generator()
-    progress(0.7, desc="Generating system prompt")
-    result = next(
-        generate_description.process(
-            [
-                {
-                    "system_prompt": PROMPT_CREATION_PROMPT,
-                    "instruction": dataset_description,
-                }
-            ]
         )
-    )[0]["generation"]
-    progress(1.0, desc="System prompt generated")
-    return result
-def generate_sample_dataset(system_prompt, progress=gr.Progress()):
-    if system_prompt in DEFAULT_SYSTEM_PROMPTS:
-        index = DEFAULT_SYSTEM_PROMPTS.index(system_prompt)
-        if index < len(DEFAULT_DATASETS):
-            return DEFAULT_DATASETS[index]
-    result = generate_dataset(
-        system_prompt, num_turns=1, num_rows=1, progress=progress, is_sample=True
-    )
-    return result
-def _check_push_to_hub(org_name, repo_name):
-    repo_id = (
-        f"{org_name}/{repo_name}"
-        if repo_name is not None and org_name is not None
-        else None
-    )
-    if repo_id is not None:
-        if not all([repo_id, org_name, repo_name]):
-            raise gr.Error(
-                "Please provide a `repo_name` and `org_name` to push the dataset to."
-            )
-    return repo_id
-def generate_dataset(
-    system_prompt: str,
-    num_turns: int = 1,
-    num_rows: int = 5,
-    is_sample: bool = False,
-    progress=gr.Progress(),
-) -> pd.DataFrame:
-    progress(0.0, desc="(1/2) Generating instructions")
-    magpie_generator = get_magpie_generator(
-        num_turns, num_rows, system_prompt, is_sample
-    )
-    response_generator = get_response_generator(num_turns, system_prompt, is_sample)
-    total_steps: int = num_rows * 2
-    batch_size = DEFAULT_BATCH_SIZE
-    # create instructions
-    n_processed = 0
-    magpie_results = []
-    while n_processed < num_rows:
-        progress(
-            0.5 * n_processed / num_rows,
-            total=total_steps,
-            desc="(1/2) Generating instructions",
         )
-        remaining_rows = num_rows - n_processed
-        batch_size = min(batch_size, remaining_rows)
-        inputs = [{"system_prompt": system_prompt} for _ in range(batch_size)]
-        batch = list(magpie_generator.process(inputs=inputs))
-        magpie_results.extend(batch[0])
-        n_processed += batch_size
-    progress(0.5, desc="(1/2) Generating instructions")
-    # generate responses
-    n_processed = 0
-    response_results = []
-    if num_turns == 1:
-        while n_processed < num_rows:
-            progress(
-                0.5 + 0.5 * n_processed / num_rows,
-                total=total_steps,
-                desc="(2/2) Generating responses",
-            )
-            batch = magpie_results[n_processed : n_processed + batch_size]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses[0])
-            n_processed += batch_size
-        for result in response_results:
-            result["prompt"] = result["instruction"]
-            result["completion"] = result["generation"]
-            result["system_prompt"] = system_prompt
-    else:
-        for result in magpie_results:
-            result["conversation"].insert(
-                0, {"role": "system", "content": system_prompt}
-            )
-            result["messages"] = result["conversation"]
-        while n_processed < num_rows:
-            progress(
-                0.5 + 0.5 * n_processed / num_rows,
-                total=total_steps,
-                desc="(2/2) Generating responses",
-            )
-            batch = magpie_results[n_processed : n_processed + batch_size]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses[0])
-            n_processed += batch_size
-        for result in response_results:
-            result["messages"].append(
-                {"role": "assistant", "content": result["generation"]}
-            )
-    progress(
-        1,
-        total=total_steps,
-        desc="(2/2) Generating responses",
-    )
-    # create distiset
-    distiset_results = []
-    for result in response_results:
-        record = {}
-        for relevant_keys in [
-            "messages",
-            "prompt",
-            "completion",
-            "model_name",
-            "system_prompt",
-        ]:
-            if relevant_keys in result:
-                record[relevant_keys] = result[relevant_keys]
-        distiset_results.append(record)
-    distiset = Distiset(
-        {
-            "default": Dataset.from_list(distiset_results),
-        }
-    )
-    # If not pushing to hub generate the dataset directly
-    distiset = distiset["default"]
-    if num_turns == 1:
-        outputs = distiset.to_pandas()[["system_prompt", "prompt", "completion"]]
-    else:
-        outputs = distiset.to_pandas()[["messages"]]
-    dataframe = pd.DataFrame(outputs)
-    progress(1.0, desc="Dataset generation completed")
     return dataframe
-def push_to_hub(
     dataframe: pd.DataFrame,
     private: bool = True,
     org_name: str = None,
     repo_name: str = None,
-    oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
-) -> pd.DataFrame:
     original_dataframe = dataframe.copy(deep=True)
-    if "messages" in dataframe.columns:
-        dataframe["messages"] = dataframe["messages"].apply(
-            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
-        )
-    progress(0.1, desc="Setting up dataset")
-    repo_id = _check_push_to_hub(org_name, repo_name)
-    distiset = Distiset(
-        {
-            "default": Dataset.from_pandas(dataframe),
-        }
-    )
-    progress(0.2, desc="Pushing dataset to hub")
-    distiset.push_to_hub(
-        repo_id=repo_id,
-        private=private,
-        include_script=False,
-        token=oauth_token.token,
-        create_pr=False,
-    )
-    progress(1.0, desc="Dataset pushed to hub")
     return original_dataframe
-def push_to_argilla(
     dataframe: pd.DataFrame,
     dataset_name: str,
-    oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     original_dataframe = dataframe.copy(deep=True)
-    if "messages" in dataframe.columns:
-        dataframe["messages"] = dataframe["messages"].apply(
-            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
-        )
     try:
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
@@ -363,273 +196,193 @@ def push_to_argilla(
     return original_dataframe
-def validate_argilla_dataset_name(
-    dataset_name: str,
-    final_dataset: pd.DataFrame,
-    add_to_existing_dataset: bool,
-    oauth_token: Union[OAuthToken, None] = None,
-    progress=gr.Progress(),
-) -> str:
-    progress(0, desc="Validating dataset configuration")
-    hf_user = HfApi().whoami(token=oauth_token.token)["name"]
-    client = get_argilla_client()
-    if dataset_name is None or dataset_name == "":
-        raise gr.Error("Dataset name is required")
-    # Create user if it doesn't exist
-    rg_user = client.users(username=hf_user)
-    if rg_user is None:
-        rg_user = client.users.add(
-            rg.User(username=hf_user, role="admin", password=str(uuid.uuid4()))
-        )
-    # Create workspace if it doesn't exist
-    workspace = client.workspaces(name=hf_user)
-    if workspace is None:
-        workspace = client.workspaces.add(rg.Workspace(name=hf_user))
-        workspace.add_user(rg_user)
-    # Check if dataset exists
-    dataset = client.datasets(name=dataset_name, workspace=hf_user)
-    if dataset and not add_to_existing_dataset:
-        raise gr.Error(f"Dataset {dataset_name} already exists")
-    return final_dataset
-def upload_pipeline_code(
-    pipeline_code,
-    org_name,
-    repo_name,
-    oauth_token: Union[OAuthToken, None] = None,
-    progress=gr.Progress(),
-):
-    repo_id = _check_push_to_hub(org_name, repo_name)
-    progress(0.1, desc="Uploading pipeline code")
-    with io.BytesIO(pipeline_code.encode("utf-8")) as f:
-        upload_file(
-            path_or_fileobj=f,
-            path_in_repo="pipeline.py",
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=oauth_token.token,
-            commit_message="Include pipeline script",
-            create_pr=False,
         )
-    progress(1.0, desc="Pipeline code uploaded")
-with get_base_app() as app:
-    gr.Markdown("## Iterate on a sample dataset")
-    with gr.Column() as main_ui:
-        dataset_description = gr.TextArea(
-            label="Give a precise description of the assistant or tool. Don't describe the dataset",
-            value=DEFAULT_DATASET_DESCRIPTIONS[0],
-            lines=2,
-        )
-        examples = gr.Examples(
-            elem_id="system_prompt_examples",
-            examples=[[example] for example in DEFAULT_DATASET_DESCRIPTIONS],
-            inputs=[dataset_description],
-        )
-        with gr.Row():
-            gr.Column(scale=1)
-            btn_generate_system_prompt = gr.Button(
-                value="Generate system prompt and sample dataset"
-            )
-            gr.Column(scale=1)
-        system_prompt = gr.TextArea(
-            label="System prompt for dataset generation. You can tune it and regenerate the sample",
-            value=DEFAULT_SYSTEM_PROMPTS[0],
-            lines=5,
         )
-        with gr.Row():
-            sample_dataset = gr.Dataframe(
-                value=DEFAULT_DATASETS[0],
-                label="Sample dataset. Prompts and completions truncated to 256 tokens.",
-                interactive=False,
-                wrap=True,
             )
-        with gr.Row():
-            gr.Column(scale=1)
-            btn_generate_sample_dataset = gr.Button(
-                value="Generate sample dataset",
             )
-            gr.Column(scale=1)
-        result = btn_generate_system_prompt.click(
-            fn=generate_system_prompt,
-            inputs=[dataset_description],
-            outputs=[system_prompt],
-            show_progress=True,
-        ).then(
-            fn=generate_sample_dataset,
-            inputs=[system_prompt],
-            outputs=[sample_dataset],
-            show_progress=True,
-        )
-        btn_generate_sample_dataset.click(
-            fn=generate_sample_dataset,
-            inputs=[system_prompt],
-            outputs=[sample_dataset],
-            show_progress=True,
-        )
-        # Add a header for the full dataset generation section
-        gr.Markdown("## Generate full dataset")
-        gr.Markdown(
-            "Once you're satisfied with the sample, generate a larger dataset and push it to Argilla or the Hugging Face Hub."
-        )
-        with gr.Column() as push_to_hub_ui:
-            with gr.Row(variant="panel"):
-                num_turns = gr.Number(
-                    value=1,
-                    label="Number of turns in the conversation",
-                    minimum=1,
-                    maximum=4,
-                    step=1,
-                    info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
-                )
-                num_rows = gr.Number(
-                    value=10,
-                    label="Number of rows in the dataset",
-                    minimum=1,
-                    maximum=500,
-                    info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
-                )
-            with gr.Tab(label="Argilla"):
-                if get_argilla_client() is not None:
-                    with gr.Row(variant="panel"):
-                        dataset_name = gr.Textbox(
-                            label="Dataset name",
-                            placeholder="dataset_name",
-                            value="my-distiset",
-                        )
-                        add_to_existing_dataset = gr.Checkbox(
-                            label="Allow adding records to existing dataset",
-                            info="When selected, you do need to ensure the number of turns in the conversation is the same as the number of turns in the existing dataset.",
-                            value=False,
-                            interactive=True,
-                            scale=0.5,
-                        )
-                    with gr.Row(variant="panel"):
-                        btn_generate_full_dataset_copy = gr.Button(
-                            value="Generate", variant="primary", scale=2
-                        )
-                        btn_generate_and_push_to_argilla = gr.Button(
-                            value="Generate and Push to Argilla",
-                            variant="primary",
-                            scale=2,
-                        )
-                        btn_push_to_argilla = gr.Button(
-                            value="Push to Argilla", variant="primary", scale=2
-                        )
-                else:
-                    gr.Markdown(
-                        "Please add `ARGILLA_API_URL` and `ARGILLA_API_KEY` to use Argilla or export the dataset to the Hugging Face Hub."
-                    )
-            with gr.Tab("Hugging Face Hub"):
-                with gr.Row(variant="panel"):
-                    org_name = get_org_dropdown()
-                    repo_name = gr.Textbox(
-                        label="Repo name",
-                        placeholder="dataset_name",
-                        value="my-distiset",
-                    )
-                    private = gr.Checkbox(
-                        label="Private dataset",
-                        value=True,
-                        interactive=True,
-                        scale=0.5,
-                    )
-                with gr.Row(variant="panel"):
-                    btn_generate_full_dataset = gr.Button(
-                        value="Generate", variant="primary", scale=2
-                    )
-                    btn_generate_and_push_to_hub = gr.Button(
-                        value="Generate and Push to Hub", variant="primary", scale=2
-                    )
-                    btn_push_to_hub = gr.Button(
-                        value="Push to Hub", variant="primary", scale=2
-                    )
-            with gr.Row():
-                final_dataset = gr.Dataframe(
-                    value=DEFAULT_DATASETS[0],
-                    label="Generated dataset",
-                    interactive=False,
-                    wrap=True,
-                )
-            with gr.Row():
-                success_message = gr.Markdown(visible=False)
-    def show_success_message_argilla():
-        client = get_argilla_client()
-        argilla_api_url = client.api_url
-        return gr.Markdown(
-            value=f"""
-            <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
-                <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
-                <p style="margin-top: 0.5em;">
-                    Your dataset is now available at:
-                    <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                        {argilla_api_url}
-                    </a>
-                    <br>Unfamiliar with Argilla? Here are some docs to help you get started:
-                    <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
-                    <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
-                </p>
-            </div>
-            """,
-            visible=True,
-        )
-    def show_success_message_hub(org_name, repo_name):
-        return gr.Markdown(
-            value=f"""
-            <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
-                <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
-                <p style="margin-top: 0.5em;">
-                    The generated dataset is in the right format for fine-tuning with TRL, AutoTrain or other frameworks.
-                    Your dataset is now available at:
-                    <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                        https://huggingface.co/datasets/{org_name}/{repo_name}
-                    </a>
-                </p>
-            </div>
-            """,
-            visible=True,
-        )
-    def hide_success_message():
-        return gr.Markdown(visible=False)
-    gr.Markdown("## Or run this pipeline locally with distilabel")
-    gr.Markdown(
-        "You can run this pipeline locally with distilabel. For more information, please refer to the [distilabel documentation](https://distilabel.argilla.io/) or go to the FAQ tab at the top of the page for more information."
-    )
-    with gr.Accordion(
-        "Run this pipeline using distilabel",
-        open=False,
-    ):
-        pipeline_code = gr.Code(
-            value=generate_pipeline_code(
-                system_prompt.value, num_turns.value, num_rows.value
-            ),
-            language="python",
-            label="Distilabel Pipeline Code",
         )
-    sample_dataset.change(
-        fn=lambda x: x,
-        inputs=[sample_dataset],
-        outputs=[final_dataset],
-    )
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
@@ -645,7 +398,7 @@ with get_base_app() as app:
     )
     btn_generate_and_push_to_argilla.click(
-        fn=validate_argilla_dataset_name,
         inputs=[dataset_name, final_dataset, add_to_existing_dataset],
         outputs=[final_dataset],
         show_progress=True,
@@ -658,7 +411,7 @@ with get_base_app() as app:
         outputs=[final_dataset],
         show_progress=True,
     ).success(
-        fn=push_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
         show_progress=True,
@@ -677,12 +430,12 @@ with get_base_app() as app:
         outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=push_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=upload_pipeline_code,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
@@ -696,12 +449,12 @@ with get_base_app() as app:
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
-        fn=push_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=upload_pipeline_code,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
@@ -715,12 +468,12 @@ with get_base_app() as app:
         fn=hide_success_message,
         outputs=[success_message],
     ).success(
-        fn=validate_argilla_dataset_name,
         inputs=[dataset_name, final_dataset, add_to_existing_dataset],
         outputs=[final_dataset],
         show_progress=True,
     ).success(
-        fn=push_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
         show_progress=True,
@@ -745,5 +498,3 @@ with get_base_app() as app:
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[pipeline_code],
     )
-    app.load(get_org_dropdown, outputs=[org_name])
-    app.load(fn=swap_visibilty, outputs=main_ui)

 import ast
 from typing import Dict, List, Union
 import argilla as rg
 import pandas as pd
 from datasets import Dataset
 from distilabel.distiset import Distiset
+from huggingface_hub import HfApi
+from src.distilabel_dataset_generator.apps.base import (
+    get_argilla_client,
+    get_main_ui,
+    get_pipeline_code_ui,
+    hide_success_message,
+    push_pipeline_code_to_hub,
+    show_success_message_argilla,
+    show_success_message_hub,
+    validate_argilla_user_workspace_dataset,
+)
+from src.distilabel_dataset_generator.apps.base import (
+    push_dataset_to_hub as push_to_hub_base,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    DEFAULT_BATCH_SIZE,
+)
 from src.distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
     DEFAULT_DATASETS,
     DEFAULT_SYSTEM_PROMPTS,
     get_prompt_generator,
     get_response_generator,
 )
+def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
+    def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
+        return ast.literal_eval(
+            messages.replace("'user'}", "'user'},")
+            .replace("'system'}", "'system'},")
+            .replace("'assistant'}", "'assistant'},")
         )
+    if "messages" in dataframe.columns:
+        dataframe["messages"] = dataframe["messages"].apply(
+            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
         )
     return dataframe
+def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     private: bool = True,
     org_name: str = None,
     repo_name: str = None,
+    oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
+):
     original_dataframe = dataframe.copy(deep=True)
+    dataframe = convert_dataframe_messages(dataframe)
+    push_to_hub_base(dataframe, private, org_name, repo_name, oauth_token, progress)
     return original_dataframe
+def push_dataset_to_argilla(
     dataframe: pd.DataFrame,
     dataset_name: str,
+    oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     original_dataframe = dataframe.copy(deep=True)
+    dataframe = convert_dataframe_messages(dataframe)
     try:
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
     return original_dataframe
+def generate_system_prompt(dataset_description, progress=gr.Progress()):
+    progress(0.0, desc="Generating system prompt")
+    if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
+        index = DEFAULT_DATASET_DESCRIPTIONS.index(dataset_description)
+        if index < len(DEFAULT_SYSTEM_PROMPTS):
+            return DEFAULT_SYSTEM_PROMPTS[index]
+    progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator()
+    progress(0.7, desc="Generating system prompt")
+    result = next(
+        generate_description.process(
+            [
+                {
+                    "system_prompt": PROMPT_CREATION_PROMPT,
+                    "instruction": dataset_description,
+                }
+            ]
         )
+    )[0]["generation"]
+    progress(1.0, desc="System prompt generated")
+    return result
+def generate_dataset(
+    system_prompt: str,
+    num_turns: int = 1,
+    num_rows: int = 5,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    progress(0.0, desc="(1/2) Generating instructions")
+    magpie_generator = get_magpie_generator(
+        num_turns, num_rows, system_prompt, is_sample
+    )
+    response_generator = get_response_generator(num_turns, system_prompt, is_sample)
+    total_steps: int = num_rows * 2
+    batch_size = DEFAULT_BATCH_SIZE
+    # create instructions
+    n_processed = 0
+    magpie_results = []
+    while n_processed < num_rows:
+        progress(
+            0.5 * n_processed / num_rows,
+            total=total_steps,
+            desc="(1/2) Generating instructions",
         )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = [{"system_prompt": system_prompt} for _ in range(batch_size)]
+        batch = list(magpie_generator.process(inputs=inputs))
+        magpie_results.extend(batch[0])
+        n_processed += batch_size
+    progress(0.5, desc="(1/2) Generating instructions")
+    # generate responses
+    n_processed = 0
+    response_results = []
+    if num_turns == 1:
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(2/2) Generating responses",
             )
+            batch = magpie_results[n_processed : n_processed + batch_size]
+            responses = list(response_generator.process(inputs=batch))
+            response_results.extend(responses[0])
+            n_processed += batch_size
+        for result in response_results:
+            result["prompt"] = result["instruction"]
+            result["completion"] = result["generation"]
+            result["system_prompt"] = system_prompt
+    else:
+        for result in magpie_results:
+            result["conversation"].insert(
+                0, {"role": "system", "content": system_prompt}
             )
+            result["messages"] = result["conversation"]
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(2/2) Generating responses",
+            )
+            batch = magpie_results[n_processed : n_processed + batch_size]
+            responses = list(response_generator.process(inputs=batch))
+            response_results.extend(responses[0])
+            n_processed += batch_size
+        for result in response_results:
+            result["messages"].append(
+                {"role": "assistant", "content": result["generation"]}
+            )
+    progress(
+        1,
+        total=total_steps,
+        desc="(2/2) Generating responses",
+    )
+    # create distiset
+    distiset_results = []
+    for result in response_results:
+        record = {}
+        for relevant_keys in [
+            "messages",
+            "prompt",
+            "completion",
+            "model_name",
+            "system_prompt",
+        ]:
+            if relevant_keys in result:
+                record[relevant_keys] = result[relevant_keys]
+        distiset_results.append(record)
+    distiset = Distiset(
+        {
+            "default": Dataset.from_list(distiset_results),
+        }
+    )
+    # If not pushing to hub generate the dataset directly
+    distiset = distiset["default"]
+    if num_turns == 1:
+        outputs = distiset.to_pandas()[["system_prompt", "prompt", "completion"]]
+    else:
+        outputs = distiset.to_pandas()[["messages"]]
+    dataframe = pd.DataFrame(outputs)
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
+(
+    app,
+    main_ui,
+    custom_input_ui,
+    dataset_description,
+    examples,
+    btn_generate_system_prompt,
+    system_prompt,
+    sample_dataset,
+    btn_generate_sample_dataset,
+    dataset_name,
+    add_to_existing_dataset,
+    btn_generate_full_dataset_copy,
+    btn_generate_and_push_to_argilla,
+    btn_push_to_argilla,
+    org_name,
+    repo_name,
+    private,
+    btn_generate_full_dataset,
+    btn_generate_and_push_to_hub,
+    btn_push_to_hub,
+    final_dataset,
+    success_message,
+) = get_main_ui(
+    default_dataset_descriptions=DEFAULT_DATASET_DESCRIPTIONS,
+    default_system_prompts=DEFAULT_SYSTEM_PROMPTS,
+    default_datasets=DEFAULT_DATASETS,
+    fn_generate_system_prompt=generate_system_prompt,
+    fn_generate_dataset=generate_dataset,
+)
+with app:
+    with main_ui:
+        with custom_input_ui:
+            num_turns = gr.Number(
+                value=1,
+                label="Number of turns in the conversation",
+                minimum=1,
+                maximum=4,
+                step=1,
+                info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
+            )
+            num_rows = gr.Number(
+                value=10,
+                label="Number of rows in the dataset",
+                minimum=1,
+                maximum=500,
+                info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
+            )
+        pipeline_code = get_pipeline_code_ui(
+            generate_pipeline_code(system_prompt.value, num_turns.value, num_rows.value)
         )
+    # define app triggers
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
     )
     btn_generate_and_push_to_argilla.click(
+        fn=validate_argilla_user_workspace_dataset,
         inputs=[dataset_name, final_dataset, add_to_existing_dataset],
         outputs=[final_dataset],
         show_progress=True,
         outputs=[final_dataset],
         show_progress=True,
     ).success(
+        fn=push_dataset_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
         show_progress=True,
         outputs=[final_dataset],
         show_progress=True,
     ).then(
+        fn=push_dataset_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
+        fn=push_pipeline_code_to_hub,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
+        fn=push_dataset_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
+        fn=push_pipeline_code_to_hub,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
         fn=hide_success_message,
         outputs=[success_message],
     ).success(
+        fn=validate_argilla_user_workspace_dataset,
         inputs=[dataset_name, final_dataset, add_to_existing_dataset],
         outputs=[final_dataset],
         show_progress=True,
     ).success(
+        fn=push_dataset_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
         show_progress=True,
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[pipeline_code],
     )

src/distilabel_dataset_generator/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from src.distilabel_dataset_generator.utils import HF_TOKENS
+DEFAULT_BATCH_SIZE = 5
+TOKEN_INDEX = 0
+MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+def _get_next_api_key():
+    global TOKEN_INDEX
+    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
+    TOKEN_INDEX += 1
+    return api_key

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import pandas as pd
-from datasets import Dataset
-from distilabel.distiset import Distiset
 from distilabel.llms import InferenceEndpointsLLM
-from distilabel.pipeline import Pipeline
-from distilabel.steps import KeepColumns
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
-from src.distilabel_dataset_generator.utils import HF_TOKENS
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
@@ -120,7 +119,6 @@ The prompt you write should follow the same style and structure as the following
 User dataset description:
 """
-MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_DATASET_DESCRIPTIONS = (
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
@@ -157,8 +155,6 @@ _STOP_SEQUENCES = [
     "assistant",
     " \n\n",
 ]
-DEFAULT_BATCH_SIZE = 5
-TOKEN_INDEX = 0
 def _get_output_mappings(num_turns):
@@ -213,13 +209,6 @@ if __name__ == "__main__":
     return code
-def _get_next_api_key():
-    global TOKEN_INDEX
-    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
-    TOKEN_INDEX += 1
-    return api_key
 def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
@@ -300,12 +289,9 @@ def get_response_generator(num_turns, system_prompt, is_sample):
 def get_prompt_generator():
-    global TOKEN_INDEX
-    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
-    TOKEN_INDEX += 1
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
-            api_key=api_key,
             model_id=MODEL,
             tokenizer_id=MODEL,
             generation_kwargs={
@@ -318,95 +304,3 @@ def get_prompt_generator():
     )
     prompt_generator.load()
     return prompt_generator
-def get_pipeline(num_turns, num_rows, system_prompt, is_sample):
-    input_mappings = _get_output_mappings(num_turns)
-    output_mappings = input_mappings
-    with Pipeline(name="sft") as pipeline:
-        magpie = get_magpie_generator(num_turns, num_rows, system_prompt, is_sample)
-        generate_response = get_response_generator(system_prompt, is_sample)
-        keep_columns = KeepColumns(
-            columns=list(output_mappings.values()) + ["model_name"],
-        )
-        magpie.connect(generate_response)
-        generate_response.connect(keep_columns)
-        return pipeline
-if __name__ == "__main__":
-    prompt_generation_step = get_prompt_generator()
-    system_prompt = next(
-        prompt_generation_step.process(
-            [
-                {
-                    "system_prompt": PROMPT_CREATION_PROMPT,
-                    "instruction": DEFAULT_DATASET_DESCRIPTIONS[0],
-                }
-            ]
-        )
-    )[0]["generation"]
-    num_rows = 2
-    num_turns = 1
-    magpie_generator = get_magpie_generator(num_turns, num_rows, system_prompt, False)
-    response_generator = get_response_generator(num_turns, system_prompt, False)
-    total_steps = num_rows * 2
-    batch_size = 5  # Adjust this value as needed
-    # create instructions
-    magpie_results = []
-    for i in range(0, num_rows, batch_size):
-        batch = list(magpie_generator.process())[:batch_size]
-        magpie_results.extend([item[0] for item in batch])
-    # generate responses
-    response_results = []
-    if num_turns == 1:
-        for i in range(0, len(magpie_results), batch_size):
-            batch = magpie_results[i : i + batch_size]
-            batch = [entry[0] for entry in batch]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses)
-        for result in response_results:
-            result[0]["prompt"] = result[0]["instruction"]
-            result[0]["completion"] = result[0]["generation"]
-            result[0]["system_prompt"] = system_prompt
-    else:
-        for result in magpie_results:
-            result[0]["conversation"].insert(
-                0, {"role": "system", "content": system_prompt}
-            )
-            result[0]["messages"] = result[0]["conversation"]
-        for i in range(0, len(magpie_results), batch_size):
-            batch = magpie_results[i : i + batch_size]
-            batch = [entry[0] for entry in batch]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses)
-        for result in response_results:
-            result[0]["messages"].append(
-                {"role": "assistant", "content": result[0]["generation"]}
-            )
-    distiset_results = []
-    for result in response_results[0]:
-        record = {}
-        for relevant_keys in [
-            "messages",
-            "prompt",
-            "completion",
-            "model_name",
-            "system_prompt",
-        ]:
-            if relevant_keys in result:
-                record[relevant_keys] = result[relevant_keys]
-        distiset_results.append(record)
-    distiset = Distiset(
-        {
-            "default": Dataset.from_list(distiset_results),
-        }
-    )

 import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
+from src.distilabel_dataset_generator.pipelines.base import (
+    MODEL,
+    _get_next_api_key,
+)
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
 User dataset description:
 """
 DEFAULT_DATASET_DESCRIPTIONS = (
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
     "assistant",
     " \n\n",
 ]
 def _get_output_mappings(num_turns):
     return code
 def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
 def get_prompt_generator():
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
             model_id=MODEL,
             tokenizer_id=MODEL,
             generation_kwargs={
     )
     prompt_generator.load()
     return prompt_generator