Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

feat/text-classification

#11

by davidberenstein1957 HF staff - opened Oct 15, 2024

base: refs/heads/main

←

from: refs/pr/11

Discussion Files changed

+2858

-1101

Files changed (14) hide show

.python-version +1 -0
app.py +15 -3
pdm.lock +0 -0
pyproject.toml +4 -2
requirements.txt +5 -2
src/distilabel_dataset_generator/apps/base.py +526 -0
src/distilabel_dataset_generator/apps/faq.py +1 -1
src/distilabel_dataset_generator/apps/sft.py +291 -263
src/distilabel_dataset_generator/apps/textcat.py +548 -0
src/distilabel_dataset_generator/pipelines/base.py +12 -0
src/distilabel_dataset_generator/pipelines/embeddings.py +16 -0
src/distilabel_dataset_generator/pipelines/sft.py +8 -114
src/distilabel_dataset_generator/pipelines/textcat.py +224 -0
src/distilabel_dataset_generator/utils.py +45 -1

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ synthetic-data-generator

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
 theme = gr.themes.Monochrome(
     spacing_size="md",
@@ -25,8 +26,8 @@ css = """
 """
 demo = gr.TabbedInterface(
-    [sft_app, faq_app],
-    ["Supervised Fine-Tuning", "FAQ"],
     css=css,
     title="""
     <style>
@@ -54,6 +55,17 @@ demo = gr.TabbedInterface(
                 margin-bottom: 20px;
             }
         }
     </style>
     <div class="header-container">
         <div class="logo-container">
@@ -62,7 +74,7 @@ demo = gr.TabbedInterface(
             </a>
         </div>
         <div class="title-container">
-            <h1 style="margin: 0; font-size: 2em;">🧬  Synthetic Data Generator</h1>
             <p style="margin: 10px 0 0 0; color: #666; font-size: 1.1em;">Build datasets using natural language</p>
         </div>
     </div>

 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
+from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
 theme = gr.themes.Monochrome(
     spacing_size="md",
 """
 demo = gr.TabbedInterface(
+    [textcat_app, sft_app, faq_app],
+    ["Text Classification", "Supervised Fine-Tuning", "FAQ"],
     css=css,
     title="""
     <style>
                 margin-bottom: 20px;
             }
         }
+        button[role="tab"].selected,
+        button[role="tab"][aria-selected="true"],
+        button[role="tab"][data-tab-id][aria-selected="true"] {
+            background-color: #000000;
+            color: white;
+            border: none;
+            font-size: 16px;
+            font-weight: bold;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+            transition: background-color 0.3s ease, color 0.3s ease;
+        }
     </style>
     <div class="header-container">
         <div class="logo-container">
             </a>
         </div>
         <div class="title-container">
+            <h1 style="margin: 0; font-size: 2em;">🧬 Synthetic Data Generator</h1>
             <p style="margin: 10px 0 0 0; color: #666; font-size: 1.1em;">Build datasets using natural language</p>
         </div>
     </div>

pdm.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -6,11 +6,13 @@ authors = [
     {name = "davidberenstein1957", email = "[email protected]"},
 ]
 dependencies = [
-    "distilabel[hf-inference-endpoints] @ git+https://github.com/argilla-io/distilabel.git@develop",
     "gradio[oauth]<5,>=4.38",
     "transformers>=4.44.2",
 ]
-requires-python = ">=3.10"
 readme = "README.md"
 license = {text = "apache 2"}

     {name = "davidberenstein1957", email = "[email protected]"},
 ]
 dependencies = [
+    "distilabel[hf-inference-endpoints,argilla]==1.4.0",
     "gradio[oauth]<5,>=4.38",
     "transformers>=4.44.2",
+    "sentence-transformers>=3.2.0",
+    "model2vec>=0.2.4",
 ]
+requires-python = "<3.13,>=3.10"
 readme = "README.md"
 license = {text = "apache 2"}

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 transformers
 gradio[oauth]
-distilabel[hf-inference-endpoints] @ git+https://github.com/argilla-io/distilabel.git@develop
-beautifulsoup4

 transformers
 gradio[oauth]
+distilabel[hf-inference-endpoints,argilla]
+beautifulsoup4
+sentence-transformers
+model2vec
+outlines

src/distilabel_dataset_generator/apps/base.py ADDED Viewed

	@@ -0,0 +1,526 @@

+import io
+import uuid
+from typing import Any, Callable, List, Tuple, Union
+import argilla as rg
+import gradio as gr
+import pandas as pd
+from datasets import ClassLabel, Dataset, Features, Sequence, Value
+from distilabel.distiset import Distiset
+from gradio import OAuthToken
+from huggingface_hub import HfApi, upload_file
+from src.distilabel_dataset_generator.utils import (
+    _LOGGED_OUT_CSS,
+    get_argilla_client,
+    list_orgs,
+    swap_visibilty,
+    get_login_button,
+)
+TEXTCAT_TASK = "text_classification"
+SFT_TASK = "supervised_fine_tuning"
+def get_main_ui(
+    default_dataset_descriptions: List[str],
+    default_system_prompts: List[str],
+    default_datasets: List[pd.DataFrame],
+    fn_generate_system_prompt: Callable,
+    fn_generate_dataset: Callable,
+    task: str,
+):
+    def fn_generate_sample_dataset(system_prompt, progress=gr.Progress()):
+        if system_prompt in default_system_prompts:
+            index = default_system_prompts.index(system_prompt)
+            if index < len(default_datasets):
+                return default_datasets[index]
+        if task == TEXTCAT_TASK:
+            result = fn_generate_dataset(
+                system_prompt=system_prompt,
+                difficulty="mixed",
+                clarity="mixed",
+                labels=[],
+                num_labels=1,
+                num_rows=1,
+                progress=progress,
+                is_sample=True,
+            )
+        else:
+            result = fn_generate_dataset(
+                system_prompt=system_prompt,
+                num_turns=1,
+                num_rows=1,
+                progress=progress,
+                is_sample=True,
+            )
+        return result
+    with gr.Blocks(
+        title="🧬 Synthetic Data Generator",
+        head="🧬 Synthetic Data Generator",
+        css=_LOGGED_OUT_CSS,
+    ) as app:
+        with gr.Row():
+            gr.Markdown(
+                "Want to run this locally or with other LLMs? Take a look at the FAQ tab. distilabel Synthetic Data Generator is free, we use the authentication token to push the dataset to the Hugging Face Hub and not for data generation."
+            )
+        with gr.Row():
+            gr.Column()
+            get_login_button()
+            gr.Column()
+        gr.Markdown("## Iterate on a sample dataset")
+        with gr.Column() as main_ui:
+            (
+                dataset_description,
+                examples,
+                btn_generate_system_prompt,
+                system_prompt,
+                sample_dataset,
+                btn_generate_sample_dataset,
+            ) = get_iterate_on_sample_dataset_ui(
+                default_dataset_descriptions=default_dataset_descriptions,
+                default_system_prompts=default_system_prompts,
+                default_datasets=default_datasets,
+                task=task,
+            )
+            gr.Markdown("## Generate full dataset")
+            gr.Markdown(
+                "Once you're satisfied with the sample, generate a larger dataset and push it to Argilla or the Hugging Face Hub."
+            )
+            with gr.Row(variant="panel") as custom_input_ui:
+                pass
+            (
+                dataset_name,
+                add_to_existing_dataset,
+                btn_generate_full_dataset_argilla,
+                btn_generate_and_push_to_argilla,
+                btn_push_to_argilla,
+                org_name,
+                repo_name,
+                private,
+                btn_generate_full_dataset,
+                btn_generate_and_push_to_hub,
+                btn_push_to_hub,
+                final_dataset,
+                success_message,
+            ) = get_push_to_ui(default_datasets)
+        sample_dataset.change(
+            fn=lambda x: x,
+            inputs=[sample_dataset],
+            outputs=[final_dataset],
+        )
+        btn_generate_system_prompt.click(
+            fn=fn_generate_system_prompt,
+            inputs=[dataset_description],
+            outputs=[system_prompt],
+            show_progress=True,
+        ).then(
+            fn=fn_generate_sample_dataset,
+            inputs=[system_prompt],
+            outputs=[sample_dataset],
+            show_progress=True,
+        )
+        btn_generate_sample_dataset.click(
+            fn=fn_generate_sample_dataset,
+            inputs=[system_prompt],
+            outputs=[sample_dataset],
+            show_progress=True,
+        )
+        app.load(fn=swap_visibilty, outputs=main_ui)
+        app.load(get_org_dropdown, outputs=[org_name])
+    return (
+        app,
+        main_ui,
+        custom_input_ui,
+        dataset_description,
+        examples,
+        btn_generate_system_prompt,
+        system_prompt,
+        sample_dataset,
+        btn_generate_sample_dataset,
+        dataset_name,
+        add_to_existing_dataset,
+        btn_generate_full_dataset_argilla,
+        btn_generate_and_push_to_argilla,
+        btn_push_to_argilla,
+        org_name,
+        repo_name,
+        private,
+        btn_generate_full_dataset,
+        btn_generate_and_push_to_hub,
+        btn_push_to_hub,
+        final_dataset,
+        success_message,
+    )
+def validate_argilla_user_workspace_dataset(
+    dataset_name: str,
+    final_dataset: pd.DataFrame,
+    add_to_existing_dataset: bool,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> str:
+    progress(0, desc="Validating dataset configuration")
+    hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+    client = get_argilla_client()
+    if dataset_name is None or dataset_name == "":
+        raise gr.Error("Dataset name is required")
+    # Create user if it doesn't exist
+    rg_user = client.users(username=hf_user)
+    if rg_user is None:
+        rg_user = client.users.add(
+            rg.User(username=hf_user, role="admin", password=str(uuid.uuid4()))
+        )
+    # Create workspace if it doesn't exist
+    workspace = client.workspaces(name=hf_user)
+    if workspace is None:
+        workspace = client.workspaces.add(rg.Workspace(name=hf_user))
+        workspace.add_user(hf_user)
+    # Check if dataset exists
+    dataset = client.datasets(name=dataset_name, workspace=hf_user)
+    if dataset and not add_to_existing_dataset:
+        raise gr.Error(f"Dataset {dataset_name} already exists")
+    return final_dataset
+def get_org_dropdown(oauth_token: OAuthToken = None):
+    orgs = list_orgs(oauth_token)
+    return gr.Dropdown(
+        label="Organization",
+        choices=orgs,
+        value=orgs[0] if orgs else None,
+        allow_custom_value=True,
+    )
+def get_push_to_ui(default_datasets):
+    with gr.Column() as push_to_ui:
+        (
+            dataset_name,
+            add_to_existing_dataset,
+            btn_generate_full_dataset_argilla,
+            btn_generate_and_push_to_argilla,
+            btn_push_to_argilla,
+        ) = get_argilla_tab()
+        (
+            org_name,
+            repo_name,
+            private,
+            btn_generate_full_dataset,
+            btn_generate_and_push_to_hub,
+            btn_push_to_hub,
+        ) = get_hf_tab()
+        final_dataset = get_final_dataset_row(default_datasets)
+        success_message = get_success_message_row()
+    return (
+        dataset_name,
+        add_to_existing_dataset,
+        btn_generate_full_dataset_argilla,
+        btn_generate_and_push_to_argilla,
+        btn_push_to_argilla,
+        org_name,
+        repo_name,
+        private,
+        btn_generate_full_dataset,
+        btn_generate_and_push_to_hub,
+        btn_push_to_hub,
+        final_dataset,
+        success_message,
+    )
+def get_iterate_on_sample_dataset_ui(
+    default_dataset_descriptions: List[str],
+    default_system_prompts: List[str],
+    default_datasets: List[pd.DataFrame],
+    task: str,
+):
+    with gr.Column():
+        dataset_description = gr.TextArea(
+            label="Give a precise description of your desired application. Check the examples for inspiration.",
+            value=default_dataset_descriptions[0],
+            lines=2,
+        )
+        examples = gr.Examples(
+            elem_id="system_prompt_examples",
+            examples=[[example] for example in default_dataset_descriptions],
+            inputs=[dataset_description],
+        )
+        with gr.Row():
+            gr.Column(scale=1)
+            btn_generate_system_prompt = gr.Button(
+                value="Generate system prompt and sample dataset"
+            )
+            gr.Column(scale=1)
+        system_prompt = gr.TextArea(
+            label="System prompt for dataset generation. You can tune it and regenerate the sample.",
+            value=default_system_prompts[0],
+            lines=2 if task == TEXTCAT_TASK else 5,
+        )
+        with gr.Row():
+            sample_dataset = gr.Dataframe(
+                value=default_datasets[0],
+                label="Sample dataset. Prompts and completions truncated to 256 tokens.",
+                interactive=False,
+                wrap=True,
+            )
+        with gr.Row():
+            gr.Column(scale=1)
+            btn_generate_sample_dataset = gr.Button(
+                value="Generate sample dataset",
+            )
+            gr.Column(scale=1)
+    return (
+        dataset_description,
+        examples,
+        btn_generate_system_prompt,
+        system_prompt,
+        sample_dataset,
+        btn_generate_sample_dataset,
+    )
+def get_pipeline_code_ui(pipeline_code: str) -> gr.Code:
+    gr.Markdown("## Or run this pipeline locally with distilabel")
+    gr.Markdown(
+        "You can run this pipeline locally with distilabel. For more information, please refer to the [distilabel documentation](https://distilabel.argilla.io/) or go to the FAQ tab at the top of the page for more information."
+    )
+    with gr.Accordion(
+        "Run this pipeline using distilabel",
+        open=False,
+    ):
+        pipeline_code = gr.Code(
+            value=pipeline_code,
+            language="python",
+            label="Distilabel Pipeline Code",
+        )
+    return pipeline_code
+def get_argilla_tab() -> Tuple[Any]:
+    with gr.Tab(label="Argilla"):
+        if get_argilla_client() is not None:
+            with gr.Row(variant="panel"):
+                dataset_name = gr.Textbox(
+                    label="Dataset name",
+                    placeholder="dataset_name",
+                    value="my-distiset",
+                )
+                add_to_existing_dataset = gr.Checkbox(
+                    label="Allow adding records to existing dataset",
+                    info="When selected, you do need to ensure the dataset options are the same as in the existing dataset.",
+                    value=False,
+                    interactive=True,
+                    scale=1,
+                )
+            with gr.Row(variant="panel"):
+                btn_generate_full_dataset_argilla = gr.Button(
+                    value="Generate", variant="primary", scale=2
+                )
+                btn_generate_and_push_to_argilla = gr.Button(
+                    value="Generate and Push to Argilla",
+                    variant="primary",
+                    scale=2,
+                )
+                btn_push_to_argilla = gr.Button(
+                    value="Push to Argilla", variant="primary", scale=2
+                )
+        else:
+            gr.Markdown(
+                "Please add `ARGILLA_API_URL` and `ARGILLA_API_KEY` to use Argilla or export the dataset to the Hugging Face Hub."
+            )
+    return (
+        dataset_name,
+        add_to_existing_dataset,
+        btn_generate_full_dataset_argilla,
+        btn_generate_and_push_to_argilla,
+        btn_push_to_argilla,
+    )
+def get_hf_tab() -> Tuple[Any]:
+    with gr.Tab("Hugging Face Hub"):
+        with gr.Row(variant="panel"):
+            org_name = get_org_dropdown()
+            repo_name = gr.Textbox(
+                label="Repo name",
+                placeholder="dataset_name",
+                value="my-distiset",
+            )
+            private = gr.Checkbox(
+                label="Private dataset",
+                value=True,
+                interactive=True,
+                scale=1,
+            )
+        with gr.Row(variant="panel"):
+            btn_generate_full_dataset = gr.Button(
+                value="Generate", variant="primary", scale=2
+            )
+            btn_generate_and_push_to_hub = gr.Button(
+                value="Generate and Push to Hub", variant="primary", scale=2
+            )
+            btn_push_to_hub = gr.Button(value="Push to Hub", variant="primary", scale=2)
+    return (
+        org_name,
+        repo_name,
+        private,
+        btn_generate_full_dataset,
+        btn_generate_and_push_to_hub,
+        btn_push_to_hub,
+    )
+def push_pipeline_code_to_hub(
+    pipeline_code: str,
+    org_name: str,
+    repo_name: str,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+):
+    repo_id = _check_push_to_hub(org_name, repo_name)
+    progress(0.1, desc="Uploading pipeline code")
+    with io.BytesIO(pipeline_code.encode("utf-8")) as f:
+        upload_file(
+            path_or_fileobj=f,
+            path_in_repo="pipeline.py",
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=oauth_token.token,
+            commit_message="Include pipeline script",
+            create_pr=False,
+        )
+    progress(1.0, desc="Pipeline code uploaded")
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    private: bool = True,
+    org_name: str = None,
+    repo_name: str = None,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+    labels: List[str] = None,
+    num_labels: int = None,
+    task: str = TEXTCAT_TASK,
+) -> pd.DataFrame:
+    progress(0.1, desc="Setting up dataset")
+    repo_id = _check_push_to_hub(org_name, repo_name)
+    if task == TEXTCAT_TASK:
+        if num_labels == 1:
+            features = Features(
+                {"text": Value("string"), "label": ClassLabel(names=labels)}
+            )
+        else:
+            features = Features({
+                "text": Value("string"),
+                "labels": Sequence(feature=ClassLabel(names=labels))
+            })
+        distiset = Distiset({
+            "default": Dataset.from_pandas(dataframe, features=features)
+        })
+    else:
+        distiset = Distiset({
+            "default": Dataset.from_pandas(dataframe)
+        })
+    progress(0.2, desc="Pushing dataset to hub")
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+    progress(1.0, desc="Dataset pushed to hub")
+    return dataframe
+def _check_push_to_hub(org_name, repo_name):
+    repo_id = (
+        f"{org_name}/{repo_name}"
+        if repo_name is not None and org_name is not None
+        else None
+    )
+    if repo_id is not None:
+        if not all([repo_id, org_name, repo_name]):
+            raise gr.Error(
+                "Please provide a `repo_name` and `org_name` to push the dataset to."
+            )
+    return repo_id
+def get_final_dataset_row(default_datasets) -> gr.Dataframe:
+    with gr.Row():
+        final_dataset = gr.Dataframe(
+            value=default_datasets[0],
+            label="Generated dataset",
+            interactive=False,
+            wrap=True,
+            min_width=300,
+        )
+    return final_dataset
+def get_success_message_row() -> gr.Markdown:
+    with gr.Row():
+        success_message = gr.Markdown(visible=False)
+    return success_message
+def show_success_message_argilla() -> gr.Markdown:
+    client = get_argilla_client()
+    argilla_api_url = client.api_url
+    return gr.Markdown(
+        value=f"""
+        <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
+            <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+            <p style="margin-top: 0.5em;">
+                Your dataset is now available at:
+                <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                    {argilla_api_url}
+                </a>
+                <br>Unfamiliar with Argilla? Here are some docs to help you get started:
+                <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
+                <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
+            </p>
+        </div>
+        """,
+        visible=True,
+    )
+def show_success_message_hub(org_name, repo_name) -> gr.Markdown:
+    return gr.Markdown(
+        value=f"""
+        <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
+            <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+            <p style="margin-top: 0.5em;">
+                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain or other frameworks.
+                Your dataset is now available at:
+                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                    https://huggingface.co/datasets/{org_name}/{repo_name}
+                </a>
+            </p>
+        </div>
+        """,
+        visible=True,
+    )
+def hide_success_message() -> gr.Markdown:
+    return gr.Markdown(visible=False)

src/distilabel_dataset_generator/apps/faq.py CHANGED Viewed

@@ -15,7 +15,7 @@ with gr.Blocks() as app:
                     <p>This tool simplifies the process of creating custom datasets, enabling you to:</p>
                     <ul>
                         <li>Define the characteristics of your desired application</li>
-                        <li>Generate system prompts automatically</li>
                         <li>Create sample datasets for quick iteration</li>
                         <li>Produce full-scale datasets with customizable parameters</li>
                         <li>Push your generated datasets directly to the Hugging Face Hub</li>

                     <p>This tool simplifies the process of creating custom datasets, enabling you to:</p>
                     <ul>
                         <li>Define the characteristics of your desired application</li>
+                        <li>Generate system prompts and tasks automatically</li>
                         <li>Create sample datasets for quick iteration</li>
                         <li>Produce full-scale datasets with customizable parameters</li>
                         <li>Push your generated datasets directly to the Hugging Face Hub</li>

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -1,16 +1,34 @@
-import io
-from typing import Union
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from distilabel.distiset import Distiset
-from distilabel.steps.tasks.text_generation import TextGeneration
-from gradio.oauth import OAuthToken
-from huggingface_hub import upload_file
-from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_BATCH_SIZE,
     DEFAULT_DATASET_DESCRIPTIONS,
     DEFAULT_DATASETS,
     DEFAULT_SYSTEM_PROMPTS,
@@ -20,11 +38,169 @@ from src.distilabel_dataset_generator.pipelines.sft import (
     get_prompt_generator,
     get_response_generator,
 )
-from src.distilabel_dataset_generator.utils import (
-    get_login_button,
-    get_org_dropdown,
-    swap_visibilty,
-)
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
@@ -35,7 +211,7 @@ def generate_system_prompt(dataset_description, progress=gr.Progress()):
             return DEFAULT_SYSTEM_PROMPTS[index]
     progress(0.3, desc="Initializing text generation")
-    generate_description: TextGeneration = get_prompt_generator()
     progress(0.7, desc="Generating system prompt")
     result = next(
         generate_description.process(
@@ -51,38 +227,13 @@ def generate_system_prompt(dataset_description, progress=gr.Progress()):
     return result
-def generate_sample_dataset(system_prompt, progress=gr.Progress()):
-    if system_prompt in DEFAULT_SYSTEM_PROMPTS:
-        index = DEFAULT_SYSTEM_PROMPTS.index(system_prompt)
-        if index < len(DEFAULT_DATASETS):
-            return DEFAULT_DATASETS[index]
-    result = generate_dataset(
-        system_prompt, num_turns=1, num_rows=1, progress=progress, is_sample=True
-    )
-    return result
-def _check_push_to_hub(org_name, repo_name):
-    repo_id = (
-        f"{org_name}/{repo_name}"
-        if repo_name is not None and org_name is not None
-        else None
-    )
-    if repo_id is not None:
-        if not all([repo_id, org_name, repo_name]):
-            raise gr.Error(
-                "Please provide a `repo_name` and `org_name` to push the dataset to."
-            )
-    return repo_id
 def generate_dataset(
     system_prompt: str,
     num_turns: int = 1,
     num_rows: int = 5,
     is_sample: bool = False,
     progress=gr.Progress(),
-):
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         num_turns, num_rows, system_prompt, is_sample
@@ -149,7 +300,7 @@ def generate_dataset(
     progress(
         1,
         total=total_steps,
-        desc="(2/2) Generating responses",
     )
     # create distiset
@@ -184,238 +335,98 @@ def generate_dataset(
     return dataframe
-def push_to_hub(
-    dataframe: pd.DataFrame,
-    private: bool = True,
-    org_name: str = None,
-    repo_name: str = None,
-    oauth_token: Union[OAuthToken, None] = None,
-    progress=gr.Progress(),
-):
-    progress(0.1, desc="Setting up dataset")
-    repo_id = _check_push_to_hub(org_name, repo_name)
-    distiset = Distiset(
-        {
-            "default": Dataset.from_pandas(dataframe),
-        }
-    )
-    progress(0.2, desc="Pushing dataset to hub")
-    distiset.push_to_hub(
-        repo_id=repo_id,
-        private=private,
-        include_script=False,
-        token=oauth_token.token,
-        create_pr=False,
-    )
-    progress(1.0, desc="Dataset pushed to hub")
-    return dataframe
-def upload_pipeline_code(
-    pipeline_code,
     org_name,
     repo_name,
-    oauth_token: Union[OAuthToken, None] = None,
-    progress=gr.Progress(),
-):
-    repo_id = _check_push_to_hub(org_name, repo_name)
-    progress(0.1, desc="Uploading pipeline code")
-    with io.BytesIO(pipeline_code.encode("utf-8")) as f:
-        upload_file(
-            path_or_fileobj=f,
-            path_in_repo="pipeline.py",
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=oauth_token.token,
-            commit_message="Include pipeline script",
-            create_pr=False,
-        )
-    progress(1.0, desc="Pipeline code uploaded")
-css = """
-.main_ui_logged_out{opacity: 0.3; pointer-events: none}
-"""
-with gr.Blocks(
-    title="🧬 Synthetic Data Generator",
-    head="🧬  Synthetic Data Generator",
-    css=css,
-) as app:
-    with gr.Row():
-        gr.Markdown(
-            "Want to run this locally or with other LLMs? Take a look at the FAQ tab. distilabel Synthetic Data Generator is free, we use the authentication token to push the dataset to the Hugging Face Hub and not for data generation."
-        )
-    with gr.Row():
-        gr.Column()
-        get_login_button()
-        gr.Column()
-    gr.Markdown("## Iterate on a sample dataset")
-    with gr.Column() as main_ui:
-        dataset_description = gr.TextArea(
-            label="Give a precise description of the assistant or tool. Don't describe the dataset",
-            value=DEFAULT_DATASET_DESCRIPTIONS[0],
-            lines=2,
-        )
-        examples = gr.Examples(
-            elem_id="system_prompt_examples",
-            examples=[[example] for example in DEFAULT_DATASET_DESCRIPTIONS],
-            inputs=[dataset_description],
-        )
-        with gr.Row():
-            gr.Column(scale=1)
-            btn_generate_system_prompt = gr.Button(
-                value="Generate system prompt and sample dataset"
-            )
-            gr.Column(scale=1)
-        system_prompt = gr.TextArea(
-            label="System prompt for dataset generation. You can tune it and regenerate the sample",
-            value=DEFAULT_SYSTEM_PROMPTS[0],
-            lines=5,
-        )
-        with gr.Row():
-            sample_dataset = gr.Dataframe(
-                value=DEFAULT_DATASETS[0],
-                label="Sample dataset. Prompts and completions truncated to 256 tokens.",
-                interactive=False,
-                wrap=True,
             )
-        with gr.Row():
-            gr.Column(scale=1)
-            btn_generate_sample_dataset = gr.Button(
-                value="Generate sample dataset",
             )
-            gr.Column(scale=1)
-        result = btn_generate_system_prompt.click(
-            fn=generate_system_prompt,
-            inputs=[dataset_description],
-            outputs=[system_prompt],
-            show_progress=True,
-        ).then(
-            fn=generate_sample_dataset,
-            inputs=[system_prompt],
-            outputs=[sample_dataset],
-            show_progress=True,
-        )
-        btn_generate_sample_dataset.click(
-            fn=generate_sample_dataset,
-            inputs=[system_prompt],
-            outputs=[sample_dataset],
-            show_progress=True,
-        )
-        # Add a header for the full dataset generation section
-        gr.Markdown("## Generate full dataset")
-        gr.Markdown(
-            "Once you're satisfied with the sample, generate a larger dataset and push it to the Hub."
-        )
-        with gr.Column() as push_to_hub_ui:
-            with gr.Row(variant="panel"):
-                num_turns = gr.Number(
-                    value=1,
-                    label="Number of turns in the conversation",
-                    minimum=1,
-                    maximum=4,
-                    step=1,
-                    info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
-                )
-                num_rows = gr.Number(
-                    value=10,
-                    label="Number of rows in the dataset",
-                    minimum=1,
-                    maximum=500,
-                    info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
-                )
-            with gr.Row(variant="panel"):
-                org_name = get_org_dropdown()
-                repo_name = gr.Textbox(
-                    label="Repo name", placeholder="dataset_name", value="my-distiset"
-                )
-                private = gr.Checkbox(
-                    label="Private dataset",
-                    value=True,
-                    interactive=True,
-                    scale=0.5,
-                )
-            with gr.Row() as regenerate_row:
-                btn_generate_full_dataset = gr.Button(
-                    value="Generate", variant="primary", scale=2
-                )
-                btn_generate_and_push_to_hub = gr.Button(
-                    value="Generate and Push to Hub", variant="primary", scale=2
-                )
-                btn_push_to_hub = gr.Button(
-                    value="Push to Hub", variant="primary", scale=2
-                )
-            with gr.Row():
-                final_dataset = gr.Dataframe(
-                    value=DEFAULT_DATASETS[0],
-                    label="Generated dataset",
-                    interactive=False,
-                    wrap=True,
-                )
-            with gr.Row():
-                success_message = gr.Markdown(visible=False)
-    def show_success_message(org_name, repo_name):
-        return gr.Markdown(
-            value=f"""
-            <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
-                <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
-                <p style="margin-top: 0.5em;">
-                    The generated dataset is in the right format for fine-tuning with TRL, AutoTrain or other frameworks.
-                    Your dataset is now available at:
-                    <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                        https://huggingface.co/datasets/{org_name}/{repo_name}
-                    </a>
-                </p>
-            </div>
-        """,
-            visible=True,
-        )
-    def hide_success_message():
-        return gr.Markdown(visible=False)
-    gr.Markdown("## Or run this pipeline locally with distilabel")
-    gr.Markdown(
-        "You can run this pipeline locally with distilabel. For more information, please refer to the [distilabel documentation](https://distilabel.argilla.io/) or go to the FAQ tab at the top of the page for more information."
-    )
-    with gr.Accordion(
-        "Run this pipeline using distilabel",
-        open=False,
-    ):
-        pipeline_code = gr.Code(
-            value=generate_pipeline_code(
-                system_prompt.value, num_turns.value, num_rows.value
-            ),
-            language="python",
-            label="Distilabel Pipeline Code",
         )
-    sample_dataset.change(
-        fn=lambda x: x,
-        inputs=[sample_dataset],
         outputs=[final_dataset],
     )
-    btn_generate_full_dataset.click(
         fn=hide_success_message,
         outputs=[success_message],
-    ).then(
         fn=generate_dataset,
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[final_dataset],
         show_progress=True,
     )
     btn_generate_and_push_to_hub.click(
@@ -427,17 +438,17 @@ with gr.Blocks(
         outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=push_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=upload_pipeline_code,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
@@ -446,21 +457,40 @@ with gr.Blocks(
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
-        fn=push_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
-        fn=upload_pipeline_code,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
     system_prompt.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, num_turns, num_rows],
@@ -476,5 +506,3 @@ with gr.Blocks(
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[pipeline_code],
     )
-    app.load(get_org_dropdown, outputs=[org_name])
-    app.load(fn=swap_visibilty, outputs=main_ui)

+import ast
+from typing import Dict, List, Union
+import argilla as rg
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from distilabel.distiset import Distiset
+from huggingface_hub import HfApi
+from src.distilabel_dataset_generator.apps.base import (
+    get_argilla_client,
+    get_main_ui,
+    get_pipeline_code_ui,
+    hide_success_message,
+    push_pipeline_code_to_hub,
+    show_success_message_argilla,
+    show_success_message_hub,
+    validate_argilla_user_workspace_dataset,
+)
+from src.distilabel_dataset_generator.apps.base import (
+    push_dataset_to_hub as push_to_hub_base,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
     DEFAULT_BATCH_SIZE,
+)
+from src.distilabel_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
+from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
     DEFAULT_DATASETS,
     DEFAULT_SYSTEM_PROMPTS,
     get_prompt_generator,
     get_response_generator,
 )
+TASK = "supervised_fine_tuning"
+def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
+    def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
+        return ast.literal_eval(
+            messages.replace("'user'}", "'user'},")
+            .replace("'system'}", "'system'},")
+            .replace("'assistant'}", "'assistant'},")
+        )
+    if "messages" in dataframe.columns:
+        dataframe["messages"] = dataframe["messages"].apply(
+            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
+        )
+    return dataframe
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    private: bool = True,
+    org_name: str = None,
+    repo_name: str = None,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+):
+    original_dataframe = dataframe.copy(deep=True)
+    dataframe = convert_dataframe_messages(dataframe)
+    try:
+        push_to_hub_base(
+            dataframe, private, org_name, repo_name, oauth_token, progress, task=TASK
+        )
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to the Hub: {e}")
+    return original_dataframe
+def push_dataset_to_argilla(
+    dataframe: pd.DataFrame,
+    dataset_name: str,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    original_dataframe = dataframe.copy(deep=True)
+    dataframe = convert_dataframe_messages(dataframe)
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        if "messages" in dataframe.columns:
+            settings = rg.Settings(
+                fields=[
+                    rg.ChatField(
+                        name="messages",
+                        description="The messages in the conversation",
+                        title="Messages",
+                    ),
+                ],
+                questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        title="Rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="user_message_length", title="User Message Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="assistant_message_length",
+                        title="Assistant Message Length",
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="messages_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and provide a score for the assistant's response.",
+            )
+            dataframe["user_message_length"] = dataframe["messages"].apply(
+                lambda x: sum([len(y["content"]) for y in x if y["role"] == "user"])
+            )
+            dataframe["assistant_message_length"] = dataframe["messages"].apply(
+                lambda x: sum(
+                    [len(y["content"]) for y in x if y["role"] == "assistant"]
+                )
+            )
+            dataframe["messages_embeddings"] = get_embeddings(
+                dataframe["messages"].apply(
+                    lambda x: " ".join([y["content"] for y in x])
+                )
+            )
+        else:
+            settings = rg.Settings(
+                fields=[
+                    rg.TextField(
+                        name="system_prompt",
+                        title="System Prompt",
+                        description="The system prompt used for the conversation",
+                        required=False,
+                    ),
+                    rg.TextField(
+                        name="prompt",
+                        title="Prompt",
+                        description="The prompt used for the conversation",
+                    ),
+                    rg.TextField(
+                        name="completion",
+                        title="Completion",
+                        description="The completion from the assistant",
+                    ),
+                ],
+                questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        title="Rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="prompt_length", title="Prompt Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="completion_length", title="Completion Length"
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="prompt_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and correct the prompt and completion where needed.",
+            )
+            dataframe["prompt_length"] = dataframe["prompt"].apply(len)
+            dataframe["completion_length"] = dataframe["completion"].apply(len)
+            dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
+        progress(0.5, desc="Creating dataset")
+        rg_dataset = client.datasets(name=dataset_name, workspace=hf_user)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=dataset_name,
+                workspace=hf_user,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        rg_dataset.records.log(records=hf_dataset)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return original_dataframe
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
             return DEFAULT_SYSTEM_PROMPTS[index]
     progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator()
     progress(0.7, desc="Generating system prompt")
     result = next(
         generate_description.process(
     return result
 def generate_dataset(
     system_prompt: str,
     num_turns: int = 1,
     num_rows: int = 5,
     is_sample: bool = False,
     progress=gr.Progress(),
+) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         num_turns, num_rows, system_prompt, is_sample
     progress(
         1,
         total=total_steps,
+        desc="(2/2) Creating dataset",
     )
     # create distiset
     return dataframe
+(
+    app,
+    main_ui,
+    custom_input_ui,
+    dataset_description,
+    examples,
+    btn_generate_system_prompt,
+    system_prompt,
+    sample_dataset,
+    btn_generate_sample_dataset,
+    dataset_name,
+    add_to_existing_dataset,
+    btn_generate_full_dataset_argilla,
+    btn_generate_and_push_to_argilla,
+    btn_push_to_argilla,
     org_name,
     repo_name,
+    private,
+    btn_generate_full_dataset,
+    btn_generate_and_push_to_hub,
+    btn_push_to_hub,
+    final_dataset,
+    success_message,
+) = get_main_ui(
+    default_dataset_descriptions=DEFAULT_DATASET_DESCRIPTIONS,
+    default_system_prompts=DEFAULT_SYSTEM_PROMPTS,
+    default_datasets=DEFAULT_DATASETS,
+    fn_generate_system_prompt=generate_system_prompt,
+    fn_generate_dataset=generate_dataset,
+    task=TASK,
+)
+with app:
+    with main_ui:
+        with custom_input_ui:
+            num_turns = gr.Number(
+                value=1,
+                label="Number of turns in the conversation",
+                minimum=1,
+                maximum=4,
+                step=1,
+                info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
             )
+            num_rows = gr.Number(
+                value=10,
+                label="Number of rows in the dataset",
+                minimum=1,
+                maximum=500,
+                info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
             )
+        pipeline_code = get_pipeline_code_ui(
+            generate_pipeline_code(system_prompt.value, num_turns.value, num_rows.value)
         )
+    # define app triggers
+    gr.on(
+        triggers=[
+            btn_generate_full_dataset.click,
+            btn_generate_full_dataset_argilla.click,
+        ],
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).then(
+        fn=generate_dataset,
+        inputs=[system_prompt, num_turns, num_rows],
         outputs=[final_dataset],
+        show_progress=True,
     )
+    btn_generate_and_push_to_argilla.click(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
         fn=hide_success_message,
         outputs=[success_message],
+    ).success(
         fn=generate_dataset,
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[final_dataset],
         show_progress=True,
+    ).success(
+        fn=push_dataset_to_argilla,
+        inputs=[final_dataset, dataset_name],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
     )
     btn_generate_and_push_to_hub.click(
         outputs=[final_dataset],
         show_progress=True,
     ).then(
+        fn=push_dataset_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
+        fn=push_pipeline_code_to_hub,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
+        fn=push_dataset_to_hub,
         inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
+        fn=push_pipeline_code_to_hub,
         inputs=[pipeline_code, org_name, repo_name],
         outputs=[],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
+    btn_push_to_argilla.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=push_dataset_to_argilla,
+        inputs=[final_dataset, dataset_name],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
+    )
     system_prompt.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, num_turns, num_rows],
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[pipeline_code],
     )

src/distilabel_dataset_generator/apps/textcat.py ADDED Viewed

	@@ -0,0 +1,548 @@

+import re
+from typing import List, Union
+import argilla as rg
+import gradio as gr
+import pandas as pd
+from datasets import Dataset
+from huggingface_hub import HfApi
+from src.distilabel_dataset_generator.apps.base import (
+    get_argilla_client,
+    get_main_ui,
+    get_pipeline_code_ui,
+    hide_success_message,
+    push_pipeline_code_to_hub,
+    show_success_message_argilla,
+    show_success_message_hub,
+    validate_argilla_user_workspace_dataset,
+)
+from src.distilabel_dataset_generator.apps.base import (
+    push_dataset_to_hub as push_to_hub_base,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    DEFAULT_BATCH_SIZE,
+)
+from src.distilabel_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
+from src.distilabel_dataset_generator.pipelines.textcat import (
+    DEFAULT_DATASET_DESCRIPTIONS,
+    DEFAULT_DATASETS,
+    DEFAULT_SYSTEM_PROMPTS,
+    PROMPT_CREATION_PROMPT,
+    generate_pipeline_code,
+    get_labeller_generator,
+    get_prompt_generator,
+    get_textcat_generator,
+)
+from src.distilabel_dataset_generator.utils import get_preprocess_labels
+TASK = "text_classification"
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    private: bool = True,
+    org_name: str = None,
+    repo_name: str = None,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+    labels: List[str] = None,
+    num_labels: int = 1,
+):
+    original_dataframe = dataframe.copy(deep=True)
+    labels = get_preprocess_labels(labels)
+    try:
+        push_to_hub_base(
+            dataframe,
+            private,
+            org_name,
+            repo_name,
+            oauth_token,
+            progress,
+            labels,
+            num_labels,
+            task=TASK,
+        )
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to the Hub: {e}")
+    return original_dataframe
+def push_dataset_to_argilla(
+    dataframe: pd.DataFrame,
+    dataset_name: str,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+    num_labels: int = 1,
+    labels: List[str] = None,
+) -> pd.DataFrame:
+    original_dataframe = dataframe.copy(deep=True)
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        labels = get_preprocess_labels(labels)
+        settings = rg.Settings(
+            fields=[
+                rg.TextField(
+                    name="text",
+                    description="The text classification data",
+                    title="Text",
+                ),
+            ],
+            questions=[
+                (
+                    rg.LabelQuestion(
+                        name="label",
+                        title="Label",
+                        description="The label of the text",
+                        labels=labels,
+                    )
+                    if num_labels == 1
+                    else rg.MultiLabelQuestion(
+                        name="labels",
+                        title="Labels",
+                        description="The labels of the conversation",
+                        labels=labels,
+                    )
+                ),
+            ],
+            metadata=[
+                rg.IntegerMetadataProperty(name="text_length", title="Text Length"),
+            ],
+            vectors=[
+                rg.VectorField(
+                    name="text_embeddings",
+                    dimensions=get_sentence_embedding_dimensions(),
+                )
+            ],
+            guidelines="Please review the text and provide or correct the label where needed.",
+        )
+        dataframe["text_length"] = dataframe["text"].apply(len)
+        dataframe["text_embeddings"] = get_embeddings(dataframe["text"])
+        progress(0.5, desc="Creating dataset")
+        rg_dataset = client.datasets(name=dataset_name, workspace=hf_user)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=dataset_name,
+                workspace=hf_user,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        records = [
+            rg.Record(
+                fields={
+                    "text": sample["text"],
+                },
+                metadata={"text_length": sample["text_length"]},
+                vectors={"text_embeddings": sample["text_embeddings"]},
+                suggestions=(
+                    [
+                        rg.Suggestion(
+                            question_name="label" if num_labels == 1 else "labels",
+                            value=(
+                                sample["label"] if num_labels == 1 else sample["labels"]
+                            ),
+                        )
+                    ]
+                    if (
+                        (num_labels == 1 and sample["label"] in labels)
+                        or (
+                            num_labels > 1
+                            and all(label in labels for label in sample["labels"])
+                        )
+                    )
+                    else []
+                ),
+            )
+            for sample in hf_dataset
+        ]
+        rg_dataset.records.log(records=records)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return original_dataframe
+def generate_system_prompt(dataset_description, progress=gr.Progress()):
+    progress(0.0, desc="Generating text classification task")
+    if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
+        index = DEFAULT_DATASET_DESCRIPTIONS.index(dataset_description)
+        if index < len(DEFAULT_SYSTEM_PROMPTS):
+            return DEFAULT_SYSTEM_PROMPTS[index]
+    progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator()
+    progress(0.7, desc="Generating text classification task")
+    result = next(
+        generate_description.process(
+            [
+                {
+                    "system_prompt": PROMPT_CREATION_PROMPT,
+                    "instruction": dataset_description,
+                }
+            ]
+        )
+    )[0]["generation"]
+    progress(1.0, desc="Text classification task generated")
+    return result
+def generate_dataset(
+    system_prompt: str,
+    difficulty: str,
+    clarity: str,
+    labels: List[str] = None,
+    num_labels: int = 1,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    progress(0.0, desc="(1/2) Generating text classification data")
+    labels = get_preprocess_labels(labels)
+    textcat_generator = get_textcat_generator(
+        difficulty=difficulty, clarity=clarity, is_sample=is_sample
+    )
+    labeller_generator = get_labeller_generator(
+        system_prompt=system_prompt,
+        labels=labels,
+        num_labels=num_labels,
+        is_sample=is_sample,
+    )
+    total_steps: int = num_rows * 2
+    batch_size = DEFAULT_BATCH_SIZE
+    # create text classification data
+    n_processed = 0
+    textcat_results = []
+    while n_processed < num_rows:
+        progress(
+            0.5 * n_processed / num_rows,
+            total=total_steps,
+            desc="(1/2) Generating text classification data",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = [{"task": system_prompt} for _ in range(batch_size)]
+        batch = list(textcat_generator.process(inputs=inputs))
+        textcat_results.extend(batch[0])
+        n_processed += batch_size
+    for result in textcat_results:
+        result["text"] = result["input_text"]
+    # label text classification data
+    progress(0.5, desc="(1/2) Generating text classification data")
+    if not is_sample:
+        n_processed = 0
+        labeller_results = []
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(1/2) Labeling text classification data",
+            )
+            batch = textcat_results[n_processed : n_processed + batch_size]
+            labels_batch = list(labeller_generator.process(inputs=batch))
+            labeller_results.extend(labels_batch[0])
+            n_processed += batch_size
+        progress(
+            1,
+            total=total_steps,
+            desc="(2/2) Creating dataset",
+        )
+    # create final dataset
+    distiset_results = []
+    source_results = textcat_results if is_sample else labeller_results
+    for result in source_results:
+        record = {
+            key: result[key]
+            for key in ["text", "label" if is_sample else "labels"]
+            if key in result
+        }
+        distiset_results.append(record)
+    dataframe = pd.DataFrame(distiset_results)
+    if not is_sample:
+        if num_labels == 1:
+            dataframe = dataframe.rename(columns={"labels": "label"})
+            dataframe["label"] = dataframe["label"].apply(
+                lambda x: x.lower().strip() if x.lower().strip() in labels else None
+            )
+        else:
+            dataframe["labels"] = dataframe["labels"].apply(
+                lambda x: (
+                    [
+                        label.lower().strip()
+                        for label in x
+                        if label.lower().strip() in labels
+                    ]
+                    if isinstance(x, list)
+                    else None
+                )
+            )
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
+def update_suggested_labels(system_prompt):
+    new_labels = re.findall(r"'(\b[\w-]+\b)'", system_prompt)
+    if not new_labels:
+        return gr.Warning(
+            "No labels found in the system prompt. Please add labels manually."
+        )
+    return gr.update(choices=new_labels, value=new_labels)
+def validate_input_labels(labels):
+    if not labels or len(labels) < 2:
+        raise gr.Error(
+            f"Please select at least 2 labels to classify your text. You selected {len(labels) if labels else 0}."
+        )
+    return labels
+(
+    app,
+    main_ui,
+    custom_input_ui,
+    dataset_description,
+    examples,
+    btn_generate_system_prompt,
+    system_prompt,
+    sample_dataset,
+    btn_generate_sample_dataset,
+    dataset_name,
+    add_to_existing_dataset,
+    btn_generate_full_dataset_argilla,
+    btn_generate_and_push_to_argilla,
+    btn_push_to_argilla,
+    org_name,
+    repo_name,
+    private,
+    btn_generate_full_dataset,
+    btn_generate_and_push_to_hub,
+    btn_push_to_hub,
+    final_dataset,
+    success_message,
+) = get_main_ui(
+    default_dataset_descriptions=DEFAULT_DATASET_DESCRIPTIONS,
+    default_system_prompts=DEFAULT_SYSTEM_PROMPTS,
+    default_datasets=DEFAULT_DATASETS,
+    fn_generate_system_prompt=generate_system_prompt,
+    fn_generate_dataset=generate_dataset,
+    task=TASK,
+)
+with app:
+    with main_ui:
+        with custom_input_ui:
+            difficulty = gr.Dropdown(
+                choices=[
+                    ("High School", "high school"),
+                    ("College", "college"),
+                    ("PhD", "PhD"),
+                    ("Mixed", "mixed"),
+                ],
+                value="mixed",
+                label="Difficulty",
+                info="The difficulty of the text to be generated.",
+            )
+            clarity = gr.Dropdown(
+                choices=[
+                    ("Clear", "clear"),
+                    (
+                        "Understandable",
+                        "understandable with some effort",
+                    ),
+                    ("Ambiguous", "ambiguous"),
+                    ("Mixed", "mixed"),
+                ],
+                value="mixed",
+                label="Clarity",
+                info="The clarity of the text to be generated.",
+            )
+            with gr.Column():
+                labels = gr.Dropdown(
+                    choices=[],
+                    allow_custom_value=True,
+                    interactive=True,
+                    label="Labels",
+                    multiselect=True,
+                    info="Add the labels to classify the text.",
+                )
+                with gr.Blocks():
+                    btn_suggested_labels = gr.Button(
+                        value="Add suggested labels",
+                        size="sm",
+                    )
+            num_labels = gr.Number(
+                label="Number of labels",
+                value=1,
+                minimum=1,
+                maximum=10,
+                info="The number of labels to classify the text.",
+            )
+            num_rows = gr.Number(
+                label="Number of rows",
+                value=10,
+                minimum=1,
+                maximum=500,
+                info="More rows will take longer to generate.",
+            )
+        pipeline_code = get_pipeline_code_ui(
+            generate_pipeline_code(
+                system_prompt.value,
+                difficulty=difficulty.value,
+                clarity=clarity.value,
+                labels=labels.value,
+                num_labels=num_labels.value,
+                num_rows=num_rows.value,
+            )
+        )
+    # define app triggers
+    btn_suggested_labels.click(
+        fn=update_suggested_labels,
+        inputs=[system_prompt],
+        outputs=labels,
+    )
+    gr.on(
+        triggers=[
+            btn_generate_full_dataset.click,
+            btn_generate_full_dataset_argilla.click,
+        ],
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).then(
+        fn=validate_input_labels,
+        inputs=[labels],
+        outputs=[labels],
+    ).success(
+        fn=generate_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[final_dataset],
+        show_progress=True,
+    )
+    btn_generate_and_push_to_argilla.click(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=generate_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=push_dataset_to_argilla,
+        inputs=[final_dataset, dataset_name, num_labels, labels],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
+    )
+    btn_generate_and_push_to_hub.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).then(
+        fn=generate_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).then(
+        fn=push_dataset_to_hub,
+        inputs=[final_dataset, private, org_name, repo_name, labels, num_labels],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).then(
+        fn=push_pipeline_code_to_hub,
+        inputs=[pipeline_code, org_name, repo_name],
+        outputs=[],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_hub,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    )
+    btn_push_to_hub.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).then(
+        fn=push_dataset_to_hub,
+        inputs=[final_dataset, private, org_name, repo_name, labels, num_labels],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).then(
+        fn=push_pipeline_code_to_hub,
+        inputs=[pipeline_code, org_name, repo_name],
+        outputs=[],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_hub,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    )
+    btn_push_to_argilla.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=push_dataset_to_argilla,
+        inputs=[final_dataset, dataset_name, num_labels, labels],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
+    )
+    system_prompt.change(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[pipeline_code],
+    )
+    difficulty.change(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[pipeline_code],
+    )
+    clarity.change(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[pipeline_code],
+    )
+    labels.change(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[pipeline_code],
+    )
+    num_labels.change(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, difficulty, clarity, labels, num_labels, num_rows],
+        outputs=[pipeline_code],
+    )

src/distilabel_dataset_generator/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from src.distilabel_dataset_generator.utils import HF_TOKENS
+DEFAULT_BATCH_SIZE = 5
+TOKEN_INDEX = 0
+MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+def _get_next_api_key():
+    global TOKEN_INDEX
+    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
+    TOKEN_INDEX += 1
+    return api_key

src/distilabel_dataset_generator/pipelines/embeddings.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import StaticEmbedding
+# Initialize a StaticEmbedding module
+static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
+model = SentenceTransformer(modules=[static_embedding])
+def get_embeddings(texts: List[str]) -> List[List[float]]:
+    return [embedding.tolist() for embedding in model.encode(texts)]
+def get_sentence_embedding_dimensions() -> int:
+    return model.get_sentence_embedding_dimension()

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import pandas as pd
-from datasets import Dataset
-from distilabel.distiset import Distiset
 from distilabel.llms import InferenceEndpointsLLM
-from distilabel.pipeline import Pipeline
-from distilabel.steps import KeepColumns
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
-from src.distilabel_dataset_generator.utils import HF_TOKENS
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
@@ -120,7 +119,6 @@ The prompt you write should follow the same style and structure as the following
 User dataset description:
 """
-MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_DATASET_DESCRIPTIONS = (
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
@@ -157,8 +155,6 @@ _STOP_SEQUENCES = [
     "assistant",
     " \n\n",
 ]
-DEFAULT_BATCH_SIZE = 5
-TOKEN_INDEX = 0
 def _get_output_mappings(num_turns):
@@ -189,7 +185,7 @@ with Pipeline(name="sft") as pipeline:
             tokenizer_id=MODEL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
-                "temperature": 0.8,
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
@@ -213,13 +209,6 @@ if __name__ == "__main__":
     return code
-def _get_next_api_key():
-    global TOKEN_INDEX
-    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
-    TOKEN_INDEX += 1
-    return api_key
 def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
@@ -231,7 +220,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
-                    "temperature": 0.8,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
@@ -250,7 +239,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
-                    "temperature": 0.8,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,
@@ -300,12 +289,9 @@ def get_response_generator(num_turns, system_prompt, is_sample):
 def get_prompt_generator():
-    global TOKEN_INDEX
-    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
-    TOKEN_INDEX += 1
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
-            api_key=api_key,
             model_id=MODEL,
             tokenizer_id=MODEL,
             generation_kwargs={
@@ -318,95 +304,3 @@ def get_prompt_generator():
     )
     prompt_generator.load()
     return prompt_generator
-def get_pipeline(num_turns, num_rows, system_prompt, is_sample):
-    input_mappings = _get_output_mappings(num_turns)
-    output_mappings = input_mappings
-    with Pipeline(name="sft") as pipeline:
-        magpie = get_magpie_generator(num_turns, num_rows, system_prompt, is_sample)
-        generate_response = get_response_generator(system_prompt, is_sample)
-        keep_columns = KeepColumns(
-            columns=list(output_mappings.values()) + ["model_name"],
-        )
-        magpie.connect(generate_response)
-        generate_response.connect(keep_columns)
-        return pipeline
-if __name__ == "__main__":
-    prompt_generation_step = get_prompt_generator()
-    system_prompt = next(
-        prompt_generation_step.process(
-            [
-                {
-                    "system_prompt": PROMPT_CREATION_PROMPT,
-                    "instruction": DEFAULT_DATASET_DESCRIPTIONS[0],
-                }
-            ]
-        )
-    )[0]["generation"]
-    num_rows = 2
-    num_turns = 1
-    magpie_generator = get_magpie_generator(num_turns, num_rows, system_prompt, False)
-    response_generator = get_response_generator(num_turns, system_prompt, False)
-    total_steps = num_rows * 2
-    batch_size = 5  # Adjust this value as needed
-    # create instructions
-    magpie_results = []
-    for i in range(0, num_rows, batch_size):
-        batch = list(magpie_generator.process())[:batch_size]
-        magpie_results.extend([item[0] for item in batch])
-    # generate responses
-    response_results = []
-    if num_turns == 1:
-        for i in range(0, len(magpie_results), batch_size):
-            batch = magpie_results[i : i + batch_size]
-            batch = [entry[0] for entry in batch]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses)
-        for result in response_results:
-            result[0]["prompt"] = result[0]["instruction"]
-            result[0]["completion"] = result[0]["generation"]
-            result[0]["system_prompt"] = system_prompt
-    else:
-        for result in magpie_results:
-            result[0]["conversation"].insert(
-                0, {"role": "system", "content": system_prompt}
-            )
-            result[0]["messages"] = result[0]["conversation"]
-        for i in range(0, len(magpie_results), batch_size):
-            batch = magpie_results[i : i + batch_size]
-            batch = [entry[0] for entry in batch]
-            responses = list(response_generator.process(inputs=batch))
-            response_results.extend(responses)
-        for result in response_results:
-            result[0]["messages"].append(
-                {"role": "assistant", "content": result[0]["generation"]}
-            )
-    distiset_results = []
-    for result in response_results[0]:
-        record = {}
-        for relevant_keys in [
-            "messages",
-            "prompt",
-            "completion",
-            "model_name",
-            "system_prompt",
-        ]:
-            if relevant_keys in result:
-                record[relevant_keys] = result[relevant_keys]
-        distiset_results.append(record)
-    distiset = Distiset(
-        {
-            "default": Dataset.from_list(distiset_results),
-        }
-    )

 import pandas as pd
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
+from src.distilabel_dataset_generator.pipelines.base import (
+    MODEL,
+    _get_next_api_key,
+)
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
 User dataset description:
 """
 DEFAULT_DATASET_DESCRIPTIONS = (
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
     "assistant",
     " \n\n",
 ]
 def _get_output_mappings(num_turns):
             tokenizer_id=MODEL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
+                "temperature": 0.9,
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
     return code
 def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
+                    "temperature": 0.9,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
+                    "temperature": 0.9,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,
 def get_prompt_generator():
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
             model_id=MODEL,
             tokenizer_id=MODEL,
             generation_kwargs={
     )
     prompt_generator.load()
     return prompt_generator

src/distilabel_dataset_generator/pipelines/textcat.py ADDED Viewed

	@@ -0,0 +1,224 @@

+from typing import List
+import pandas as pd
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.steps.tasks import (
+    GenerateTextClassificationData,
+    TextClassification,
+    TextGeneration,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    MODEL,
+    _get_next_api_key,
+)
+from src.distilabel_dataset_generator.utils import get_preprocess_labels
+PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
+Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
+The prompt you write should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels.
+If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
+Classify the following customer review of a cinema as either 'positive' or 'negative'.
+Classify the following news article into one or more of the following categories: 'politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international'.
+Determine the sentiment of the following social media post: 'ambiguous', 'sarcastic', 'informative', 'emotional'.
+Identify the issue category for the following technical support ticket: 'billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription'.
+Classify the following movie review into one of the following categories: 'critical', 'praise', 'disappointed', 'enthusiastic'.
+Determine the level of customer satisfaction from the following customer service transcript: 'satisfied', 'dissatisfied', 'highly-satisfied', 'somewhat-dissatisfied', 'indifferent'.
+Categorize the following product description into one of the following product types: 'smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones'.
+Classify the following tweet as expressing either 'support' or 'opposition' to the political event discussed.
+Classify the following restaurant review into one of the following categories: 'food-quality', 'service', 'ambiance', or 'price'.
+Classify the following blog post based on its primary fashion trend or style: 'casual', 'formal', 'streetwear', 'vintage' or 'sustainable-fashion'.
+User dataset description:
+"""
+DEFAULT_DATASET_DESCRIPTIONS = [
+    "A dataset covering customer reviews for an e-commerce website.",
+    "A dataset covering news articles about various topics.",
+]
+DEFAULT_DATASETS = [
+    pd.DataFrame.from_dict(
+        {
+            "text": [
+                "I love the product! It's amazing and I'll buy it again.",
+                "The product was okay, but I wouldn't buy it again.",
+            ],
+            "label": ["positive", "negative"],
+        }
+    ),
+    pd.DataFrame.from_dict(
+        {
+            "text": [
+                "Yesterday, the US stock market had a significant increase.",
+                "New research suggests that the Earth is not a perfect sphere.",
+            ],
+            "labels": [["economy", "politics"], ["science", "environment"]],
+        }
+    ),
+]
+DEFAULT_SYSTEM_PROMPTS = [
+    "Classify the following customer review as either 'positive' or 'negative'.",
+    "Classify the following news article into one of the following categories: 'politics', 'economy', 'environment', 'science', 'health'.",
+]
+def generate_pipeline_code(
+    system_prompt: str,
+    difficulty: str = None,
+    clarity: str = None,
+    labels: List[str] = None,
+    num_labels: int = 1,
+    num_rows: int = 10,
+) -> str:
+    labels = get_preprocess_labels(labels)
+    base_code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints]`
+import os
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts, KeepColumns
+from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
+MODEL = "{MODEL}"
+TEXT_CLASSIFICATION_TASK = "{system_prompt}"
+os.environ["HF_TOKEN"] = (
+    "hf_xxx"  # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+)
+with Pipeline(name="textcat") as pipeline:
+    task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
+    textcat_generation = GenerateTextClassificationData(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
+            generation_kwargs={{
+                "temperature": 0.8,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        difficulty={None if difficulty == "mixed" else repr(difficulty)},
+        clarity={None if clarity == "mixed" else repr(clarity)},
+        num_generations={num_rows},
+        output_mappings={{"input_text": "text"}},
+    )
+    """
+    if num_labels == 1:
+        return (
+            base_code
+            + """
+    keep_columns = KeepColumns(
+        columns=["text", "label"],
+    )
+    # Connect steps in the pipeline
+    task_generator >> textcat_generation >> keep_columns
+    if __name__ == "__main__":
+        distiset = pipeline.run()
+    """
+        )
+    return (
+        base_code
+        + f"""
+    keep_columns = KeepColumns(
+        columns=["text"],
+    )
+    textcat_labeller = TextClassification(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
+            generation_kwargs={{
+                "temperature": 0.8,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        n={num_labels},
+        available_labels={labels},
+        context=TEXT_CLASSIFICATION_TASK,
+        default_label="unknown"
+    )
+    # Connect steps in the pipeline
+    task_generator >> textcat_generation >> keep_columns >> textcat_labeller
+    if __name__ == "__main__":
+        distiset = pipeline.run()
+    """
+    )
+def get_textcat_generator(difficulty, clarity, is_sample):
+    textcat_generator = GenerateTextClassificationData(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.8,
+                "max_new_tokens": 256 if is_sample else 1024,
+            },
+        ),
+        difficulty=None if difficulty == "mixed" else difficulty,
+        clarity=None if clarity == "mixed" else clarity,
+    )
+    textcat_generator.load()
+    return textcat_generator
+def get_labeller_generator(system_prompt, labels, num_labels, is_sample):
+    labeller_generator = TextClassification(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.8,
+                "max_new_tokens": 256 if is_sample else 1024,
+            },
+        ),
+        context=system_prompt,
+        available_labels=labels,
+        n=num_labels,
+        default_label="unknown",
+    )
+    labeller_generator.load()
+    return labeller_generator
+def get_prompt_generator():
+    prompt_generator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            generation_kwargs={
+                "temperature": 0.8,
+                "max_new_tokens": 2048,
+                "do_sample": True,
+            },
+        ),
+        use_system_prompt=True,
+    )
+    prompt_generator.load()
+    return prompt_generator

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
@@ -10,6 +12,8 @@ from gradio.oauth import (
 )
 from huggingface_hub import whoami
 HF_TOKENS = [os.getenv("HF_TOKEN")] + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
 HF_TOKENS = [token for token in HF_TOKENS if token]
@@ -76,8 +80,48 @@ def get_token(oauth_token: OAuthToken = None):
         return ""
-def swap_visibilty(oauth_token: OAuthToken = None):
     if oauth_token:
         return gr.update(elem_classes=["main_ui_logged_in"])
     else:
         return gr.update(elem_classes=["main_ui_logged_out"])

 import os
+from typing import Union, List, Optional
+import argilla as rg
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
 )
 from huggingface_hub import whoami
+_LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
 HF_TOKENS = [os.getenv("HF_TOKEN")] + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
 HF_TOKENS = [token for token in HF_TOKENS if token]
         return ""
+def swap_visibilty(oauth_token: Optional[OAuthToken] = None):
     if oauth_token:
         return gr.update(elem_classes=["main_ui_logged_in"])
     else:
         return gr.update(elem_classes=["main_ui_logged_out"])
+def get_base_app():
+    with gr.Blocks(
+        title="🧬 Synthetic Data Generator",
+        head="🧬  Synthetic Data Generator",
+        css=_LOGGED_OUT_CSS,
+    ) as app:
+        with gr.Row():
+            gr.Markdown(
+                "Want to run this locally or with other LLMs? Take a look at the FAQ tab. distilabel Synthetic Data Generator is free, we use the authentication token to push the dataset to the Hugging Face Hub and not for data generation."
+            )
+        with gr.Row():
+            gr.Column()
+            get_login_button()
+            gr.Column()
+        gr.Markdown("## Iterate on a sample dataset")
+        with gr.Column() as main_ui:
+            pass
+    return app
+def get_argilla_client() -> Union[rg.Argilla, None]:
+    try:
+        api_url = os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
+        api_key = os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
+        if api_url is None or api_key is None:
+            api_url = os.getenv("ARGILLA_API_URL")
+            api_key = os.getenv("ARGILLA_API_KEY")
+        return rg.Argilla(
+            api_url=api_url,
+            api_key=api_key,
+        )
+    except Exception:
+        return None
+def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
+    return [label.lower().strip() for label in labels] if labels else []