Spaces:

argilla
/

synthetic-data-generator

Running

@@ -1,11 +1,12 @@
 import io
 import uuid
 from typing import Any, Callable, List, Optional, Tuple, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
-from datasets import Dataset
 from distilabel.distiset import Distiset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
@@ -16,6 +17,8 @@ from src.distilabel_dataset_generator.utils import (
     list_orgs,
 )
 def swap_visibilty(oauth_token: Optional[OAuthToken] = None):
     if oauth_token:
@@ -30,15 +33,21 @@ def get_main_ui(
     default_datasets: List[pd.DataFrame],
     fn_generate_system_prompt: Callable,
     fn_generate_dataset: Callable,
 ):
     def fn_generate_sample_dataset(system_prompt, progress=gr.Progress()):
         if system_prompt in default_system_prompts:
             index = default_system_prompts.index(system_prompt)
             if index < len(default_datasets):
                 return default_datasets[index]
-        result = fn_generate_dataset(
-            system_prompt, num_turns=1, num_rows=1, progress=progress, is_sample=True
-        )
         return result
     with gr.Blocks(
@@ -109,7 +118,7 @@ def get_main_ui(
             outputs=[sample_dataset],
             show_progress=True,
         )
         btn_generate_sample_dataset.click(
             fn=fn_generate_sample_dataset,
             inputs=[system_prompt],
@@ -306,14 +315,14 @@ def get_argilla_tab() -> Tuple[Any]:
                 dataset_name = gr.Textbox(
                     label="Dataset name",
                     placeholder="dataset_name",
-                    value="my-distiset",
                 )
                 add_to_existing_dataset = gr.Checkbox(
                     label="Allow adding records to existing dataset",
-                    info="When selected, you do need to ensure the number of turns in the conversation is the same as the number of turns in the existing dataset.",
                     value=False,
                     interactive=True,
-                    scale=0.5,
                 )
             with gr.Row(variant="panel"):
@@ -354,7 +363,7 @@ def get_hf_tab() -> Tuple[Any]:
                 label="Private dataset",
                 value=True,
                 interactive=True,
-                scale=0.5,
             )
         with gr.Row(variant="panel"):
             btn_generate_full_dataset = gr.Button(
@@ -403,14 +412,33 @@ def push_dataset_to_hub(
     repo_name: str = None,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
-    distiset = Distiset(
-        {
-            "default": Dataset.from_pandas(dataframe),
-        }
-    )
     progress(0.2, desc="Pushing dataset to hub")
     distiset.push_to_hub(
         repo_id=repo_id,
@@ -444,6 +472,7 @@ def get_final_dataset_row(default_datasets) -> gr.Dataframe:
             label="Generated dataset",
             interactive=False,
             wrap=True,
         )
     return final_dataset

 import io
+import re
 import uuid
 from typing import Any, Callable, List, Optional, Tuple, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
+from datasets import Dataset, Features, ClassLabel, Value
 from distilabel.distiset import Distiset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
     list_orgs,
 )
+TEXTCAT_TASK = "text_classification"
+SFT_TASK = "supervised_finetuning"
 def swap_visibilty(oauth_token: Optional[OAuthToken] = None):
     if oauth_token:
     default_datasets: List[pd.DataFrame],
     fn_generate_system_prompt: Callable,
     fn_generate_dataset: Callable,
+    task: str,
 ):
     def fn_generate_sample_dataset(system_prompt, progress=gr.Progress()):
         if system_prompt in default_system_prompts:
             index = default_system_prompts.index(system_prompt)
             if index < len(default_datasets):
                 return default_datasets[index]
+        if task == TEXTCAT_TASK:
+            result = fn_generate_dataset(
+                system_prompt, difficulty="mixed", clarity="mixed", labels=[], num_labels=1, num_rows=1, progress=progress, is_sample=True
+            )
+        else:
+            result = fn_generate_dataset(
+                system_prompt, num_turns=1, num_rows=1, progress=progress, is_sample=True
+            )
         return result
     with gr.Blocks(
             outputs=[sample_dataset],
             show_progress=True,
         )
         btn_generate_sample_dataset.click(
             fn=fn_generate_sample_dataset,
             inputs=[system_prompt],
                 dataset_name = gr.Textbox(
                     label="Dataset name",
                     placeholder="dataset_name",
+                    value=f"my-distiset-{uuid.uuid4()}", ######## CHANGE AFTER TESTING
                 )
                 add_to_existing_dataset = gr.Checkbox(
                     label="Allow adding records to existing dataset",
+                    info="When selected, you do need to ensure the dataset options are the same as in the existing dataset.",
                     value=False,
                     interactive=True,
+                    scale=1,
                 )
             with gr.Row(variant="panel"):
                 label="Private dataset",
                 value=True,
                 interactive=True,
+                scale=1,
             )
         with gr.Row(variant="panel"):
             btn_generate_full_dataset = gr.Button(
     repo_name: str = None,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
+    labels: List[str] = None,
+    num_labels: int = None,
+    task: str = TEXTCAT_TASK,
 ) -> pd.DataFrame:
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
+    if task == TEXTCAT_TASK and num_labels == 1:
+        distiset = Distiset(
+            {
+                "default": Dataset.from_pandas(
+                    dataframe,
+                    features=Features(
+                        {
+                            "text": Value("string"),
+                            "label": ClassLabel(names=labels),
+                        }
+                    ),
+                ),
+            }
+        )
+    else:
+        distiset = Distiset(
+            {
+                "default": Dataset.from_pandas(dataframe),
+            }
+        )
     progress(0.2, desc="Pushing dataset to hub")
     distiset.push_to_hub(
         repo_id=repo_id,
             label="Generated dataset",
             interactive=False,
             wrap=True,
+            min_width=300,
         )
     return final_dataset

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -39,6 +39,8 @@ from src.distilabel_dataset_generator.pipelines.sft import (
     get_response_generator,
 )
 def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
@@ -65,7 +67,9 @@ def push_dataset_to_hub(
 ):
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
-    push_to_hub_base(dataframe, private, org_name, repo_name, oauth_token, progress)
     return original_dataframe
@@ -357,6 +361,7 @@ def generate_dataset(
     default_datasets=DEFAULT_DATASETS,
     fn_generate_system_prompt=generate_system_prompt,
     fn_generate_dataset=generate_dataset,
 )
 with app:

     get_response_generator,
 )
+TASK = "supervised_fine_tuning"
 def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
 ):
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
+    push_to_hub_base(
+        dataframe, private, org_name, repo_name, oauth_token, progress, task=TASK
+    )
     return original_dataframe
     default_datasets=DEFAULT_DATASETS,
     fn_generate_system_prompt=generate_system_prompt,
     fn_generate_dataset=generate_dataset,
+    task=TASK,
 )
 with app:

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,43 +1,250 @@
-from typing import List
 import gradio as gr
 import pandas as pd
 from src.distilabel_dataset_generator.apps.base import (
     get_main_ui,
     get_pipeline_code_ui,
     hide_success_message,
-    push_dataset_to_hub,
     push_pipeline_code_to_hub,
     show_success_message_argilla,
     show_success_message_hub,
     validate_argilla_user_workspace_dataset,
 )
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     DEFAULT_DATASETS,
     DEFAULT_SYSTEM_PROMPTS,
     generate_pipeline_code,
 )
-def push_dataset_to_argilla(dataset: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
-    return dataset
-def generate_system_prompt(dataset_description: str) -> str:
-    return dataset_description
 def generate_dataset(
     system_prompt: str,
     difficulty: str,
     clarity: str,
-    labels: List[str],
-    num_labels: int,
-    num_rows: int,
 ) -> pd.DataFrame:
-    return pd.DataFrame({"prompt": [system_prompt], "completion": [system_prompt]})
 (
@@ -69,8 +276,20 @@ def generate_dataset(
     default_datasets=DEFAULT_DATASETS,
     fn_generate_system_prompt=generate_system_prompt,
     fn_generate_dataset=generate_dataset,
 )
 with app:
     with main_ui:
         with custom_input_ui:
@@ -78,7 +297,7 @@ with app:
                 choices=[
                     ("High School", "high school"),
                     ("College", "college"),
-                    ("PhD", "phd"),
                     ("Mixed", "mixed"),
                 ],
                 value="mixed",
@@ -86,29 +305,38 @@ with app:
             )
             clarity = gr.Dropdown(
                 choices=[
-                    ("Clear", "Clear"),
                     (
                         "Understandable",
                         "understandable with some effort",
                     ),
-                    ("Ambiguous", "Ambiguous"),
                     ("Mixed", "mixed"),
                 ],
                 value="mixed",
                 label="Clarity",
             )
-            labels = gr.Dropdown(
-                choices=[],
-                allow_custom_value=True,
-                interactive=True,
-                label="Labels",
-                multiselect=True,
-            )
             num_labels = gr.Number(
                 label="Number of labels", value=1, minimum=1, maximum=10
             )
             num_rows = gr.Number(
-                label="Number of rows", value=10, minimum=1, maximum=500
             )
         pipeline_code = get_pipeline_code_ui(
@@ -123,6 +351,12 @@ with app:
         )
     # define app triggers
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
@@ -152,7 +386,7 @@ with app:
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
-        inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
         show_progress=True,
     ).success(
@@ -171,7 +405,7 @@ with app:
         show_progress=True,
     ).then(
         fn=push_dataset_to_hub,
-        inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
@@ -190,7 +424,7 @@ with app:
         outputs=[success_message],
     ).then(
         fn=push_dataset_to_hub,
-        inputs=[final_dataset, private, org_name, repo_name],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
@@ -214,7 +448,7 @@ with app:
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
-        inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
         show_progress=True,
     ).success(

+import re
+from typing import Dict, List, Union
+import argilla as rg
 import gradio as gr
 import pandas as pd
+from datasets import Dataset
+from distilabel.distiset import Distiset
+from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
+    get_argilla_client,
     get_main_ui,
     get_pipeline_code_ui,
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message_argilla,
     show_success_message_hub,
     validate_argilla_user_workspace_dataset,
 )
+from src.distilabel_dataset_generator.apps.base import (
+    push_dataset_to_hub as push_to_hub_base,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    DEFAULT_BATCH_SIZE,
+)
+from src.distilabel_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
 from src.distilabel_dataset_generator.pipelines.textcat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     DEFAULT_DATASETS,
     DEFAULT_SYSTEM_PROMPTS,
+    PROMPT_CREATION_PROMPT,
     generate_pipeline_code,
+    get_textcat_generator,
+    get_prompt_generator,
+    get_labeller_generator,
 )
+TASK = "text_classification"
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame,
+    private: bool = True,
+    org_name: str = None,
+    repo_name: str = None,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+    labels: List[str] = None,
+    num_labels: int = 1,
+):
+    original_dataframe = dataframe.copy(deep=True)
+    push_to_hub_base(
+        dataframe,
+        private,
+        org_name,
+        repo_name,
+        oauth_token,
+        progress,
+        labels,
+        num_labels,
+        task=TASK,
+    )
+    return original_dataframe
+def push_dataset_to_argilla(
+    dataframe: pd.DataFrame,
+    dataset_name: str,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+    num_labels: int = 1,
+    labels: List[str] = None,
+) -> pd.DataFrame:
+    original_dataframe = dataframe.copy(deep=True)
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        settings = rg.Settings(
+            fields=[
+                rg.TextField(
+                    name="text",
+                    description="The text classification data",
+                    title="Text",
+                ),
+            ],
+            questions=[
+                (
+                    rg.LabelQuestion(
+                        name="label",
+                        title="Label",
+                        description="The label of the text",
+                        labels=labels,
+                    )
+                    if num_labels == 1
+                    else rg.MultiLabelQuestion(
+                        name="labels",
+                        title="Labels",
+                        description="The labels of the conversation",
+                        labels=labels,
+                    )
+                ),
+            ],
+            metadata=[
+                rg.IntegerMetadataProperty(name="text_length", title="Text Length"),
+            ],
+            vectors=[
+                rg.VectorField(
+                    name="text_embeddings",
+                    dimensions=get_sentence_embedding_dimensions(),
+                )
+            ],
+            guidelines="Please review the text and provide or correct the label where needed.",
+        )
+        dataframe["text_length"] = dataframe["text"].apply(len)
+        dataframe["text_embeddings"] = get_embeddings(dataframe["text"])
+        progress(0.5, desc="Creating dataset")
+        rg_dataset = client.datasets(name=dataset_name, workspace=hf_user)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=dataset_name,
+                workspace=hf_user,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        rg_dataset.records.log(records=hf_dataset)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return original_dataframe
+def generate_system_prompt(dataset_description, progress=gr.Progress()):
+    progress(0.0, desc="Generating text classification task")
+    if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
+        index = DEFAULT_DATASET_DESCRIPTIONS.index(dataset_description)
+        if index < len(DEFAULT_SYSTEM_PROMPTS):
+            return DEFAULT_SYSTEM_PROMPTS[index]
+    progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator()
+    progress(0.7, desc="Generating text classification task")
+    result = next(
+        generate_description.process(
+            [
+                {
+                    "system_prompt": PROMPT_CREATION_PROMPT,
+                    "instruction": dataset_description,
+                }
+            ]
+        )
+    )[0]["generation"]
+    progress(1.0, desc="Text classification task generated")
+    return result
 def generate_dataset(
     system_prompt: str,
     difficulty: str,
     clarity: str,
+    labels: List[str] = [],
+    num_labels: int = 2,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
 ) -> pd.DataFrame:
+    progress(0.0, desc="(1/2) Generating text classification data")
+    textcat_generator = get_textcat_generator(difficulty, clarity, is_sample)
+    labeler_generator = get_labeller_generator(num_labels, labels, is_sample)
+    total_steps: int = num_rows * 2
+    batch_size = DEFAULT_BATCH_SIZE
+    # create text classification data
+    n_processed = 0
+    textcat_results = []
+    while n_processed < num_rows:
+        progress(
+            0.5 * n_processed / num_rows,
+            total=total_steps,
+            desc="(1/2) Generating text classification data",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = [{"task": system_prompt} for _ in range(batch_size)]
+        batch = list(textcat_generator.process(inputs=inputs))
+        textcat_results.extend(batch[0])
+        n_processed += batch_size
+    for result in textcat_results:
+        result["text"] = result["input_text"]
+    # label text classification data
+    progress(0.5, desc="(1/2) Labeling text classification data")
+    if not is_sample:
+        n_processed = 0
+        labeler_results = []
+        while n_processed < num_rows:
+            progress(
+                0.5 + 0.5 * n_processed / num_rows,
+                total=total_steps,
+                desc="(1/2) Generating text classification data",
+            )
+            batch = textcat_results[n_processed : n_processed + batch_size]
+            labels = list(labeler_generator.process(inputs=batch))
+            labeler_results.extend(labels[0])
+            n_processed += batch_size
+        progress(
+            1,
+            total=total_steps,
+            desc="(2/2) Labeling text classification data",
+        )
+    # create final dataset
+    distiset_results = []
+    if is_sample:
+        for result in textcat_results:
+            record = {}
+            for relevant_keys in [
+                "text",
+                "label",
+            ]:
+                if relevant_keys in result:
+                    record[relevant_keys] = result[relevant_keys]
+            distiset_results.append(record)
+    else:
+        for result in labeler_results:
+            record = {}
+            for relevant_keys in [
+                "text",
+                "labels",
+            ]:
+                if relevant_keys in result:
+                    record[relevant_keys] = result[relevant_keys]
+            distiset_results.append(record)
+    dataframe = pd.DataFrame(distiset_results)
+    if num_labels == 1:
+        dataframe = dataframe.rename(columns={"labels": "label"})
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
 (
     default_datasets=DEFAULT_DATASETS,
     fn_generate_system_prompt=generate_system_prompt,
     fn_generate_dataset=generate_dataset,
+    task=TASK,
 )
+def update_labels_based_on_checkbox(checked, system_prompt):
+    if checked:
+        pattern = r"'(\b\w+\b)'"
+        new_labels = re.findall(pattern, system_prompt)
+        gr.update(choices=new_labels)
+        return gr.update(value=new_labels)
+    else:
+        return gr.update(choices=[])
 with app:
     with main_ui:
         with custom_input_ui:
                 choices=[
                     ("High School", "high school"),
                     ("College", "college"),
+                    ("PhD", "PhD"),
                     ("Mixed", "mixed"),
                 ],
                 value="mixed",
             )
             clarity = gr.Dropdown(
                 choices=[
+                    ("Clear", "CLEAR"),
                     (
                         "Understandable",
                         "understandable with some effort",
                     ),
+                    ("Ambiguous", "ambiguous"),
                     ("Mixed", "mixed"),
                 ],
                 value="mixed",
                 label="Clarity",
             )
+            with gr.Row(variant="default"):
+                labels = gr.Dropdown(
+                    choices=[],
+                    allow_custom_value=True,
+                    interactive=True,
+                    label="Labels",
+                    multiselect=True,
+                )
+                suggested_labels = gr.Checkbox(
+                    label="Add suggested labels",
+                    value=False,
+                    interactive=True,
+                )
             num_labels = gr.Number(
                 label="Number of labels", value=1, minimum=1, maximum=10
             )
             num_rows = gr.Number(
+                label="Number of rows",
+                value=1,
+                minimum=1,
+                maximum=500,  ###### CHANGE AFTER TESTING
             )
         pipeline_code = get_pipeline_code_ui(
         )
     # define app triggers
+    suggested_labels.change(
+        update_labels_based_on_checkbox,
+        inputs=[suggested_labels, system_prompt],
+        outputs=labels,
+    )
     gr.on(
         triggers=[
             btn_generate_full_dataset.click,
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
+        inputs=[final_dataset, dataset_name, num_labels, labels],
         outputs=[final_dataset],
         show_progress=True,
     ).success(
         show_progress=True,
     ).then(
         fn=push_dataset_to_hub,
+        inputs=[final_dataset, private, org_name, repo_name, labels, num_labels],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
         outputs=[success_message],
     ).then(
         fn=push_dataset_to_hub,
+        inputs=[final_dataset, private, org_name, repo_name, labels],
         outputs=[final_dataset],
         show_progress=True,
     ).then(
         show_progress=True,
     ).success(
         fn=push_dataset_to_argilla,
+        inputs=[final_dataset, dataset_name, num_labels, labels],
         outputs=[final_dataset],
         show_progress=True,
     ).success(

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,6 +1,42 @@
 from typing import List
-import pandas as pd
 DEFAULT_DATASET_DESCRIPTIONS = [
     "A dataset covering customer reviews for an e-commerce website.",
@@ -23,14 +59,14 @@ DEFAULT_DATASETS = [
                 "Yesterday, the US stock market had a significant increase.",
                 "New research suggests that the Earth is not a perfect sphere.",
             ],
-            "label": [["economy", "politics"], ["science", "environment"]],
         }
     ),
 ]
 DEFAULT_SYSTEM_PROMPTS = [
-    "Classify the following customer review as positive or negative.",
-    "Classify the following news article into one or more categories.",
 ]
@@ -42,8 +78,118 @@ def generate_pipeline_code(
     num_labels: int,
     num_rows: int,
 ) -> str:
-    return """
-    from distilabel import Distilabel
-    #### PIPELINE CODE HERE
     """

+import pandas as pd
 from typing import List
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.steps.tasks import GenerateTextClassificationData, TextClassification, TextGeneration
+from src.distilabel_dataset_generator.pipelines.base import (
+    MODEL,
+    _get_next_api_key,
+)
+PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
+Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.
+The prompt you write should follow the same style and structure as the following example prompts, clearly specifying the possible classification labels where applicable:
+Classify the following customer review of a cinema as either 'positive' or 'negative'.
+Classify the following news article into one or more of the following categories: 'politics', 'sports', 'technology', 'entertainment', 'health', 'business', 'environment', 'education', 'science', 'international'.
+Determine the sentiment of the following social media post: 'ambiguous', 'sarcastic', 'informative', 'emotional'.
+Identify the issue category for the following technical support ticket: 'billing', 'technical', 'account', 'shipping', 'returns', 'installation', 'subscription'.
+Classify the following movie review into one of the following categories: 'critical', 'praise', 'disappointed', 'enthusiastic'.
+Determine the level of customer satisfaction from the following customer service transcript: 'satisfied', 'dissatisfied', 'highly satisfied', 'somewhat dissatisfied', 'indifferent'.
+Categorize the following product description into one of the following product types: 'smartphone', 'laptop', 'tablet', 'smartwatch', 'e-reader', 'headphones'.
+Classify the following tweet as expressing either 'support' or 'opposition' to the political event discussed.
+Classify the following restaurant review into one of the following categories: 'food quality', 'service', 'ambiance', or 'price'.
+Classify the following blog post based on its primary fashion trend or style: 'casual', 'formal', 'streetwear', 'vintage' or 'sustainable fashion'.
+User dataset description:
+"""
 DEFAULT_DATASET_DESCRIPTIONS = [
     "A dataset covering customer reviews for an e-commerce website.",
                 "Yesterday, the US stock market had a significant increase.",
                 "New research suggests that the Earth is not a perfect sphere.",
             ],
+            "labels": [["economy", "politics"], ["science", "environment"]],
         }
     ),
 ]
 DEFAULT_SYSTEM_PROMPTS = [
+    "Classify the following customer review as either 'positive' or 'negative'.",
+    "Classify the following news article into one of the following categories: 'politics', 'economy', 'environment', 'science', 'health'.",
 ]
     num_labels: int,
     num_rows: int,
 ) -> str:
+    base = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints]`
+import os
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts
+from distilabel.steps.tasks import GenerateTextClassificationData
+MODEL = "{MODEL}"
+TEXTCAT_TASK = "{system_prompt}"
+os.environ["HF_TOKEN"] = (
+    "hf_xxx"  # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+)
+with Pipeline(name="textcat") as pipeline:
+    textcat_generation = GenerateTextClassificationData(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={{
+                "temperature": 0.8,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        difficulty={None if difficulty == "mixed" else difficulty},
+        clarity={None if clarity == "mixed" else clarity},
+        num_generations={num_rows},
+    )
+    keep_columns = KeepColumns(
+        columns=["input_text", "model_name"],
+    )
     """
+    if num_labels > 1:
+        return base + """
+    textcat_generation >> keep_columns >> textcat_labeler
+    if __name__ == "__main__":
+        distiset = pipeline.run()
+    """
+    return f"""
+textcat_labeler = TextClassification(
+    llm=InferenceEndpointsLLM(
+        model_id=MODEL,
+        tokenizer_id=MODEL,
+        api_key=_get_next_api_key(),
+        generation_kwargs={{
+            "temperature": 0.8,
+            "max_new_tokens": 2048,
+        }},
+    ),
+    n= {num_labels},
+    available_labels={labels},
+)
+textcat_generation >> keep_columns >> textcat_labeler
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+def get_textcat_generator(difficulty, clarity, is_sample):
+    textcat_generator = GenerateTextClassificationData(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.8,
+                "max_new_tokens": 256 if is_sample else 1024,
+            },
+        ),
+        difficulty=None if difficulty == "mixed" else difficulty,
+        clarity=None if clarity == "mixed" else clarity,
+    )
+    textcat_generator.load()
+    return textcat_generator
+def get_labeller_generator(num_labels, labels, is_sample):
+    labeller_generator = TextClassification(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.8,
+                "max_new_tokens": 256 if is_sample else 1024,
+            },
+        ),
+        n= num_labels,
+        available_labels=labels,
+    )
+    labeller_generator.load()
+    return labeller_generator
+def get_prompt_generator():
+    prompt_generator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            generation_kwargs={
+                "temperature": 0.8,
+                "max_new_tokens": 2048,
+                "do_sample": True,
+            },
+        ),
+        use_system_prompt=True,
+    )
+    prompt_generator.load()
+    return prompt_generator