Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

sdiazlor HF staff commited on 18 days ago

Commit

28b1761

•

1 Parent(s): 5c28c1d

fix: minor bug and feat:use seuqence(classlabel) for multilabel

Browse files

Files changed (4) hide show

src/distilabel_dataset_generator/apps/base.py +17 -24
src/distilabel_dataset_generator/apps/textcat.py +25 -6
src/distilabel_dataset_generator/pipelines/textcat.py +3 -3
src/distilabel_dataset_generator/utils.py +4 -1

src/distilabel_dataset_generator/apps/base.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Callable, List, Tuple, Union
 import argilla as rg
 import gradio as gr
 import pandas as pd
-from datasets import ClassLabel, Dataset, Features, Value
 from distilabel.distiset import Distiset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
@@ -421,30 +421,23 @@ def push_dataset_to_hub(
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
-    if task == TEXTCAT_TASK and num_labels == 1:
-        labels = [label.lower().strip() for label in labels]
-        dataframe["label"] = dataframe["label"].apply(
-            lambda x: x if x in labels else None
-        )
-        distiset = Distiset(
-            {
-                "default": Dataset.from_pandas(
-                    dataframe,
-                    features=Features(
-                        {
-                            "text": Value("string"),
-                            "label": ClassLabel(names=labels),
-                        }
-                    ),
-                ),
-            }
-        )
     else:
-        distiset = Distiset(
-            {
-                "default": Dataset.from_pandas(dataframe),
-            }
-        )
     progress(0.2, desc="Pushing dataset to hub")
     distiset.push_to_hub(
         repo_id=repo_id,

 import argilla as rg
 import gradio as gr
 import pandas as pd
+from datasets import ClassLabel, Dataset, Features, Sequence, Value
 from distilabel.distiset import Distiset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
+    if task == TEXTCAT_TASK:
+        if num_labels == 1:
+            features = Features(
+                {"text": Value("string"), "label": ClassLabel(names=labels)}
+            )
+        else:
+            features = Features({
+                "text": Value("string"),
+                "labels": Sequence(feature=ClassLabel(names=labels))
+            })
+        distiset = Distiset({
+            "default": Dataset.from_pandas(dataframe, features=features)
+        })
     else:
+        distiset = Distiset({
+            "default": Dataset.from_pandas(dataframe)
+        })
     progress(0.2, desc="Pushing dataset to hub")
     distiset.push_to_hub(
         repo_id=repo_id,

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -37,6 +37,7 @@ from src.distilabel_dataset_generator.pipelines.textcat import (
     get_prompt_generator,
     get_textcat_generator,
 )
 TASK = "text_classification"
@@ -52,6 +53,7 @@ def push_dataset_to_hub(
     num_labels: int = 1,
 ):
     original_dataframe = dataframe.copy(deep=True)
     try:
         push_to_hub_base(
             dataframe,
@@ -82,7 +84,7 @@ def push_dataset_to_argilla(
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
         hf_user = HfApi().whoami(token=oauth_token.token)["name"]
-        labels = [label.lower().strip() for label in labels]
         settings = rg.Settings(
             fields=[
                 rg.TextField(
@@ -205,6 +207,7 @@ def generate_dataset(
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
     textcat_generator = get_textcat_generator(
         difficulty=difficulty, clarity=clarity, is_sample=is_sample
     )
@@ -247,8 +250,8 @@ def generate_dataset(
                 desc="(1/2) Labeling text classification data",
             )
             batch = textcat_results[n_processed : n_processed + batch_size]
-            labels = list(labeller_generator.process(inputs=batch))
-            labeller_results.extend(labels[0])
             n_processed += batch_size
         progress(
             1,
@@ -268,8 +271,24 @@ def generate_dataset(
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
-    if num_labels == 1:
-        dataframe = dataframe.rename(columns={"labels": "label"})
     progress(1.0, desc="Dataset generation completed")
     return dataframe
@@ -339,7 +358,7 @@ with app:
             )
             clarity = gr.Dropdown(
                 choices=[
-                    ("Clear", "CLEAR"),
                     (
                         "Understandable",
                         "understandable with some effort",

     get_prompt_generator,
     get_textcat_generator,
 )
+from src.distilabel_dataset_generator.utils import get_preprocess_labels
 TASK = "text_classification"
     num_labels: int = 1,
 ):
     original_dataframe = dataframe.copy(deep=True)
+    labels = get_preprocess_labels(labels)
     try:
         push_to_hub_base(
             dataframe,
         progress(0.1, desc="Setting up user and workspace")
         client = get_argilla_client()
         hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        labels = get_preprocess_labels(labels)
         settings = rg.Settings(
             fields=[
                 rg.TextField(
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
+    labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
         difficulty=difficulty, clarity=clarity, is_sample=is_sample
     )
                 desc="(1/2) Labeling text classification data",
             )
             batch = textcat_results[n_processed : n_processed + batch_size]
+            labels_batch = list(labeller_generator.process(inputs=batch))
+            labeller_results.extend(labels_batch[0])
             n_processed += batch_size
         progress(
             1,
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
+    if not is_sample:
+        if num_labels == 1:
+            dataframe = dataframe.rename(columns={"labels": "label"})
+            dataframe["label"] = dataframe["label"].apply(
+                lambda x: x.lower().strip() if x.lower().strip() in labels else None
+            )
+        else:
+            dataframe["labels"] = dataframe["labels"].apply(
+                lambda x: (
+                    [
+                        label.lower().strip()
+                        for label in x
+                        if label.lower().strip() in labels
+                    ]
+                    if isinstance(x, list)
+                    else None
+                )
+            )
     progress(1.0, desc="Dataset generation completed")
     return dataframe
             )
             clarity = gr.Dropdown(
                 choices=[
+                    ("Clear", "clear"),
                     (
                         "Understandable",
                         "understandable with some effort",

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -7,11 +7,11 @@ from distilabel.steps.tasks import (
     TextClassification,
     TextGeneration,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     MODEL,
     _get_next_api_key,
 )
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
@@ -84,7 +84,7 @@ def generate_pipeline_code(
     num_labels: int = 1,
     num_rows: int = 10,
 ) -> str:
-    labels = [label.lower().strip() for label in labels or []]
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
@@ -159,6 +159,7 @@ with Pipeline(name="textcat") as pipeline:
         default_label="unknown"
     )
     task_generator >> textcat_generation >> keep_columns >> textcat_labeller
     if __name__ == "__main__":
@@ -186,7 +187,6 @@ def get_textcat_generator(difficulty, clarity, is_sample):
 def get_labeller_generator(system_prompt, labels, num_labels, is_sample):
-    labels = [label.lower().strip() for label in labels]
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,

     TextClassification,
     TextGeneration,
 )
 from src.distilabel_dataset_generator.pipelines.base import (
     MODEL,
     _get_next_api_key,
 )
+from src.distilabel_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
     num_labels: int = 1,
     num_rows: int = 10,
 ) -> str:
+    labels = get_preprocess_labels(labels)
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
         default_label="unknown"
     )
+    # Connect steps in the pipeline
     task_generator >> textcat_generation >> keep_columns >> textcat_labeller
     if __name__ == "__main__":
 def get_labeller_generator(system_prompt, labels, num_labels, is_sample):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Union, Optional
 import argilla as rg
 import gradio as gr
@@ -122,3 +122,6 @@ def get_argilla_client() -> Union[rg.Argilla, None]:
         )
     except Exception:
         return None

 import os
+from typing import Union, List, Optional
 import argilla as rg
 import gradio as gr
         )
     except Exception:
         return None
+def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
+    return [label.lower().strip() for label in labels] if labels else []