Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 HF staff commited on Oct 16, 2024

Commit

2e2beb7

2 Parent(s): f6a1e43 abbcdf5

Merge branch 'main' into pr/11

Browse files

Files changed (8) hide show

app.py +12 -1
pdm.lock +0 -0
pyproject.toml +4 -2
requirements.txt +4 -2
src/distilabel_dataset_generator/apps/sft.py +318 -32
src/distilabel_dataset_generator/pipelines/embeddings.py +16 -0
src/distilabel_dataset_generator/pipelines/sft.py +3 -3
src/distilabel_dataset_generator/utils.py +17 -1

app.py CHANGED Viewed

@@ -55,6 +55,17 @@ demo = gr.TabbedInterface(
                 margin-bottom: 20px;
             }
         }
     </style>
     <div class="header-container">
         <div class="logo-container">
@@ -63,7 +74,7 @@ demo = gr.TabbedInterface(
             </a>
         </div>
         <div class="title-container">
-            <h1 style="margin: 0; font-size: 2em;">🧬  Synthetic Data Generator</h1>
             <p style="margin: 10px 0 0 0; color: #666; font-size: 1.1em;">Build datasets using natural language</p>
         </div>
     </div>

                 margin-bottom: 20px;
             }
         }
+        button[role="tab"].selected,
+        button[role="tab"][aria-selected="true"],
+        button[role="tab"][data-tab-id][aria-selected="true"] {
+            background-color: #000000;
+            color: white;
+            border: none;
+            font-size: 16px;
+            font-weight: bold;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+            transition: background-color 0.3s ease, color 0.3s ease;
+        }
     </style>
     <div class="header-container">
         <div class="logo-container">
             </a>
         </div>
         <div class="title-container">
+            <h1 style="margin: 0; font-size: 2em;">🧬 Synthetic Data Generator</h1>
             <p style="margin: 10px 0 0 0; color: #666; font-size: 1.1em;">Build datasets using natural language</p>
         </div>
     </div>

pdm.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -6,11 +6,13 @@ authors = [
     {name = "davidberenstein1957", email = "[email protected]"},
 ]
 dependencies = [
-    "distilabel[hf-inference-endpoints] @ git+https://github.com/argilla-io/distilabel.git@develop",
     "gradio[oauth]<5,>=4.38",
     "transformers>=4.44.2",
 ]
-requires-python = ">=3.10"
 readme = "README.md"
 license = {text = "apache 2"}

     {name = "davidberenstein1957", email = "[email protected]"},
 ]
 dependencies = [
+    "distilabel[hf-inference-endpoints,argilla]==1.4.0",
     "gradio[oauth]<5,>=4.38",
     "transformers>=4.44.2",
+    "sentence-transformers>=3.2.0",
+    "model2vec>=0.2.4",
 ]
+requires-python = "<3.13,>=3.10"
 readme = "README.md"
 license = {text = "apache 2"}

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 transformers
 gradio[oauth]
-distilabel[hf-inference-endpoints] @ git+https://github.com/argilla-io/distilabel.git@develop
-beautifulsoup4

 transformers
 gradio[oauth]
+distilabel[hf-inference-endpoints,argilla]
+beautifulsoup4
+sentence-transformers
+model2vec

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import io
-from typing import Union
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
@@ -8,7 +11,12 @@ from distilabel.distiset import Distiset
 from distilabel.steps.tasks.text_generation import TextGeneration
 from gradio.oauth import OAuthToken
 from huggingface_hub import upload_file
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_BATCH_SIZE,
     DEFAULT_DATASET_DESCRIPTIONS,
@@ -21,12 +29,21 @@ from src.distilabel_dataset_generator.pipelines.sft import (
     get_response_generator,
 )
 from src.distilabel_dataset_generator.utils import (
     get_base_app,
     get_org_dropdown,
     swap_visibilty,
 )
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
@@ -82,7 +99,7 @@ def generate_dataset(
     num_rows: int = 5,
     is_sample: bool = False,
     progress=gr.Progress(),
-):
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         num_turns, num_rows, system_prompt, is_sample
@@ -191,7 +208,12 @@ def push_to_hub(
     repo_name: str = None,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
-):
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
     distiset = Distiset(
@@ -208,7 +230,167 @@ def push_to_hub(
         create_pr=False,
     )
     progress(1.0, desc="Dataset pushed to hub")
-    return dataframe
 def upload_pipeline_code(
@@ -296,7 +478,7 @@ with get_base_app() as app:
         # Add a header for the full dataset generation section
         gr.Markdown("## Generate full dataset")
         gr.Markdown(
-            "Once you're satisfied with the sample, generate a larger dataset and push it to the Hub."
         )
         with gr.Column() as push_to_hub_ui:
@@ -316,27 +498,64 @@ with get_base_app() as app:
                     maximum=500,
                     info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
                 )
-            with gr.Row(variant="panel"):
-                org_name = get_org_dropdown()
-                repo_name = gr.Textbox(
-                    label="Repo name", placeholder="dataset_name", value="my-distiset"
-                )
-                private = gr.Checkbox(
-                    label="Private dataset",
-                    value=True,
-                    interactive=True,
-                    scale=0.5,
-                )
-            with gr.Row() as regenerate_row:
-                btn_generate_full_dataset = gr.Button(
-                    value="Generate", variant="primary", scale=2
-                )
-                btn_generate_and_push_to_hub = gr.Button(
-                    value="Generate and Push to Hub", variant="primary", scale=2
-                )
-                btn_push_to_hub = gr.Button(
-                    value="Push to Hub", variant="primary", scale=2
-                )
             with gr.Row():
                 final_dataset = gr.Dataframe(
                     value=DEFAULT_DATASETS[0],
@@ -348,7 +567,28 @@ with get_base_app() as app:
             with gr.Row():
                 success_message = gr.Markdown(visible=False)
-    def show_success_message(org_name, repo_name):
         return gr.Markdown(
             value=f"""
             <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
@@ -361,7 +601,7 @@ with get_base_app() as app:
                     </a>
                 </p>
             </div>
-        """,
             visible=True,
         )
@@ -390,8 +630,11 @@ with get_base_app() as app:
         inputs=[sample_dataset],
         outputs=[final_dataset],
     )
-    btn_generate_full_dataset.click(
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
@@ -401,6 +644,30 @@ with get_base_app() as app:
         show_progress=True,
     )
     btn_generate_and_push_to_hub.click(
         fn=hide_success_message,
         outputs=[success_message],
@@ -420,7 +687,7 @@ with get_base_app() as app:
         outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
@@ -439,11 +706,30 @@ with get_base_app() as app:
         outputs=[],
         show_progress=True,
     ).success(
-        fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
     system_prompt.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, num_turns, num_rows],

+import ast
 import io
+import uuid
+from typing import Dict, List, Union
+import argilla as rg
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from distilabel.steps.tasks.text_generation import TextGeneration
 from gradio.oauth import OAuthToken
 from huggingface_hub import upload_file
+from huggingface_hub.hf_api import HfApi
+from src.distilabel_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
 from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_BATCH_SIZE,
     DEFAULT_DATASET_DESCRIPTIONS,
     get_response_generator,
 )
 from src.distilabel_dataset_generator.utils import (
+    get_argilla_client,
     get_base_app,
     get_org_dropdown,
     swap_visibilty,
 )
+def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
+    return ast.literal_eval(
+        messages.replace("'user'}", "'user'},")
+        .replace("'system'}", "'system'},")
+        .replace("'assistant'}", "'assistant'},")
+    )
 def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     if dataset_description in DEFAULT_DATASET_DESCRIPTIONS:
     num_rows: int = 5,
     is_sample: bool = False,
     progress=gr.Progress(),
+) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         num_turns, num_rows, system_prompt, is_sample
     repo_name: str = None,
     oauth_token: Union[OAuthToken, None] = None,
     progress=gr.Progress(),
+) -> pd.DataFrame:
+    original_dataframe = dataframe.copy(deep=True)
+    if "messages" in dataframe.columns:
+        dataframe["messages"] = dataframe["messages"].apply(
+            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
+        )
     progress(0.1, desc="Setting up dataset")
     repo_id = _check_push_to_hub(org_name, repo_name)
     distiset = Distiset(
         create_pr=False,
     )
     progress(1.0, desc="Dataset pushed to hub")
+    return original_dataframe
+def push_to_argilla(
+    dataframe: pd.DataFrame,
+    dataset_name: str,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    original_dataframe = dataframe.copy(deep=True)
+    if "messages" in dataframe.columns:
+        dataframe["messages"] = dataframe["messages"].apply(
+            lambda x: convert_to_list_of_dicts(x) if isinstance(x, str) else x
+        )
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        if "messages" in dataframe.columns:
+            settings = rg.Settings(
+                fields=[
+                    rg.ChatField(
+                        name="messages",
+                        description="The messages in the conversation",
+                        title="Messages",
+                    ),
+                ],
+                questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        title="Rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="user_message_length", title="User Message Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="assistant_message_length",
+                        title="Assistant Message Length",
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="messages_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and provide a score for the assistant's response.",
+            )
+            dataframe["user_message_length"] = dataframe["messages"].apply(
+                lambda x: sum([len(y["content"]) for y in x if y["role"] == "user"])
+            )
+            dataframe["assistant_message_length"] = dataframe["messages"].apply(
+                lambda x: sum(
+                    [len(y["content"]) for y in x if y["role"] == "assistant"]
+                )
+            )
+            dataframe["messages_embeddings"] = get_embeddings(
+                dataframe["messages"].apply(
+                    lambda x: " ".join([y["content"] for y in x])
+                )
+            )
+        else:
+            settings = rg.Settings(
+                fields=[
+                    rg.TextField(
+                        name="system_prompt",
+                        title="System Prompt",
+                        description="The system prompt used for the conversation",
+                        required=False,
+                    ),
+                    rg.TextField(
+                        name="prompt",
+                        title="Prompt",
+                        description="The prompt used for the conversation",
+                    ),
+                    rg.TextField(
+                        name="completion",
+                        title="Completion",
+                        description="The completion from the assistant",
+                    ),
+                ],
+                questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        title="Rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name="prompt_length", title="Prompt Length"
+                    ),
+                    rg.IntegerMetadataProperty(
+                        name="completion_length", title="Completion Length"
+                    ),
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name="prompt_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                ],
+                guidelines="Please review the conversation and correct the prompt and completion where needed.",
+            )
+            dataframe["prompt_length"] = dataframe["prompt"].apply(len)
+            dataframe["completion_length"] = dataframe["completion"].apply(len)
+            dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
+        progress(0.5, desc="Creating dataset")
+        rg_dataset = client.datasets(name=dataset_name, workspace=hf_user)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=dataset_name,
+                workspace=hf_user,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
+        progress(0.7, desc="Pushing dataset to Argilla")
+        hf_dataset = Dataset.from_pandas(dataframe)
+        rg_dataset.records.log(records=hf_dataset)
+        progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return original_dataframe
+def validate_argilla_dataset_name(
+    dataset_name: str,
+    final_dataset: pd.DataFrame,
+    add_to_existing_dataset: bool,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> str:
+    progress(0, desc="Validating dataset configuration")
+    hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+    client = get_argilla_client()
+    if dataset_name is None or dataset_name == "":
+        raise gr.Error("Dataset name is required")
+    # Create user if it doesn't exist
+    rg_user = client.users(username=hf_user)
+    if rg_user is None:
+        rg_user = client.users.add(
+            rg.User(username=hf_user, role="admin", password=str(uuid.uuid4()))
+        )
+    # Create workspace if it doesn't exist
+    workspace = client.workspaces(name=hf_user)
+    if workspace is None:
+        workspace = client.workspaces.add(rg.Workspace(name=hf_user))
+        workspace.add_user(rg_user)
+    # Check if dataset exists
+    dataset = client.datasets(name=dataset_name, workspace=hf_user)
+    if dataset and not add_to_existing_dataset:
+        raise gr.Error(f"Dataset {dataset_name} already exists")
+    return final_dataset
 def upload_pipeline_code(
         # Add a header for the full dataset generation section
         gr.Markdown("## Generate full dataset")
         gr.Markdown(
+            "Once you're satisfied with the sample, generate a larger dataset and push it to Argilla or the Hugging Face Hub."
         )
         with gr.Column() as push_to_hub_ui:
                     maximum=500,
                     info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
                 )
+            with gr.Tab(label="Argilla"):
+                if get_argilla_client() is not None:
+                    with gr.Row(variant="panel"):
+                        dataset_name = gr.Textbox(
+                            label="Dataset name",
+                            placeholder="dataset_name",
+                            value="my-distiset",
+                        )
+                        add_to_existing_dataset = gr.Checkbox(
+                            label="Allow adding records to existing dataset",
+                            info="When selected, you do need to ensure the number of turns in the conversation is the same as the number of turns in the existing dataset.",
+                            value=False,
+                            interactive=True,
+                            scale=0.5,
+                        )
+                    with gr.Row(variant="panel"):
+                        btn_generate_full_dataset_copy = gr.Button(
+                            value="Generate", variant="primary", scale=2
+                        )
+                        btn_generate_and_push_to_argilla = gr.Button(
+                            value="Generate and Push to Argilla",
+                            variant="primary",
+                            scale=2,
+                        )
+                        btn_push_to_argilla = gr.Button(
+                            value="Push to Argilla", variant="primary", scale=2
+                        )
+                else:
+                    gr.Markdown(
+                        "Please add `ARGILLA_API_URL` and `ARGILLA_API_KEY` to use Argilla or export the dataset to the Hugging Face Hub."
+                    )
+            with gr.Tab("Hugging Face Hub"):
+                with gr.Row(variant="panel"):
+                    org_name = get_org_dropdown()
+                    repo_name = gr.Textbox(
+                        label="Repo name",
+                        placeholder="dataset_name",
+                        value="my-distiset",
+                    )
+                    private = gr.Checkbox(
+                        label="Private dataset",
+                        value=True,
+                        interactive=True,
+                        scale=0.5,
+                    )
+                with gr.Row(variant="panel"):
+                    btn_generate_full_dataset = gr.Button(
+                        value="Generate", variant="primary", scale=2
+                    )
+                    btn_generate_and_push_to_hub = gr.Button(
+                        value="Generate and Push to Hub", variant="primary", scale=2
+                    )
+                    btn_push_to_hub = gr.Button(
+                        value="Push to Hub", variant="primary", scale=2
+                    )
             with gr.Row():
                 final_dataset = gr.Dataframe(
                     value=DEFAULT_DATASETS[0],
             with gr.Row():
                 success_message = gr.Markdown(visible=False)
+    def show_success_message_argilla():
+        client = get_argilla_client()
+        argilla_api_url = client.api_url
+        return gr.Markdown(
+            value=f"""
+            <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
+                <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
+                <p style="margin-top: 0.5em;">
+                    Your dataset is now available at:
+                    <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
+                        {argilla_api_url}
+                    </a>
+                    <br>Unfamiliar with Argilla? Here are some docs to help you get started:
+                    <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
+                    <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
+                </p>
+            </div>
+            """,
+            visible=True,
+        )
+    def show_success_message_hub(org_name, repo_name):
         return gr.Markdown(
             value=f"""
             <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
                     </a>
                 </p>
             </div>
+            """,
             visible=True,
         )
         inputs=[sample_dataset],
         outputs=[final_dataset],
     )
+    gr.on(
+        triggers=[
+            btn_generate_full_dataset.click,
+            btn_generate_full_dataset_copy.click,
+        ],
         fn=hide_success_message,
         outputs=[success_message],
     ).then(
         show_progress=True,
     )
+    btn_generate_and_push_to_argilla.click(
+        fn=validate_argilla_dataset_name,
+        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=generate_dataset,
+        inputs=[system_prompt, num_turns, num_rows],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=push_to_argilla,
+        inputs=[final_dataset, dataset_name],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
+    )
     btn_generate_and_push_to_hub.click(
         fn=hide_success_message,
         outputs=[success_message],
         outputs=[],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
         outputs=[],
         show_progress=True,
     ).success(
+        fn=show_success_message_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     )
+    btn_push_to_argilla.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=validate_argilla_dataset_name,
+        inputs=[dataset_name, final_dataset, add_to_existing_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=push_to_argilla,
+        inputs=[final_dataset, dataset_name],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
+        fn=show_success_message_argilla,
+        inputs=[],
+        outputs=[success_message],
+    )
     system_prompt.change(
         fn=generate_pipeline_code,
         inputs=[system_prompt, num_turns, num_rows],

src/distilabel_dataset_generator/pipelines/embeddings.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.models import StaticEmbedding
+# Initialize a StaticEmbedding module
+static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
+model = SentenceTransformer(modules=[static_embedding])
+def get_embeddings(texts: List[str]) -> List[List[float]]:
+    return [embedding.tolist() for embedding in model.encode(texts)]
+def get_sentence_embedding_dimensions() -> int:
+    return model.get_sentence_embedding_dimension()

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -189,7 +189,7 @@ with Pipeline(name="sft") as pipeline:
             tokenizer_id=MODEL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
-                "temperature": 0.8,
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
@@ -231,7 +231,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
-                    "temperature": 0.8,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
@@ -250,7 +250,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
-                    "temperature": 0.8,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,

             tokenizer_id=MODEL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
+                "temperature": 0.9,
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
+                    "temperature": 0.9,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
+                    "temperature": 0.9,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
@@ -10,6 +12,8 @@ from gradio.oauth import (
 )
 from huggingface_hub import whoami
 HF_TOKENS = [os.getenv("HF_TOKEN")] + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
 HF_TOKENS = [token for token in HF_TOKENS if token]
@@ -105,4 +109,16 @@ def get_base_app():
     return app
-_LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"

 import os
+from typing import Union
+import argilla as rg
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
 )
 from huggingface_hub import whoami
+_LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
 HF_TOKENS = [os.getenv("HF_TOKEN")] + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
 HF_TOKENS = [token for token in HF_TOKENS if token]
     return app
+def get_argilla_client() -> Union[rg.Argilla, None]:
+    try:
+        api_url = os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
+        api_key = os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
+        if api_url is None or api_key is None:
+            api_url = os.getenv("ARGILLA_API_URL")
+            api_key = os.getenv("ARGILLA_API_KEY")
+        return rg.Argilla(
+            api_url=api_url,
+            api_key=api_key,
+        )
+    except Exception:
+        return None