Spaces:

datasets-topics
/

topics-generator

Sleeping

App Files Files Community

asoria HF staff commited on Oct 24, 2024

Commit

dc70c7b

1 Parent(s): a893b55

some refactor

Browse files

Files changed (4) hide show

.vscode/launch.json +14 -0
app.py +21 -63
src/templates.py +3 -11
src/viewer_api.py +51 -0

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "purpose": ["debug-test"],
+            "justMyCode": false
+        }
+    ]
+}

app.py CHANGED Viewed

@@ -5,9 +5,7 @@ import logging
 import os
 import datamapplot
-import duckdb
 import numpy as np
-import requests
 from dotenv import load_dotenv
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
@@ -28,26 +26,28 @@ from transformers import (
 from src.hub import create_space_with_content
 from src.templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
-EXPORTS_REPOSITORY = os.getenv("EXPORTS_REPOSITORY")
-assert (
-    EXPORTS_REPOSITORY is not None
-), "You need to set EXPORTS_REPOSITORY in your environment variables"
 MAX_ROWS = int(os.getenv("MAX_ROWS", "8_000"))
 CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2_000"))
-DATASET_VIEWE_API_URL = "https://datasets-server.huggingface.co/"
 DATASETS_TOPICS_ORGANIZATION = os.getenv(
     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_ARROW_STYLE = int(os.getenv("USE_ARROW_STYLE", "0"))
-USE_CUML = int(os.getenv("USE_CUML", "0"))
 if USE_CUML:
     from cuml.manifold import UMAP
     from cuml.cluster import HDBSCAN
@@ -55,14 +55,12 @@ else:
     from umap import UMAP
     from hdbscan import HDBSCAN
-USE_LLM_TEXT_GENERATION = int(os.getenv("USE_LLM_TEXT_GENERATION", "1"))
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 api = HfApi(token=HF_TOKEN)
-session = requests.Session()
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Representation model
@@ -98,41 +96,6 @@ else:
 vectorizer_model = CountVectorizer(stop_words="english")
-def get_split_rows(dataset, config, split):
-    config_size = session.get(
-        f"{DATASET_VIEWE_API_URL}/size?dataset={dataset}&config={config}",
-        timeout=20,
-    ).json()
-    if "error" in config_size:
-        raise Exception(f"Error fetching config size: {config_size['error']}")
-    split_size = next(
-        (s for s in config_size["size"]["splits"] if s["split"] == split),
-        None,
-    )
-    if split_size is None:
-        raise Exception(f"Error fetching split {split} in config {config}")
-    return split_size["num_rows"]
-def get_parquet_urls(dataset, config, split):
-    parquet_files = session.get(
-        f"{DATASET_VIEWE_API_URL}/parquet?dataset={dataset}&config={config}&split={split}",
-        timeout=20,
-    ).json()
-    if "error" in parquet_files:
-        raise Exception(f"Error fetching parquet files: {parquet_files['error']}")
-    parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
-    logging.debug(f"Parquet files: {parquet_urls}")
-    return ",".join(f"'{url}'" for url in parquet_urls)
-def get_docs_from_parquet(parquet_urls, column, offset, limit):
-    SQL_QUERY = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
-    df = duckdb.sql(SQL_QUERY).to_df()
-    return df[column].tolist()
-# @spaces.GPU
 def calculate_embeddings(docs):
     return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
@@ -143,7 +106,6 @@ def calculate_n_neighbors_and_components(n_rows):
     return n_neighbors, n_components
-# @spaces.GPU
 def fit_model(docs, embeddings, n_neighbors, n_components):
     umap_model = UMAP(
         n_neighbors=n_neighbors,
@@ -254,18 +216,16 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
         reduced_embeddings_array = np.vstack(reduced_embeddings_list)
         topics_info = base_model.get_topic_info()
-        all_topics, _ = base_model.transform(all_docs)
-        all_topics = np.array(all_topics)
         sub_title = (
             f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
             if full_processing
             else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
         )
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
                 reduced_embeddings=reduced_embeddings_array,
                 title=dataset,
                 sub_title=sub_title,
@@ -291,7 +251,6 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
                 title=dataset,
             )
         )
         rows_processed += len(docs)
         progress = min(rows_processed / limit, 1.0)
         logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
@@ -320,10 +279,10 @@ def generate_topics(dataset, config, split, column, nested_column, plot_type):
     else:
         topic_plot.write_image(plot_png)
-    all_topics, _ = base_model.transform(all_docs)
     topic_info = base_model.get_topic_info()
-    topic_names = {row["Topic"]: row["Name"] for index, row in topic_info.iterrows()}
     topic_names_array = np.array(
         [
             topic_names.get(topic, "No Topic").split("_")[1].strip("-")
@@ -461,21 +420,20 @@ with gr.Blocks() as demo:
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
-        info_resp = session.get(
-            f"{DATASET_VIEWE_API_URL}/info?dataset={dataset}", timeout=20
-        ).json()
-        if "error" in info_resp:
             return {
                 subset_dropdown: gr.Dropdown(visible=False),
                 split_dropdown: gr.Dropdown(visible=False),
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
-        subsets: list[str] = list(info_resp["dataset_info"])
         subset = default_subset if default_subset in subsets else subsets[0]
-        splits: list[str] = list(info_resp["dataset_info"][subset]["splits"])
         split = default_split if default_split in splits else splits[0]
-        features = info_resp["dataset_info"][subset]["features"]
         def _is_string_feature(feature):
             return isinstance(feature, dict) and feature.get("dtype") == "string"

 import os
 import datamapplot
 import numpy as np
 from dotenv import load_dotenv
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from src.hub import create_space_with_content
 from src.templates import REPRESENTATION_PROMPT, SPACE_REPO_CARD_CONTENT
+from src.viewer_api import (
+    get_split_rows,
+    get_parquet_urls,
+    get_docs_from_parquet,
+    get_info,
+)
+# Load environment variables
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
 MAX_ROWS = int(os.getenv("MAX_ROWS", "8_000"))
 CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2_000"))
 DATASETS_TOPICS_ORGANIZATION = os.getenv(
     "DATASETS_TOPICS_ORGANIZATION", "datasets-topics"
 )
 USE_ARROW_STYLE = int(os.getenv("USE_ARROW_STYLE", "0"))
+USE_CUML = int(os.getenv("USE_CUML", "1"))
+USE_LLM_TEXT_GENERATION = int(os.getenv("USE_LLM_TEXT_GENERATION", "1"))
+# Use cuml lib only if configured
 if USE_CUML:
     from cuml.manifold import UMAP
     from cuml.cluster import HDBSCAN
     from umap import UMAP
     from hdbscan import HDBSCAN
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 api = HfApi(token=HF_TOKEN)
 sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Representation model
 vectorizer_model = CountVectorizer(stop_words="english")
 def calculate_embeddings(docs):
     return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
     return n_neighbors, n_components
 def fit_model(docs, embeddings, n_neighbors, n_components):
     umap_model = UMAP(
         n_neighbors=n_neighbors,
         reduced_embeddings_array = np.vstack(reduced_embeddings_list)
         topics_info = base_model.get_topic_info()
+        all_topics = base_model.topics_
         sub_title = (
             f"Data map for the entire dataset ({limit} rows) using the column '{column}'"
             if full_processing
             else f"Data map for a sample of the dataset (first {limit} rows) using the column '{column}'"
         )
         topic_plot = (
             base_model.visualize_document_datamap(
                 docs=all_docs,
+                topics=all_topics,
                 reduced_embeddings=reduced_embeddings_array,
                 title=dataset,
                 sub_title=sub_title,
                 title=dataset,
             )
         )
         rows_processed += len(docs)
         progress = min(rows_processed / limit, 1.0)
         logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
     else:
         topic_plot.write_image(plot_png)
+    all_topics = base_model.topics_
     topic_info = base_model.get_topic_info()
+    topic_names = {row["Topic"]: row["Name"] for _, row in topic_info.iterrows()}
     topic_names_array = np.array(
         [
             topic_names.get(topic, "No Topic").split("_")[1].strip("-")
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
+        try:
+            info_resp = get_info(dataset)
+        except Exception:
             return {
                 subset_dropdown: gr.Dropdown(visible=False),
                 split_dropdown: gr.Dropdown(visible=False),
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
+        subsets: list[str] = list(info_resp)
         subset = default_subset if default_subset in subsets else subsets[0]
+        splits: list[str] = list(info_resp[subset]["splits"])
         split = default_split if default_split in splits else splits[0]
+        features = info_resp[subset]["features"]
         def _is_string_feature(feature):
             return isinstance(feature, dict) and feature.get("dtype") == "string"

src/templates.py CHANGED Viewed

@@ -5,12 +5,7 @@ You are a helpful, respectful and honest assistant for labeling topics.
 """
 EXAMPLE_PROMPT = """
-I have a topic that contains the following documents:
-- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
-- Meat, but especially beef, is the word food in terms of emissions.
-- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
-The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
 Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
@@ -19,10 +14,7 @@ Based on the information about the topic above, please create a short label of t
 MAIN_PROMPT = """
 [INST]
-I have a topic that contains the following documents:
-[DOCUMENTS]
-The topic is described by the following keywords: '[KEYWORDS]'.
 Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
 [/INST]
@@ -32,7 +24,7 @@ REPRESENTATION_PROMPT = SYSTEM_PROMPT + EXAMPLE_PROMPT + MAIN_PROMPT
 SPACE_REPO_CARD_CONTENT = """
 ---
-title: {dataset_id} topic modeling
 sdk: static
 pinned: false
 datasets:

 """
 EXAMPLE_PROMPT = """
+I have a topic that is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
 Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
 MAIN_PROMPT = """
 [INST]
+I have a topic that is described by the following keywords: '[KEYWORDS]'.
 Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
 [/INST]
 SPACE_REPO_CARD_CONTENT = """
 ---
+title: {dataset_id}
 sdk: static
 pinned: false
 datasets:

src/viewer_api.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import requests
+import duckdb
+DATASET_VIEWER_API_URL = "https://datasets-server.huggingface.co/"
+session = requests.Session()
+def fetch_json(url, params=None, timeout=20):
+    response = session.get(url, params=params, timeout=timeout)
+    response.raise_for_status()
+    data = response.json()
+    if "error" in data:
+        raise Exception(f"Error fetching data: {data['error']}")
+    return data
+def get_split_rows(dataset, config, split):
+    url = f"{DATASET_VIEWER_API_URL}/size"
+    params = {"dataset": dataset, "config": config}
+    config_size = fetch_json(url, params)
+    split_size = next(
+        (s for s in config_size["size"]["splits"] if s["split"] == split), None
+    )
+    if split_size is None:
+        raise Exception(f"Error fetching split {split} in config {config}")
+    return split_size["num_rows"]
+def get_parquet_urls(dataset, config, split):
+    url = f"{DATASET_VIEWER_API_URL}/parquet"
+    params = {"dataset": dataset, "config": config, "split": split}
+    parquet_files = fetch_json(url, params)
+    parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
+    return ",".join(f"'{url}'" for url in parquet_urls)
+def get_docs_from_parquet(parquet_urls, column, offset, limit):
+    sql_query = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
+    df = duckdb.sql(sql_query).to_df()
+    return df[column].tolist()
+def get_info(dataset):
+    url = f"{DATASET_VIEWER_API_URL}/info"
+    params = {"dataset": dataset}
+    info_resp = fetch_json(url, params)
+    return info_resp["dataset_info"]