Spaces:

cicero-im
/

marimo

Sleeping

App Files Files Community

arthrod commited on 28 days ago

Commit

d25a3d8

verified ·

1 Parent(s): b381380

Update app.py

Browse files

Files changed (1) hide show

app.py +595 -79

app.py CHANGED Viewed

@@ -1,85 +1,601 @@
 # =============================================================================
-# Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset
 # =============================================================================
-# This template demonstrates how to:
-#  • Lazy load a Hugging Face dataset from all directories using a recursive globbing
-#    pattern for Parquet files.
-#  • Preview the loaded DataFrame along with metadata using a custom command.
-#  • Provide an interactive button to expand the DataFrame view.
-#  • (Optionally) Read local JSONL files (commented out).
-#
-# Note: According to the Polars documentation, you can read multiple files with:
-#       pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}")
-#       and globbing patterns such as "**/*.parquet" work to query all files recursively.
-#
-# Install dependencies with:
-#   pip install polars marimo
 # =============================================================================
-import polars as pl
-import marimo as mo  # Marimo provides UI and lazy-loading decorators
-# ------------------------------------------------------------------------------
-# 2. Lazy Load the Dataset
-#
-# Use the recursive globbing pattern "**/*.parquet" to read all Parquet files
-# from all subdirectories on Hugging Face.
-# ------------------------------------------------------------------------------
-dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
-@mo.lazy  # Use Marimo's lazy decorator to defer data loading until needed.
-def load_dataset():
-    # Load all Parquet files matching the recursive pattern.
-    df = pl.read_parquet(dataset_url)
-    # Uncomment the next line to read local JSONL files instead:
-    # df = pl.read_ndjson("/local/path/to/*.jsonl")
     return df
-# Calling load_dataset() returns a lazy DataFrame that is materialized on demand.
-df = load_dataset()
-# ------------------------------------------------------------------------------
-# 3. Preview the DataFrame
-#
-# Define a custom command to preview the DataFrame with metadata.
-# mo.ui.table is assumed to render a rich interactive table.
-# ------------------------------------------------------------------------------
-def preview_dataframe(df: pl.DataFrame):
-    # Display a preview (first few rows) along with metadata (e.g., row count, column names).
-    return mo.ui.table(df.head(), metadata=True)
-# Obtain and render the preview.
-preview = preview_dataframe(df)
-preview
-# ------------------------------------------------------------------------------
-# 4. Expand the DataFrame for Better Visualization
-#
-# Create an interactive button that, when clicked, renders the full DataFrame
-# with expanded display options (e.g. full width).
-# ------------------------------------------------------------------------------
-expand_option = mo.ui.button(label="Expand Dataframe")
-@expand_option.on_click
-def expand_dataframe():
-    # Render the complete DataFrame view using the UI table component.
-    # Adjust display parameters such as width and height.
-    mo.ui.table(df, width="100%", height="auto")
-# Render the expand button.
-expand_option
-# ------------------------------------------------------------------------------
-# 5. Commented-Out Formulas for Column Selection
-#
-# The following examples (commented out) demonstrate different column selection techniques:
-#
-# Example 1: Select specific columns by name:
-# selected_columns_df = df.select(["column1", "column2"])
-#
-# Example 2: Select all columns except column 'a':
-# all_except_a_df = df.select(pl.exclude("a"))
-#
-# Example 3: Select a range of columns (e.g., from the second to the fourth column):
-# range_columns_df = df.select(pl.col(df.columns[1:4]))
-# ------------------------------------------------------------------------------

+import os
+import polars as pl
+import marimo
+__generated_with = "0.10.15"
+app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css")
+# =============================================================================
+# Intro Cell
+# =============================================================================
+@app.cell
+def introduction(mo):
+    mo.md(
+        r"""
+        # Exploring a Hugging Face Dataset with Polars
+        In this notebook we demonstrate how to:
+         - **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern).
+         - **Preview** the loaded DataFrame with metadata.
+         - **Interactively expand** the DataFrame view.
+         - Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data.
+        **Prerequisites:**
+         - Install dependencies via:
+           ```bash
+           pip install polars marimo
+           ```
+         - Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable.
+        ![Hugging Face logo](https://huggingface.co/front/assets/huggingface_logo.svg)
+        """
+    )
+    return
 # =============================================================================
+# Load HF_TOKEN from the environment
 # =============================================================================
+@app.cell
+def load_token(mo):
+    hf_token = os.environ.get("HF_TOKEN")
+    mo.md(f"""
+    **Hugging Face Token:** `{hf_token}`
+    *(Ensure that HF_TOKEN is set in your environment.)*
+    """)
+    return
+# =============================================================================
+# 1. Lazy-load the Dataset
 # =============================================================================
+@app.cell
+def lazy_load_dataset(mo, pl):
+    # Use a recursive globbing pattern to load all Parquet files from all subdirectories.
+    dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
+    @mo.lazy  # The mo.lazy decorator defers execution until the data is needed.
+    def load_dataset():
+        # Load all Parquet files matching the recursive pattern.
+        df = pl.read_parquet(dataset_url)
+        # --- Alternative for local JSONL files (uncomment if needed):
+        # df = pl.read_ndjson("/local/path/to/*.jsonl")
+        return df
+    df = load_dataset()
     return df
+# =============================================================================
+# 2. Preview the DataFrame with Metadata
+# =============================================================================
+@app.cell
+def preview_data(mo, lazy_load_dataset, pl):
+    df = lazy_load_dataset  # LazyFrame returned by load_dataset
+    preview = mo.ui.table(df.head(), metadata=True)
+    mo.md(
+        r"""
+        ## Data Preview
+        Below is a preview of the first few rows along with basic metadata.
+        """
+    )
+    return preview
+# =============================================================================
+# 3. Expand the DataFrame for Better Visualization
+# =============================================================================
+@app.cell
+def expand_view(mo, lazy_load_dataset, pl):
+    df = lazy_load_dataset
+    expand_button = mo.ui.button(label="Expand Dataframe")
+    @expand_button.on_click
+    def on_expand():
+        mo.ui.table(df, width="100%", height="auto")
+    mo.md(
+        r"""
+        ## Expand Dataframe
+        Click the button below to expand the DataFrame view.
+        """
+    )
+    return expand_button
+# =============================================================================
+# 4. Column Selection Tips (as Markdown)
+# =============================================================================
+@app.cell
+def column_selection_tips(mo):
+    mo.md(
+        r"""
+        ## Column Selection Tips
+        **Example 1: Select specific columns by name:**
+        ```python
+        selected_columns_df = df.select(["column1", "column2"])
+        ```
+        **Example 2: Select all columns except column 'a':**
+        ```python
+        all_except_a_df = df.select(pl.exclude("a"))
+        ```
+        **Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):**
+        ```python
+        range_columns_df = df.select(pl.col(df.columns[1:4]))
+        ```
+        """
+    )
+    return
+# =============================================================================
+# Additional Polars I/O and DataFrame Examples (Markdown Cells)
+# =============================================================================
+@app.cell
+def example_1(mo):
+    mo.md(
+        r"""
+        ### Example 1: Eagerly Read a Single Parquet File
+        ```python
+        df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
+        ```
+        """
+    )
+    return
+@app.cell
+def example_2(mo):
+    mo.md(
+        r"""
+        ### Example 2: Read Multiple Parquet Files Using Globbing
+        ```python
+        df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet")
+        ```
+        """
+    )
+    return
+@app.cell
+def example_3(mo):
+    mo.md(
+        r"""
+        ### Example 3: Lazily Scan Parquet Files with Recursive Globbing
+        ```python
+        df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet")
+        ```
+        """
+    )
+    return
+@app.cell
+def example_4(mo):
+    mo.md(
+        r"""
+        ### Example 4: Read a JSON File into a DataFrame
+        ```python
+        df_json = pl.read_json("data/sample.json")
+        ```
+        """
+    )
+    return
+@app.cell
+def example_5(mo):
+    mo.md(
+        r"""
+        ### Example 5: Read JSON with a Specified Schema
+        ```python
+        schema = {"name": pl.Utf8, "age": pl.Int64}
+        df_json = pl.read_json("data/sample.json", schema=schema)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_6(mo):
+    mo.md(
+        r"""
+        ### Example 6: Write a DataFrame to NDJSON Format
+        ```python
+        df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]})
+        ndjson_str = df.write_ndjson()
+        print(ndjson_str)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_7(mo):
+    mo.md(
+        r"""
+        ### Example 7: Get the Schema of a Parquet File Without Reading Data
+        ```python
+        schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
+        print(schema)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_8(mo):
+    mo.md(
+        r"""
+        ### Example 8: Scan Parquet Files with Hive Partitioning Enabled
+        ```python
+        df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_9(mo):
+    mo.md(
+        r"""
+        ### Example 9: Lazily Scan NDJSON Files Using Globbing
+        ```python
+        df_lazy = pl.scan_ndjson("data/*.jsonl")
+        ```
+        """
+    )
+    return
+@app.cell
+def example_10(mo):
+    mo.md(
+        r"""
+        ### Example 10: Write a DataFrame to Partitioned Parquet Files
+        ```python
+        df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]})
+        df.write_parquet("output/", partition_by=["date"])
+        ```
+        """
+    )
+    return
+@app.cell
+def example_11(mo):
+    mo.md(
+        r"""
+        ### Example 11: Read JSON with Custom Inference Length
+        ```python
+        df = pl.read_json("data/large_text.json", infer_schema_length=500)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_12(mo):
+    mo.md(
+        r"""
+        ### Example 12: Read JSON with Schema Overrides
+        ```python
+        schema = {"id": pl.Int64, "text": pl.Utf8}
+        overrides = {"id": pl.Int32}
+        df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_13(mo):
+    mo.md(
+        r"""
+        ### Example 13: Write a DataFrame to NDJSON and Return as String
+        ```python
+        df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]})
+        ndjson_output = df.write_ndjson()
+        print(ndjson_output)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_14(mo):
+    mo.md(
+        r"""
+        ### Example 14: Scan Parquet Files with Cloud Storage Options
+        ```python
+        storage_options = {"token": os.environ.get("HF_TOKEN")}
+        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_15(mo):
+    mo.md(
+        r"""
+        ### Example 15: Scan NDJSON Files with Cloud Storage Options
+        ```python
+        storage_options = {"token": os.environ.get("HF_TOKEN")}
+        df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_16(mo):
+    mo.md(
+        r"""
+        ### Example 16: Predicate Pushdown Example
+        ```python
+        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        # Only load rows where 'value' > 100
+        df_filtered = df_lazy.filter(pl.col("value") > 100)
+        result = df_filtered.collect()
+        ```
+        """
+    )
+    return
+@app.cell
+def example_17(mo):
+    mo.md(
+        r"""
+        ### Example 17: Projection Pushdown Example
+        ```python
+        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        # Only select the 'text' and 'id' columns to reduce memory footprint
+        df_proj = df_lazy.select(["id", "text"])
+        result = df_proj.collect()
+        ```
+        """
+    )
+    return
+@app.cell
+def example_18(mo):
+    mo.md(
+        r"""
+        ### Example 18: Collecting a Lazy DataFrame
+        ```python
+        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        # Perform lazy operations...
+        result = df_lazy.collect()
+        print(result)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_19(mo):
+    mo.md(
+        r"""
+        ### Example 19: Filtering on a Large Text Column
+        ```python
+        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        # Filter rows where the 'text' column contains a long string pattern
+        df_filtered = df.filter(pl.col("text").str.contains("important keyword"))
+        print(df_filtered.head())
+        ```
+        """
+    )
+    return
+@app.cell
+def example_20(mo):
+    mo.md(
+        r"""
+        ### Example 20: Using String Length on a Text Column
+        ```python
+        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        # Compute the length of text in the 'text' column
+        df = df.with_columns(text_length=pl.col("text").str.len())
+        print(df.head())
+        ```
+        """
+    )
+    return
+@app.cell
+def example_21(mo):
+    mo.md(
+        r"""
+        ### Example 21: Grouping by a Large Text Field
+        ```python
+        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length"))
+        print(grouped.collect())
+        ```
+        """
+    )
+    return
+@app.cell
+def example_22(mo):
+    mo.md(
+        r"""
+        ### Example 22: Joining Two DataFrames on a Common Key
+        ```python
+        df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]})
+        df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]})
+        joined = df1.join(df2, on="id")
+        print(joined)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_23(mo):
+    mo.md(
+        r"""
+        ### Example 23: Using join_asof for Time-based Joins
+        ```python
+        df1 = pl.DataFrame({
+            "time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"),
+            "text": ["sample text"] * 25
+        })
+        df2 = pl.DataFrame({
+            "time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"),
+            "value": list(range(25))
+        })
+        # Perform an asof join to match the nearest timestamp
+        joined = df1.sort("time").join_asof(df2.sort("time"), on="time")
+        print(joined)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_24(mo):
+    mo.md(
+        r"""
+        ### Example 24: Reading a Parquet File with Low Memory Option
+        ```python
+        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True)
+        print(df.head())
+        ```
+        """
+    )
+    return
+@app.cell
+def example_25(mo):
+    mo.md(
+        r"""
+        ### Example 25: Scanning Parquet Files with a Parallel Strategy
+        ```python
+        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto")
+        result = df_lazy.collect()
+        print(result)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_26(mo):
+    mo.md(
+        r"""
+        ### Example 26: Reading a Large JSON File into a DataFrame
+        ```python
+        df = pl.read_json("data/large_text.json", infer_schema_length=200)
+        print(df.head())
+        ```
+        """
+    )
+    return
+@app.cell
+def example_27(mo):
+    mo.md(
+        r"""
+        ### Example 27: Using DataFrame.head() on a Large Text Dataset
+        ```python
+        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        print(df.head(10))
+        ```
+        """
+    )
+    return
+@app.cell
+def example_28(mo):
+    mo.md(
+        r"""
+        ### Example 28: Using DataFrame.tail() on a Large Text Dataset
+        ```python
+        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
+        print(df.tail(10))
+        ```
+        """
+    )
+    return
+@app.cell
+def example_29(mo):
+    mo.md(
+        r"""
+        ### Example 29: Scanning NDJSON Files with Rechunking
+        ```python
+        df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True)
+        result = df_lazy.collect()
+        print(result)
+        ```
+        """
+    )
+    return
+@app.cell
+def example_30(mo):
+    mo.md(
+        r"""
+        ### Example 30: Scanning Parquet Files with Allowing Missing Columns
+        ```python
+        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True)
+        result = df_lazy.collect()
+        print(result)
+        ```
+        """
+    )
+    return
+# =============================================================================
+# End of Notebook
+# =============================================================================
+@app.cell
+def conclusion(mo):
+    mo.md(
+        r"""
+        # Conclusion
+        This notebook showcased:
+         - How to lazy-load a Hugging Face dataset using Polars with recursive globbing.
+         - How to preview and interactively expand the DataFrame.
+         - Over 30 examples covering various Polars I/O functions and DataFrame operations,
+           which are especially useful when working with large text data.
+        For more information, please refer to:
+         - [Polars Documentation](https://docs.pola.rs/)
+         - [Hugging Face Hub Documentation](https://huggingface.co/docs)
+         - [Marimo Notebook Documentation](https://marimo.io/)
+        Happy Data Exploring!
+        """
+    )
+    return
+if __name__ == "__main__":
+    app.run()