# ============================================================================= # Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset # ============================================================================= # This template demonstrates how to: # • Lazy load a Hugging Face dataset from all directories using a recursive globbing # pattern for Parquet files. # • Preview the loaded DataFrame along with metadata using a custom command. # • Provide an interactive button to expand the DataFrame view. # • (Optionally) Read local JSONL files (commented out). # # Note: According to the Polars documentation, you can read multiple files with: # pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}") # and globbing patterns such as "**/*.parquet" work to query all files recursively. # # Install dependencies with: # pip install polars marimo # ============================================================================= import polars as pl import marimo as mo # Marimo provides UI and lazy-loading decorators # ------------------------------------------------------------------------------ # 2. Lazy Load the Dataset # # Use the recursive globbing pattern "**/*.parquet" to read all Parquet files # from all subdirectories on Hugging Face. # ------------------------------------------------------------------------------ dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet" @mo.lazy # Use Marimo's lazy decorator to defer data loading until needed. def load_dataset(): # Load all Parquet files matching the recursive pattern. df = pl.read_parquet(dataset_url) # Uncomment the next line to read local JSONL files instead: # df = pl.read_ndjson("/local/path/to/*.jsonl") return df # Calling load_dataset() returns a lazy DataFrame that is materialized on demand. df = load_dataset() # ------------------------------------------------------------------------------ # 3. Preview the DataFrame # # Define a custom command to preview the DataFrame with metadata. # mo.ui.table is assumed to render a rich interactive table. # ------------------------------------------------------------------------------ def preview_dataframe(df: pl.DataFrame): # Display a preview (first few rows) along with metadata (e.g., row count, column names). return mo.ui.table(df.head(), metadata=True) # Obtain and render the preview. preview = preview_dataframe(df) preview # ------------------------------------------------------------------------------ # 4. Expand the DataFrame for Better Visualization # # Create an interactive button that, when clicked, renders the full DataFrame # with expanded display options (e.g. full width). # ------------------------------------------------------------------------------ expand_option = mo.ui.button(label="Expand Dataframe") @expand_option.on_click def expand_dataframe(): # Render the complete DataFrame view using the UI table component. # Adjust display parameters such as width and height. mo.ui.table(df, width="100%", height="auto") # Render the expand button. expand_option # ------------------------------------------------------------------------------ # 5. Commented-Out Formulas for Column Selection # # The following examples (commented out) demonstrate different column selection techniques: # # Example 1: Select specific columns by name: # selected_columns_df = df.select(["column1", "column2"]) # # Example 2: Select all columns except column 'a': # all_except_a_df = df.select(pl.exclude("a")) # # Example 3: Select a range of columns (e.g., from the second to the fourth column): # range_columns_df = df.select(pl.col(df.columns[1:4])) # ------------------------------------------------------------------------------