arthrod commited on
Commit
d25a3d8
·
verified ·
1 Parent(s): b381380

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +595 -79
app.py CHANGED
@@ -1,85 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # =============================================================================
2
- # Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset
3
  # =============================================================================
4
- # This template demonstrates how to:
5
- # • Lazy load a Hugging Face dataset from all directories using a recursive globbing
6
- # pattern for Parquet files.
7
- # • Preview the loaded DataFrame along with metadata using a custom command.
8
- # • Provide an interactive button to expand the DataFrame view.
9
- # • (Optionally) Read local JSONL files (commented out).
10
- #
11
- # Note: According to the Polars documentation, you can read multiple files with:
12
- # pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}")
13
- # and globbing patterns such as "**/*.parquet" work to query all files recursively.
14
- #
15
- # Install dependencies with:
16
- # pip install polars marimo
17
  # =============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- import polars as pl
20
- import marimo as mo # Marimo provides UI and lazy-loading decorators
21
-
22
- # ------------------------------------------------------------------------------
23
- # 2. Lazy Load the Dataset
24
- #
25
- # Use the recursive globbing pattern "**/*.parquet" to read all Parquet files
26
- # from all subdirectories on Hugging Face.
27
- # ------------------------------------------------------------------------------
28
- dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
29
-
30
- @mo.lazy # Use Marimo's lazy decorator to defer data loading until needed.
31
- def load_dataset():
32
- # Load all Parquet files matching the recursive pattern.
33
- df = pl.read_parquet(dataset_url)
34
- # Uncomment the next line to read local JSONL files instead:
35
- # df = pl.read_ndjson("/local/path/to/*.jsonl")
36
  return df
37
 
38
- # Calling load_dataset() returns a lazy DataFrame that is materialized on demand.
39
- df = load_dataset()
40
-
41
- # ------------------------------------------------------------------------------
42
- # 3. Preview the DataFrame
43
- #
44
- # Define a custom command to preview the DataFrame with metadata.
45
- # mo.ui.table is assumed to render a rich interactive table.
46
- # ------------------------------------------------------------------------------
47
- def preview_dataframe(df: pl.DataFrame):
48
- # Display a preview (first few rows) along with metadata (e.g., row count, column names).
49
- return mo.ui.table(df.head(), metadata=True)
50
-
51
- # Obtain and render the preview.
52
- preview = preview_dataframe(df)
53
- preview
54
-
55
- # ------------------------------------------------------------------------------
56
- # 4. Expand the DataFrame for Better Visualization
57
- #
58
- # Create an interactive button that, when clicked, renders the full DataFrame
59
- # with expanded display options (e.g. full width).
60
- # ------------------------------------------------------------------------------
61
- expand_option = mo.ui.button(label="Expand Dataframe")
62
-
63
- @expand_option.on_click
64
- def expand_dataframe():
65
- # Render the complete DataFrame view using the UI table component.
66
- # Adjust display parameters such as width and height.
67
- mo.ui.table(df, width="100%", height="auto")
68
-
69
- # Render the expand button.
70
- expand_option
71
-
72
- # ------------------------------------------------------------------------------
73
- # 5. Commented-Out Formulas for Column Selection
74
- #
75
- # The following examples (commented out) demonstrate different column selection techniques:
76
- #
77
- # Example 1: Select specific columns by name:
78
- # selected_columns_df = df.select(["column1", "column2"])
79
- #
80
- # Example 2: Select all columns except column 'a':
81
- # all_except_a_df = df.select(pl.exclude("a"))
82
- #
83
- # Example 3: Select a range of columns (e.g., from the second to the fourth column):
84
- # range_columns_df = df.select(pl.col(df.columns[1:4]))
85
- # ------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import polars as pl
3
+ import marimo
4
+
5
+ __generated_with = "0.10.15"
6
+ app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css")
7
+
8
+ # =============================================================================
9
+ # Intro Cell
10
+ # =============================================================================
11
+ @app.cell
12
+ def introduction(mo):
13
+ mo.md(
14
+ r"""
15
+ # Exploring a Hugging Face Dataset with Polars
16
+
17
+ In this notebook we demonstrate how to:
18
+ - **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern).
19
+ - **Preview** the loaded DataFrame with metadata.
20
+ - **Interactively expand** the DataFrame view.
21
+ - Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data.
22
+
23
+ **Prerequisites:**
24
+ - Install dependencies via:
25
+ ```bash
26
+ pip install polars marimo
27
+ ```
28
+ - Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable.
29
+
30
+ ![Hugging Face logo](https://huggingface.co/front/assets/huggingface_logo.svg)
31
+ """
32
+ )
33
+ return
34
+
35
  # =============================================================================
36
+ # Load HF_TOKEN from the environment
37
  # =============================================================================
38
+ @app.cell
39
+ def load_token(mo):
40
+ hf_token = os.environ.get("HF_TOKEN")
41
+ mo.md(f"""
42
+ **Hugging Face Token:** `{hf_token}`
43
+ *(Ensure that HF_TOKEN is set in your environment.)*
44
+ """)
45
+ return
46
+
47
+ # =============================================================================
48
+ # 1. Lazy-load the Dataset
 
 
49
  # =============================================================================
50
+ @app.cell
51
+ def lazy_load_dataset(mo, pl):
52
+ # Use a recursive globbing pattern to load all Parquet files from all subdirectories.
53
+ dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
54
+
55
+ @mo.lazy # The mo.lazy decorator defers execution until the data is needed.
56
+ def load_dataset():
57
+ # Load all Parquet files matching the recursive pattern.
58
+ df = pl.read_parquet(dataset_url)
59
+ # --- Alternative for local JSONL files (uncomment if needed):
60
+ # df = pl.read_ndjson("/local/path/to/*.jsonl")
61
+ return df
62
 
63
+ df = load_dataset()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  return df
65
 
66
+ # =============================================================================
67
+ # 2. Preview the DataFrame with Metadata
68
+ # =============================================================================
69
+ @app.cell
70
+ def preview_data(mo, lazy_load_dataset, pl):
71
+ df = lazy_load_dataset # LazyFrame returned by load_dataset
72
+ preview = mo.ui.table(df.head(), metadata=True)
73
+ mo.md(
74
+ r"""
75
+ ## Data Preview
76
+
77
+ Below is a preview of the first few rows along with basic metadata.
78
+ """
79
+ )
80
+ return preview
81
+
82
+ # =============================================================================
83
+ # 3. Expand the DataFrame for Better Visualization
84
+ # =============================================================================
85
+ @app.cell
86
+ def expand_view(mo, lazy_load_dataset, pl):
87
+ df = lazy_load_dataset
88
+ expand_button = mo.ui.button(label="Expand Dataframe")
89
+
90
+ @expand_button.on_click
91
+ def on_expand():
92
+ mo.ui.table(df, width="100%", height="auto")
93
+
94
+ mo.md(
95
+ r"""
96
+ ## Expand Dataframe
97
+
98
+ Click the button below to expand the DataFrame view.
99
+ """
100
+ )
101
+ return expand_button
102
+
103
+ # =============================================================================
104
+ # 4. Column Selection Tips (as Markdown)
105
+ # =============================================================================
106
+ @app.cell
107
+ def column_selection_tips(mo):
108
+ mo.md(
109
+ r"""
110
+ ## Column Selection Tips
111
+
112
+ **Example 1: Select specific columns by name:**
113
+ ```python
114
+ selected_columns_df = df.select(["column1", "column2"])
115
+ ```
116
+
117
+ **Example 2: Select all columns except column 'a':**
118
+ ```python
119
+ all_except_a_df = df.select(pl.exclude("a"))
120
+ ```
121
+
122
+ **Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):**
123
+ ```python
124
+ range_columns_df = df.select(pl.col(df.columns[1:4]))
125
+ ```
126
+ """
127
+ )
128
+ return
129
+
130
+ # =============================================================================
131
+ # Additional Polars I/O and DataFrame Examples (Markdown Cells)
132
+ # =============================================================================
133
+
134
+ @app.cell
135
+ def example_1(mo):
136
+ mo.md(
137
+ r"""
138
+ ### Example 1: Eagerly Read a Single Parquet File
139
+
140
+ ```python
141
+ df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
142
+ ```
143
+ """
144
+ )
145
+ return
146
+
147
+ @app.cell
148
+ def example_2(mo):
149
+ mo.md(
150
+ r"""
151
+ ### Example 2: Read Multiple Parquet Files Using Globbing
152
+
153
+ ```python
154
+ df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet")
155
+ ```
156
+ """
157
+ )
158
+ return
159
+
160
+ @app.cell
161
+ def example_3(mo):
162
+ mo.md(
163
+ r"""
164
+ ### Example 3: Lazily Scan Parquet Files with Recursive Globbing
165
+
166
+ ```python
167
+ df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet")
168
+ ```
169
+ """
170
+ )
171
+ return
172
+
173
+ @app.cell
174
+ def example_4(mo):
175
+ mo.md(
176
+ r"""
177
+ ### Example 4: Read a JSON File into a DataFrame
178
+
179
+ ```python
180
+ df_json = pl.read_json("data/sample.json")
181
+ ```
182
+ """
183
+ )
184
+ return
185
+
186
+ @app.cell
187
+ def example_5(mo):
188
+ mo.md(
189
+ r"""
190
+ ### Example 5: Read JSON with a Specified Schema
191
+
192
+ ```python
193
+ schema = {"name": pl.Utf8, "age": pl.Int64}
194
+ df_json = pl.read_json("data/sample.json", schema=schema)
195
+ ```
196
+ """
197
+ )
198
+ return
199
+
200
+ @app.cell
201
+ def example_6(mo):
202
+ mo.md(
203
+ r"""
204
+ ### Example 6: Write a DataFrame to NDJSON Format
205
+
206
+ ```python
207
+ df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]})
208
+ ndjson_str = df.write_ndjson()
209
+ print(ndjson_str)
210
+ ```
211
+ """
212
+ )
213
+ return
214
+
215
+ @app.cell
216
+ def example_7(mo):
217
+ mo.md(
218
+ r"""
219
+ ### Example 7: Get the Schema of a Parquet File Without Reading Data
220
+
221
+ ```python
222
+ schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
223
+ print(schema)
224
+ ```
225
+ """
226
+ )
227
+ return
228
+
229
+ @app.cell
230
+ def example_8(mo):
231
+ mo.md(
232
+ r"""
233
+ ### Example 8: Scan Parquet Files with Hive Partitioning Enabled
234
+
235
+ ```python
236
+ df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True)
237
+ ```
238
+ """
239
+ )
240
+ return
241
+
242
+ @app.cell
243
+ def example_9(mo):
244
+ mo.md(
245
+ r"""
246
+ ### Example 9: Lazily Scan NDJSON Files Using Globbing
247
+
248
+ ```python
249
+ df_lazy = pl.scan_ndjson("data/*.jsonl")
250
+ ```
251
+ """
252
+ )
253
+ return
254
+
255
+ @app.cell
256
+ def example_10(mo):
257
+ mo.md(
258
+ r"""
259
+ ### Example 10: Write a DataFrame to Partitioned Parquet Files
260
+
261
+ ```python
262
+ df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]})
263
+ df.write_parquet("output/", partition_by=["date"])
264
+ ```
265
+ """
266
+ )
267
+ return
268
+
269
+ @app.cell
270
+ def example_11(mo):
271
+ mo.md(
272
+ r"""
273
+ ### Example 11: Read JSON with Custom Inference Length
274
+
275
+ ```python
276
+ df = pl.read_json("data/large_text.json", infer_schema_length=500)
277
+ ```
278
+ """
279
+ )
280
+ return
281
+
282
+ @app.cell
283
+ def example_12(mo):
284
+ mo.md(
285
+ r"""
286
+ ### Example 12: Read JSON with Schema Overrides
287
+
288
+ ```python
289
+ schema = {"id": pl.Int64, "text": pl.Utf8}
290
+ overrides = {"id": pl.Int32}
291
+ df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides)
292
+ ```
293
+ """
294
+ )
295
+ return
296
+
297
+ @app.cell
298
+ def example_13(mo):
299
+ mo.md(
300
+ r"""
301
+ ### Example 13: Write a DataFrame to NDJSON and Return as String
302
+
303
+ ```python
304
+ df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]})
305
+ ndjson_output = df.write_ndjson()
306
+ print(ndjson_output)
307
+ ```
308
+ """
309
+ )
310
+ return
311
+
312
+ @app.cell
313
+ def example_14(mo):
314
+ mo.md(
315
+ r"""
316
+ ### Example 14: Scan Parquet Files with Cloud Storage Options
317
+
318
+ ```python
319
+ storage_options = {"token": os.environ.get("HF_TOKEN")}
320
+ df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options)
321
+ ```
322
+ """
323
+ )
324
+ return
325
+
326
+ @app.cell
327
+ def example_15(mo):
328
+ mo.md(
329
+ r"""
330
+ ### Example 15: Scan NDJSON Files with Cloud Storage Options
331
+
332
+ ```python
333
+ storage_options = {"token": os.environ.get("HF_TOKEN")}
334
+ df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options)
335
+ ```
336
+ """
337
+ )
338
+ return
339
+
340
+ @app.cell
341
+ def example_16(mo):
342
+ mo.md(
343
+ r"""
344
+ ### Example 16: Predicate Pushdown Example
345
+
346
+ ```python
347
+ df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
348
+ # Only load rows where 'value' > 100
349
+ df_filtered = df_lazy.filter(pl.col("value") > 100)
350
+ result = df_filtered.collect()
351
+ ```
352
+ """
353
+ )
354
+ return
355
+
356
+ @app.cell
357
+ def example_17(mo):
358
+ mo.md(
359
+ r"""
360
+ ### Example 17: Projection Pushdown Example
361
+
362
+ ```python
363
+ df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
364
+ # Only select the 'text' and 'id' columns to reduce memory footprint
365
+ df_proj = df_lazy.select(["id", "text"])
366
+ result = df_proj.collect()
367
+ ```
368
+ """
369
+ )
370
+ return
371
+
372
+ @app.cell
373
+ def example_18(mo):
374
+ mo.md(
375
+ r"""
376
+ ### Example 18: Collecting a Lazy DataFrame
377
+
378
+ ```python
379
+ df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
380
+ # Perform lazy operations...
381
+ result = df_lazy.collect()
382
+ print(result)
383
+ ```
384
+ """
385
+ )
386
+ return
387
+
388
+ @app.cell
389
+ def example_19(mo):
390
+ mo.md(
391
+ r"""
392
+ ### Example 19: Filtering on a Large Text Column
393
+
394
+ ```python
395
+ df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
396
+ # Filter rows where the 'text' column contains a long string pattern
397
+ df_filtered = df.filter(pl.col("text").str.contains("important keyword"))
398
+ print(df_filtered.head())
399
+ ```
400
+ """
401
+ )
402
+ return
403
+
404
+ @app.cell
405
+ def example_20(mo):
406
+ mo.md(
407
+ r"""
408
+ ### Example 20: Using String Length on a Text Column
409
+
410
+ ```python
411
+ df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
412
+ # Compute the length of text in the 'text' column
413
+ df = df.with_columns(text_length=pl.col("text").str.len())
414
+ print(df.head())
415
+ ```
416
+ """
417
+ )
418
+ return
419
+
420
+ @app.cell
421
+ def example_21(mo):
422
+ mo.md(
423
+ r"""
424
+ ### Example 21: Grouping by a Large Text Field
425
+
426
+ ```python
427
+ df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
428
+ grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length"))
429
+ print(grouped.collect())
430
+ ```
431
+ """
432
+ )
433
+ return
434
+
435
+ @app.cell
436
+ def example_22(mo):
437
+ mo.md(
438
+ r"""
439
+ ### Example 22: Joining Two DataFrames on a Common Key
440
+
441
+ ```python
442
+ df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]})
443
+ df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]})
444
+ joined = df1.join(df2, on="id")
445
+ print(joined)
446
+ ```
447
+ """
448
+ )
449
+ return
450
+
451
+ @app.cell
452
+ def example_23(mo):
453
+ mo.md(
454
+ r"""
455
+ ### Example 23: Using join_asof for Time-based Joins
456
+
457
+ ```python
458
+ df1 = pl.DataFrame({
459
+ "time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"),
460
+ "text": ["sample text"] * 25
461
+ })
462
+ df2 = pl.DataFrame({
463
+ "time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"),
464
+ "value": list(range(25))
465
+ })
466
+ # Perform an asof join to match the nearest timestamp
467
+ joined = df1.sort("time").join_asof(df2.sort("time"), on="time")
468
+ print(joined)
469
+ ```
470
+ """
471
+ )
472
+ return
473
+
474
+ @app.cell
475
+ def example_24(mo):
476
+ mo.md(
477
+ r"""
478
+ ### Example 24: Reading a Parquet File with Low Memory Option
479
+
480
+ ```python
481
+ df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True)
482
+ print(df.head())
483
+ ```
484
+ """
485
+ )
486
+ return
487
+
488
+ @app.cell
489
+ def example_25(mo):
490
+ mo.md(
491
+ r"""
492
+ ### Example 25: Scanning Parquet Files with a Parallel Strategy
493
+
494
+ ```python
495
+ df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto")
496
+ result = df_lazy.collect()
497
+ print(result)
498
+ ```
499
+ """
500
+ )
501
+ return
502
+
503
+ @app.cell
504
+ def example_26(mo):
505
+ mo.md(
506
+ r"""
507
+ ### Example 26: Reading a Large JSON File into a DataFrame
508
+
509
+ ```python
510
+ df = pl.read_json("data/large_text.json", infer_schema_length=200)
511
+ print(df.head())
512
+ ```
513
+ """
514
+ )
515
+ return
516
+
517
+ @app.cell
518
+ def example_27(mo):
519
+ mo.md(
520
+ r"""
521
+ ### Example 27: Using DataFrame.head() on a Large Text Dataset
522
+
523
+ ```python
524
+ df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
525
+ print(df.head(10))
526
+ ```
527
+ """
528
+ )
529
+ return
530
+
531
+ @app.cell
532
+ def example_28(mo):
533
+ mo.md(
534
+ r"""
535
+ ### Example 28: Using DataFrame.tail() on a Large Text Dataset
536
+
537
+ ```python
538
+ df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
539
+ print(df.tail(10))
540
+ ```
541
+ """
542
+ )
543
+ return
544
+
545
+ @app.cell
546
+ def example_29(mo):
547
+ mo.md(
548
+ r"""
549
+ ### Example 29: Scanning NDJSON Files with Rechunking
550
+
551
+ ```python
552
+ df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True)
553
+ result = df_lazy.collect()
554
+ print(result)
555
+ ```
556
+ """
557
+ )
558
+ return
559
+
560
+ @app.cell
561
+ def example_30(mo):
562
+ mo.md(
563
+ r"""
564
+ ### Example 30: Scanning Parquet Files with Allowing Missing Columns
565
+
566
+ ```python
567
+ df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True)
568
+ result = df_lazy.collect()
569
+ print(result)
570
+ ```
571
+ """
572
+ )
573
+ return
574
+
575
+ # =============================================================================
576
+ # End of Notebook
577
+ # =============================================================================
578
+ @app.cell
579
+ def conclusion(mo):
580
+ mo.md(
581
+ r"""
582
+ # Conclusion
583
+
584
+ This notebook showcased:
585
+ - How to lazy-load a Hugging Face dataset using Polars with recursive globbing.
586
+ - How to preview and interactively expand the DataFrame.
587
+ - Over 30 examples covering various Polars I/O functions and DataFrame operations,
588
+ which are especially useful when working with large text data.
589
+
590
+ For more information, please refer to:
591
+ - [Polars Documentation](https://docs.pola.rs/)
592
+ - [Hugging Face Hub Documentation](https://huggingface.co/docs)
593
+ - [Marimo Notebook Documentation](https://marimo.io/)
594
+
595
+ Happy Data Exploring!
596
+ """
597
+ )
598
+ return
599
+
600
+ if __name__ == "__main__":
601
+ app.run()