Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,85 +1,601 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# =============================================================================
|
2 |
-
#
|
3 |
# =============================================================================
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
#
|
14 |
-
#
|
15 |
-
# Install dependencies with:
|
16 |
-
# pip install polars marimo
|
17 |
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
import marimo as mo # Marimo provides UI and lazy-loading decorators
|
21 |
-
|
22 |
-
# ------------------------------------------------------------------------------
|
23 |
-
# 2. Lazy Load the Dataset
|
24 |
-
#
|
25 |
-
# Use the recursive globbing pattern "**/*.parquet" to read all Parquet files
|
26 |
-
# from all subdirectories on Hugging Face.
|
27 |
-
# ------------------------------------------------------------------------------
|
28 |
-
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
|
29 |
-
|
30 |
-
@mo.lazy # Use Marimo's lazy decorator to defer data loading until needed.
|
31 |
-
def load_dataset():
|
32 |
-
# Load all Parquet files matching the recursive pattern.
|
33 |
-
df = pl.read_parquet(dataset_url)
|
34 |
-
# Uncomment the next line to read local JSONL files instead:
|
35 |
-
# df = pl.read_ndjson("/local/path/to/*.jsonl")
|
36 |
return df
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
preview
|
53 |
-
|
54 |
-
|
55 |
-
#
|
56 |
-
#
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
#
|
76 |
-
#
|
77 |
-
#
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import polars as pl
|
3 |
+
import marimo
|
4 |
+
|
5 |
+
__generated_with = "0.10.15"
|
6 |
+
app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css")
|
7 |
+
|
8 |
+
# =============================================================================
|
9 |
+
# Intro Cell
|
10 |
+
# =============================================================================
|
11 |
+
@app.cell
|
12 |
+
def introduction(mo):
|
13 |
+
mo.md(
|
14 |
+
r"""
|
15 |
+
# Exploring a Hugging Face Dataset with Polars
|
16 |
+
|
17 |
+
In this notebook we demonstrate how to:
|
18 |
+
- **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern).
|
19 |
+
- **Preview** the loaded DataFrame with metadata.
|
20 |
+
- **Interactively expand** the DataFrame view.
|
21 |
+
- Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data.
|
22 |
+
|
23 |
+
**Prerequisites:**
|
24 |
+
- Install dependencies via:
|
25 |
+
```bash
|
26 |
+
pip install polars marimo
|
27 |
+
```
|
28 |
+
- Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable.
|
29 |
+
|
30 |
+

|
31 |
+
"""
|
32 |
+
)
|
33 |
+
return
|
34 |
+
|
35 |
# =============================================================================
|
36 |
+
# Load HF_TOKEN from the environment
|
37 |
# =============================================================================
|
38 |
+
@app.cell
|
39 |
+
def load_token(mo):
|
40 |
+
hf_token = os.environ.get("HF_TOKEN")
|
41 |
+
mo.md(f"""
|
42 |
+
**Hugging Face Token:** `{hf_token}`
|
43 |
+
*(Ensure that HF_TOKEN is set in your environment.)*
|
44 |
+
""")
|
45 |
+
return
|
46 |
+
|
47 |
+
# =============================================================================
|
48 |
+
# 1. Lazy-load the Dataset
|
|
|
|
|
49 |
# =============================================================================
|
50 |
+
@app.cell
|
51 |
+
def lazy_load_dataset(mo, pl):
|
52 |
+
# Use a recursive globbing pattern to load all Parquet files from all subdirectories.
|
53 |
+
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
|
54 |
+
|
55 |
+
@mo.lazy # The mo.lazy decorator defers execution until the data is needed.
|
56 |
+
def load_dataset():
|
57 |
+
# Load all Parquet files matching the recursive pattern.
|
58 |
+
df = pl.read_parquet(dataset_url)
|
59 |
+
# --- Alternative for local JSONL files (uncomment if needed):
|
60 |
+
# df = pl.read_ndjson("/local/path/to/*.jsonl")
|
61 |
+
return df
|
62 |
|
63 |
+
df = load_dataset()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
return df
|
65 |
|
66 |
+
# =============================================================================
|
67 |
+
# 2. Preview the DataFrame with Metadata
|
68 |
+
# =============================================================================
|
69 |
+
@app.cell
|
70 |
+
def preview_data(mo, lazy_load_dataset, pl):
|
71 |
+
df = lazy_load_dataset # LazyFrame returned by load_dataset
|
72 |
+
preview = mo.ui.table(df.head(), metadata=True)
|
73 |
+
mo.md(
|
74 |
+
r"""
|
75 |
+
## Data Preview
|
76 |
+
|
77 |
+
Below is a preview of the first few rows along with basic metadata.
|
78 |
+
"""
|
79 |
+
)
|
80 |
+
return preview
|
81 |
+
|
82 |
+
# =============================================================================
|
83 |
+
# 3. Expand the DataFrame for Better Visualization
|
84 |
+
# =============================================================================
|
85 |
+
@app.cell
|
86 |
+
def expand_view(mo, lazy_load_dataset, pl):
|
87 |
+
df = lazy_load_dataset
|
88 |
+
expand_button = mo.ui.button(label="Expand Dataframe")
|
89 |
+
|
90 |
+
@expand_button.on_click
|
91 |
+
def on_expand():
|
92 |
+
mo.ui.table(df, width="100%", height="auto")
|
93 |
+
|
94 |
+
mo.md(
|
95 |
+
r"""
|
96 |
+
## Expand Dataframe
|
97 |
+
|
98 |
+
Click the button below to expand the DataFrame view.
|
99 |
+
"""
|
100 |
+
)
|
101 |
+
return expand_button
|
102 |
+
|
103 |
+
# =============================================================================
|
104 |
+
# 4. Column Selection Tips (as Markdown)
|
105 |
+
# =============================================================================
|
106 |
+
@app.cell
|
107 |
+
def column_selection_tips(mo):
|
108 |
+
mo.md(
|
109 |
+
r"""
|
110 |
+
## Column Selection Tips
|
111 |
+
|
112 |
+
**Example 1: Select specific columns by name:**
|
113 |
+
```python
|
114 |
+
selected_columns_df = df.select(["column1", "column2"])
|
115 |
+
```
|
116 |
+
|
117 |
+
**Example 2: Select all columns except column 'a':**
|
118 |
+
```python
|
119 |
+
all_except_a_df = df.select(pl.exclude("a"))
|
120 |
+
```
|
121 |
+
|
122 |
+
**Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):**
|
123 |
+
```python
|
124 |
+
range_columns_df = df.select(pl.col(df.columns[1:4]))
|
125 |
+
```
|
126 |
+
"""
|
127 |
+
)
|
128 |
+
return
|
129 |
+
|
130 |
+
# =============================================================================
|
131 |
+
# Additional Polars I/O and DataFrame Examples (Markdown Cells)
|
132 |
+
# =============================================================================
|
133 |
+
|
134 |
+
@app.cell
|
135 |
+
def example_1(mo):
|
136 |
+
mo.md(
|
137 |
+
r"""
|
138 |
+
### Example 1: Eagerly Read a Single Parquet File
|
139 |
+
|
140 |
+
```python
|
141 |
+
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
|
142 |
+
```
|
143 |
+
"""
|
144 |
+
)
|
145 |
+
return
|
146 |
+
|
147 |
+
@app.cell
|
148 |
+
def example_2(mo):
|
149 |
+
mo.md(
|
150 |
+
r"""
|
151 |
+
### Example 2: Read Multiple Parquet Files Using Globbing
|
152 |
+
|
153 |
+
```python
|
154 |
+
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet")
|
155 |
+
```
|
156 |
+
"""
|
157 |
+
)
|
158 |
+
return
|
159 |
+
|
160 |
+
@app.cell
|
161 |
+
def example_3(mo):
|
162 |
+
mo.md(
|
163 |
+
r"""
|
164 |
+
### Example 3: Lazily Scan Parquet Files with Recursive Globbing
|
165 |
+
|
166 |
+
```python
|
167 |
+
df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet")
|
168 |
+
```
|
169 |
+
"""
|
170 |
+
)
|
171 |
+
return
|
172 |
+
|
173 |
+
@app.cell
|
174 |
+
def example_4(mo):
|
175 |
+
mo.md(
|
176 |
+
r"""
|
177 |
+
### Example 4: Read a JSON File into a DataFrame
|
178 |
+
|
179 |
+
```python
|
180 |
+
df_json = pl.read_json("data/sample.json")
|
181 |
+
```
|
182 |
+
"""
|
183 |
+
)
|
184 |
+
return
|
185 |
+
|
186 |
+
@app.cell
|
187 |
+
def example_5(mo):
|
188 |
+
mo.md(
|
189 |
+
r"""
|
190 |
+
### Example 5: Read JSON with a Specified Schema
|
191 |
+
|
192 |
+
```python
|
193 |
+
schema = {"name": pl.Utf8, "age": pl.Int64}
|
194 |
+
df_json = pl.read_json("data/sample.json", schema=schema)
|
195 |
+
```
|
196 |
+
"""
|
197 |
+
)
|
198 |
+
return
|
199 |
+
|
200 |
+
@app.cell
|
201 |
+
def example_6(mo):
|
202 |
+
mo.md(
|
203 |
+
r"""
|
204 |
+
### Example 6: Write a DataFrame to NDJSON Format
|
205 |
+
|
206 |
+
```python
|
207 |
+
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]})
|
208 |
+
ndjson_str = df.write_ndjson()
|
209 |
+
print(ndjson_str)
|
210 |
+
```
|
211 |
+
"""
|
212 |
+
)
|
213 |
+
return
|
214 |
+
|
215 |
+
@app.cell
|
216 |
+
def example_7(mo):
|
217 |
+
mo.md(
|
218 |
+
r"""
|
219 |
+
### Example 7: Get the Schema of a Parquet File Without Reading Data
|
220 |
+
|
221 |
+
```python
|
222 |
+
schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
|
223 |
+
print(schema)
|
224 |
+
```
|
225 |
+
"""
|
226 |
+
)
|
227 |
+
return
|
228 |
+
|
229 |
+
@app.cell
|
230 |
+
def example_8(mo):
|
231 |
+
mo.md(
|
232 |
+
r"""
|
233 |
+
### Example 8: Scan Parquet Files with Hive Partitioning Enabled
|
234 |
+
|
235 |
+
```python
|
236 |
+
df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True)
|
237 |
+
```
|
238 |
+
"""
|
239 |
+
)
|
240 |
+
return
|
241 |
+
|
242 |
+
@app.cell
|
243 |
+
def example_9(mo):
|
244 |
+
mo.md(
|
245 |
+
r"""
|
246 |
+
### Example 9: Lazily Scan NDJSON Files Using Globbing
|
247 |
+
|
248 |
+
```python
|
249 |
+
df_lazy = pl.scan_ndjson("data/*.jsonl")
|
250 |
+
```
|
251 |
+
"""
|
252 |
+
)
|
253 |
+
return
|
254 |
+
|
255 |
+
@app.cell
|
256 |
+
def example_10(mo):
|
257 |
+
mo.md(
|
258 |
+
r"""
|
259 |
+
### Example 10: Write a DataFrame to Partitioned Parquet Files
|
260 |
+
|
261 |
+
```python
|
262 |
+
df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]})
|
263 |
+
df.write_parquet("output/", partition_by=["date"])
|
264 |
+
```
|
265 |
+
"""
|
266 |
+
)
|
267 |
+
return
|
268 |
+
|
269 |
+
@app.cell
|
270 |
+
def example_11(mo):
|
271 |
+
mo.md(
|
272 |
+
r"""
|
273 |
+
### Example 11: Read JSON with Custom Inference Length
|
274 |
+
|
275 |
+
```python
|
276 |
+
df = pl.read_json("data/large_text.json", infer_schema_length=500)
|
277 |
+
```
|
278 |
+
"""
|
279 |
+
)
|
280 |
+
return
|
281 |
+
|
282 |
+
@app.cell
|
283 |
+
def example_12(mo):
|
284 |
+
mo.md(
|
285 |
+
r"""
|
286 |
+
### Example 12: Read JSON with Schema Overrides
|
287 |
+
|
288 |
+
```python
|
289 |
+
schema = {"id": pl.Int64, "text": pl.Utf8}
|
290 |
+
overrides = {"id": pl.Int32}
|
291 |
+
df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides)
|
292 |
+
```
|
293 |
+
"""
|
294 |
+
)
|
295 |
+
return
|
296 |
+
|
297 |
+
@app.cell
|
298 |
+
def example_13(mo):
|
299 |
+
mo.md(
|
300 |
+
r"""
|
301 |
+
### Example 13: Write a DataFrame to NDJSON and Return as String
|
302 |
+
|
303 |
+
```python
|
304 |
+
df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]})
|
305 |
+
ndjson_output = df.write_ndjson()
|
306 |
+
print(ndjson_output)
|
307 |
+
```
|
308 |
+
"""
|
309 |
+
)
|
310 |
+
return
|
311 |
+
|
312 |
+
@app.cell
|
313 |
+
def example_14(mo):
|
314 |
+
mo.md(
|
315 |
+
r"""
|
316 |
+
### Example 14: Scan Parquet Files with Cloud Storage Options
|
317 |
+
|
318 |
+
```python
|
319 |
+
storage_options = {"token": os.environ.get("HF_TOKEN")}
|
320 |
+
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options)
|
321 |
+
```
|
322 |
+
"""
|
323 |
+
)
|
324 |
+
return
|
325 |
+
|
326 |
+
@app.cell
|
327 |
+
def example_15(mo):
|
328 |
+
mo.md(
|
329 |
+
r"""
|
330 |
+
### Example 15: Scan NDJSON Files with Cloud Storage Options
|
331 |
+
|
332 |
+
```python
|
333 |
+
storage_options = {"token": os.environ.get("HF_TOKEN")}
|
334 |
+
df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options)
|
335 |
+
```
|
336 |
+
"""
|
337 |
+
)
|
338 |
+
return
|
339 |
+
|
340 |
+
@app.cell
|
341 |
+
def example_16(mo):
|
342 |
+
mo.md(
|
343 |
+
r"""
|
344 |
+
### Example 16: Predicate Pushdown Example
|
345 |
+
|
346 |
+
```python
|
347 |
+
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
348 |
+
# Only load rows where 'value' > 100
|
349 |
+
df_filtered = df_lazy.filter(pl.col("value") > 100)
|
350 |
+
result = df_filtered.collect()
|
351 |
+
```
|
352 |
+
"""
|
353 |
+
)
|
354 |
+
return
|
355 |
+
|
356 |
+
@app.cell
|
357 |
+
def example_17(mo):
|
358 |
+
mo.md(
|
359 |
+
r"""
|
360 |
+
### Example 17: Projection Pushdown Example
|
361 |
+
|
362 |
+
```python
|
363 |
+
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
364 |
+
# Only select the 'text' and 'id' columns to reduce memory footprint
|
365 |
+
df_proj = df_lazy.select(["id", "text"])
|
366 |
+
result = df_proj.collect()
|
367 |
+
```
|
368 |
+
"""
|
369 |
+
)
|
370 |
+
return
|
371 |
+
|
372 |
+
@app.cell
|
373 |
+
def example_18(mo):
|
374 |
+
mo.md(
|
375 |
+
r"""
|
376 |
+
### Example 18: Collecting a Lazy DataFrame
|
377 |
+
|
378 |
+
```python
|
379 |
+
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
380 |
+
# Perform lazy operations...
|
381 |
+
result = df_lazy.collect()
|
382 |
+
print(result)
|
383 |
+
```
|
384 |
+
"""
|
385 |
+
)
|
386 |
+
return
|
387 |
+
|
388 |
+
@app.cell
|
389 |
+
def example_19(mo):
|
390 |
+
mo.md(
|
391 |
+
r"""
|
392 |
+
### Example 19: Filtering on a Large Text Column
|
393 |
+
|
394 |
+
```python
|
395 |
+
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
396 |
+
# Filter rows where the 'text' column contains a long string pattern
|
397 |
+
df_filtered = df.filter(pl.col("text").str.contains("important keyword"))
|
398 |
+
print(df_filtered.head())
|
399 |
+
```
|
400 |
+
"""
|
401 |
+
)
|
402 |
+
return
|
403 |
+
|
404 |
+
@app.cell
|
405 |
+
def example_20(mo):
|
406 |
+
mo.md(
|
407 |
+
r"""
|
408 |
+
### Example 20: Using String Length on a Text Column
|
409 |
+
|
410 |
+
```python
|
411 |
+
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
412 |
+
# Compute the length of text in the 'text' column
|
413 |
+
df = df.with_columns(text_length=pl.col("text").str.len())
|
414 |
+
print(df.head())
|
415 |
+
```
|
416 |
+
"""
|
417 |
+
)
|
418 |
+
return
|
419 |
+
|
420 |
+
@app.cell
|
421 |
+
def example_21(mo):
|
422 |
+
mo.md(
|
423 |
+
r"""
|
424 |
+
### Example 21: Grouping by a Large Text Field
|
425 |
+
|
426 |
+
```python
|
427 |
+
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
428 |
+
grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length"))
|
429 |
+
print(grouped.collect())
|
430 |
+
```
|
431 |
+
"""
|
432 |
+
)
|
433 |
+
return
|
434 |
+
|
435 |
+
@app.cell
|
436 |
+
def example_22(mo):
|
437 |
+
mo.md(
|
438 |
+
r"""
|
439 |
+
### Example 22: Joining Two DataFrames on a Common Key
|
440 |
+
|
441 |
+
```python
|
442 |
+
df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]})
|
443 |
+
df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]})
|
444 |
+
joined = df1.join(df2, on="id")
|
445 |
+
print(joined)
|
446 |
+
```
|
447 |
+
"""
|
448 |
+
)
|
449 |
+
return
|
450 |
+
|
451 |
+
@app.cell
|
452 |
+
def example_23(mo):
|
453 |
+
mo.md(
|
454 |
+
r"""
|
455 |
+
### Example 23: Using join_asof for Time-based Joins
|
456 |
+
|
457 |
+
```python
|
458 |
+
df1 = pl.DataFrame({
|
459 |
+
"time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"),
|
460 |
+
"text": ["sample text"] * 25
|
461 |
+
})
|
462 |
+
df2 = pl.DataFrame({
|
463 |
+
"time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"),
|
464 |
+
"value": list(range(25))
|
465 |
+
})
|
466 |
+
# Perform an asof join to match the nearest timestamp
|
467 |
+
joined = df1.sort("time").join_asof(df2.sort("time"), on="time")
|
468 |
+
print(joined)
|
469 |
+
```
|
470 |
+
"""
|
471 |
+
)
|
472 |
+
return
|
473 |
+
|
474 |
+
@app.cell
|
475 |
+
def example_24(mo):
|
476 |
+
mo.md(
|
477 |
+
r"""
|
478 |
+
### Example 24: Reading a Parquet File with Low Memory Option
|
479 |
+
|
480 |
+
```python
|
481 |
+
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True)
|
482 |
+
print(df.head())
|
483 |
+
```
|
484 |
+
"""
|
485 |
+
)
|
486 |
+
return
|
487 |
+
|
488 |
+
@app.cell
|
489 |
+
def example_25(mo):
|
490 |
+
mo.md(
|
491 |
+
r"""
|
492 |
+
### Example 25: Scanning Parquet Files with a Parallel Strategy
|
493 |
+
|
494 |
+
```python
|
495 |
+
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto")
|
496 |
+
result = df_lazy.collect()
|
497 |
+
print(result)
|
498 |
+
```
|
499 |
+
"""
|
500 |
+
)
|
501 |
+
return
|
502 |
+
|
503 |
+
@app.cell
|
504 |
+
def example_26(mo):
|
505 |
+
mo.md(
|
506 |
+
r"""
|
507 |
+
### Example 26: Reading a Large JSON File into a DataFrame
|
508 |
+
|
509 |
+
```python
|
510 |
+
df = pl.read_json("data/large_text.json", infer_schema_length=200)
|
511 |
+
print(df.head())
|
512 |
+
```
|
513 |
+
"""
|
514 |
+
)
|
515 |
+
return
|
516 |
+
|
517 |
+
@app.cell
|
518 |
+
def example_27(mo):
|
519 |
+
mo.md(
|
520 |
+
r"""
|
521 |
+
### Example 27: Using DataFrame.head() on a Large Text Dataset
|
522 |
+
|
523 |
+
```python
|
524 |
+
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
525 |
+
print(df.head(10))
|
526 |
+
```
|
527 |
+
"""
|
528 |
+
)
|
529 |
+
return
|
530 |
+
|
531 |
+
@app.cell
|
532 |
+
def example_28(mo):
|
533 |
+
mo.md(
|
534 |
+
r"""
|
535 |
+
### Example 28: Using DataFrame.tail() on a Large Text Dataset
|
536 |
+
|
537 |
+
```python
|
538 |
+
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
|
539 |
+
print(df.tail(10))
|
540 |
+
```
|
541 |
+
"""
|
542 |
+
)
|
543 |
+
return
|
544 |
+
|
545 |
+
@app.cell
|
546 |
+
def example_29(mo):
|
547 |
+
mo.md(
|
548 |
+
r"""
|
549 |
+
### Example 29: Scanning NDJSON Files with Rechunking
|
550 |
+
|
551 |
+
```python
|
552 |
+
df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True)
|
553 |
+
result = df_lazy.collect()
|
554 |
+
print(result)
|
555 |
+
```
|
556 |
+
"""
|
557 |
+
)
|
558 |
+
return
|
559 |
+
|
560 |
+
@app.cell
|
561 |
+
def example_30(mo):
|
562 |
+
mo.md(
|
563 |
+
r"""
|
564 |
+
### Example 30: Scanning Parquet Files with Allowing Missing Columns
|
565 |
+
|
566 |
+
```python
|
567 |
+
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True)
|
568 |
+
result = df_lazy.collect()
|
569 |
+
print(result)
|
570 |
+
```
|
571 |
+
"""
|
572 |
+
)
|
573 |
+
return
|
574 |
+
|
575 |
+
# =============================================================================
|
576 |
+
# End of Notebook
|
577 |
+
# =============================================================================
|
578 |
+
@app.cell
|
579 |
+
def conclusion(mo):
|
580 |
+
mo.md(
|
581 |
+
r"""
|
582 |
+
# Conclusion
|
583 |
+
|
584 |
+
This notebook showcased:
|
585 |
+
- How to lazy-load a Hugging Face dataset using Polars with recursive globbing.
|
586 |
+
- How to preview and interactively expand the DataFrame.
|
587 |
+
- Over 30 examples covering various Polars I/O functions and DataFrame operations,
|
588 |
+
which are especially useful when working with large text data.
|
589 |
+
|
590 |
+
For more information, please refer to:
|
591 |
+
- [Polars Documentation](https://docs.pola.rs/)
|
592 |
+
- [Hugging Face Hub Documentation](https://huggingface.co/docs)
|
593 |
+
- [Marimo Notebook Documentation](https://marimo.io/)
|
594 |
+
|
595 |
+
Happy Data Exploring!
|
596 |
+
"""
|
597 |
+
)
|
598 |
+
return
|
599 |
+
|
600 |
+
if __name__ == "__main__":
|
601 |
+
app.run()
|