Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- Dockerfile +106 -0
- README.md +9 -4
- data/hf_spark_utils.py +183 -0
- data/spark.ipynb +121 -0
- login.html +70 -0
- on_startup.sh +5 -0
- packages.txt +2 -0
- requirements.txt +8 -0
- start_server.sh +23 -0
Dockerfile
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:11.3.1-base-ubuntu20.04
|
2 |
+
|
3 |
+
ENV DEBIAN_FRONTEND=noninteractive \
|
4 |
+
TZ=Europe/Paris
|
5 |
+
|
6 |
+
# Remove any third-party apt sources to avoid issues with expiring keys.
|
7 |
+
# Install some basic utilities
|
8 |
+
RUN rm -f /etc/apt/sources.list.d/*.list && \
|
9 |
+
apt-get update && apt-get install -y --no-install-recommends \
|
10 |
+
curl \
|
11 |
+
ca-certificates \
|
12 |
+
sudo \
|
13 |
+
git \
|
14 |
+
wget \
|
15 |
+
procps \
|
16 |
+
git-lfs \
|
17 |
+
zip \
|
18 |
+
unzip \
|
19 |
+
htop \
|
20 |
+
vim \
|
21 |
+
nano \
|
22 |
+
bzip2 \
|
23 |
+
libx11-6 \
|
24 |
+
build-essential \
|
25 |
+
libsndfile-dev \
|
26 |
+
software-properties-common \
|
27 |
+
&& rm -rf /var/lib/apt/lists/*
|
28 |
+
|
29 |
+
RUN add-apt-repository ppa:flexiondotorg/nvtop && \
|
30 |
+
apt-get upgrade -y && \
|
31 |
+
apt-get install -y --no-install-recommends nvtop
|
32 |
+
|
33 |
+
RUN curl -sL https://deb.nodesource.com/setup_20.x | bash - && \
|
34 |
+
apt-get install -y nodejs && \
|
35 |
+
npm install -g configurable-http-proxy
|
36 |
+
|
37 |
+
# Create a working directory
|
38 |
+
WORKDIR /app
|
39 |
+
|
40 |
+
# Create a non-root user and switch to it
|
41 |
+
RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
|
42 |
+
&& chown -R user:user /app
|
43 |
+
RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
|
44 |
+
USER user
|
45 |
+
|
46 |
+
# All users can use /home/user as their home directory
|
47 |
+
ENV HOME=/home/user
|
48 |
+
RUN mkdir $HOME/.cache $HOME/.config \
|
49 |
+
&& chmod -R 777 $HOME
|
50 |
+
|
51 |
+
# Set up the Conda environment
|
52 |
+
ENV CONDA_AUTO_UPDATE_CONDA=false \
|
53 |
+
PATH=$HOME/miniconda/bin:$PATH
|
54 |
+
RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
|
55 |
+
&& chmod +x ~/miniconda.sh \
|
56 |
+
&& ~/miniconda.sh -b -p ~/miniconda \
|
57 |
+
&& rm ~/miniconda.sh \
|
58 |
+
&& conda clean -ya
|
59 |
+
|
60 |
+
WORKDIR $HOME/app
|
61 |
+
|
62 |
+
#######################################
|
63 |
+
# Start root user section
|
64 |
+
#######################################
|
65 |
+
|
66 |
+
USER root
|
67 |
+
|
68 |
+
# User Debian packages
|
69 |
+
## Security warning : Potential user code executed as root (build time)
|
70 |
+
RUN --mount=target=/root/packages.txt,source=packages.txt \
|
71 |
+
apt-get update && \
|
72 |
+
xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
|
73 |
+
&& rm -rf /var/lib/apt/lists/*
|
74 |
+
|
75 |
+
RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
|
76 |
+
bash /root/on_startup.sh
|
77 |
+
|
78 |
+
RUN mkdir /data && chown user:user /data
|
79 |
+
|
80 |
+
#######################################
|
81 |
+
# End root user section
|
82 |
+
#######################################
|
83 |
+
|
84 |
+
USER user
|
85 |
+
|
86 |
+
# Python packages
|
87 |
+
RUN --mount=target=requirements.txt,source=requirements.txt \
|
88 |
+
pip install --no-cache-dir --upgrade -r requirements.txt
|
89 |
+
|
90 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
91 |
+
COPY --chown=user . $HOME/app
|
92 |
+
|
93 |
+
RUN chmod +x start_server.sh
|
94 |
+
|
95 |
+
COPY --chown=user login.html /home/user/miniconda/lib/python3.9/site-packages/jupyter_server/templates/login.html
|
96 |
+
|
97 |
+
ENV PYTHONUNBUFFERED=1 \
|
98 |
+
GRADIO_ALLOW_FLAGGING=never \
|
99 |
+
GRADIO_NUM_PORTS=1 \
|
100 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
101 |
+
GRADIO_THEME=huggingface \
|
102 |
+
HF_HOME=/data/.cache/huggingface \
|
103 |
+
SYSTEM=spaces \
|
104 |
+
SHELL=/bin/bash
|
105 |
+
|
106 |
+
CMD ["./start_server.sh"]
|
README.md
CHANGED
@@ -1,10 +1,15 @@
|
|
1 |
---
|
2 |
-
title: Spark
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
8 |
---
|
9 |
|
10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Spark on HF JupyterLab
|
3 |
+
emoji: 🌅
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: red
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
+
tags:
|
9 |
+
- jupyterlab
|
10 |
+
- spark
|
11 |
+
- datasets
|
12 |
+
suggested_storage: small
|
13 |
---
|
14 |
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
data/hf_spark_utils.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import pickle
|
3 |
+
import tempfile
|
4 |
+
from functools import partial
|
5 |
+
from typing import Iterator, Optional, Union
|
6 |
+
|
7 |
+
import pyarrow as pa
|
8 |
+
import pyarrow.parquet as pq
|
9 |
+
from huggingface_hub import CommitOperationAdd, HfFileSystem
|
10 |
+
from pyspark.sql.dataframe import DataFrame
|
11 |
+
from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema
|
12 |
+
|
13 |
+
spark = None
|
14 |
+
|
15 |
+
def set_session(session):
|
16 |
+
global spark
|
17 |
+
spark = session
|
18 |
+
|
19 |
+
|
20 |
+
def _read(iterator: Iterator[pa.RecordBatch], columns: Optional[list[str]], filters: Optional[Union[list[tuple], list[list[tuple]]]], **kwargs) -> Iterator[pa.RecordBatch]:
|
21 |
+
for batch in iterator:
|
22 |
+
paths = batch[0].to_pylist()
|
23 |
+
ds = pq.ParquetDataset(paths, **kwargs)
|
24 |
+
yield from ds._dataset.to_batches(columns=columns, filter=pq.filters_to_expression(filters) if filters else None)
|
25 |
+
|
26 |
+
|
27 |
+
def read_parquet(
|
28 |
+
path: str,
|
29 |
+
columns: Optional[list[str]] = None,
|
30 |
+
filters: Optional[Union[list[tuple], list[list[tuple]]]] = None,
|
31 |
+
**kwargs,
|
32 |
+
) -> DataFrame:
|
33 |
+
"""
|
34 |
+
Loads Parquet files from Hugging Face using PyArrow, returning a PySPark `DataFrame`.
|
35 |
+
|
36 |
+
It reads Parquet files in a distributed manner.
|
37 |
+
|
38 |
+
Access private or gated repositories using `huggingface-cli login` or passing a token
|
39 |
+
using the `storage_options` argument: `storage_options={"token": "hf_xxx"}`
|
40 |
+
|
41 |
+
Parameters
|
42 |
+
----------
|
43 |
+
path : str
|
44 |
+
Path to the file. Prefix with a protocol like `hf://` to read from Hugging Face.
|
45 |
+
You can read from multiple files if you pass a globstring.
|
46 |
+
columns : list, default None
|
47 |
+
If not None, only these columns will be read from the file.
|
48 |
+
filters : List[Tuple] or List[List[Tuple]], default None
|
49 |
+
To filter out data.
|
50 |
+
Filter syntax: [[(column, op, val), ...],...]
|
51 |
+
where op is [==, =, >, >=, <, <=, !=, in, not in]
|
52 |
+
The innermost tuples are transposed into a set of filters applied
|
53 |
+
through an `AND` operation.
|
54 |
+
The outer list combines these sets of filters through an `OR`
|
55 |
+
operation.
|
56 |
+
A single list of tuples can also be used, meaning that no `OR`
|
57 |
+
operation between set of filters is to be conducted.
|
58 |
+
|
59 |
+
**kwargs
|
60 |
+
Any additional kwargs are passed to pyarrow.parquet.ParquetDataset.
|
61 |
+
|
62 |
+
Returns
|
63 |
+
-------
|
64 |
+
DataFrame
|
65 |
+
DataFrame based on parquet file.
|
66 |
+
|
67 |
+
Examples
|
68 |
+
--------
|
69 |
+
>>> path = "hf://datasets/username/dataset/data.parquet"
|
70 |
+
>>> pd.DataFrame({"foo": range(5), "bar": range(5, 10)}).to_parquet(path)
|
71 |
+
>>> read_parquet(path).show()
|
72 |
+
+---+---+
|
73 |
+
|foo|bar|
|
74 |
+
+---+---+
|
75 |
+
| 0| 5|
|
76 |
+
| 1| 6|
|
77 |
+
| 2| 7|
|
78 |
+
| 3| 8|
|
79 |
+
| 4| 9|
|
80 |
+
+---+---+
|
81 |
+
>>> read_parquet(path, columns=["bar"]).show()
|
82 |
+
+---+
|
83 |
+
|bar|
|
84 |
+
+---+
|
85 |
+
| 5|
|
86 |
+
| 6|
|
87 |
+
| 7|
|
88 |
+
| 8|
|
89 |
+
| 9|
|
90 |
+
+---+
|
91 |
+
>>> sel = [("foo", ">", 2)]
|
92 |
+
>>> read_parquet(path, filters=sel).show()
|
93 |
+
+---+---+
|
94 |
+
|foo|bar|
|
95 |
+
+---+---+
|
96 |
+
| 3| 8|
|
97 |
+
| 4| 9|
|
98 |
+
+---+---+
|
99 |
+
"""
|
100 |
+
filesystem: HfFileSystem = kwargs.pop("filesystem") if "filesystem" in kwargs else HfFileSystem(**kwargs.pop("storage_options", {}))
|
101 |
+
paths = filesystem.glob(path)
|
102 |
+
if not paths:
|
103 |
+
raise FileNotFoundError(f"Counldn't find any file at {path}")
|
104 |
+
rdd = spark.sparkContext.parallelize([{"path": path} for path in paths], len(paths))
|
105 |
+
df = spark.createDataFrame(rdd)
|
106 |
+
arrow_schema = pq.read_schema(filesystem.open(paths[0]))
|
107 |
+
schema = pa.schema([field for field in arrow_schema if (columns is None or field.name in columns)], metadata=arrow_schema.metadata)
|
108 |
+
return df.mapInArrow(
|
109 |
+
partial(_read, columns=columns, filters=filters, filesystem=filesystem, schema=arrow_schema, **kwargs),
|
110 |
+
from_arrow_schema(schema),
|
111 |
+
)
|
112 |
+
|
113 |
+
|
114 |
+
def _preupload(iterator: Iterator[pa.RecordBatch], path: str, schema: pa.Schema, filesystem: HfFileSystem, row_group_size: Optional[int] = None, **kwargs) -> Iterator[pa.RecordBatch]:
|
115 |
+
resolved_path = filesystem.resolve_path(path)
|
116 |
+
with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
|
117 |
+
with pq.ParquetWriter(temp_file.name, schema=schema, **kwargs) as writer:
|
118 |
+
for batch in iterator:
|
119 |
+
writer.write_batch(batch, row_group_size=row_group_size)
|
120 |
+
addition = CommitOperationAdd(path_in_repo=temp_file.name, path_or_fileobj=temp_file.name)
|
121 |
+
filesystem._api.preupload_lfs_files(repo_id=resolved_path.repo_id, additions=[addition], repo_type=resolved_path.repo_type, revision=resolved_path.revision)
|
122 |
+
yield pa.record_batch({"addition": [pickle.dumps(addition)]}, schema=pa.schema({"addition": pa.binary()}))
|
123 |
+
|
124 |
+
|
125 |
+
def _commit(iterator: Iterator[pa.RecordBatch], path: str, filesystem: HfFileSystem, max_operations_per_commit=50) -> Iterator[pa.RecordBatch]:
|
126 |
+
resolved_path = filesystem.resolve_path(path)
|
127 |
+
additions: list[CommitOperationAdd] = [pickle.loads(addition) for addition in pa.Table.from_batches(iterator, schema=pa.schema({"addition": pa.binary()}))[0].to_pylist()]
|
128 |
+
num_commits = math.ceil(len(additions) / max_operations_per_commit)
|
129 |
+
for shard_idx, addition in enumerate(additions):
|
130 |
+
addition.path_in_repo = resolved_path.path_in_repo.replace("{shard_idx:05d}", f"{shard_idx:05d}")
|
131 |
+
for i in range(0, num_commits):
|
132 |
+
operations = additions[i * max_operations_per_commit : (i + 1) * max_operations_per_commit]
|
133 |
+
commit_message = "Upload using PySpark" + (f" (part {i:05d}-of-{num_commits:05d})" if num_commits > 1 else "")
|
134 |
+
filesystem._api.create_commit(repo_id=resolved_path.repo_id, repo_type=resolved_path.repo_type, revision=resolved_path.revision, operations=operations, commit_message=commit_message)
|
135 |
+
yield pa.record_batch({"path": [addition.path_in_repo for addition in operations]}, schema=pa.schema({"path": pa.string()}))
|
136 |
+
|
137 |
+
|
138 |
+
def write_parquet(df: DataFrame, path: str, **kwargs) -> None:
|
139 |
+
"""
|
140 |
+
Write Parquet files to Hugging Face using PyArrow.
|
141 |
+
|
142 |
+
It uploads Parquet files in a distributed manner in two steps:
|
143 |
+
|
144 |
+
1. Preupload the Parquet files in parallel in a distributed banner
|
145 |
+
2. Commit the preuploaded files
|
146 |
+
|
147 |
+
Authenticate using `huggingface-cli login` or passing a token
|
148 |
+
using the `storage_options` argument: `storage_options={"token": "hf_xxx"}`
|
149 |
+
|
150 |
+
Parameters
|
151 |
+
----------
|
152 |
+
path : str
|
153 |
+
Path of the file or directory. Prefix with a protocol like `hf://` to read from Hugging Face.
|
154 |
+
It writes Parquet files in the form "part-xxxxx.parquet", or to a single file if `path ends with ".parquet".
|
155 |
+
|
156 |
+
**kwargs
|
157 |
+
Any additional kwargs are passed to pyarrow.parquet.ParquetWriter.
|
158 |
+
|
159 |
+
Returns
|
160 |
+
-------
|
161 |
+
DataFrame
|
162 |
+
DataFrame based on parquet file.
|
163 |
+
|
164 |
+
Examples
|
165 |
+
--------
|
166 |
+
>>> spark.createDataFrame(pd.DataFrame({"foo": range(5), "bar": range(5, 10)}))
|
167 |
+
>>> # Save to one file
|
168 |
+
>>> write_parquet(df, "hf://datasets/username/dataset/data.parquet")
|
169 |
+
>>> # OR save to a directory (possibly in many files)
|
170 |
+
>>> write_parquet(df, "hf://datasets/username/dataset")
|
171 |
+
"""
|
172 |
+
filesystem: HfFileSystem = kwargs.pop("filesystem", HfFileSystem(**kwargs.pop("storage_options", {})))
|
173 |
+
if path.endswith(".parquet") or path.endswith(".pq"):
|
174 |
+
df = df.coalesce(1)
|
175 |
+
else:
|
176 |
+
path += "/part-{shard_idx:05d}.parquet"
|
177 |
+
df.mapInArrow(
|
178 |
+
partial(_preupload, path=path, schema=to_arrow_schema(df.schema), filesystem=filesystem, **kwargs),
|
179 |
+
from_arrow_schema(pa.schema({"addition": pa.binary()})),
|
180 |
+
).coalesce(1).mapInArrow(
|
181 |
+
partial(_commit, path=path, filesystem=filesystem),
|
182 |
+
from_arrow_schema(pa.schema({"path": pa.string()})),
|
183 |
+
).collect()
|
data/spark.ipynb
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "6fb06d81-1778-403c-b15b-d68200a5e6b5",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Spark on Hugging Face"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": null,
|
14 |
+
"id": "7399a5ed-aea8-45cf-866f-2decd7097456",
|
15 |
+
"metadata": {
|
16 |
+
"tags": []
|
17 |
+
},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"from pyspark.sql import SparkSession\n",
|
21 |
+
"spark = SparkSession.builder.appName(\"demo\").getOrCreate()"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "markdown",
|
26 |
+
"id": "8bf07f63-6fed-4cf9-8fee-5f3a5fb6bed1",
|
27 |
+
"metadata": {
|
28 |
+
"tags": []
|
29 |
+
},
|
30 |
+
"source": [
|
31 |
+
"Example:\n",
|
32 |
+
"\n",
|
33 |
+
"```python\n",
|
34 |
+
"# Load the BAAI/Infinity-Instruct dataset\n",
|
35 |
+
"df = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\")\n",
|
36 |
+
"\n",
|
37 |
+
"# Load only one column\n",
|
38 |
+
"df_langdetect_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", columns=[\"langdetect\"])\n",
|
39 |
+
"\n",
|
40 |
+
"# Load values within certain ranges\n",
|
41 |
+
"criteria = [(\"langdetect\", \"=\", \"zh-cn\")]\n",
|
42 |
+
"df_chinese_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", filters=criteria)\n",
|
43 |
+
"\n",
|
44 |
+
"# Save dataset\n",
|
45 |
+
"write_parquet(df_chinese_only, \"hf://datasets/username/Infinity-Instruct-Chinese-Only\")\n",
|
46 |
+
"```"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": null,
|
52 |
+
"id": "ca71b3ac-3291-4e4e-8fee-b3550b0426d6",
|
53 |
+
"metadata": {
|
54 |
+
"tags": []
|
55 |
+
},
|
56 |
+
"outputs": [],
|
57 |
+
"source": [
|
58 |
+
"from hf_spark_utils import read_parquet, write_parquet, set_session\n",
|
59 |
+
"set_session(spark)"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "markdown",
|
64 |
+
"id": "07ea62a4-7549-4a75-8a12-9d830f6e3cde",
|
65 |
+
"metadata": {},
|
66 |
+
"source": [
|
67 |
+
"#### (Optional) Login"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"cell_type": "code",
|
72 |
+
"execution_count": null,
|
73 |
+
"id": "343b3a9a-2dce-492b-9384-703368ba3975",
|
74 |
+
"metadata": {
|
75 |
+
"tags": []
|
76 |
+
},
|
77 |
+
"outputs": [],
|
78 |
+
"source": [
|
79 |
+
"from huggingface_hub import notebook_login\n",
|
80 |
+
"notebook_login(new_session=False)"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "markdown",
|
85 |
+
"id": "332b7609-f0eb-4703-aea6-fec3d09f5870",
|
86 |
+
"metadata": {},
|
87 |
+
"source": [
|
88 |
+
"#### Run your code:"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": null,
|
94 |
+
"id": "6c0dfe01-9190-454c-9c52-216f74d339e1",
|
95 |
+
"metadata": {},
|
96 |
+
"outputs": [],
|
97 |
+
"source": []
|
98 |
+
}
|
99 |
+
],
|
100 |
+
"metadata": {
|
101 |
+
"kernelspec": {
|
102 |
+
"display_name": "Python 3 (ipykernel)",
|
103 |
+
"language": "python",
|
104 |
+
"name": "python3"
|
105 |
+
},
|
106 |
+
"language_info": {
|
107 |
+
"codemirror_mode": {
|
108 |
+
"name": "ipython",
|
109 |
+
"version": 3
|
110 |
+
},
|
111 |
+
"file_extension": ".py",
|
112 |
+
"mimetype": "text/x-python",
|
113 |
+
"name": "python",
|
114 |
+
"nbconvert_exporter": "python",
|
115 |
+
"pygments_lexer": "ipython3",
|
116 |
+
"version": "3.9.5"
|
117 |
+
}
|
118 |
+
},
|
119 |
+
"nbformat": 4,
|
120 |
+
"nbformat_minor": 5
|
121 |
+
}
|
login.html
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{% extends "page.html" %}
|
2 |
+
|
3 |
+
|
4 |
+
{% block stylesheet %}
|
5 |
+
{% endblock %}
|
6 |
+
|
7 |
+
{% block site %}
|
8 |
+
|
9 |
+
<div id="jupyter-main-app" class="container">
|
10 |
+
|
11 |
+
<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face Logo" style="height: 96px; vertical-align: bottom;"><span style="font-size: 52px;">×</span><img src="https://upload.wikimedia.org/wikipedia/commons/f/f3/Apache_Spark_logo.svg" alt="Apache Spark Logo" style="height: 96px; vertical-align: bottom;">
|
12 |
+
<h4>You must duplicate this Space to use it.</h4>
|
13 |
+
<br>
|
14 |
+
<a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/DockerTemplates/jupyterlab?duplicate=true">
|
15 |
+
<img style="margin: 0" src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>
|
16 |
+
<br>
|
17 |
+
<br>
|
18 |
+
<h4>The default token is <span style="color:orange;">huggingface</span></h4>
|
19 |
+
<h4>Duplicate the Space to run your own instance</h4>
|
20 |
+
|
21 |
+
{% if login_available %}
|
22 |
+
{# login_available means password-login is allowed. Show the form. #}
|
23 |
+
<div class="row">
|
24 |
+
<div class="navbar col-sm-8">
|
25 |
+
<div class="navbar-inner">
|
26 |
+
<div class="container">
|
27 |
+
<div class="center-nav">
|
28 |
+
<form action="{{base_url}}login?next={{next}}" method="post" class="navbar-form pull-left">
|
29 |
+
{{ xsrf_form_html() | safe }}
|
30 |
+
{% if token_available %}
|
31 |
+
<label for="password_input"><strong>{% trans %}Token:{% endtrans
|
32 |
+
%}</strong></label>
|
33 |
+
{% else %}
|
34 |
+
<label for="password_input"><strong>{% trans %}Password:{% endtrans %}</strong></label>
|
35 |
+
{% endif %}
|
36 |
+
<input type="password" name="password" id="password_input" class="form-control">
|
37 |
+
<button type="submit" class="btn btn-default" id="login_submit">{% trans %}Log in{% endtrans
|
38 |
+
%}</button>
|
39 |
+
</form>
|
40 |
+
</div>
|
41 |
+
</div>
|
42 |
+
</div>
|
43 |
+
</div>
|
44 |
+
</div>
|
45 |
+
{% else %}
|
46 |
+
<p>{% trans %}No login available, you shouldn't be seeing this page.{% endtrans %}</p>
|
47 |
+
{% endif %}
|
48 |
+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/spark-ex-min.png" alt="Spark on Hugging Face example Python code" style="width: 100%; margin-bottom: 40px; border-radius: 5px; box-shadow: rgba(149, 157, 165, 0.2) 0px 8px 24px; border: 1rem solid white;">
|
49 |
+
<p>This template was created by <a href="https://twitter.com/camenduru" target="_blank" >camenduru</a> and <a href="https://huggingface.co/nateraw" target="_blank" >nateraw</a>, with contributions of <a href="https://huggingface.co/osanseviero" target="_blank" >osanseviero</a>, <a href="https://huggingface.co/azzr" target="_blank" >azzr</a> and <a href="https://huggingface.co/lhoestq" target="_blank">lhoestq</a></p>
|
50 |
+
{% if message %}
|
51 |
+
<div class="row">
|
52 |
+
{% for key in message %}
|
53 |
+
<div class="message {{key}}">
|
54 |
+
{{message[key]}}
|
55 |
+
</div>
|
56 |
+
{% endfor %}
|
57 |
+
</div>
|
58 |
+
{% endif %}
|
59 |
+
{% if token_available %}
|
60 |
+
{% block token_message %}
|
61 |
+
|
62 |
+
{% endblock token_message %}
|
63 |
+
{% endif %}
|
64 |
+
</div>
|
65 |
+
|
66 |
+
{% endblock %}
|
67 |
+
|
68 |
+
|
69 |
+
{% block script %}
|
70 |
+
{% endblock %}
|
on_startup.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Write some commands here that will run on root user before startup.
|
3 |
+
# For example, to clone transformers and install it in dev mode:
|
4 |
+
# git clone https://github.com/huggingface/transformers.git
|
5 |
+
# cd transformers && pip install -e ".[dev]"
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
tree
|
2 |
+
openjdk-8-jdk
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
jupyterlab==3.6.1
|
2 |
+
jupyter-server==2.3.0
|
3 |
+
tornado==6.2
|
4 |
+
ipywidgets
|
5 |
+
huggingface_hub
|
6 |
+
pyarrow
|
7 |
+
pyspark[sql,pandas_on_spark]
|
8 |
+
plotly
|
start_server.sh
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
|
3 |
+
|
4 |
+
echo "Starting Jupyter Lab with token $JUPYTER_TOKEN"
|
5 |
+
|
6 |
+
NOTEBOOK_DIR="/data"
|
7 |
+
cp -n data/hf_spark_utils.py $NOTEBOOK_DIR/hf_spark_utils.py
|
8 |
+
cp -n data/spark.ipynb $NOTEBOOK_DIR/spark.ipynb
|
9 |
+
DEFAULT_URL="/lab/tree/spark.ipynb"
|
10 |
+
|
11 |
+
jupyter-lab \
|
12 |
+
--ip 0.0.0.0 \
|
13 |
+
--port 7860 \
|
14 |
+
--no-browser \
|
15 |
+
--allow-root \
|
16 |
+
--ServerApp.token="$JUPYTER_TOKEN" \
|
17 |
+
--ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
|
18 |
+
--ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
|
19 |
+
--ServerApp.disable_check_xsrf=True \
|
20 |
+
--LabApp.news_url=None \
|
21 |
+
--LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
|
22 |
+
--LabApp.default_url=$DEFAULT_URL \
|
23 |
+
--notebook-dir=$NOTEBOOK_DIR
|