Spaces:

lhoestq
/

Spark-on-HF-JupyterLab

Sleeping

App Files Files Community

lhoestq HF staff commited on Aug 28, 2024

Commit

019fb90

verified ·

1 Parent(s): fbd2a96

Upload 9 files

Browse files

Files changed (9) hide show

Dockerfile +106 -0
README.md +9 -4
data/hf_spark_utils.py +183 -0
data/spark.ipynb +121 -0
login.html +70 -0
on_startup.sh +5 -0
packages.txt +2 -0
requirements.txt +8 -0
start_server.sh +23 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,106 @@

+FROM nvidia/cuda:11.3.1-base-ubuntu20.04
+ENV DEBIAN_FRONTEND=noninteractive \
+	TZ=Europe/Paris
+# Remove any third-party apt sources to avoid issues with expiring keys.
+# Install some basic utilities
+RUN rm -f /etc/apt/sources.list.d/*.list && \
+    apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    ca-certificates \
+    sudo \
+    git \
+    wget \
+    procps \
+    git-lfs \
+    zip \
+    unzip \
+    htop \
+    vim \
+    nano \
+    bzip2 \
+    libx11-6 \
+    build-essential \
+    libsndfile-dev \
+    software-properties-common \
+ && rm -rf /var/lib/apt/lists/*
+RUN add-apt-repository ppa:flexiondotorg/nvtop && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends nvtop
+RUN curl -sL https://deb.nodesource.com/setup_20.x  | bash - && \
+    apt-get install -y nodejs && \
+    npm install -g configurable-http-proxy
+# Create a working directory
+WORKDIR /app
+# Create a non-root user and switch to it
+RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
+ && chown -R user:user /app
+RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
+USER user
+# All users can use /home/user as their home directory
+ENV HOME=/home/user
+RUN mkdir $HOME/.cache $HOME/.config \
+ && chmod -R 777 $HOME
+# Set up the Conda environment
+ENV CONDA_AUTO_UPDATE_CONDA=false \
+    PATH=$HOME/miniconda/bin:$PATH
+RUN curl -sLo ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \
+ && chmod +x ~/miniconda.sh \
+ && ~/miniconda.sh -b -p ~/miniconda \
+ && rm ~/miniconda.sh \
+ && conda clean -ya
+WORKDIR $HOME/app
+#######################################
+# Start root user section
+#######################################
+USER root
+# User Debian packages
+## Security warning : Potential user code executed as root (build time)
+RUN --mount=target=/root/packages.txt,source=packages.txt \
+    apt-get update && \
+    xargs -r -a /root/packages.txt apt-get install -y --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
+RUN --mount=target=/root/on_startup.sh,source=on_startup.sh,readwrite \
+	bash /root/on_startup.sh
+RUN mkdir /data && chown user:user /data
+#######################################
+# End root user section
+#######################################
+USER user
+# Python packages
+RUN --mount=target=requirements.txt,source=requirements.txt \
+    pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+RUN chmod +x start_server.sh
+COPY --chown=user login.html /home/user/miniconda/lib/python3.9/site-packages/jupyter_server/templates/login.html
+ENV PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+    HF_HOME=/data/.cache/huggingface \
+	SYSTEM=spaces \
+	SHELL=/bin/bash
+CMD ["./start_server.sh"]

README.md CHANGED Viewed

@@ -1,10 +1,15 @@
 ---
-title: Spark On HF JupyterLab
-emoji: 👁
-colorFrom: purple
-colorTo: blue
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Spark on HF JupyterLab
+emoji: 🌅
+colorFrom: gray
+colorTo: red
 sdk: docker
 pinned: false
+tags:
+- jupyterlab
+- spark
+- datasets
+suggested_storage: small
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

data/hf_spark_utils.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import math
+import pickle
+import tempfile
+from functools import partial
+from typing import Iterator, Optional, Union
+import pyarrow as pa
+import pyarrow.parquet as pq
+from huggingface_hub import CommitOperationAdd, HfFileSystem
+from pyspark.sql.dataframe import DataFrame
+from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema
+spark = None
+def set_session(session):
+    global spark
+    spark = session
+def _read(iterator: Iterator[pa.RecordBatch], columns: Optional[list[str]], filters: Optional[Union[list[tuple], list[list[tuple]]]], **kwargs) -> Iterator[pa.RecordBatch]:
+    for batch in iterator:
+        paths = batch[0].to_pylist()
+        ds = pq.ParquetDataset(paths, **kwargs)
+        yield from ds._dataset.to_batches(columns=columns, filter=pq.filters_to_expression(filters) if filters else None)
+def read_parquet(
+    path: str,
+    columns: Optional[list[str]] = None,
+    filters: Optional[Union[list[tuple], list[list[tuple]]]] = None,
+    **kwargs,
+) -> DataFrame:
+    """
+    Loads Parquet files from Hugging Face using PyArrow, returning a PySPark `DataFrame`.
+    It reads Parquet files in a distributed manner.
+    Access private or gated repositories using `huggingface-cli login` or passing a token
+    using the `storage_options` argument: `storage_options={"token": "hf_xxx"}`
+    Parameters
+    ----------
+    path : str
+        Path to the file. Prefix with a protocol like `hf://` to read from Hugging Face.
+        You can read from multiple files if you pass a globstring.
+    columns : list, default None
+        If not None, only these columns will be read from the file.
+    filters : List[Tuple] or List[List[Tuple]], default None
+        To filter out data.
+        Filter syntax: [[(column, op, val), ...],...]
+        where op is [==, =, >, >=, <, <=, !=, in, not in]
+        The innermost tuples are transposed into a set of filters applied
+        through an `AND` operation.
+        The outer list combines these sets of filters through an `OR`
+        operation.
+        A single list of tuples can also be used, meaning that no `OR`
+        operation between set of filters is to be conducted.
+    **kwargs
+        Any additional kwargs are passed to pyarrow.parquet.ParquetDataset.
+    Returns
+    -------
+    DataFrame
+        DataFrame based on parquet file.
+    Examples
+    --------
+    >>> path = "hf://datasets/username/dataset/data.parquet"
+    >>> pd.DataFrame({"foo": range(5), "bar": range(5, 10)}).to_parquet(path)
+    >>> read_parquet(path).show()
+    +---+---+
+    |foo|bar|
+    +---+---+
+    |  0|  5|
+    |  1|  6|
+    |  2|  7|
+    |  3|  8|
+    |  4|  9|
+    +---+---+
+    >>> read_parquet(path, columns=["bar"]).show()
+    +---+
+    |bar|
+    +---+
+    |  5|
+    |  6|
+    |  7|
+    |  8|
+    |  9|
+    +---+
+    >>> sel = [("foo", ">", 2)]
+    >>> read_parquet(path, filters=sel).show()
+    +---+---+
+    |foo|bar|
+    +---+---+
+    |  3|  8|
+    |  4|  9|
+    +---+---+
+    """
+    filesystem: HfFileSystem = kwargs.pop("filesystem") if "filesystem" in kwargs else HfFileSystem(**kwargs.pop("storage_options", {}))
+    paths = filesystem.glob(path)
+    if not paths:
+        raise FileNotFoundError(f"Counldn't find any file at {path}")
+    rdd = spark.sparkContext.parallelize([{"path": path} for path in paths], len(paths))
+    df = spark.createDataFrame(rdd)
+    arrow_schema = pq.read_schema(filesystem.open(paths[0]))
+    schema = pa.schema([field for field in arrow_schema if (columns is None or field.name in columns)], metadata=arrow_schema.metadata)
+    return df.mapInArrow(
+        partial(_read, columns=columns, filters=filters, filesystem=filesystem, schema=arrow_schema, **kwargs),
+        from_arrow_schema(schema),
+    )
+def _preupload(iterator: Iterator[pa.RecordBatch], path: str, schema: pa.Schema, filesystem: HfFileSystem, row_group_size: Optional[int] = None, **kwargs) -> Iterator[pa.RecordBatch]:
+    resolved_path = filesystem.resolve_path(path)
+    with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
+        with pq.ParquetWriter(temp_file.name, schema=schema, **kwargs) as writer:
+            for batch in iterator:
+                writer.write_batch(batch, row_group_size=row_group_size)
+        addition = CommitOperationAdd(path_in_repo=temp_file.name, path_or_fileobj=temp_file.name)
+        filesystem._api.preupload_lfs_files(repo_id=resolved_path.repo_id, additions=[addition], repo_type=resolved_path.repo_type, revision=resolved_path.revision)
+    yield pa.record_batch({"addition": [pickle.dumps(addition)]}, schema=pa.schema({"addition": pa.binary()}))
+def _commit(iterator: Iterator[pa.RecordBatch], path: str, filesystem: HfFileSystem, max_operations_per_commit=50) -> Iterator[pa.RecordBatch]:
+    resolved_path = filesystem.resolve_path(path)
+    additions: list[CommitOperationAdd] = [pickle.loads(addition) for addition in pa.Table.from_batches(iterator, schema=pa.schema({"addition": pa.binary()}))[0].to_pylist()]
+    num_commits = math.ceil(len(additions) / max_operations_per_commit)
+    for shard_idx, addition in enumerate(additions):
+        addition.path_in_repo = resolved_path.path_in_repo.replace("{shard_idx:05d}", f"{shard_idx:05d}")
+    for i in range(0, num_commits):
+        operations = additions[i * max_operations_per_commit : (i + 1) * max_operations_per_commit]
+        commit_message = "Upload using PySpark" + (f" (part {i:05d}-of-{num_commits:05d})" if num_commits > 1 else "")
+        filesystem._api.create_commit(repo_id=resolved_path.repo_id, repo_type=resolved_path.repo_type, revision=resolved_path.revision, operations=operations, commit_message=commit_message)
+        yield pa.record_batch({"path": [addition.path_in_repo for addition in operations]}, schema=pa.schema({"path": pa.string()}))
+def write_parquet(df: DataFrame, path: str, **kwargs) -> None:
+    """
+    Write Parquet files to Hugging Face using PyArrow.
+    It uploads Parquet files in a distributed manner in two steps:
+    1. Preupload the Parquet files in parallel in a distributed banner
+    2. Commit the preuploaded files
+    Authenticate using `huggingface-cli login` or passing a token
+    using the `storage_options` argument: `storage_options={"token": "hf_xxx"}`
+    Parameters
+    ----------
+    path : str
+        Path of the file or directory. Prefix with a protocol like `hf://` to read from Hugging Face.
+        It writes Parquet files in the form "part-xxxxx.parquet", or to a single file if `path ends with ".parquet".
+    **kwargs
+        Any additional kwargs are passed to pyarrow.parquet.ParquetWriter.
+    Returns
+    -------
+    DataFrame
+        DataFrame based on parquet file.
+    Examples
+    --------
+    >>> spark.createDataFrame(pd.DataFrame({"foo": range(5), "bar": range(5, 10)}))
+    >>> # Save to one file
+    >>> write_parquet(df, "hf://datasets/username/dataset/data.parquet")
+    >>> # OR save to a directory (possibly in many files)
+    >>> write_parquet(df, "hf://datasets/username/dataset")
+    """
+    filesystem: HfFileSystem = kwargs.pop("filesystem", HfFileSystem(**kwargs.pop("storage_options", {})))
+    if path.endswith(".parquet") or path.endswith(".pq"):
+        df = df.coalesce(1)
+    else:
+        path += "/part-{shard_idx:05d}.parquet"
+    df.mapInArrow(
+        partial(_preupload, path=path, schema=to_arrow_schema(df.schema), filesystem=filesystem, **kwargs),
+        from_arrow_schema(pa.schema({"addition": pa.binary()})),
+    ).coalesce(1).mapInArrow(
+        partial(_commit, path=path, filesystem=filesystem),
+        from_arrow_schema(pa.schema({"path": pa.string()})),
+    ).collect()

data/spark.ipynb ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6fb06d81-1778-403c-b15b-d68200a5e6b5",
+   "metadata": {},
+   "source": [
+    "# Spark on Hugging Face"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7399a5ed-aea8-45cf-866f-2decd7097456",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "spark = SparkSession.builder.appName(\"demo\").getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf07f63-6fed-4cf9-8fee-5f3a5fb6bed1",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Example:\n",
+    "\n",
+    "```python\n",
+    "# Load the BAAI/Infinity-Instruct dataset\n",
+    "df = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\")\n",
+    "\n",
+    "# Load only one column\n",
+    "df_langdetect_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", columns=[\"langdetect\"])\n",
+    "\n",
+    "# Load values within certain ranges\n",
+    "criteria = [(\"langdetect\", \"=\", \"zh-cn\")]\n",
+    "df_chinese_only = read_parquet(\"hf://datasets/BAAI/Infinity-Instruct/7M/*.parquet\", filters=criteria)\n",
+    "\n",
+    "# Save dataset\n",
+    "write_parquet(df_chinese_only, \"hf://datasets/username/Infinity-Instruct-Chinese-Only\")\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca71b3ac-3291-4e4e-8fee-b3550b0426d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from hf_spark_utils import read_parquet, write_parquet, set_session\n",
+    "set_session(spark)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07ea62a4-7549-4a75-8a12-9d830f6e3cde",
+   "metadata": {},
+   "source": [
+    "#### (Optional) Login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "343b3a9a-2dce-492b-9384-703368ba3975",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login(new_session=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "332b7609-f0eb-4703-aea6-fec3d09f5870",
+   "metadata": {},
+   "source": [
+    "#### Run your code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c0dfe01-9190-454c-9c52-216f74d339e1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

	@@ -0,0 +1,70 @@

+{% extends "page.html" %}
+{% block stylesheet %}
+{% endblock %}
+{% block site %}
+<div id="jupyter-main-app" class="container">
+    <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face Logo" style="height: 96px; vertical-align: bottom;"><span style="font-size: 52px;">×</span><img src="https://upload.wikimedia.org/wikipedia/commons/f/f3/Apache_Spark_logo.svg" alt="Apache Spark Logo" style="height: 96px; vertical-align: bottom;">
+    <h4>You must duplicate this Space to use it.</h4>
+    <br>
+    <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/DockerTemplates/jupyterlab?duplicate=true">
+    <img style="margin: 0" src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&amp;style=flat&amp;logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&amp;logoWidth=14" alt="Duplicate Space"></a>
+    <br>
+    <br>
+    <h4>The default token is <span style="color:orange;">huggingface</span></h4>
+    <h4>Duplicate the Space to run your own instance</h4>
+    {% if login_available %}
+    {# login_available means password-login is allowed. Show the form. #}
+    <div class="row">
+        <div class="navbar col-sm-8">
+            <div class="navbar-inner">
+                <div class="container">
+                    <div class="center-nav">
+                        <form action="{{base_url}}login?next={{next}}" method="post" class="navbar-form pull-left">
+                            {{ xsrf_form_html() | safe }}
+                            {% if token_available %}
+                            <label for="password_input"><strong>{% trans %}Token:{% endtrans
+                                    %}</strong></label>
+                            {% else %}
+                            <label for="password_input"><strong>{% trans %}Password:{% endtrans %}</strong></label>
+                            {% endif %}
+                            <input type="password" name="password" id="password_input" class="form-control">
+                            <button type="submit" class="btn btn-default" id="login_submit">{% trans %}Log in{% endtrans
+                                %}</button>
+                        </form>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    {% else %}
+    <p>{% trans %}No login available, you shouldn't be seeing this page.{% endtrans %}</p>
+    {% endif %}
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/spark-ex-min.png" alt="Spark on Hugging Face example Python code" style="width: 100%; margin-bottom: 40px; border-radius: 5px; box-shadow: rgba(149, 157, 165, 0.2) 0px 8px 24px; border: 1rem solid white;">
+    <p>This template was created by <a href="https://twitter.com/camenduru" target="_blank" >camenduru</a> and <a href="https://huggingface.co/nateraw" target="_blank" >nateraw</a>, with contributions of <a href="https://huggingface.co/osanseviero" target="_blank" >osanseviero</a>, <a href="https://huggingface.co/azzr" target="_blank" >azzr</a> and <a href="https://huggingface.co/lhoestq" target="_blank">lhoestq</a></p>
+    {% if message %}
+    <div class="row">
+        {% for key in message %}
+        <div class="message {{key}}">
+            {{message[key]}}
+        </div>
+        {% endfor %}
+    </div>
+    {% endif %}
+    {% if token_available %}
+    {% block token_message %}
+    {% endblock token_message %}
+    {% endif %}
+</div>
+{% endblock %}
+{% block script %}
+{% endblock %}

on_startup.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/bash
+# Write some commands here that will run on root user before startup.
+# For example, to clone transformers and install it in dev mode:
+# git clone https://github.com/huggingface/transformers.git
+# cd transformers && pip install -e ".[dev]"

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ tree
2	+ openjdk-8-jdk

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+jupyterlab==3.6.1
+jupyter-server==2.3.0
+tornado==6.2
+ipywidgets
+huggingface_hub
+pyarrow
+pyspark[sql,pandas_on_spark]
+plotly

start_server.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+JUPYTER_TOKEN="${JUPYTER_TOKEN:=huggingface}"
+echo "Starting Jupyter Lab with token $JUPYTER_TOKEN"
+NOTEBOOK_DIR="/data"
+cp -n data/hf_spark_utils.py $NOTEBOOK_DIR/hf_spark_utils.py
+cp -n data/spark.ipynb $NOTEBOOK_DIR/spark.ipynb
+DEFAULT_URL="/lab/tree/spark.ipynb"
+jupyter-lab \
+    --ip 0.0.0.0 \
+    --port 7860 \
+    --no-browser \
+    --allow-root \
+    --ServerApp.token="$JUPYTER_TOKEN" \
+    --ServerApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
+    --ServerApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
+    --ServerApp.disable_check_xsrf=True \
+    --LabApp.news_url=None \
+    --LabApp.check_for_updates_class="jupyterlab.NeverCheckForUpdate" \
+    --LabApp.default_url=$DEFAULT_URL \
+    --notebook-dir=$NOTEBOOK_DIR