Spaces:

lhoestq
/

run-duckdb-jobs

Running

App Files Files Community

lhoestq HF staff commited on 15 days ago

Commit

43b024e

1 Parent(s): 641f494

rename files

Browse files

Files changed (4) hide show

Dockerfile +2 -2
requirements.txt +1 -0
run.py → run_job.py +3 -1
app.py → start_app.py +56 -12

Dockerfile CHANGED Viewed

@@ -20,8 +20,8 @@ RUN pip install --no-cache-dir --upgrade pip
 COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
 # Install dependencies
-RUN pip install "gradio[oauth]" fire
 RUN pip install -r requirements.txt
 # Run app
-ENTRYPOINT python app.py

 COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
 # Install dependencies
+RUN pip install "gradio[oauth]"
 RUN pip install -r requirements.txt
 # Run app
+ENTRYPOINT python start_app.py

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 duckdb
 huggingface_hub
 tabulate

+fire
 duckdb
 huggingface_hub
 tabulate

run.py → run_job.py RENAMED Viewed

@@ -50,11 +50,13 @@ def sql(src: str, dst: str, query: str, config: str = "default", split: str = "t
         src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
         if not src_kwargs:
             raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
         con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
         if dry_run:
-            print(f"Sample data from '{src}' that would be written to '{dst}':\n")
         else:
             con.sql("PRAGMA enable_progress_bar;")
         result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
         if dry_run:
             print(result.df().to_markdown())

         src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
         if not src_kwargs:
             raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
         con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
         if dry_run:
+            print(f"Sample data from '{src}' that would be written to dataset '{dst}':\n")
         else:
             con.sql("PRAGMA enable_progress_bar;")
         result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
         if dry_run:
             print(result.df().to_markdown())

app.py → start_app.py RENAMED Viewed

@@ -1,18 +1,20 @@
 import re
 import subprocess
 import yaml
 import gradio as gr
 import requests
-from huggingface_hub import HfApi
-CMD = ["python" ,"run.py"]
 with open("README.md") as f:
     METADATA = yaml.safe_load(f.read().split("---\n")[1])
 TITLE = METADATA["title"]
 EMOJI = METADATA["emoji"]
 try:
     process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -22,26 +24,68 @@ except Exception:
 DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
-def update_pbars(pbars: dict[str, float], line: str):
     if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|█▌"):
         [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
         percent = float(percent_match.group(0)[:-1]) / 100
         desc = line[:percent_match.start()].strip() or "Progress"
         pbars[desc] = percent
 def dry_run(src, config, split, dst, query):
     if not all([src, config, split, dst, query]):
         raise gr.Error("Please fill source, destination and query.")
-    process = subprocess.Popen(CMD + ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN], stdout=subprocess.PIPE)
-    logs = ""
     for line in iter(process.stdout.readline, b""):
         logs += line.decode()
-        yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
-def run(src, config, split, dst, query):
     if not all([src, config, split, dst, query]):
         raise gr.Error("Please fill source, destination and query.")
-    raise gr.Error("NotImplemented")
 READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
 NUM_TRENDING_DATASETS = 10
@@ -51,17 +95,17 @@ with gr.Blocks() as demo:
         with gr.Column(scale=10):
             gr.Markdown(f"# {TITLE} {EMOJI}")
         with gr.Column():
-            gr.LoginButton(scale=0.1)
     with gr.Row():
-        with gr.Column():
             with gr.Row():
                 loading_codes_json = gr.JSON([], visible=False)
                 dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
                 subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
                 split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
-        with gr.Column(scale=0.1, min_width=60):
             gr.HTML("<div style='font-size: 4em;'>→</div>")
-        with gr.Column():
             dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
     query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
     with gr.Row():

+import os
 import re
 import subprocess
 import yaml
 import gradio as gr
 import requests
+from huggingface_hub import HfApi, get_token
+CMD = ["python" ,"run_job.py"]
 with open("README.md") as f:
     METADATA = yaml.safe_load(f.read().split("---\n")[1])
 TITLE = METADATA["title"]
 EMOJI = METADATA["emoji"]
+spaceId = os.environ.get("SPACE_ID") or "lhoestq/run-duckdb"
 try:
     process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
+def parse_log(line: str, pbars: dict[str, float]):
     if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|█▌"):
         [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
         percent = float(percent_match.group(0)[:-1]) / 100
         desc = line[:percent_match.start()].strip() or "Progress"
         pbars[desc] = percent
+        yield ""
+    else:
+        yield line
 def dry_run(src, config, split, dst, query):
     if not all([src, config, split, dst, query]):
         raise gr.Error("Please fill source, destination and query.")
+    args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN]
+    cmd = CMD + args
+    logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
+    yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
+    process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
     for line in iter(process.stdout.readline, b""):
         logs += line.decode()
+        yield {output_markdown: logs}
+def run(src, config, split, dst, query, oauth_token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
     if not all([src, config, split, dst, query]):
         raise gr.Error("Please fill source, destination and query.")
+    if oauth_token and profile:
+        token = oauth_token.token
+        username = profile.username
+    elif (token := get_token()):
+        username = HfApi().whoami(token=token)["name"]
+    else:
+        raise gr.Error("Please log in to run the job.")
+    args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query]
+    cmd = CMD + args
+    logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
+    pbars = {}
+    yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
+    resp = requests.post(
+        f"https://huggingface.co/api/jobs/{username}",
+        json={
+            "spaceId": spaceId,
+            "arguments": args,
+            "command":  CMD,
+            "environment": {},
+            "flavor": "cpu-basic"
+        },
+        headers={"Authorization": f"Bearer {token}"}
+    )
+    if resp.status_code != 200:
+        logs += resp.text
+        pbars = {"Finished with an error ❌": 1.0}
+    else:
+        job_id = resp.json()["metadata"]["job_id"]
+        resp = requests.get(
+            f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
+            headers={"Authorization": f"Bearer {token}"}
+        )
+        for line in iter(resp.raw.readline, b""):
+            logs += parse_log(line.decode(), pbars=pbars)
+            yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
+        pbars = {"Finished" + (" ✅" if process.returncode == 0 else " with an error ❌"): 1.0}
+    yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
 READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
 NUM_TRENDING_DATASETS = 10
         with gr.Column(scale=10):
             gr.Markdown(f"# {TITLE} {EMOJI}")
         with gr.Column():
+            gr.LoginButton()
     with gr.Row():
+        with gr.Column(scale=10):
             with gr.Row():
                 loading_codes_json = gr.JSON([], visible=False)
                 dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
                 subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
                 split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
+        with gr.Column(min_width=60):
             gr.HTML("<div style='font-size: 4em;'>→</div>")
+        with gr.Column(scale=10):
             dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
     query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
     with gr.Row():