Spaces:
Running
Running
rename files
Browse files- Dockerfile +2 -2
- requirements.txt +1 -0
- run.py β run_job.py +3 -1
- app.py β start_app.py +56 -12
Dockerfile
CHANGED
@@ -20,8 +20,8 @@ RUN pip install --no-cache-dir --upgrade pip
|
|
20 |
COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
|
21 |
|
22 |
# Install dependencies
|
23 |
-
RUN pip install "gradio[oauth]"
|
24 |
RUN pip install -r requirements.txt
|
25 |
|
26 |
# Run app
|
27 |
-
ENTRYPOINT python
|
|
|
20 |
COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
|
21 |
|
22 |
# Install dependencies
|
23 |
+
RUN pip install "gradio[oauth]"
|
24 |
RUN pip install -r requirements.txt
|
25 |
|
26 |
# Run app
|
27 |
+
ENTRYPOINT python start_app.py
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
duckdb
|
2 |
huggingface_hub
|
3 |
tabulate
|
|
|
1 |
+
fire
|
2 |
duckdb
|
3 |
huggingface_hub
|
4 |
tabulate
|
run.py β run_job.py
RENAMED
@@ -50,11 +50,13 @@ def sql(src: str, dst: str, query: str, config: str = "default", split: str = "t
|
|
50 |
src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
|
51 |
if not src_kwargs:
|
52 |
raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
|
|
|
53 |
con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
|
54 |
if dry_run:
|
55 |
-
print(f"Sample data from '{src}' that would be written to '{dst}':\n")
|
56 |
else:
|
57 |
con.sql("PRAGMA enable_progress_bar;")
|
|
|
58 |
result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
|
59 |
if dry_run:
|
60 |
print(result.df().to_markdown())
|
|
|
50 |
src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
|
51 |
if not src_kwargs:
|
52 |
raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
|
53 |
+
|
54 |
con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
|
55 |
if dry_run:
|
56 |
+
print(f"Sample data from '{src}' that would be written to dataset '{dst}':\n")
|
57 |
else:
|
58 |
con.sql("PRAGMA enable_progress_bar;")
|
59 |
+
|
60 |
result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
|
61 |
if dry_run:
|
62 |
print(result.df().to_markdown())
|
app.py β start_app.py
RENAMED
@@ -1,18 +1,20 @@
|
|
|
|
1 |
import re
|
2 |
import subprocess
|
3 |
import yaml
|
4 |
|
5 |
import gradio as gr
|
6 |
import requests
|
7 |
-
from huggingface_hub import HfApi
|
8 |
|
9 |
|
10 |
-
CMD = ["python" ,"
|
11 |
|
12 |
with open("README.md") as f:
|
13 |
METADATA = yaml.safe_load(f.read().split("---\n")[1])
|
14 |
TITLE = METADATA["title"]
|
15 |
EMOJI = METADATA["emoji"]
|
|
|
16 |
|
17 |
try:
|
18 |
process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
@@ -22,26 +24,68 @@ except Exception:
|
|
22 |
|
23 |
DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
|
24 |
|
25 |
-
def
|
26 |
if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|ββ"):
|
27 |
[pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
|
28 |
percent = float(percent_match.group(0)[:-1]) / 100
|
29 |
desc = line[:percent_match.start()].strip() or "Progress"
|
30 |
pbars[desc] = percent
|
|
|
|
|
|
|
31 |
|
32 |
def dry_run(src, config, split, dst, query):
|
33 |
if not all([src, config, split, dst, query]):
|
34 |
raise gr.Error("Please fill source, destination and query.")
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
37 |
for line in iter(process.stdout.readline, b""):
|
38 |
logs += line.decode()
|
39 |
-
yield {output_markdown: logs
|
40 |
|
41 |
-
def run(src, config, split, dst, query):
|
42 |
if not all([src, config, split, dst, query]):
|
43 |
raise gr.Error("Please fill source, destination and query.")
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
|
47 |
NUM_TRENDING_DATASETS = 10
|
@@ -51,17 +95,17 @@ with gr.Blocks() as demo:
|
|
51 |
with gr.Column(scale=10):
|
52 |
gr.Markdown(f"# {TITLE} {EMOJI}")
|
53 |
with gr.Column():
|
54 |
-
gr.LoginButton(
|
55 |
with gr.Row():
|
56 |
-
with gr.Column():
|
57 |
with gr.Row():
|
58 |
loading_codes_json = gr.JSON([], visible=False)
|
59 |
dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
|
60 |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
|
61 |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
|
62 |
-
with gr.Column(
|
63 |
gr.HTML("<div style='font-size: 4em;'>β</div>")
|
64 |
-
with gr.Column():
|
65 |
dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
|
66 |
query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
|
67 |
with gr.Row():
|
|
|
1 |
+
import os
|
2 |
import re
|
3 |
import subprocess
|
4 |
import yaml
|
5 |
|
6 |
import gradio as gr
|
7 |
import requests
|
8 |
+
from huggingface_hub import HfApi, get_token
|
9 |
|
10 |
|
11 |
+
CMD = ["python" ,"run_job.py"]
|
12 |
|
13 |
with open("README.md") as f:
|
14 |
METADATA = yaml.safe_load(f.read().split("---\n")[1])
|
15 |
TITLE = METADATA["title"]
|
16 |
EMOJI = METADATA["emoji"]
|
17 |
+
spaceId = os.environ.get("SPACE_ID") or "lhoestq/run-duckdb"
|
18 |
|
19 |
try:
|
20 |
process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
24 |
|
25 |
DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
|
26 |
|
27 |
+
def parse_log(line: str, pbars: dict[str, float]):
|
28 |
if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|ββ"):
|
29 |
[pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
|
30 |
percent = float(percent_match.group(0)[:-1]) / 100
|
31 |
desc = line[:percent_match.start()].strip() or "Progress"
|
32 |
pbars[desc] = percent
|
33 |
+
yield ""
|
34 |
+
else:
|
35 |
+
yield line
|
36 |
|
37 |
def dry_run(src, config, split, dst, query):
|
38 |
if not all([src, config, split, dst, query]):
|
39 |
raise gr.Error("Please fill source, destination and query.")
|
40 |
+
args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN]
|
41 |
+
cmd = CMD + args
|
42 |
+
logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
|
43 |
+
yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
|
44 |
+
process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
45 |
for line in iter(process.stdout.readline, b""):
|
46 |
logs += line.decode()
|
47 |
+
yield {output_markdown: logs}
|
48 |
|
49 |
+
def run(src, config, split, dst, query, oauth_token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
|
50 |
if not all([src, config, split, dst, query]):
|
51 |
raise gr.Error("Please fill source, destination and query.")
|
52 |
+
if oauth_token and profile:
|
53 |
+
token = oauth_token.token
|
54 |
+
username = profile.username
|
55 |
+
elif (token := get_token()):
|
56 |
+
username = HfApi().whoami(token=token)["name"]
|
57 |
+
else:
|
58 |
+
raise gr.Error("Please log in to run the job.")
|
59 |
+
args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query]
|
60 |
+
cmd = CMD + args
|
61 |
+
logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
|
62 |
+
pbars = {}
|
63 |
+
yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
|
64 |
+
resp = requests.post(
|
65 |
+
f"https://huggingface.co/api/jobs/{username}",
|
66 |
+
json={
|
67 |
+
"spaceId": spaceId,
|
68 |
+
"arguments": args,
|
69 |
+
"command": CMD,
|
70 |
+
"environment": {},
|
71 |
+
"flavor": "cpu-basic"
|
72 |
+
},
|
73 |
+
headers={"Authorization": f"Bearer {token}"}
|
74 |
+
)
|
75 |
+
if resp.status_code != 200:
|
76 |
+
logs += resp.text
|
77 |
+
pbars = {"Finished with an error β": 1.0}
|
78 |
+
else:
|
79 |
+
job_id = resp.json()["metadata"]["job_id"]
|
80 |
+
resp = requests.get(
|
81 |
+
f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
|
82 |
+
headers={"Authorization": f"Bearer {token}"}
|
83 |
+
)
|
84 |
+
for line in iter(resp.raw.readline, b""):
|
85 |
+
logs += parse_log(line.decode(), pbars=pbars)
|
86 |
+
yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
|
87 |
+
pbars = {"Finished" + (" β
" if process.returncode == 0 else " with an error β"): 1.0}
|
88 |
+
yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
|
89 |
|
90 |
READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
|
91 |
NUM_TRENDING_DATASETS = 10
|
|
|
95 |
with gr.Column(scale=10):
|
96 |
gr.Markdown(f"# {TITLE} {EMOJI}")
|
97 |
with gr.Column():
|
98 |
+
gr.LoginButton()
|
99 |
with gr.Row():
|
100 |
+
with gr.Column(scale=10):
|
101 |
with gr.Row():
|
102 |
loading_codes_json = gr.JSON([], visible=False)
|
103 |
dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
|
104 |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
|
105 |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
|
106 |
+
with gr.Column(min_width=60):
|
107 |
gr.HTML("<div style='font-size: 4em;'>β</div>")
|
108 |
+
with gr.Column(scale=10):
|
109 |
dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
|
110 |
query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
|
111 |
with gr.Row():
|