Tristan Thrush
commited on
Commit
·
ac14940
1
Parent(s):
6d09417
added force push capability
Browse files
README.md
CHANGED
@@ -16,14 +16,19 @@ A basic example of dynamic adversarial data collection with a Gradio app.
|
|
16 |
|
17 |
*Setting up the Space*
|
18 |
1. Clone this repo and deploy it on your own Hugging Face space.
|
19 |
-
2. Add
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
*Running Data Collection*
|
29 |
1. On your local repo that you pulled, create a copy of `config.py.example`,
|
@@ -38,11 +43,9 @@ Now, you should be watching hits come into your Hugging Face dataset
|
|
38 |
automatically!
|
39 |
|
40 |
*Tips and Tricks*
|
41 |
-
-
|
42 |
-
|
43 |
-
|
44 |
-
When you redeploy your app on Hugging Face spaces, the data directory is deleted
|
45 |
-
automatically.
|
46 |
- huggingface spaces have limited computational resources and memory. If you
|
47 |
run too many HITs and/or assignments at once, then you could encounter issues.
|
48 |
You could also encounter issues if you are trying to create a dataset that is
|
|
|
16 |
|
17 |
*Setting up the Space*
|
18 |
1. Clone this repo and deploy it on your own Hugging Face space.
|
19 |
+
2. Add the following secrets to your space:
|
20 |
+
- `HF_TOKEN`: One of your Hugging Face tokens.
|
21 |
+
- `DATASET_REPO_URL`: The url to an empty dataset that you created the hub. It
|
22 |
+
can be a private or public dataset.
|
23 |
+
- `FORCE_PUSH`: "yes"
|
24 |
+
When you run this space on mturk and when people visit your space on
|
25 |
+
huggingface.co, the app will use your token to automatically store new HITs
|
26 |
+
in your dataset. Setting `FORCE_PUSH` to "yes" ensures that your repo will
|
27 |
+
force push changes to the dataset during data collection. Otherwise,
|
28 |
+
accidental manual changes to your dataset could result in your space gettin
|
29 |
+
merge conflicts as it automatically tries to push the dataset to the hub. For
|
30 |
+
local development, add these three keys to a `.env` file, and consider setting
|
31 |
+
`FORCE_PUSH` to "no".
|
32 |
|
33 |
*Running Data Collection*
|
34 |
1. On your local repo that you pulled, create a copy of `config.py.example`,
|
|
|
43 |
automatically!
|
44 |
|
45 |
*Tips and Tricks*
|
46 |
+
- Use caution while doing local development of your space and
|
47 |
+
simultaneously running it on mturk. Consider setting `FORCE_PUSH` to "no" in
|
48 |
+
your local `.env` file.
|
|
|
|
|
49 |
- huggingface spaces have limited computational resources and memory. If you
|
50 |
run too many HITs and/or assignments at once, then you could encounter issues.
|
51 |
You could also encounter issues if you are trying to create a dataset that is
|
app.py
CHANGED
@@ -12,11 +12,13 @@ from dotenv import load_dotenv
|
|
12 |
from pathlib import Path
|
13 |
import json
|
14 |
from filelock import FileLock
|
|
|
15 |
|
16 |
# These variables are for storing the mturk HITs in a Hugging Face dataset.
|
17 |
if Path(".env").is_file():
|
18 |
load_dotenv(".env")
|
19 |
DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
|
|
|
20 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
21 |
DATA_FILENAME = "data.jsonl"
|
22 |
DATA_FILE = os.path.join("data", DATA_FILENAME)
|
@@ -106,7 +108,16 @@ with demo:
|
|
106 |
json_data_with_assignment_id =\
|
107 |
[json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
|
108 |
jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
finally:
|
111 |
lock.release()
|
112 |
return state
|
|
|
12 |
from pathlib import Path
|
13 |
import json
|
14 |
from filelock import FileLock
|
15 |
+
from utils import force_git_push
|
16 |
|
17 |
# These variables are for storing the mturk HITs in a Hugging Face dataset.
|
18 |
if Path(".env").is_file():
|
19 |
load_dotenv(".env")
|
20 |
DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
|
21 |
+
FORCE_PUSH = os.getenv("FORCE_PUSH")
|
22 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
23 |
DATA_FILENAME = "data.jsonl"
|
24 |
DATA_FILE = os.path.join("data", DATA_FILENAME)
|
|
|
108 |
json_data_with_assignment_id =\
|
109 |
[json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
|
110 |
jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
|
111 |
+
|
112 |
+
if repo.is_repo_clean():
|
113 |
+
logger.info("Repo currently clean. Ignoring push_to_hub")
|
114 |
+
return None
|
115 |
+
repo.git_add(auto_lfs_track=True)
|
116 |
+
repo.git_commit("Auto commit by space")
|
117 |
+
if FORCE_PUSH == "yes":
|
118 |
+
force_git_push(repo)
|
119 |
+
else:
|
120 |
+
repo.git_push()
|
121 |
finally:
|
122 |
lock.release()
|
123 |
return state
|
utils.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
from huggingface_hub.repository import _lfs_log_progress
|
3 |
+
|
4 |
+
def force_git_push(
|
5 |
+
repo,
|
6 |
+
):
|
7 |
+
"""
|
8 |
+
force a simple git push
|
9 |
+
Blocking. Will return url to commit on remote
|
10 |
+
repo.
|
11 |
+
"""
|
12 |
+
command = "git push --force"
|
13 |
+
|
14 |
+
try:
|
15 |
+
with _lfs_log_progress():
|
16 |
+
process = subprocess.Popen(
|
17 |
+
command.split(),
|
18 |
+
stderr=subprocess.PIPE,
|
19 |
+
stdout=subprocess.PIPE,
|
20 |
+
encoding="utf-8",
|
21 |
+
cwd=repo.local_dir,
|
22 |
+
)
|
23 |
+
|
24 |
+
stdout, stderr = process.communicate()
|
25 |
+
return_code = process.poll()
|
26 |
+
process.kill()
|
27 |
+
|
28 |
+
if len(stderr):
|
29 |
+
print(stderr)
|
30 |
+
|
31 |
+
if return_code:
|
32 |
+
raise subprocess.CalledProcessError(
|
33 |
+
return_code, process.args, output=stdout, stderr=stderr
|
34 |
+
)
|
35 |
+
|
36 |
+
except subprocess.CalledProcessError as exc:
|
37 |
+
raise EnvironmentError(exc.stderr)
|
38 |
+
|
39 |
+
return repo.git_head_commit_url()
|