Tristan Thrush commited on
Commit
ac14940
·
1 Parent(s): 6d09417

added force push capability

Browse files
Files changed (3) hide show
  1. README.md +16 -13
  2. app.py +12 -1
  3. utils.py +39 -0
README.md CHANGED
@@ -16,14 +16,19 @@ A basic example of dynamic adversarial data collection with a Gradio app.
16
 
17
  *Setting up the Space*
18
  1. Clone this repo and deploy it on your own Hugging Face space.
19
- 2. Add one of your Hugging Face tokens to the secrets for your space, with the
20
- name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
21
- the url of this dataset in the secrets for your space, with the name
22
- `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
23
- space on mturk and when people visit your space on huggingface.co, the app
24
- will use your token to automatically store new HITs in your dataset. NOTE:
25
- if you push something to your dataset manually, you need to reboot your space
26
- or it could get merge conflicts when trying to push HIT data.
 
 
 
 
 
27
 
28
  *Running Data Collection*
29
  1. On your local repo that you pulled, create a copy of `config.py.example`,
@@ -38,11 +43,9 @@ Now, you should be watching hits come into your Hugging Face dataset
38
  automatically!
39
 
40
  *Tips and Tricks*
41
- - If you are developing and running this space locally to test it out, try
42
- deleting the data directory that the app clones before running the app again.
43
- Otherwise, the app could get merge conflicts when storing new HITs on the hub.
44
- When you redeploy your app on Hugging Face spaces, the data directory is deleted
45
- automatically.
46
  - huggingface spaces have limited computational resources and memory. If you
47
  run too many HITs and/or assignments at once, then you could encounter issues.
48
  You could also encounter issues if you are trying to create a dataset that is
 
16
 
17
  *Setting up the Space*
18
  1. Clone this repo and deploy it on your own Hugging Face space.
19
+ 2. Add the following secrets to your space:
20
+ - `HF_TOKEN`: One of your Hugging Face tokens.
21
+ - `DATASET_REPO_URL`: The url to an empty dataset that you created the hub. It
22
+ can be a private or public dataset.
23
+ - `FORCE_PUSH`: "yes"
24
+ When you run this space on mturk and when people visit your space on
25
+ huggingface.co, the app will use your token to automatically store new HITs
26
+ in your dataset. Setting `FORCE_PUSH` to "yes" ensures that your repo will
27
+ force push changes to the dataset during data collection. Otherwise,
28
+ accidental manual changes to your dataset could result in your space gettin
29
+ merge conflicts as it automatically tries to push the dataset to the hub. For
30
+ local development, add these three keys to a `.env` file, and consider setting
31
+ `FORCE_PUSH` to "no".
32
 
33
  *Running Data Collection*
34
  1. On your local repo that you pulled, create a copy of `config.py.example`,
 
43
  automatically!
44
 
45
  *Tips and Tricks*
46
+ - Use caution while doing local development of your space and
47
+ simultaneously running it on mturk. Consider setting `FORCE_PUSH` to "no" in
48
+ your local `.env` file.
 
 
49
  - huggingface spaces have limited computational resources and memory. If you
50
  run too many HITs and/or assignments at once, then you could encounter issues.
51
  You could also encounter issues if you are trying to create a dataset that is
app.py CHANGED
@@ -12,11 +12,13 @@ from dotenv import load_dotenv
12
  from pathlib import Path
13
  import json
14
  from filelock import FileLock
 
15
 
16
  # These variables are for storing the mturk HITs in a Hugging Face dataset.
17
  if Path(".env").is_file():
18
  load_dotenv(".env")
19
  DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
 
20
  HF_TOKEN = os.getenv("HF_TOKEN")
21
  DATA_FILENAME = "data.jsonl"
22
  DATA_FILE = os.path.join("data", DATA_FILENAME)
@@ -106,7 +108,16 @@ with demo:
106
  json_data_with_assignment_id =\
107
  [json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
108
  jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
109
- repo.push_to_hub()
 
 
 
 
 
 
 
 
 
110
  finally:
111
  lock.release()
112
  return state
 
12
  from pathlib import Path
13
  import json
14
  from filelock import FileLock
15
+ from utils import force_git_push
16
 
17
  # These variables are for storing the mturk HITs in a Hugging Face dataset.
18
  if Path(".env").is_file():
19
  load_dotenv(".env")
20
  DATASET_REPO_URL = os.getenv("DATASET_REPO_URL")
21
+ FORCE_PUSH = os.getenv("FORCE_PUSH")
22
  HF_TOKEN = os.getenv("HF_TOKEN")
23
  DATA_FILENAME = "data.jsonl"
24
  DATA_FILE = os.path.join("data", DATA_FILENAME)
 
108
  json_data_with_assignment_id =\
109
  [json.dumps(dict({"assignmentId": state["assignmentId"]}, **datum)) for datum in state["data"]]
110
  jsonlfile.write("\n".join(json_data_with_assignment_id) + "\n")
111
+
112
+ if repo.is_repo_clean():
113
+ logger.info("Repo currently clean. Ignoring push_to_hub")
114
+ return None
115
+ repo.git_add(auto_lfs_track=True)
116
+ repo.git_commit("Auto commit by space")
117
+ if FORCE_PUSH == "yes":
118
+ force_git_push(repo)
119
+ else:
120
+ repo.git_push()
121
  finally:
122
  lock.release()
123
  return state
utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ from huggingface_hub.repository import _lfs_log_progress
3
+
4
+ def force_git_push(
5
+ repo,
6
+ ):
7
+ """
8
+ force a simple git push
9
+ Blocking. Will return url to commit on remote
10
+ repo.
11
+ """
12
+ command = "git push --force"
13
+
14
+ try:
15
+ with _lfs_log_progress():
16
+ process = subprocess.Popen(
17
+ command.split(),
18
+ stderr=subprocess.PIPE,
19
+ stdout=subprocess.PIPE,
20
+ encoding="utf-8",
21
+ cwd=repo.local_dir,
22
+ )
23
+
24
+ stdout, stderr = process.communicate()
25
+ return_code = process.poll()
26
+ process.kill()
27
+
28
+ if len(stderr):
29
+ print(stderr)
30
+
31
+ if return_code:
32
+ raise subprocess.CalledProcessError(
33
+ return_code, process.args, output=stdout, stderr=stderr
34
+ )
35
+
36
+ except subprocess.CalledProcessError as exc:
37
+ raise EnvironmentError(exc.stderr)
38
+
39
+ return repo.git_head_commit_url()