csyaas commited on
Commit
91d7d48
·
verified ·
1 Parent(s): 2ff47a9

Create sync_data.sh

Browse files
Files changed (1) hide show
  1. sync_data.sh +129 -0
sync_data.sh ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 检查环境变量
4
+ if [[ -z "$HF_TOKEN" ]] || [[ -z "$DATASET_ID" ]]; then
5
+ echo "Starting without backup functionality - missing HF_TOKEN or DATASET_ID"
6
+ exec java ${JVM_OPTS} -jar /opt/halo/halo.jar
7
+ exit 0
8
+ fi
9
+
10
+ # 激活虚拟环境
11
+ source /opt/venv/bin/activate
12
+
13
+ # Python 函数: 上传备份
14
+ upload_backup() {
15
+ file_path="$1"
16
+ file_name="$2"
17
+ token="$HF_TOKEN"
18
+ repo_id="$DATASET_ID"
19
+
20
+ python3 -c "
21
+ from huggingface_hub import HfApi
22
+ import sys
23
+ import os
24
+
25
+ def manage_backups(api, repo_id, max_files=50):
26
+ files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
27
+ backup_files = [f for f in files if f.startswith('halo_backup_') and f.endswith('.tar.gz')]
28
+ backup_files.sort()
29
+
30
+ if len(backup_files) >= max_files:
31
+ files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
32
+ for file_to_delete in files_to_delete:
33
+ try:
34
+ api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type='dataset')
35
+ print(f'Deleted old backup: {file_to_delete}')
36
+ except Exception as e:
37
+ print(f'Error deleting {file_to_delete}: {str(e)}')
38
+
39
+ api = HfApi(token='$token')
40
+ try:
41
+ api.upload_file(
42
+ path_or_fileobj='$file_path',
43
+ path_in_repo='$file_name',
44
+ repo_id='$repo_id',
45
+ repo_type='dataset'
46
+ )
47
+ print(f'Successfully uploaded $file_name')
48
+
49
+ manage_backups(api, '$repo_id')
50
+ except Exception as e:
51
+ print(f'Error uploading file: {str(e)}')
52
+ "
53
+ }
54
+
55
+ # Python 函数: 下载最新备份
56
+ download_latest_backup() {
57
+ token="$HF_TOKEN"
58
+ repo_id="$DATASET_ID"
59
+
60
+ python3 -c "
61
+ from huggingface_hub import HfApi
62
+ import sys
63
+ import os
64
+ import tarfile
65
+ import tempfile
66
+ api = HfApi(token='$token')
67
+ try:
68
+ files = api.list_repo_files(repo_id='$repo_id', repo_type='dataset')
69
+ backup_files = [f for f in files if f.startswith('halo_backup_') and f.endswith('.tar.gz')]
70
+
71
+ if not backup_files:
72
+ print('No backup files found')
73
+ sys.exit()
74
+
75
+ latest_backup = sorted(backup_files)[-1]
76
+
77
+ with tempfile.TemporaryDirectory() as temp_dir:
78
+ filepath = api.hf_hub_download(
79
+ repo_id='$repo_id',
80
+ filename=latest_backup,
81
+ repo_type='dataset',
82
+ local_dir=temp_dir
83
+ )
84
+
85
+ if filepath and os.path.exists(filepath):
86
+ with tarfile.open(filepath, 'r:gz') as tar:
87
+ tar.extractall(os.path.expanduser('~/.halo2'))
88
+ print(f'Successfully restored backup from {latest_backup}')
89
+
90
+ except Exception as e:
91
+ print(f'Error downloading backup: {str(e)}')
92
+ "
93
+ }
94
+
95
+ # 首次启动时下载最新备份
96
+ echo "Downloading latest backup from HuggingFace..."
97
+ download_latest_backup
98
+
99
+ # 同步函数
100
+ sync_data() {
101
+ while true; do
102
+ echo "Starting sync process at $(date)"
103
+
104
+ if [ -d ~/.halo2 ]; then
105
+ timestamp=$(date +%Y%m%d_%H%M%S)
106
+ backup_file="halo_backup_${timestamp}.tar.gz"
107
+
108
+ # 压缩数据目录
109
+ tar -czf "/tmp/${backup_file}" -C ~/.halo2 .
110
+
111
+ echo "Uploading backup to HuggingFace..."
112
+ upload_backup "/tmp/${backup_file}" "${backup_file}"
113
+
114
+ rm -f "/tmp/${backup_file}"
115
+ else
116
+ echo "Data directory does not exist yet, waiting for next sync..."
117
+ fi
118
+
119
+ SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
120
+ echo "Next sync in ${SYNC_INTERVAL} seconds..."
121
+ sleep $SYNC_INTERVAL
122
+ done
123
+ }
124
+
125
+ # 后台启动同步进程
126
+ sync_data &
127
+
128
+ # 启动 Halo
129
+ exec java ${JVM_OPTS} -jar /opt/halo/halo.jar