from datasets import load_dataset, DownloadMode | |
import json | |
import os | |
from huggingface_hub import HfApi , hf_hub_download | |
dataset_id = "SDSC/digiwild-dataset" | |
token = os.getenv("HUGGINGFACE_TOKEN") | |
# Initialize API client | |
api = HfApi(token=token) | |
# Load all metadata files | |
files = api.list_repo_files(dataset_id, repo_type="dataset") | |
json_files = [file for file in files if file.endswith(".json")] | |
# Load the metadata compilation | |
try: | |
data_files = "data/train-00000-of-00001.parquet" | |
metadata = load_dataset( | |
dataset_id, | |
data_files=data_files) | |
# Add new json entries to dataset | |
for file in json_files: | |
file = hf_hub_download(repo_id=dataset_id, filename=file, repo_type="dataset") | |
with open(file, "r") as f: | |
new = json.load(f) | |
if not(new["image_md5"] in metadata["train"]["image_md5"]): | |
metadata["train"] = metadata["train"].add_item(new) | |
except: | |
metadata = load_dataset( | |
dataset_id, | |
data_files=json_files) | |
metadata.push_to_hub(dataset_id, token=token) |