from datasets import load_dataset, DownloadMode import json import os from huggingface_hub import HfApi , hf_hub_download dataset_id = "SDSC/digiwild-dataset" token = os.getenv("HUGGINGFACE_TOKEN") # Initialize API client api = HfApi(token=token) # Load all metadata files files = api.list_repo_files(dataset_id, repo_type="dataset") json_files = [file for file in files if file.endswith(".json")] # Load the metadata compilation try: data_files = "data/train-00000-of-00001.parquet" metadata = load_dataset( dataset_id, data_files=data_files) # Add new json entries to dataset for file in json_files: file = hf_hub_download(repo_id=dataset_id, filename=file, repo_type="dataset") with open(file, "r") as f: new = json.load(f) if not(new["image_md5"] in metadata["train"]["image_md5"]): metadata["train"] = metadata["train"].add_item(new) except: metadata = load_dataset( dataset_id, data_files=json_files) metadata.push_to_hub(dataset_id, token=token)