File size: 1,129 Bytes
951051a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
from datasets import load_dataset, DownloadMode
import json
import os
from huggingface_hub import HfApi , hf_hub_download
dataset_id = "SDSC/digiwild-dataset"
token = os.getenv("HUGGINGFACE_TOKEN")
# Initialize API client
api = HfApi(token=token)
# Load all metadata files
files = api.list_repo_files(dataset_id, repo_type="dataset")
json_files = [file for file in files if file.endswith(".json")]
# Load the metadata compilation
try:
data_files = "data/train-00000-of-00001.parquet"
metadata = load_dataset(
dataset_id,
data_files=data_files)
# Add new json entries to dataset
for file in json_files:
file = hf_hub_download(repo_id=dataset_id, filename=file, repo_type="dataset")
with open(file, "r") as f:
new = json.load(f)
if not(new["image_md5"] in metadata["train"]["image_md5"]):
metadata["train"] = metadata["train"].add_item(new)
except:
metadata = load_dataset(
dataset_id,
data_files=json_files)
metadata.push_to_hub(dataset_id, token=token) |