File size: 1,129 Bytes
951051a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from datasets import load_dataset, DownloadMode
import json
import os 
from huggingface_hub import HfApi , hf_hub_download

dataset_id = "SDSC/digiwild-dataset"
token = os.getenv("HUGGINGFACE_TOKEN")

# Initialize API client
api = HfApi(token=token)

# Load all metadata files
files = api.list_repo_files(dataset_id, repo_type="dataset")
json_files = [file for file in files if file.endswith(".json")]

# Load the metadata compilation 
try: 
    data_files = "data/train-00000-of-00001.parquet"
    metadata = load_dataset(
                            dataset_id, 
                            data_files=data_files)
    # Add new json entries to dataset 
    for file in json_files: 
        file = hf_hub_download(repo_id=dataset_id, filename=file, repo_type="dataset")
        with open(file, "r") as f:
            new = json.load(f)
        if not(new["image_md5"] in metadata["train"]["image_md5"]):
            metadata["train"] = metadata["train"].add_item(new)
except: 
    metadata = load_dataset(
                    dataset_id,
                    data_files=json_files)


metadata.push_to_hub(dataset_id, token=token)