Spaces:

OAOA
/

DifFace

Sleeping

Zongsheng

first upload

06f26d7 almost 2 years ago

7.15 kB

	import cv2
	import lmdb
	import sys
	from multiprocessing import Pool
	from os import path as osp
	from tqdm import tqdm


	def make_lmdb_from_imgs(data_path,
	lmdb_path,
	img_path_list,
	keys,
	batch=5000,
	compress_level=1,
	multiprocessing_read=False,
	n_thread=40,
	map_size=None):
	"""Make lmdb from images.

	Contents of lmdb. The file structure is:

	::

	example.lmdb
	├── data.mdb
	├── lock.mdb
	├── meta_info.txt

	The data.mdb and lock.mdb are standard lmdb files and you can refer to
	https://lmdb.readthedocs.io/en/release/ for more details.

	The meta_info.txt is a specified txt file to record the meta information
	of our datasets. It will be automatically created when preparing
	datasets by our provided dataset tools.
	Each line in the txt file records 1)image name (with extension),
	2)image shape, and 3)compression level, separated by a white space.

	For example, the meta information could be:
	`000_00000000.png (720,1280,3) 1`, which means:
	1) image name (with extension): 000_00000000.png;
	2) image shape: (720,1280,3);
	3) compression level: 1

	We use the image name without extension as the lmdb key.

	If `multiprocessing_read` is True, it will read all the images to memory
	using multiprocessing. Thus, your server needs to have enough memory.

	Args:
	data_path (str): Data path for reading images.
	lmdb_path (str): Lmdb save path.
	img_path_list (str): Image path list.
	keys (str): Used for lmdb keys.
	batch (int): After processing batch images, lmdb commits.
	Default: 5000.
	compress_level (int): Compress level when encoding images. Default: 1.
	multiprocessing_read (bool): Whether use multiprocessing to read all
	the images to memory. Default: False.
	n_thread (int): For multiprocessing.
	map_size (int \| None): Map size for lmdb env. If None, use the
	estimated size from images. Default: None
	"""

	assert len(img_path_list) == len(keys), ('img_path_list and keys should have the same length, '
	f'but got {len(img_path_list)} and {len(keys)}')
	print(f'Create lmdb for {data_path}, save to {lmdb_path}...')
	print(f'Totoal images: {len(img_path_list)}')
	if not lmdb_path.endswith('.lmdb'):
	raise ValueError("lmdb_path must end with '.lmdb'.")
	if osp.exists(lmdb_path):
	print(f'Folder {lmdb_path} already exists. Exit.')
	sys.exit(1)

	if multiprocessing_read:
	# read all the images to memory (multiprocessing)
	dataset = {} # use dict to keep the order for multiprocessing
	shapes = {}
	print(f'Read images with multiprocessing, #thread: {n_thread} ...')
	pbar = tqdm(total=len(img_path_list), unit='image')

	def callback(arg):
	"""get the image data and update pbar."""
	key, dataset[key], shapes[key] = arg
	pbar.update(1)
	pbar.set_description(f'Read {key}')

	pool = Pool(n_thread)
	for path, key in zip(img_path_list, keys):
	pool.apply_async(read_img_worker, args=(osp.join(data_path, path), key, compress_level), callback=callback)
	pool.close()
	pool.join()
	pbar.close()
	print(f'Finish reading {len(img_path_list)} images.')

	# create lmdb environment
	if map_size is None:
	# obtain data size for one image
	img = cv2.imread(osp.join(data_path, img_path_list[0]), cv2.IMREAD_UNCHANGED)
	_, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level])
	data_size_per_img = img_byte.nbytes
	print('Data size per image is: ', data_size_per_img)
	data_size = data_size_per_img * len(img_path_list)
	map_size = data_size * 10

	env = lmdb.open(lmdb_path, map_size=map_size)

	# write data to lmdb
	pbar = tqdm(total=len(img_path_list), unit='chunk')
	txn = env.begin(write=True)
	txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w')
	for idx, (path, key) in enumerate(zip(img_path_list, keys)):
	pbar.update(1)
	pbar.set_description(f'Write {key}')
	key_byte = key.encode('ascii')
	if multiprocessing_read:
	img_byte = dataset[key]
	h, w, c = shapes[key]
	else:
	_, img_byte, img_shape = read_img_worker(osp.join(data_path, path), key, compress_level)
	h, w, c = img_shape

	txn.put(key_byte, img_byte)
	# write meta information
	txt_file.write(f'{key}.png ({h},{w},{c}) {compress_level}\n')
	if idx % batch == 0:
	txn.commit()
	txn = env.begin(write=True)
	pbar.close()
	txn.commit()
	env.close()
	txt_file.close()
	print('\nFinish writing lmdb.')


	def read_img_worker(path, key, compress_level):
	"""Read image worker.

	Args:
	path (str): Image path.
	key (str): Image key.
	compress_level (int): Compress level when encoding images.

	Returns:
	str: Image key.
	byte: Image byte.
	tuple[int]: Image shape.
	"""

	img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
	if img.ndim == 2:
	h, w = img.shape
	c = 1
	else:
	h, w, c = img.shape
	_, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level])
	return (key, img_byte, (h, w, c))


	class LmdbMaker():
	"""LMDB Maker.

	Args:
	lmdb_path (str): Lmdb save path.
	map_size (int): Map size for lmdb env. Default: 1024 ** 4, 1TB.
	batch (int): After processing batch images, lmdb commits.
	Default: 5000.
	compress_level (int): Compress level when encoding images. Default: 1.
	"""

	def __init__(self, lmdb_path, map_size=1024**4, batch=5000, compress_level=1):
	if not lmdb_path.endswith('.lmdb'):
	raise ValueError("lmdb_path must end with '.lmdb'.")
	if osp.exists(lmdb_path):
	print(f'Folder {lmdb_path} already exists. Exit.')
	sys.exit(1)

	self.lmdb_path = lmdb_path
	self.batch = batch
	self.compress_level = compress_level
	self.env = lmdb.open(lmdb_path, map_size=map_size)
	self.txn = self.env.begin(write=True)
	self.txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w')
	self.counter = 0

	def put(self, img_byte, key, img_shape):
	self.counter += 1
	key_byte = key.encode('ascii')
	self.txn.put(key_byte, img_byte)
	# write meta information
	h, w, c = img_shape
	self.txt_file.write(f'{key}.png ({h},{w},{c}) {self.compress_level}\n')
	if self.counter % self.batch == 0:
	self.txn.commit()
	self.txn = self.env.begin(write=True)

	def close(self):
	self.txn.commit()
	self.env.close()
	self.txt_file.close()