This file contains functions that are used to perform data augmentation.
import cv2
import numpy as np
import torch
from loguru import logger
def get_transform(center, scale, res, rot=0):
"""Generate transformation matrix."""
h = 200 * scale
t = np.zeros((3, 3))
t[0, 0] = float(res[1]) / h
t[1, 1] = float(res[0]) / h
t[0, 2] = res[1] * (-float(center[0]) / h + 0.5)
t[1, 2] = res[0] * (-float(center[1]) / h + 0.5)
t[2, 2] = 1
if not rot == 0:
rot = -rot # To match direction of rotation from cropping
rot_mat = np.zeros((3, 3))
rot_rad = rot * np.pi / 180
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
rot_mat[0, :2] = [cs, -sn]
rot_mat[1, :2] = [sn, cs]
rot_mat[2, 2] = 1
# Need to rotate around center
t_mat = np.eye(3)
t_mat[0, 2] = -res[1] / 2
t_mat[1, 2] = -res[0] / 2
t_inv = t_mat.copy()
t_inv[:2, 2] *= -1
t =,,, t)))
return t
def transform(pt, center, scale, res, invert=0, rot=0):
"""Transform pixel location to different reference."""
t = get_transform(center, scale, res, rot=rot)
if invert:
t = np.linalg.inv(t)
new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.0]).T
new_pt =, new_pt)
return new_pt[:2].astype(int) + 1
def rotate_2d(pt_2d, rot_rad):
x = pt_2d[0]
y = pt_2d[1]
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
xx = x * cs - y * sn
yy = x * sn + y * cs
return np.array([xx, yy], dtype=np.float32)
def gen_trans_from_patch_cv(
c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False
# augment size with scale
src_w = src_width * scale
src_h = src_height * scale
src_center = np.array([c_x, c_y], dtype=np.float32)
# augment rotation
rot_rad = np.pi * rot / 180
src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)
dst_w = dst_width
dst_h = dst_height
dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
src = np.zeros((3, 2), dtype=np.float32)
src[0, :] = src_center
src[1, :] = src_center + src_downdir
src[2, :] = src_center + src_rightdir
dst = np.zeros((3, 2), dtype=np.float32)
dst[0, :] = dst_center
dst[1, :] = dst_center + dst_downdir
dst[2, :] = dst_center + dst_rightdir
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
trans = trans.astype(np.float32)
return trans
def generate_patch_image(
img = cvimg.copy()
bb_c_x = float(bbox[0])
bb_c_y = float(bbox[1])
bb_width = float(bbox[2])
bb_height = float(bbox[3])
trans = gen_trans_from_patch_cv(
bb_c_x, bb_c_y, bb_width, bb_height, out_shape[1], out_shape[0], scale, rot
# anti-aliasing
blur = cv2.GaussianBlur(img, (gauss_kernel, gauss_kernel), gauss_sigma)
img_patch = cv2.warpAffine(
blur, trans, (int(out_shape[1]), int(out_shape[0])), flags=interpl_strategy
img_patch = img_patch.astype(np.float32)
inv_trans = gen_trans_from_patch_cv(
return img_patch, trans, inv_trans
def augm_params(is_train, flip_prob, noise_factor, rot_factor, scale_factor):
"""Get augmentation parameters."""
flip = 0 # flipping
pn = np.ones(3) # per channel pixel-noise
rot = 0 # rotation
sc = 1 # scaling
if is_train:
# We flip with probability 1/2
if np.random.uniform() <= flip_prob:
flip = 1
assert False, "Flipping not supported"
# Each channel is multiplied with a number
# in the area [1-opt.noiseFactor,1+opt.noiseFactor]
pn = np.random.uniform(1 - noise_factor, 1 + noise_factor, 3)
# The rotation is a number in the area [-2*rotFactor, 2*rotFactor]
rot = min(
2 * rot_factor,
-2 * rot_factor,
np.random.randn() * rot_factor,
# The scale is multiplied with a number
# in the area [1-scaleFactor,1+scaleFactor]
sc = min(
1 + scale_factor,
1 - scale_factor,
np.random.randn() * scale_factor + 1,
# but it is zero with probability 3/5
if np.random.uniform() <= 0.6:
rot = 0
augm_dict = {}
augm_dict["flip"] = flip
augm_dict["pn"] = pn
augm_dict["rot"] = rot
augm_dict["sc"] = sc
return augm_dict
def rgb_processing(is_train, rgb_img, center, bbox_dim, augm_dict, img_res):
rot = augm_dict["rot"]
sc = augm_dict["sc"]
pn = augm_dict["pn"]
scale = sc * bbox_dim
crop_dim = int(scale * 200)
# faster cropping!!
rgb_img = generate_patch_image(
[center[0], center[1], crop_dim, crop_dim],
[img_res, img_res],
# in the rgb image we add pixel noise in a channel-wise manner
rgb_img[:, :, 0] = np.minimum(255.0, np.maximum(0.0, rgb_img[:, :, 0] * pn[0]))
rgb_img[:, :, 1] = np.minimum(255.0, np.maximum(0.0, rgb_img[:, :, 1] * pn[1]))
rgb_img[:, :, 2] = np.minimum(255.0, np.maximum(0.0, rgb_img[:, :, 2] * pn[2]))
rgb_img = np.transpose(rgb_img.astype("float32"), (2, 0, 1)) / 255.0
return rgb_img
def transform_kp2d(kp2d, bbox):
# bbox: (cx, cy, scale) in the original image space
# scale is normalized
assert isinstance(kp2d, np.ndarray)
assert len(kp2d.shape) == 2
cx, cy, scale = bbox
s = 200 * scale # to px
cap_dim = 1000 # px
factor = cap_dim / (1.5 * s)
kp2d_cropped = np.copy(kp2d)
kp2d_cropped[:, 0] -= cx - 1.5 / 2 * s
kp2d_cropped[:, 1] -= cy - 1.5 / 2 * s
kp2d_cropped[:, 0] *= factor
kp2d_cropped[:, 1] *= factor
return kp2d_cropped
def j2d_processing(kp, center, bbox_dim, augm_dict, img_res):
"""Process gt 2D keypoints and apply all augmentation transforms."""
scale = augm_dict["sc"] * bbox_dim
rot = augm_dict["rot"]
nparts = kp.shape[0]
for i in range(nparts):
kp[i, 0:2] = transform(
kp[i, 0:2] + 1,
[img_res, img_res],
# convert to normalized coordinates
kp = normalize_kp2d_np(kp, img_res)
kp = kp.astype("float32")
return kp
def pose_processing(pose, augm_dict):
"""Process SMPL theta parameters and apply all augmentation transforms."""
rot = augm_dict["rot"]
# rotation or the pose parameters
pose[:3] = rot_aa(pose[:3], rot)
# flip the pose parameters
# (72),float
pose = pose.astype("float32")
return pose
def rot_aa(aa, rot):
"""Rotate axis angle parameters."""
# pose parameters
R = np.array(
[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
[np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
[0, 0, 1],
# find the rotation of the body in camera frame
per_rdg, _ = cv2.Rodrigues(aa)
# apply the global rotation to the global orientation
resrot, _ = cv2.Rodrigues(, per_rdg))
aa = (resrot.T)[0]
return aa
def denormalize_images(images):
images = images * torch.tensor([0.229, 0.224, 0.225], device=images.device).reshape(
1, 3, 1, 1
images = images + torch.tensor([0.485, 0.456, 0.406], device=images.device).reshape(
1, 3, 1, 1
return images
def read_img(img_fn, dummy_shape):
cv_img = _read_img(img_fn)
logger.warning(f"Unable to load {img_fn}")
cv_img = np.zeros(dummy_shape, dtype=np.float32)
return cv_img, False
return cv_img, True
def _read_img(img_fn):
img = cv2.cvtColor(cv2.imread(img_fn), cv2.COLOR_BGR2RGB)
return img.astype(np.float32)
def normalize_kp2d_np(kp2d: np.ndarray, img_res):
assert kp2d.shape[1] == 3
kp2d_normalized = kp2d.copy()
kp2d_normalized[:, :2] = 2.0 * kp2d[:, :2] / img_res - 1.0
return kp2d_normalized
def unnormalize_2d_kp(kp_2d_np: np.ndarray, res):
assert kp_2d_np.shape[1] == 3
kp_2d = np.copy(kp_2d_np)
kp_2d[:, :2] = 0.5 * res * (kp_2d[:, :2] + 1)
return kp_2d
def normalize_kp2d(kp2d: torch.Tensor, img_res):
assert len(kp2d.shape) == 3
kp2d_normalized = kp2d.clone()
kp2d_normalized[:, :, :2] = 2.0 * kp2d[:, :, :2] / img_res - 1.0
return kp2d_normalized
def unormalize_kp2d(kp2d_normalized: torch.Tensor, img_res):
assert len(kp2d_normalized.shape) == 3
assert kp2d_normalized.shape[2] == 2
kp2d = kp2d_normalized.clone()
kp2d = 0.5 * img_res * (kp2d + 1)
return kp2d
def get_wp_intrix(fixed_focal: float, img_res):
# consruct weak perspective on patch
camera_center = np.array([img_res // 2, img_res // 2])
intrx = torch.zeros([3, 3])
intrx[0, 0] = fixed_focal
intrx[1, 1] = fixed_focal
intrx[2, 2] = 1.0
intrx[0, -1] = camera_center[0]
intrx[1, -1] = camera_center[1]
return intrx
def get_aug_intrix(
intrx, fixed_focal: float, img_res, use_gt_k, bbox_cx, bbox_cy, scale
This function returns camera intrinsics under scaling.
If use_gt_k, the GT K is used, but scaled based on the amount of scaling in the patch.
Else, we construct an intrinsic camera with a fixed focal length and fixed camera center.
if not use_gt_k:
# consruct weak perspective on patch
intrx = get_wp_intrix(fixed_focal, img_res)
# update the GT intrinsics (full image space)
# such that it matches the scale of the patch
dim = scale * 200.0 # bbox size
k_scale = float(img_res) / dim # resized_dim / bbox_size in full image space
# x1 and y1: top-left corner of bbox
intrinsics after data augmentation
fx' = k*fx
fy' = k*fy
cx' = k*(cx - x1)
cy' = k*(cy - y1)
intrx[0, 0] *= k_scale # k*fx
intrx[1, 1] *= k_scale # k*fy
intrx[0, 2] -= bbox_cx - dim / 2.0
intrx[1, 2] -= bbox_cy - dim / 2.0
intrx[0, 2] *= k_scale
intrx[1, 2] *= k_scale
return intrx