import numpy as np import torch """ Useful geometric operations, e.g. Perspective projection and a differentiable Rodrigues formula Parts of the code are taken from """ def perspective_to_weak_perspective_torch( perspective_camera, focal_length, img_res, ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection # if isinstance(focal_length, torch.Tensor): # focal_length = focal_length[:, 0] tx = perspective_camera[:, 0] ty = perspective_camera[:, 1] tz = perspective_camera[:, 2] weak_perspective_camera = torch.stack( [2 * focal_length / (img_res * tz + 1e-9), tx, ty], dim=-1, ) return weak_perspective_camera def convert_perspective_to_weak_perspective( perspective_camera, focal_length, img_res, ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection # if isinstance(focal_length, torch.Tensor): # focal_length = focal_length[:, 0] weak_perspective_camera = torch.stack( [ 2 * focal_length / (img_res * perspective_camera[:, 2] + 1e-9), perspective_camera[:, 0], perspective_camera[:, 1], ], dim=-1, ) return weak_perspective_camera def convert_weak_perspective_to_perspective( weak_perspective_camera, focal_length, img_res ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection # if isinstance(focal_length, torch.Tensor): # focal_length = focal_length[:, 0] perspective_camera = torch.stack( [ weak_perspective_camera[:, 1], weak_perspective_camera[:, 2], 2 * focal_length / (img_res * weak_perspective_camera[:, 0] + 1e-9), ], dim=-1, ) return perspective_camera def get_default_cam_t(f, img_res): cam = torch.tensor([[5.0, 0.0, 0.0]]) return convert_weak_perspective_to_perspective(cam, f, img_res) def estimate_translation_np(S, joints_2d, joints_conf, focal_length, img_size): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (25, 3) 3D joint locations joints: (25, 3) 2D joint locations and confidence Returns: (3,) camera translation vector """ num_joints = S.shape[0] # focal length f = np.array([focal_length[0], focal_length[1]]) # optical center center = np.array([img_size[1] / 2.0, img_size[0] / 2.0]) # transformations Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1) XY = np.reshape(S[:, 0:2], -1) O = np.tile(center, num_joints) F = np.tile(f, num_joints) weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1) # least squares Q = np.array( [ F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints), O - np.reshape(joints_2d, -1), ] ).T c = (np.reshape(joints_2d, -1) - O) * Z - F * XY # weighted least squares W = np.diagflat(weight2) Q =, Q) c =, c) # square matrix A =, Q) b =, c) # solution trans = np.linalg.solve(A, b) return trans def estimate_translation( S, joints_2d, focal_length, img_size, use_all_joints=False, rotation=None, pad_2d=False, ): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (B, 49, 3) 3D joint locations joints: (B, 49, 3) 2D joint locations and confidence Returns: (B, 3) camera translation vectors """ if pad_2d: batch, num_pts = joints_2d.shape[:2] joints_2d_pad = torch.ones((batch, num_pts, 3)) joints_2d_pad[:, :, :2] = joints_2d joints_2d_pad = joints_2d = joints_2d_pad device = S.device if rotation is not None: S = torch.einsum("bij,bkj->bki", rotation, S) # Use only joints 25:49 (GT joints) if use_all_joints: S = S.cpu().numpy() joints_2d = joints_2d.cpu().numpy() else: S = S[:, 25:, :].cpu().numpy() joints_2d = joints_2d[:, 25:, :].cpu().numpy() joints_conf = joints_2d[:, :, -1] joints_2d = joints_2d[:, :, :-1] trans = np.zeros((S.shape[0], 3), dtype=np.float32) # Find the translation for each example in the batch for i in range(S.shape[0]): S_i = S[i] joints_i = joints_2d[i] conf_i = joints_conf[i] trans[i] = estimate_translation_np( S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size ) return torch.from_numpy(trans).to(device) def estimate_translation_cam( S, joints_2d, focal_length, img_size, use_all_joints=False, rotation=None ): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (25, 3) 3D joint locations joints: (25, 3) 2D joint locations and confidence Returns: (3,) camera translation vector """ num_joints = S.shape[0] # focal length focal = np.array([K[0, 0], K[1, 1]]) # optical center center = np.array([K[0, 2], K[1, 2]]) # transformations Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1) XY = np.reshape(S[:, 0:2], -1) O = np.tile(center, num_joints) F = np.tile(focal, num_joints) weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1) # least squares Q = np.array( [ F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints), O - np.reshape(joints_2d, -1), ] ).T c = (np.reshape(joints_2d, -1) - O) * Z - F * XY # weighted least squares W = np.diagflat(weight2) Q =, Q) c =, c) # square matrix A =, Q) b =, c) # solution trans = np.linalg.solve(A, b) return trans def estimate_translation_k( S, joints_2d, K, use_all_joints=False, rotation=None, pad_2d=False, ): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (B, 49, 3) 3D joint locations joints: (B, 49, 3) 2D joint locations and confidence Returns: (B, 3) camera translation vectors """ if pad_2d: batch, num_pts = joints_2d.shape[:2] joints_2d_pad = torch.ones((batch, num_pts, 3)) joints_2d_pad[:, :, :2] = joints_2d joints_2d_pad = joints_2d = joints_2d_pad device = S.device if rotation is not None: S = torch.einsum("bij,bkj->bki", rotation, S) # Use only joints 25:49 (GT joints) if use_all_joints: S = S.cpu().numpy() joints_2d = joints_2d.cpu().numpy() else: S = S[:, 25:, :].cpu().numpy() joints_2d = joints_2d[:, 25:, :].cpu().numpy() joints_conf = joints_2d[:, :, -1] joints_2d = joints_2d[:, :, :-1] trans = np.zeros((S.shape[0], 3), dtype=np.float32) # Find the translation for each example in the batch for i in range(S.shape[0]): S_i = S[i] joints_i = joints_2d[i] conf_i = joints_conf[i] K_i = K[i] trans[i] = estimate_translation_k_np(S_i, joints_i, conf_i, K_i) return torch.from_numpy(trans).to(device) def weak_perspective_to_perspective_torch( weak_perspective_camera, focal_length, img_res, min_s ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection s = weak_perspective_camera[:, 0] s = torch.clamp(s, min_s) tx = weak_perspective_camera[:, 1] ty = weak_perspective_camera[:, 2] perspective_camera = torch.stack( [ tx, ty, 2 * focal_length / (img_res * s + 1e-9), ], dim=-1, ) return perspective_camera