import numpy as np import torch """ Useful geometric operations, e.g. Perspective projection and a differentiable Rodrigues formula Parts of the code are taken from https://github.com/MandyMo/pytorch_HMR """ def perspective_to_weak_perspective_torch( perspective_camera, focal_length, img_res, ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection # if isinstance(focal_length, torch.Tensor): # focal_length = focal_length[:, 0] tx = perspective_camera[:, 0] ty = perspective_camera[:, 1] tz = perspective_camera[:, 2] weak_perspective_camera = torch.stack( [2 * focal_length / (img_res * tz + 1e-9), tx, ty], dim=-1, ) return weak_perspective_camera def convert_perspective_to_weak_perspective( perspective_camera, focal_length, img_res, ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection # if isinstance(focal_length, torch.Tensor): # focal_length = focal_length[:, 0] weak_perspective_camera = torch.stack( [ 2 * focal_length / (img_res * perspective_camera[:, 2] + 1e-9), perspective_camera[:, 0], perspective_camera[:, 1], ], dim=-1, ) return weak_perspective_camera def convert_weak_perspective_to_perspective( weak_perspective_camera, focal_length, img_res ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection # if isinstance(focal_length, torch.Tensor): # focal_length = focal_length[:, 0] perspective_camera = torch.stack( [ weak_perspective_camera[:, 1], weak_perspective_camera[:, 2], 2 * focal_length / (img_res * weak_perspective_camera[:, 0] + 1e-9), ], dim=-1, ) return perspective_camera def get_default_cam_t(f, img_res): cam = torch.tensor([[5.0, 0.0, 0.0]]) return convert_weak_perspective_to_perspective(cam, f, img_res) def estimate_translation_np(S, joints_2d, joints_conf, focal_length, img_size): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (25, 3) 3D joint locations joints: (25, 3) 2D joint locations and confidence Returns: (3,) camera translation vector """ num_joints = S.shape[0] # focal length f = np.array([focal_length[0], focal_length[1]]) # optical center center = np.array([img_size[1] / 2.0, img_size[0] / 2.0]) # transformations Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1) XY = np.reshape(S[:, 0:2], -1) O = np.tile(center, num_joints) F = np.tile(f, num_joints) weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1) # least squares Q = np.array( [ F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints), O - np.reshape(joints_2d, -1), ] ).T c = (np.reshape(joints_2d, -1) - O) * Z - F * XY # weighted least squares W = np.diagflat(weight2) Q = np.dot(W, Q) c = np.dot(W, c) # square matrix A = np.dot(Q.T, Q) b = np.dot(Q.T, c) # solution trans = np.linalg.solve(A, b) return trans def estimate_translation( S, joints_2d, focal_length, img_size, use_all_joints=False, rotation=None, pad_2d=False, ): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (B, 49, 3) 3D joint locations joints: (B, 49, 3) 2D joint locations and confidence Returns: (B, 3) camera translation vectors """ if pad_2d: batch, num_pts = joints_2d.shape[:2] joints_2d_pad = torch.ones((batch, num_pts, 3)) joints_2d_pad[:, :, :2] = joints_2d joints_2d_pad = joints_2d_pad.to(joints_2d.device) joints_2d = joints_2d_pad device = S.device if rotation is not None: S = torch.einsum("bij,bkj->bki", rotation, S) # Use only joints 25:49 (GT joints) if use_all_joints: S = S.cpu().numpy() joints_2d = joints_2d.cpu().numpy() else: S = S[:, 25:, :].cpu().numpy() joints_2d = joints_2d[:, 25:, :].cpu().numpy() joints_conf = joints_2d[:, :, -1] joints_2d = joints_2d[:, :, :-1] trans = np.zeros((S.shape[0], 3), dtype=np.float32) # Find the translation for each example in the batch for i in range(S.shape[0]): S_i = S[i] joints_i = joints_2d[i] conf_i = joints_conf[i] trans[i] = estimate_translation_np( S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size ) return torch.from_numpy(trans).to(device) def estimate_translation_cam( S, joints_2d, focal_length, img_size, use_all_joints=False, rotation=None ): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (B, 49, 3) 3D joint locations joints: (B, 49, 3) 2D joint locations and confidence Returns: (B, 3) camera translation vectors """ def estimate_translation_np(S, joints_2d, joints_conf, focal_length, img_size): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (25, 3) 3D joint locations joints: (25, 3) 2D joint locations and confidence Returns: (3,) camera translation vector """ num_joints = S.shape[0] # focal length f = np.array([focal_length[0], focal_length[1]]) # optical center center = np.array([img_size[0] / 2.0, img_size[1] / 2.0]) # transformations Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1) XY = np.reshape(S[:, 0:2], -1) O = np.tile(center, num_joints) F = np.tile(f, num_joints) weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1) # least squares Q = np.array( [ F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints), O - np.reshape(joints_2d, -1), ] ).T c = (np.reshape(joints_2d, -1) - O) * Z - F * XY # weighted least squares W = np.diagflat(weight2) Q = np.dot(W, Q) c = np.dot(W, c) # square matrix A = np.dot(Q.T, Q) b = np.dot(Q.T, c) # solution trans = np.linalg.solve(A, b) return trans device = S.device if rotation is not None: S = torch.einsum("bij,bkj->bki", rotation, S) # Use only joints 25:49 (GT joints) if use_all_joints: S = S.cpu().numpy() joints_2d = joints_2d.cpu().numpy() else: S = S[:, 25:, :].cpu().numpy() joints_2d = joints_2d[:, 25:, :].cpu().numpy() joints_conf = joints_2d[:, :, -1] joints_2d = joints_2d[:, :, :-1] trans = np.zeros((S.shape[0], 3), dtype=np.float32) # Find the translation for each example in the batch for i in range(S.shape[0]): S_i = S[i] joints_i = joints_2d[i] conf_i = joints_conf[i] trans[i] = estimate_translation_np( S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size ) return torch.from_numpy(trans).to(device) def get_coord_maps(size=56): xx_ones = torch.ones([1, size], dtype=torch.int32) xx_ones = xx_ones.unsqueeze(-1) xx_range = torch.arange(size, dtype=torch.int32).unsqueeze(0) xx_range = xx_range.unsqueeze(1) xx_channel = torch.matmul(xx_ones, xx_range) xx_channel = xx_channel.unsqueeze(-1) yy_ones = torch.ones([1, size], dtype=torch.int32) yy_ones = yy_ones.unsqueeze(1) yy_range = torch.arange(size, dtype=torch.int32).unsqueeze(0) yy_range = yy_range.unsqueeze(-1) yy_channel = torch.matmul(yy_range, yy_ones) yy_channel = yy_channel.unsqueeze(-1) xx_channel = xx_channel.permute(0, 3, 1, 2) yy_channel = yy_channel.permute(0, 3, 1, 2) xx_channel = xx_channel.float() / (size - 1) yy_channel = yy_channel.float() / (size - 1) xx_channel = xx_channel * 2 - 1 yy_channel = yy_channel * 2 - 1 out = torch.cat([xx_channel, yy_channel], dim=1) return out def look_at(eye, at=np.array([0, 0, 0]), up=np.array([0, 0, 1]), eps=1e-5): at = at.astype(float).reshape(1, 3) up = up.astype(float).reshape(1, 3) eye = eye.reshape(-1, 3) up = up.repeat(eye.shape[0] // up.shape[0], axis=0) eps = np.array([eps]).reshape(1, 1).repeat(up.shape[0], axis=0) z_axis = eye - at z_axis /= np.max(np.stack([np.linalg.norm(z_axis, axis=1, keepdims=True), eps])) x_axis = np.cross(up, z_axis) x_axis /= np.max(np.stack([np.linalg.norm(x_axis, axis=1, keepdims=True), eps])) y_axis = np.cross(z_axis, x_axis) y_axis /= np.max(np.stack([np.linalg.norm(y_axis, axis=1, keepdims=True), eps])) r_mat = np.concatenate( (x_axis.reshape(-1, 3, 1), y_axis.reshape(-1, 3, 1), z_axis.reshape(-1, 3, 1)), axis=2, ) return r_mat def to_sphere(u, v): theta = 2 * np.pi * u phi = np.arccos(1 - 2 * v) cx = np.sin(phi) * np.cos(theta) cy = np.sin(phi) * np.sin(theta) cz = np.cos(phi) s = np.stack([cx, cy, cz]) return s def sample_on_sphere(range_u=(0, 1), range_v=(0, 1)): u = np.random.uniform(*range_u) v = np.random.uniform(*range_v) return to_sphere(u, v) def sample_pose_on_sphere(range_v=(0, 1), range_u=(0, 1), radius=1, up=[0, 1, 0]): # sample location on unit sphere loc = sample_on_sphere(range_u, range_v) # sample radius if necessary if isinstance(radius, tuple): radius = np.random.uniform(*radius) loc = loc * radius R = look_at(loc, up=np.array(up))[0] RT = np.concatenate([R, loc.reshape(3, 1)], axis=1) RT = torch.Tensor(RT.astype(np.float32)) return RT def rectify_pose(camera_r, body_aa, rotate_x=False): body_r = batch_rodrigues(body_aa).reshape(-1, 3, 3) if rotate_x: rotate_x = torch.tensor([[[1.0, 0.0, 0.0], [0.0, -1.0, 0.0], [0.0, 0.0, -1.0]]]) body_r = body_r @ rotate_x final_r = camera_r @ body_r body_aa = batch_rot2aa(final_r) return body_aa def estimate_translation_k_np(S, joints_2d, joints_conf, K): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (25, 3) 3D joint locations joints: (25, 3) 2D joint locations and confidence Returns: (3,) camera translation vector """ num_joints = S.shape[0] # focal length focal = np.array([K[0, 0], K[1, 1]]) # optical center center = np.array([K[0, 2], K[1, 2]]) # transformations Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1) XY = np.reshape(S[:, 0:2], -1) O = np.tile(center, num_joints) F = np.tile(focal, num_joints) weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1) # least squares Q = np.array( [ F * np.tile(np.array([1, 0]), num_joints), F * np.tile(np.array([0, 1]), num_joints), O - np.reshape(joints_2d, -1), ] ).T c = (np.reshape(joints_2d, -1) - O) * Z - F * XY # weighted least squares W = np.diagflat(weight2) Q = np.dot(W, Q) c = np.dot(W, c) # square matrix A = np.dot(Q.T, Q) b = np.dot(Q.T, c) # solution trans = np.linalg.solve(A, b) return trans def estimate_translation_k( S, joints_2d, K, use_all_joints=False, rotation=None, pad_2d=False, ): """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d. Input: S: (B, 49, 3) 3D joint locations joints: (B, 49, 3) 2D joint locations and confidence Returns: (B, 3) camera translation vectors """ if pad_2d: batch, num_pts = joints_2d.shape[:2] joints_2d_pad = torch.ones((batch, num_pts, 3)) joints_2d_pad[:, :, :2] = joints_2d joints_2d_pad = joints_2d_pad.to(joints_2d.device) joints_2d = joints_2d_pad device = S.device if rotation is not None: S = torch.einsum("bij,bkj->bki", rotation, S) # Use only joints 25:49 (GT joints) if use_all_joints: S = S.cpu().numpy() joints_2d = joints_2d.cpu().numpy() else: S = S[:, 25:, :].cpu().numpy() joints_2d = joints_2d[:, 25:, :].cpu().numpy() joints_conf = joints_2d[:, :, -1] joints_2d = joints_2d[:, :, :-1] trans = np.zeros((S.shape[0], 3), dtype=np.float32) # Find the translation for each example in the batch for i in range(S.shape[0]): S_i = S[i] joints_i = joints_2d[i] conf_i = joints_conf[i] K_i = K[i] trans[i] = estimate_translation_k_np(S_i, joints_i, conf_i, K_i) return torch.from_numpy(trans).to(device) def weak_perspective_to_perspective_torch( weak_perspective_camera, focal_length, img_res, min_s ): # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz] # in 3D given the bounding box size # This camera translation can be used in a full perspective projection s = weak_perspective_camera[:, 0] s = torch.clamp(s, min_s) tx = weak_perspective_camera[:, 1] ty = weak_perspective_camera[:, 2] perspective_camera = torch.stack( [ tx, ty, 2 * focal_length / (img_res * s + 1e-9), ], dim=-1, ) return perspective_camera