meow
init
d6d3a5b
raw
history blame
14.1 kB
import numpy as np
import torch
"""
Useful geometric operations, e.g. Perspective projection and a differentiable Rodrigues formula
Parts of the code are taken from https://github.com/MandyMo/pytorch_HMR
"""
def perspective_to_weak_perspective_torch(
perspective_camera,
focal_length,
img_res,
):
# Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
# in 3D given the bounding box size
# This camera translation can be used in a full perspective projection
# if isinstance(focal_length, torch.Tensor):
# focal_length = focal_length[:, 0]
tx = perspective_camera[:, 0]
ty = perspective_camera[:, 1]
tz = perspective_camera[:, 2]
weak_perspective_camera = torch.stack(
[2 * focal_length / (img_res * tz + 1e-9), tx, ty],
dim=-1,
)
return weak_perspective_camera
def convert_perspective_to_weak_perspective(
perspective_camera,
focal_length,
img_res,
):
# Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
# in 3D given the bounding box size
# This camera translation can be used in a full perspective projection
# if isinstance(focal_length, torch.Tensor):
# focal_length = focal_length[:, 0]
weak_perspective_camera = torch.stack(
[
2 * focal_length / (img_res * perspective_camera[:, 2] + 1e-9),
perspective_camera[:, 0],
perspective_camera[:, 1],
],
dim=-1,
)
return weak_perspective_camera
def convert_weak_perspective_to_perspective(
weak_perspective_camera, focal_length, img_res
):
# Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
# in 3D given the bounding box size
# This camera translation can be used in a full perspective projection
# if isinstance(focal_length, torch.Tensor):
# focal_length = focal_length[:, 0]
perspective_camera = torch.stack(
[
weak_perspective_camera[:, 1],
weak_perspective_camera[:, 2],
2 * focal_length / (img_res * weak_perspective_camera[:, 0] + 1e-9),
],
dim=-1,
)
return perspective_camera
def get_default_cam_t(f, img_res):
cam = torch.tensor([[5.0, 0.0, 0.0]])
return convert_weak_perspective_to_perspective(cam, f, img_res)
def estimate_translation_np(S, joints_2d, joints_conf, focal_length, img_size):
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
Input:
S: (25, 3) 3D joint locations
joints: (25, 3) 2D joint locations and confidence
Returns:
(3,) camera translation vector
"""
num_joints = S.shape[0]
# focal length
f = np.array([focal_length[0], focal_length[1]])
# optical center
center = np.array([img_size[1] / 2.0, img_size[0] / 2.0])
# transformations
Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
XY = np.reshape(S[:, 0:2], -1)
O = np.tile(center, num_joints)
F = np.tile(f, num_joints)
weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)
# least squares
Q = np.array(
[
F * np.tile(np.array([1, 0]), num_joints),
F * np.tile(np.array([0, 1]), num_joints),
O - np.reshape(joints_2d, -1),
]
).T
c = (np.reshape(joints_2d, -1) - O) * Z - F * XY
# weighted least squares
W = np.diagflat(weight2)
Q = np.dot(W, Q)
c = np.dot(W, c)
# square matrix
A = np.dot(Q.T, Q)
b = np.dot(Q.T, c)
# solution
trans = np.linalg.solve(A, b)
return trans
def estimate_translation(
S,
joints_2d,
focal_length,
img_size,
use_all_joints=False,
rotation=None,
pad_2d=False,
):
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
Input:
S: (B, 49, 3) 3D joint locations
joints: (B, 49, 3) 2D joint locations and confidence
Returns:
(B, 3) camera translation vectors
"""
if pad_2d:
batch, num_pts = joints_2d.shape[:2]
joints_2d_pad = torch.ones((batch, num_pts, 3))
joints_2d_pad[:, :, :2] = joints_2d
joints_2d_pad = joints_2d_pad.to(joints_2d.device)
joints_2d = joints_2d_pad
device = S.device
if rotation is not None:
S = torch.einsum("bij,bkj->bki", rotation, S)
# Use only joints 25:49 (GT joints)
if use_all_joints:
S = S.cpu().numpy()
joints_2d = joints_2d.cpu().numpy()
else:
S = S[:, 25:, :].cpu().numpy()
joints_2d = joints_2d[:, 25:, :].cpu().numpy()
joints_conf = joints_2d[:, :, -1]
joints_2d = joints_2d[:, :, :-1]
trans = np.zeros((S.shape[0], 3), dtype=np.float32)
# Find the translation for each example in the batch
for i in range(S.shape[0]):
S_i = S[i]
joints_i = joints_2d[i]
conf_i = joints_conf[i]
trans[i] = estimate_translation_np(
S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size
)
return torch.from_numpy(trans).to(device)
def estimate_translation_cam(
S, joints_2d, focal_length, img_size, use_all_joints=False, rotation=None
):
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
Input:
S: (B, 49, 3) 3D joint locations
joints: (B, 49, 3) 2D joint locations and confidence
Returns:
(B, 3) camera translation vectors
"""
def estimate_translation_np(S, joints_2d, joints_conf, focal_length, img_size):
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
Input:
S: (25, 3) 3D joint locations
joints: (25, 3) 2D joint locations and confidence
Returns:
(3,) camera translation vector
"""
num_joints = S.shape[0]
# focal length
f = np.array([focal_length[0], focal_length[1]])
# optical center
center = np.array([img_size[0] / 2.0, img_size[1] / 2.0])
# transformations
Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
XY = np.reshape(S[:, 0:2], -1)
O = np.tile(center, num_joints)
F = np.tile(f, num_joints)
weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)
# least squares
Q = np.array(
[
F * np.tile(np.array([1, 0]), num_joints),
F * np.tile(np.array([0, 1]), num_joints),
O - np.reshape(joints_2d, -1),
]
).T
c = (np.reshape(joints_2d, -1) - O) * Z - F * XY
# weighted least squares
W = np.diagflat(weight2)
Q = np.dot(W, Q)
c = np.dot(W, c)
# square matrix
A = np.dot(Q.T, Q)
b = np.dot(Q.T, c)
# solution
trans = np.linalg.solve(A, b)
return trans
device = S.device
if rotation is not None:
S = torch.einsum("bij,bkj->bki", rotation, S)
# Use only joints 25:49 (GT joints)
if use_all_joints:
S = S.cpu().numpy()
joints_2d = joints_2d.cpu().numpy()
else:
S = S[:, 25:, :].cpu().numpy()
joints_2d = joints_2d[:, 25:, :].cpu().numpy()
joints_conf = joints_2d[:, :, -1]
joints_2d = joints_2d[:, :, :-1]
trans = np.zeros((S.shape[0], 3), dtype=np.float32)
# Find the translation for each example in the batch
for i in range(S.shape[0]):
S_i = S[i]
joints_i = joints_2d[i]
conf_i = joints_conf[i]
trans[i] = estimate_translation_np(
S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size
)
return torch.from_numpy(trans).to(device)
def get_coord_maps(size=56):
xx_ones = torch.ones([1, size], dtype=torch.int32)
xx_ones = xx_ones.unsqueeze(-1)
xx_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
xx_range = xx_range.unsqueeze(1)
xx_channel = torch.matmul(xx_ones, xx_range)
xx_channel = xx_channel.unsqueeze(-1)
yy_ones = torch.ones([1, size], dtype=torch.int32)
yy_ones = yy_ones.unsqueeze(1)
yy_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
yy_range = yy_range.unsqueeze(-1)
yy_channel = torch.matmul(yy_range, yy_ones)
yy_channel = yy_channel.unsqueeze(-1)
xx_channel = xx_channel.permute(0, 3, 1, 2)
yy_channel = yy_channel.permute(0, 3, 1, 2)
xx_channel = xx_channel.float() / (size - 1)
yy_channel = yy_channel.float() / (size - 1)
xx_channel = xx_channel * 2 - 1
yy_channel = yy_channel * 2 - 1
out = torch.cat([xx_channel, yy_channel], dim=1)
return out
def look_at(eye, at=np.array([0, 0, 0]), up=np.array([0, 0, 1]), eps=1e-5):
at = at.astype(float).reshape(1, 3)
up = up.astype(float).reshape(1, 3)
eye = eye.reshape(-1, 3)
up = up.repeat(eye.shape[0] // up.shape[0], axis=0)
eps = np.array([eps]).reshape(1, 1).repeat(up.shape[0], axis=0)
z_axis = eye - at
z_axis /= np.max(np.stack([np.linalg.norm(z_axis, axis=1, keepdims=True), eps]))
x_axis = np.cross(up, z_axis)
x_axis /= np.max(np.stack([np.linalg.norm(x_axis, axis=1, keepdims=True), eps]))
y_axis = np.cross(z_axis, x_axis)
y_axis /= np.max(np.stack([np.linalg.norm(y_axis, axis=1, keepdims=True), eps]))
r_mat = np.concatenate(
(x_axis.reshape(-1, 3, 1), y_axis.reshape(-1, 3, 1), z_axis.reshape(-1, 3, 1)),
axis=2,
)
return r_mat
def to_sphere(u, v):
theta = 2 * np.pi * u
phi = np.arccos(1 - 2 * v)
cx = np.sin(phi) * np.cos(theta)
cy = np.sin(phi) * np.sin(theta)
cz = np.cos(phi)
s = np.stack([cx, cy, cz])
return s
def sample_on_sphere(range_u=(0, 1), range_v=(0, 1)):
u = np.random.uniform(*range_u)
v = np.random.uniform(*range_v)
return to_sphere(u, v)
def sample_pose_on_sphere(range_v=(0, 1), range_u=(0, 1), radius=1, up=[0, 1, 0]):
# sample location on unit sphere
loc = sample_on_sphere(range_u, range_v)
# sample radius if necessary
if isinstance(radius, tuple):
radius = np.random.uniform(*radius)
loc = loc * radius
R = look_at(loc, up=np.array(up))[0]
RT = np.concatenate([R, loc.reshape(3, 1)], axis=1)
RT = torch.Tensor(RT.astype(np.float32))
return RT
def rectify_pose(camera_r, body_aa, rotate_x=False):
body_r = batch_rodrigues(body_aa).reshape(-1, 3, 3)
if rotate_x:
rotate_x = torch.tensor([[[1.0, 0.0, 0.0], [0.0, -1.0, 0.0], [0.0, 0.0, -1.0]]])
body_r = body_r @ rotate_x
final_r = camera_r @ body_r
body_aa = batch_rot2aa(final_r)
return body_aa
def estimate_translation_k_np(S, joints_2d, joints_conf, K):
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
Input:
S: (25, 3) 3D joint locations
joints: (25, 3) 2D joint locations and confidence
Returns:
(3,) camera translation vector
"""
num_joints = S.shape[0]
# focal length
focal = np.array([K[0, 0], K[1, 1]])
# optical center
center = np.array([K[0, 2], K[1, 2]])
# transformations
Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
XY = np.reshape(S[:, 0:2], -1)
O = np.tile(center, num_joints)
F = np.tile(focal, num_joints)
weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)
# least squares
Q = np.array(
[
F * np.tile(np.array([1, 0]), num_joints),
F * np.tile(np.array([0, 1]), num_joints),
O - np.reshape(joints_2d, -1),
]
).T
c = (np.reshape(joints_2d, -1) - O) * Z - F * XY
# weighted least squares
W = np.diagflat(weight2)
Q = np.dot(W, Q)
c = np.dot(W, c)
# square matrix
A = np.dot(Q.T, Q)
b = np.dot(Q.T, c)
# solution
trans = np.linalg.solve(A, b)
return trans
def estimate_translation_k(
S,
joints_2d,
K,
use_all_joints=False,
rotation=None,
pad_2d=False,
):
"""Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
Input:
S: (B, 49, 3) 3D joint locations
joints: (B, 49, 3) 2D joint locations and confidence
Returns:
(B, 3) camera translation vectors
"""
if pad_2d:
batch, num_pts = joints_2d.shape[:2]
joints_2d_pad = torch.ones((batch, num_pts, 3))
joints_2d_pad[:, :, :2] = joints_2d
joints_2d_pad = joints_2d_pad.to(joints_2d.device)
joints_2d = joints_2d_pad
device = S.device
if rotation is not None:
S = torch.einsum("bij,bkj->bki", rotation, S)
# Use only joints 25:49 (GT joints)
if use_all_joints:
S = S.cpu().numpy()
joints_2d = joints_2d.cpu().numpy()
else:
S = S[:, 25:, :].cpu().numpy()
joints_2d = joints_2d[:, 25:, :].cpu().numpy()
joints_conf = joints_2d[:, :, -1]
joints_2d = joints_2d[:, :, :-1]
trans = np.zeros((S.shape[0], 3), dtype=np.float32)
# Find the translation for each example in the batch
for i in range(S.shape[0]):
S_i = S[i]
joints_i = joints_2d[i]
conf_i = joints_conf[i]
K_i = K[i]
trans[i] = estimate_translation_k_np(S_i, joints_i, conf_i, K_i)
return torch.from_numpy(trans).to(device)
def weak_perspective_to_perspective_torch(
weak_perspective_camera, focal_length, img_res, min_s
):
# Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
# in 3D given the bounding box size
# This camera translation can be used in a full perspective projection
s = weak_perspective_camera[:, 0]
s = torch.clamp(s, min_s)
tx = weak_perspective_camera[:, 1]
ty = weak_perspective_camera[:, 2]
perspective_camera = torch.stack(
[
tx,
ty,
2 * focal_length / (img_res * s + 1e-9),
],
dim=-1,
)
return perspective_camera