Spaces:

xymeow7
/

gene-hoi-denoising

Runtime error

File size: 14,076 Bytes

d6d3a5b

import numpy as np
import torch

"""
Useful geometric operations, e.g. Perspective projection and a differentiable Rodrigues formula
Parts of the code are taken from https://github.com/MandyMo/pytorch_HMR
"""


def perspective_to_weak_perspective_torch(
    perspective_camera,
    focal_length,
    img_res,
):
    # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
    # in 3D given the bounding box size
    # This camera translation can be used in a full perspective projection
    # if isinstance(focal_length, torch.Tensor):
    #     focal_length = focal_length[:, 0]

    tx = perspective_camera[:, 0]
    ty = perspective_camera[:, 1]
    tz = perspective_camera[:, 2]

    weak_perspective_camera = torch.stack(
        [2 * focal_length / (img_res * tz + 1e-9), tx, ty],
        dim=-1,
    )
    return weak_perspective_camera


def convert_perspective_to_weak_perspective(
    perspective_camera,
    focal_length,
    img_res,
):
    # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
    # in 3D given the bounding box size
    # This camera translation can be used in a full perspective projection
    # if isinstance(focal_length, torch.Tensor):
    #     focal_length = focal_length[:, 0]

    weak_perspective_camera = torch.stack(
        [
            2 * focal_length / (img_res * perspective_camera[:, 2] + 1e-9),
            perspective_camera[:, 0],
            perspective_camera[:, 1],
        ],
        dim=-1,
    )
    return weak_perspective_camera


def convert_weak_perspective_to_perspective(
    weak_perspective_camera, focal_length, img_res
):
    # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
    # in 3D given the bounding box size
    # This camera translation can be used in a full perspective projection
    # if isinstance(focal_length, torch.Tensor):
    #     focal_length = focal_length[:, 0]

    perspective_camera = torch.stack(
        [
            weak_perspective_camera[:, 1],
            weak_perspective_camera[:, 2],
            2 * focal_length / (img_res * weak_perspective_camera[:, 0] + 1e-9),
        ],
        dim=-1,
    )
    return perspective_camera


def get_default_cam_t(f, img_res):
    cam = torch.tensor([[5.0, 0.0, 0.0]])
    return convert_weak_perspective_to_perspective(cam, f, img_res)


def estimate_translation_np(S, joints_2d, joints_conf, focal_length, img_size):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (25, 3) 3D joint locations
        joints: (25, 3) 2D joint locations and confidence
    Returns:
        (3,) camera translation vector
    """
    num_joints = S.shape[0]
    # focal length

    f = np.array([focal_length[0], focal_length[1]])
    # optical center
    center = np.array([img_size[1] / 2.0, img_size[0] / 2.0])

    # transformations
    Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
    XY = np.reshape(S[:, 0:2], -1)
    O = np.tile(center, num_joints)
    F = np.tile(f, num_joints)
    weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)

    # least squares
    Q = np.array(
        [
            F * np.tile(np.array([1, 0]), num_joints),
            F * np.tile(np.array([0, 1]), num_joints),
            O - np.reshape(joints_2d, -1),
        ]
    ).T
    c = (np.reshape(joints_2d, -1) - O) * Z - F * XY

    # weighted least squares
    W = np.diagflat(weight2)
    Q = np.dot(W, Q)
    c = np.dot(W, c)

    # square matrix
    A = np.dot(Q.T, Q)
    b = np.dot(Q.T, c)

    # solution
    trans = np.linalg.solve(A, b)

    return trans


def estimate_translation(
    S,
    joints_2d,
    focal_length,
    img_size,
    use_all_joints=False,
    rotation=None,
    pad_2d=False,
):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (B, 49, 3) 3D joint locations
        joints: (B, 49, 3) 2D joint locations and confidence
    Returns:
        (B, 3) camera translation vectors
    """
    if pad_2d:
        batch, num_pts = joints_2d.shape[:2]
        joints_2d_pad = torch.ones((batch, num_pts, 3))
        joints_2d_pad[:, :, :2] = joints_2d
        joints_2d_pad = joints_2d_pad.to(joints_2d.device)
        joints_2d = joints_2d_pad

    device = S.device

    if rotation is not None:
        S = torch.einsum("bij,bkj->bki", rotation, S)

    # Use only joints 25:49 (GT joints)
    if use_all_joints:
        S = S.cpu().numpy()
        joints_2d = joints_2d.cpu().numpy()
    else:
        S = S[:, 25:, :].cpu().numpy()
        joints_2d = joints_2d[:, 25:, :].cpu().numpy()

    joints_conf = joints_2d[:, :, -1]
    joints_2d = joints_2d[:, :, :-1]
    trans = np.zeros((S.shape[0], 3), dtype=np.float32)
    # Find the translation for each example in the batch
    for i in range(S.shape[0]):
        S_i = S[i]
        joints_i = joints_2d[i]
        conf_i = joints_conf[i]
        trans[i] = estimate_translation_np(
            S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size
        )
    return torch.from_numpy(trans).to(device)


def estimate_translation_cam(
    S, joints_2d, focal_length, img_size, use_all_joints=False, rotation=None
):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (B, 49, 3) 3D joint locations
        joints: (B, 49, 3) 2D joint locations and confidence
    Returns:
        (B, 3) camera translation vectors
    """

    def estimate_translation_np(S, joints_2d, joints_conf, focal_length, img_size):
        """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
        Input:
            S: (25, 3) 3D joint locations
            joints: (25, 3) 2D joint locations and confidence
        Returns:
            (3,) camera translation vector
        """

        num_joints = S.shape[0]
        # focal length
        f = np.array([focal_length[0], focal_length[1]])
        # optical center
        center = np.array([img_size[0] / 2.0, img_size[1] / 2.0])

        # transformations
        Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
        XY = np.reshape(S[:, 0:2], -1)
        O = np.tile(center, num_joints)
        F = np.tile(f, num_joints)
        weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)

        # least squares
        Q = np.array(
            [
                F * np.tile(np.array([1, 0]), num_joints),
                F * np.tile(np.array([0, 1]), num_joints),
                O - np.reshape(joints_2d, -1),
            ]
        ).T
        c = (np.reshape(joints_2d, -1) - O) * Z - F * XY

        # weighted least squares
        W = np.diagflat(weight2)
        Q = np.dot(W, Q)
        c = np.dot(W, c)

        # square matrix
        A = np.dot(Q.T, Q)
        b = np.dot(Q.T, c)

        # solution
        trans = np.linalg.solve(A, b)

        return trans

    device = S.device

    if rotation is not None:
        S = torch.einsum("bij,bkj->bki", rotation, S)

    # Use only joints 25:49 (GT joints)
    if use_all_joints:
        S = S.cpu().numpy()
        joints_2d = joints_2d.cpu().numpy()
    else:
        S = S[:, 25:, :].cpu().numpy()
        joints_2d = joints_2d[:, 25:, :].cpu().numpy()

    joints_conf = joints_2d[:, :, -1]
    joints_2d = joints_2d[:, :, :-1]
    trans = np.zeros((S.shape[0], 3), dtype=np.float32)
    # Find the translation for each example in the batch
    for i in range(S.shape[0]):
        S_i = S[i]
        joints_i = joints_2d[i]
        conf_i = joints_conf[i]
        trans[i] = estimate_translation_np(
            S_i, joints_i, conf_i, focal_length=focal_length, img_size=img_size
        )
    return torch.from_numpy(trans).to(device)


def get_coord_maps(size=56):
    xx_ones = torch.ones([1, size], dtype=torch.int32)
    xx_ones = xx_ones.unsqueeze(-1)

    xx_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
    xx_range = xx_range.unsqueeze(1)

    xx_channel = torch.matmul(xx_ones, xx_range)
    xx_channel = xx_channel.unsqueeze(-1)

    yy_ones = torch.ones([1, size], dtype=torch.int32)
    yy_ones = yy_ones.unsqueeze(1)

    yy_range = torch.arange(size, dtype=torch.int32).unsqueeze(0)
    yy_range = yy_range.unsqueeze(-1)

    yy_channel = torch.matmul(yy_range, yy_ones)
    yy_channel = yy_channel.unsqueeze(-1)

    xx_channel = xx_channel.permute(0, 3, 1, 2)
    yy_channel = yy_channel.permute(0, 3, 1, 2)

    xx_channel = xx_channel.float() / (size - 1)
    yy_channel = yy_channel.float() / (size - 1)

    xx_channel = xx_channel * 2 - 1
    yy_channel = yy_channel * 2 - 1

    out = torch.cat([xx_channel, yy_channel], dim=1)
    return out


def look_at(eye, at=np.array([0, 0, 0]), up=np.array([0, 0, 1]), eps=1e-5):
    at = at.astype(float).reshape(1, 3)
    up = up.astype(float).reshape(1, 3)

    eye = eye.reshape(-1, 3)
    up = up.repeat(eye.shape[0] // up.shape[0], axis=0)
    eps = np.array([eps]).reshape(1, 1).repeat(up.shape[0], axis=0)

    z_axis = eye - at
    z_axis /= np.max(np.stack([np.linalg.norm(z_axis, axis=1, keepdims=True), eps]))

    x_axis = np.cross(up, z_axis)
    x_axis /= np.max(np.stack([np.linalg.norm(x_axis, axis=1, keepdims=True), eps]))

    y_axis = np.cross(z_axis, x_axis)
    y_axis /= np.max(np.stack([np.linalg.norm(y_axis, axis=1, keepdims=True), eps]))

    r_mat = np.concatenate(
        (x_axis.reshape(-1, 3, 1), y_axis.reshape(-1, 3, 1), z_axis.reshape(-1, 3, 1)),
        axis=2,
    )

    return r_mat


def to_sphere(u, v):
    theta = 2 * np.pi * u
    phi = np.arccos(1 - 2 * v)
    cx = np.sin(phi) * np.cos(theta)
    cy = np.sin(phi) * np.sin(theta)
    cz = np.cos(phi)
    s = np.stack([cx, cy, cz])
    return s


def sample_on_sphere(range_u=(0, 1), range_v=(0, 1)):
    u = np.random.uniform(*range_u)
    v = np.random.uniform(*range_v)
    return to_sphere(u, v)


def sample_pose_on_sphere(range_v=(0, 1), range_u=(0, 1), radius=1, up=[0, 1, 0]):
    # sample location on unit sphere
    loc = sample_on_sphere(range_u, range_v)

    # sample radius if necessary
    if isinstance(radius, tuple):
        radius = np.random.uniform(*radius)

    loc = loc * radius
    R = look_at(loc, up=np.array(up))[0]

    RT = np.concatenate([R, loc.reshape(3, 1)], axis=1)
    RT = torch.Tensor(RT.astype(np.float32))
    return RT


def rectify_pose(camera_r, body_aa, rotate_x=False):
    body_r = batch_rodrigues(body_aa).reshape(-1, 3, 3)

    if rotate_x:
        rotate_x = torch.tensor([[[1.0, 0.0, 0.0], [0.0, -1.0, 0.0], [0.0, 0.0, -1.0]]])
        body_r = body_r @ rotate_x

    final_r = camera_r @ body_r
    body_aa = batch_rot2aa(final_r)
    return body_aa


def estimate_translation_k_np(S, joints_2d, joints_conf, K):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (25, 3) 3D joint locations
        joints: (25, 3) 2D joint locations and confidence
    Returns:
        (3,) camera translation vector
    """
    num_joints = S.shape[0]
    # focal length

    focal = np.array([K[0, 0], K[1, 1]])
    # optical center
    center = np.array([K[0, 2], K[1, 2]])

    # transformations
    Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
    XY = np.reshape(S[:, 0:2], -1)
    O = np.tile(center, num_joints)
    F = np.tile(focal, num_joints)
    weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)

    # least squares
    Q = np.array(
        [
            F * np.tile(np.array([1, 0]), num_joints),
            F * np.tile(np.array([0, 1]), num_joints),
            O - np.reshape(joints_2d, -1),
        ]
    ).T
    c = (np.reshape(joints_2d, -1) - O) * Z - F * XY

    # weighted least squares
    W = np.diagflat(weight2)
    Q = np.dot(W, Q)
    c = np.dot(W, c)

    # square matrix
    A = np.dot(Q.T, Q)
    b = np.dot(Q.T, c)

    # solution
    trans = np.linalg.solve(A, b)

    return trans


def estimate_translation_k(
    S,
    joints_2d,
    K,
    use_all_joints=False,
    rotation=None,
    pad_2d=False,
):
    """Find camera translation that brings 3D joints S closest to 2D the corresponding joints_2d.
    Input:
        S: (B, 49, 3) 3D joint locations
        joints: (B, 49, 3) 2D joint locations and confidence
    Returns:
        (B, 3) camera translation vectors
    """
    if pad_2d:
        batch, num_pts = joints_2d.shape[:2]
        joints_2d_pad = torch.ones((batch, num_pts, 3))
        joints_2d_pad[:, :, :2] = joints_2d
        joints_2d_pad = joints_2d_pad.to(joints_2d.device)
        joints_2d = joints_2d_pad

    device = S.device

    if rotation is not None:
        S = torch.einsum("bij,bkj->bki", rotation, S)

    # Use only joints 25:49 (GT joints)
    if use_all_joints:
        S = S.cpu().numpy()
        joints_2d = joints_2d.cpu().numpy()
    else:
        S = S[:, 25:, :].cpu().numpy()
        joints_2d = joints_2d[:, 25:, :].cpu().numpy()

    joints_conf = joints_2d[:, :, -1]
    joints_2d = joints_2d[:, :, :-1]
    trans = np.zeros((S.shape[0], 3), dtype=np.float32)
    # Find the translation for each example in the batch
    for i in range(S.shape[0]):
        S_i = S[i]
        joints_i = joints_2d[i]
        conf_i = joints_conf[i]
        K_i = K[i]
        trans[i] = estimate_translation_k_np(S_i, joints_i, conf_i, K_i)
    return torch.from_numpy(trans).to(device)


def weak_perspective_to_perspective_torch(
    weak_perspective_camera, focal_length, img_res, min_s
):
    # Convert Weak Perspective Camera [s, tx, ty] to camera translation [tx, ty, tz]
    # in 3D given the bounding box size
    # This camera translation can be used in a full perspective projection
    s = weak_perspective_camera[:, 0]
    s = torch.clamp(s, min_s)
    tx = weak_perspective_camera[:, 1]
    ty = weak_perspective_camera[:, 2]
    perspective_camera = torch.stack(
        [
            tx,
            ty,
            2 * focal_length / (img_res * s + 1e-9),
        ],
        dim=-1,
    )
    return perspective_camera