first commit
This commit is contained in:
331
lingbot_map/utils/pose_enc.py
Normal file
331
lingbot_map/utils/pose_enc.py
Normal file
@@ -0,0 +1,331 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
import torch
|
||||
from .rotation import quat_to_mat, mat_to_quat
|
||||
import os
|
||||
import torch
|
||||
import numpy as np
|
||||
import gzip
|
||||
import json
|
||||
import random
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
from lingbot_map.utils.geometry import closed_form_inverse_se3, closed_form_inverse_se3_general
|
||||
|
||||
|
||||
def extri_intri_to_pose_encoding(
|
||||
extrinsics, intrinsics, image_size_hw=None, pose_encoding_type="absT_quaR_FoV" # e.g., (256, 512)
|
||||
):
|
||||
"""Convert camera extrinsics and intrinsics to a compact pose encoding.
|
||||
|
||||
This function transforms camera parameters into a unified pose encoding format,
|
||||
which can be used for various downstream tasks like pose prediction or representation.
|
||||
|
||||
Args:
|
||||
extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4,
|
||||
where B is batch size and S is sequence length.
|
||||
In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world transformation.
|
||||
The format is [R|t] where R is a 3x3 rotation matrix and t is a 3x1 translation vector.
|
||||
intrinsics (torch.Tensor): Camera intrinsic parameters with shape BxSx3x3.
|
||||
Defined in pixels, with format:
|
||||
[[fx, 0, cx],
|
||||
[0, fy, cy],
|
||||
[0, 0, 1]]
|
||||
where fx, fy are focal lengths and (cx, cy) is the principal point
|
||||
image_size_hw (tuple): Tuple of (height, width) of the image in pixels.
|
||||
Required for computing field of view values. For example: (256, 512).
|
||||
pose_encoding_type (str): Type of pose encoding to use. Currently only
|
||||
supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view).
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Encoded camera pose parameters with shape BxSx9.
|
||||
For "absT_quaR_FoV" type, the 9 dimensions are:
|
||||
- [:3] = absolute translation vector T (3D)
|
||||
- [3:7] = rotation as quaternion quat (4D)
|
||||
- [7:] = field of view (2D)
|
||||
"""
|
||||
|
||||
# extrinsics: BxSx3x4
|
||||
# intrinsics: BxSx3x3
|
||||
|
||||
if pose_encoding_type == "absT_quaR_FoV":
|
||||
R = extrinsics[:, :, :3, :3] # BxSx3x3
|
||||
T = extrinsics[:, :, :3, 3] # BxSx3
|
||||
|
||||
quat = mat_to_quat(R)
|
||||
# Note the order of h and w here
|
||||
H, W = image_size_hw
|
||||
fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
|
||||
fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
|
||||
pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return pose_encoding
|
||||
|
||||
|
||||
def pose_encoding_to_extri_intri(
|
||||
pose_encoding, image_size_hw=None, pose_encoding_type="absT_quaR_FoV", build_intrinsics=True # e.g., (256, 512)
|
||||
):
|
||||
"""Convert a pose encoding back to camera extrinsics and intrinsics.
|
||||
|
||||
This function performs the inverse operation of extri_intri_to_pose_encoding,
|
||||
reconstructing the full camera parameters from the compact encoding.
|
||||
|
||||
Args:
|
||||
pose_encoding (torch.Tensor): Encoded camera pose parameters with shape BxSx9,
|
||||
where B is batch size and S is sequence length.
|
||||
For "absT_quaR_FoV" type, the 9 dimensions are:
|
||||
- [:3] = absolute translation vector T (3D)
|
||||
- [3:7] = rotation as quaternion quat (4D)
|
||||
- [7:] = field of view (2D)
|
||||
image_size_hw (tuple): Tuple of (height, width) of the image in pixels.
|
||||
Required for reconstructing intrinsics from field of view values.
|
||||
For example: (256, 512).
|
||||
pose_encoding_type (str): Type of pose encoding used. Currently only
|
||||
supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view).
|
||||
build_intrinsics (bool): Whether to reconstruct the intrinsics matrix.
|
||||
If False, only extrinsics are returned and intrinsics will be None.
|
||||
|
||||
Returns:
|
||||
tuple: (extrinsics, intrinsics)
|
||||
- extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4.
|
||||
In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world
|
||||
transformation. The format is [R|t] where R is a 3x3 rotation matrix and t is
|
||||
a 3x1 translation vector.
|
||||
- intrinsics (torch.Tensor or None): Camera intrinsic parameters with shape BxSx3x3,
|
||||
or None if build_intrinsics is False. Defined in pixels, with format:
|
||||
[[fx, 0, cx],
|
||||
[0, fy, cy],
|
||||
[0, 0, 1]]
|
||||
where fx, fy are focal lengths and (cx, cy) is the principal point,
|
||||
assumed to be at the center of the image (W/2, H/2).
|
||||
"""
|
||||
|
||||
intrinsics = None
|
||||
|
||||
if pose_encoding_type == "absT_quaR_FoV":
|
||||
T = pose_encoding[..., :3]
|
||||
quat = pose_encoding[..., 3:7]
|
||||
fov_h = pose_encoding[..., 7]
|
||||
fov_w = pose_encoding[..., 8]
|
||||
|
||||
R = quat_to_mat(quat)
|
||||
extrinsics = torch.cat([R, T[..., None]], dim=-1)
|
||||
|
||||
if build_intrinsics:
|
||||
H, W = image_size_hw
|
||||
fy = (H / 2.0) / torch.tan(fov_h / 2.0)
|
||||
fx = (W / 2.0) / torch.tan(fov_w / 2.0)
|
||||
intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device)
|
||||
intrinsics[..., 0, 0] = fx
|
||||
intrinsics[..., 1, 1] = fy
|
||||
intrinsics[..., 0, 2] = W / 2
|
||||
intrinsics[..., 1, 2] = H / 2
|
||||
intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1
|
||||
elif pose_encoding_type == "absT_quaR":
|
||||
T = pose_encoding[..., :3]
|
||||
quat = pose_encoding[..., 3:7]
|
||||
|
||||
R = quat_to_mat(quat)
|
||||
extrinsics = torch.cat([R, T[..., None]], dim=-1)
|
||||
|
||||
intrinsics = None
|
||||
|
||||
return extrinsics, intrinsics
|
||||
|
||||
def convert_pt3d_RT_to_opencv(Rot, Trans):
|
||||
"""
|
||||
Convert Point3D extrinsic matrices to OpenCV convention.
|
||||
|
||||
Args:
|
||||
Rot: 3D rotation matrix in Point3D format
|
||||
Trans: 3D translation vector in Point3D format
|
||||
|
||||
Returns:
|
||||
extri_opencv: 3x4 extrinsic matrix in OpenCV format
|
||||
"""
|
||||
rot_pt3d = np.array(Rot)
|
||||
trans_pt3d = np.array(Trans)
|
||||
|
||||
trans_pt3d[:2] *= -1
|
||||
rot_pt3d[:, :2] *= -1
|
||||
rot_pt3d = rot_pt3d.transpose(1, 0)
|
||||
extri_opencv = np.hstack((rot_pt3d, trans_pt3d[:, None]))
|
||||
return extri_opencv
|
||||
|
||||
|
||||
def build_pair_index(N, B=1):
|
||||
"""
|
||||
Build indices for all possible pairs of frames.
|
||||
|
||||
Args:
|
||||
N: Number of frames
|
||||
B: Batch size
|
||||
|
||||
Returns:
|
||||
i1, i2: Indices for all possible pairs
|
||||
"""
|
||||
i1_, i2_ = torch.combinations(torch.arange(N), 2, with_replacement=False).unbind(-1)
|
||||
i1, i2 = [(i[None] + torch.arange(B)[:, None] * N).reshape(-1) for i in [i1_, i2_]]
|
||||
return i1, i2
|
||||
|
||||
|
||||
def rotation_angle(rot_gt, rot_pred, batch_size=None, eps=1e-15):
|
||||
"""
|
||||
Calculate rotation angle error between ground truth and predicted rotations.
|
||||
|
||||
Args:
|
||||
rot_gt: Ground truth rotation matrices
|
||||
rot_pred: Predicted rotation matrices
|
||||
batch_size: Batch size for reshaping the result
|
||||
eps: Small value to avoid numerical issues
|
||||
|
||||
Returns:
|
||||
Rotation angle error in degrees
|
||||
"""
|
||||
q_pred = mat_to_quat(rot_pred)
|
||||
q_gt = mat_to_quat(rot_gt)
|
||||
|
||||
loss_q = (1 - (q_pred * q_gt).sum(dim=1) ** 2).clamp(min=eps)
|
||||
err_q = torch.arccos(1 - 2 * loss_q)
|
||||
|
||||
rel_rangle_deg = err_q * 180 / np.pi
|
||||
|
||||
if batch_size is not None:
|
||||
rel_rangle_deg = rel_rangle_deg.reshape(batch_size, -1)
|
||||
|
||||
return rel_rangle_deg
|
||||
|
||||
|
||||
def translation_angle(tvec_gt, tvec_pred, batch_size=None, ambiguity=True):
|
||||
"""
|
||||
Calculate translation angle error between ground truth and predicted translations.
|
||||
|
||||
Args:
|
||||
tvec_gt: Ground truth translation vectors
|
||||
tvec_pred: Predicted translation vectors
|
||||
batch_size: Batch size for reshaping the result
|
||||
ambiguity: Whether to handle direction ambiguity
|
||||
|
||||
Returns:
|
||||
Translation angle error in degrees
|
||||
"""
|
||||
rel_tangle_deg = compare_translation_by_angle(tvec_gt, tvec_pred)
|
||||
rel_tangle_deg = rel_tangle_deg * 180.0 / np.pi
|
||||
|
||||
if ambiguity:
|
||||
rel_tangle_deg = torch.min(rel_tangle_deg, (180 - rel_tangle_deg).abs())
|
||||
|
||||
if batch_size is not None:
|
||||
rel_tangle_deg = rel_tangle_deg.reshape(batch_size, -1)
|
||||
|
||||
return rel_tangle_deg
|
||||
|
||||
|
||||
def compare_translation_by_angle(t_gt, t, eps=1e-15, default_err=1e6):
|
||||
"""
|
||||
Normalize the translation vectors and compute the angle between them.
|
||||
|
||||
Args:
|
||||
t_gt: Ground truth translation vectors
|
||||
t: Predicted translation vectors
|
||||
eps: Small value to avoid division by zero
|
||||
default_err: Default error value for invalid cases
|
||||
|
||||
Returns:
|
||||
Angular error between translation vectors in radians
|
||||
"""
|
||||
t_norm = torch.norm(t, dim=1, keepdim=True)
|
||||
t = t / (t_norm + eps)
|
||||
|
||||
t_gt_norm = torch.norm(t_gt, dim=1, keepdim=True)
|
||||
t_gt = t_gt / (t_gt_norm + eps)
|
||||
|
||||
loss_t = torch.clamp_min(1.0 - torch.sum(t * t_gt, dim=1) ** 2, eps)
|
||||
err_t = torch.acos(torch.sqrt(1 - loss_t))
|
||||
|
||||
err_t[torch.isnan(err_t) | torch.isinf(err_t)] = default_err
|
||||
return err_t
|
||||
|
||||
|
||||
def calculate_auc_np(r_error, t_error, max_threshold=30):
|
||||
"""
|
||||
Calculate the Area Under the Curve (AUC) for the given error arrays using NumPy.
|
||||
|
||||
Args:
|
||||
r_error: numpy array representing R error values (Degree)
|
||||
t_error: numpy array representing T error values (Degree)
|
||||
max_threshold: Maximum threshold value for binning the histogram
|
||||
|
||||
Returns:
|
||||
AUC value and the normalized histogram
|
||||
"""
|
||||
error_matrix = np.concatenate((r_error[:, None], t_error[:, None]), axis=1)
|
||||
max_errors = np.max(error_matrix, axis=1)
|
||||
bins = np.arange(max_threshold + 1)
|
||||
histogram, _ = np.histogram(max_errors, bins=bins)
|
||||
num_pairs = float(len(max_errors))
|
||||
normalized_histogram = histogram.astype(float) / num_pairs
|
||||
return np.mean(np.cumsum(normalized_histogram)), normalized_histogram
|
||||
|
||||
|
||||
def se3_to_relative_pose_error(pred_se3, gt_se3, num_frames):
|
||||
"""
|
||||
Compute rotation and translation errors between predicted and ground truth poses.
|
||||
This function assumes the input poses are world-to-camera (w2c) transformations.
|
||||
|
||||
Args:
|
||||
pred_se3: Predicted SE(3) transformations (w2c), shape (N, 4, 4)
|
||||
gt_se3: Ground truth SE(3) transformations (w2c), shape (N, 4, 4)
|
||||
num_frames: Number of frames (N)
|
||||
|
||||
Returns:
|
||||
Rotation and translation angle errors in degrees
|
||||
"""
|
||||
pair_idx_i1, pair_idx_i2 = build_pair_index(num_frames)
|
||||
|
||||
relative_pose_gt = gt_se3[pair_idx_i1].bmm(
|
||||
closed_form_inverse_se3(gt_se3[pair_idx_i2])
|
||||
)
|
||||
relative_pose_pred = pred_se3[pair_idx_i1].bmm(
|
||||
closed_form_inverse_se3(pred_se3[pair_idx_i2])
|
||||
)
|
||||
|
||||
rel_rangle_deg = rotation_angle(
|
||||
relative_pose_gt[:, :3, :3], relative_pose_pred[:, :3, :3]
|
||||
)
|
||||
rel_tangle_deg = translation_angle(
|
||||
relative_pose_gt[:, :3, 3], relative_pose_pred[:, :3, 3]
|
||||
)
|
||||
|
||||
return rel_rangle_deg, rel_tangle_deg
|
||||
|
||||
|
||||
def colmap_to_opencv_intrinsics(K):
|
||||
"""
|
||||
Modify camera intrinsics to follow a different convention.
|
||||
Coordinates of the center of the top-left pixels are by default:
|
||||
- (0.5, 0.5) in Colmap
|
||||
- (0,0) in OpenCV
|
||||
"""
|
||||
K = K.copy()
|
||||
K[..., 0, 2] -= 0.5
|
||||
K[..., 1, 2] -= 0.5
|
||||
return K
|
||||
|
||||
def read_camera_parameters(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.readlines()
|
||||
lines = [line.rstrip() for line in lines]
|
||||
# extrinsics: line [1,5), 4x4 matrix
|
||||
extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
|
||||
# intrinsics: line [7-10), 3x3 matrix
|
||||
intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
|
||||
|
||||
return intrinsics, extrinsics
|
||||
Reference in New Issue
Block a user