# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import torch from .rotation import quat_to_mat, mat_to_quat import os import torch import numpy as np import gzip import json import random import logging import warnings from lingbot_map.utils.geometry import closed_form_inverse_se3, closed_form_inverse_se3_general def extri_intri_to_pose_encoding( extrinsics, intrinsics, image_size_hw=None, pose_encoding_type="absT_quaR_FoV" # e.g., (256, 512) ): """Convert camera extrinsics and intrinsics to a compact pose encoding. This function transforms camera parameters into a unified pose encoding format, which can be used for various downstream tasks like pose prediction or representation. Args: extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4, where B is batch size and S is sequence length. In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world transformation. The format is [R|t] where R is a 3x3 rotation matrix and t is a 3x1 translation vector. intrinsics (torch.Tensor): Camera intrinsic parameters with shape BxSx3x3. Defined in pixels, with format: [[fx, 0, cx], [0, fy, cy], [0, 0, 1]] where fx, fy are focal lengths and (cx, cy) is the principal point image_size_hw (tuple): Tuple of (height, width) of the image in pixels. Required for computing field of view values. For example: (256, 512). pose_encoding_type (str): Type of pose encoding to use. Currently only supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view). Returns: torch.Tensor: Encoded camera pose parameters with shape BxSx9. For "absT_quaR_FoV" type, the 9 dimensions are: - [:3] = absolute translation vector T (3D) - [3:7] = rotation as quaternion quat (4D) - [7:] = field of view (2D) """ # extrinsics: BxSx3x4 # intrinsics: BxSx3x3 if pose_encoding_type == "absT_quaR_FoV": R = extrinsics[:, :, :3, :3] # BxSx3x3 T = extrinsics[:, :, :3, 3] # BxSx3 quat = mat_to_quat(R) # Note the order of h and w here H, W = image_size_hw fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1]) fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0]) pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float() else: raise NotImplementedError return pose_encoding def pose_encoding_to_extri_intri( pose_encoding, image_size_hw=None, pose_encoding_type="absT_quaR_FoV", build_intrinsics=True # e.g., (256, 512) ): """Convert a pose encoding back to camera extrinsics and intrinsics. This function performs the inverse operation of extri_intri_to_pose_encoding, reconstructing the full camera parameters from the compact encoding. Args: pose_encoding (torch.Tensor): Encoded camera pose parameters with shape BxSx9, where B is batch size and S is sequence length. For "absT_quaR_FoV" type, the 9 dimensions are: - [:3] = absolute translation vector T (3D) - [3:7] = rotation as quaternion quat (4D) - [7:] = field of view (2D) image_size_hw (tuple): Tuple of (height, width) of the image in pixels. Required for reconstructing intrinsics from field of view values. For example: (256, 512). pose_encoding_type (str): Type of pose encoding used. Currently only supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view). build_intrinsics (bool): Whether to reconstruct the intrinsics matrix. If False, only extrinsics are returned and intrinsics will be None. Returns: tuple: (extrinsics, intrinsics) - extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4. In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world transformation. The format is [R|t] where R is a 3x3 rotation matrix and t is a 3x1 translation vector. - intrinsics (torch.Tensor or None): Camera intrinsic parameters with shape BxSx3x3, or None if build_intrinsics is False. Defined in pixels, with format: [[fx, 0, cx], [0, fy, cy], [0, 0, 1]] where fx, fy are focal lengths and (cx, cy) is the principal point, assumed to be at the center of the image (W/2, H/2). """ intrinsics = None if pose_encoding_type == "absT_quaR_FoV": T = pose_encoding[..., :3] quat = pose_encoding[..., 3:7] fov_h = pose_encoding[..., 7] fov_w = pose_encoding[..., 8] R = quat_to_mat(quat) extrinsics = torch.cat([R, T[..., None]], dim=-1) if build_intrinsics: H, W = image_size_hw fy = (H / 2.0) / torch.tan(fov_h / 2.0) fx = (W / 2.0) / torch.tan(fov_w / 2.0) intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device) intrinsics[..., 0, 0] = fx intrinsics[..., 1, 1] = fy intrinsics[..., 0, 2] = W / 2 intrinsics[..., 1, 2] = H / 2 intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1 elif pose_encoding_type == "absT_quaR": T = pose_encoding[..., :3] quat = pose_encoding[..., 3:7] R = quat_to_mat(quat) extrinsics = torch.cat([R, T[..., None]], dim=-1) intrinsics = None return extrinsics, intrinsics def convert_pt3d_RT_to_opencv(Rot, Trans): """ Convert Point3D extrinsic matrices to OpenCV convention. Args: Rot: 3D rotation matrix in Point3D format Trans: 3D translation vector in Point3D format Returns: extri_opencv: 3x4 extrinsic matrix in OpenCV format """ rot_pt3d = np.array(Rot) trans_pt3d = np.array(Trans) trans_pt3d[:2] *= -1 rot_pt3d[:, :2] *= -1 rot_pt3d = rot_pt3d.transpose(1, 0) extri_opencv = np.hstack((rot_pt3d, trans_pt3d[:, None])) return extri_opencv def build_pair_index(N, B=1): """ Build indices for all possible pairs of frames. Args: N: Number of frames B: Batch size Returns: i1, i2: Indices for all possible pairs """ i1_, i2_ = torch.combinations(torch.arange(N), 2, with_replacement=False).unbind(-1) i1, i2 = [(i[None] + torch.arange(B)[:, None] * N).reshape(-1) for i in [i1_, i2_]] return i1, i2 def rotation_angle(rot_gt, rot_pred, batch_size=None, eps=1e-15): """ Calculate rotation angle error between ground truth and predicted rotations. Args: rot_gt: Ground truth rotation matrices rot_pred: Predicted rotation matrices batch_size: Batch size for reshaping the result eps: Small value to avoid numerical issues Returns: Rotation angle error in degrees """ q_pred = mat_to_quat(rot_pred) q_gt = mat_to_quat(rot_gt) loss_q = (1 - (q_pred * q_gt).sum(dim=1) ** 2).clamp(min=eps) err_q = torch.arccos(1 - 2 * loss_q) rel_rangle_deg = err_q * 180 / np.pi if batch_size is not None: rel_rangle_deg = rel_rangle_deg.reshape(batch_size, -1) return rel_rangle_deg def translation_angle(tvec_gt, tvec_pred, batch_size=None, ambiguity=True): """ Calculate translation angle error between ground truth and predicted translations. Args: tvec_gt: Ground truth translation vectors tvec_pred: Predicted translation vectors batch_size: Batch size for reshaping the result ambiguity: Whether to handle direction ambiguity Returns: Translation angle error in degrees """ rel_tangle_deg = compare_translation_by_angle(tvec_gt, tvec_pred) rel_tangle_deg = rel_tangle_deg * 180.0 / np.pi if ambiguity: rel_tangle_deg = torch.min(rel_tangle_deg, (180 - rel_tangle_deg).abs()) if batch_size is not None: rel_tangle_deg = rel_tangle_deg.reshape(batch_size, -1) return rel_tangle_deg def compare_translation_by_angle(t_gt, t, eps=1e-15, default_err=1e6): """ Normalize the translation vectors and compute the angle between them. Args: t_gt: Ground truth translation vectors t: Predicted translation vectors eps: Small value to avoid division by zero default_err: Default error value for invalid cases Returns: Angular error between translation vectors in radians """ t_norm = torch.norm(t, dim=1, keepdim=True) t = t / (t_norm + eps) t_gt_norm = torch.norm(t_gt, dim=1, keepdim=True) t_gt = t_gt / (t_gt_norm + eps) loss_t = torch.clamp_min(1.0 - torch.sum(t * t_gt, dim=1) ** 2, eps) err_t = torch.acos(torch.sqrt(1 - loss_t)) err_t[torch.isnan(err_t) | torch.isinf(err_t)] = default_err return err_t def calculate_auc_np(r_error, t_error, max_threshold=30): """ Calculate the Area Under the Curve (AUC) for the given error arrays using NumPy. Args: r_error: numpy array representing R error values (Degree) t_error: numpy array representing T error values (Degree) max_threshold: Maximum threshold value for binning the histogram Returns: AUC value and the normalized histogram """ error_matrix = np.concatenate((r_error[:, None], t_error[:, None]), axis=1) max_errors = np.max(error_matrix, axis=1) bins = np.arange(max_threshold + 1) histogram, _ = np.histogram(max_errors, bins=bins) num_pairs = float(len(max_errors)) normalized_histogram = histogram.astype(float) / num_pairs return np.mean(np.cumsum(normalized_histogram)), normalized_histogram def se3_to_relative_pose_error(pred_se3, gt_se3, num_frames): """ Compute rotation and translation errors between predicted and ground truth poses. This function assumes the input poses are world-to-camera (w2c) transformations. Args: pred_se3: Predicted SE(3) transformations (w2c), shape (N, 4, 4) gt_se3: Ground truth SE(3) transformations (w2c), shape (N, 4, 4) num_frames: Number of frames (N) Returns: Rotation and translation angle errors in degrees """ pair_idx_i1, pair_idx_i2 = build_pair_index(num_frames) relative_pose_gt = gt_se3[pair_idx_i1].bmm( closed_form_inverse_se3(gt_se3[pair_idx_i2]) ) relative_pose_pred = pred_se3[pair_idx_i1].bmm( closed_form_inverse_se3(pred_se3[pair_idx_i2]) ) rel_rangle_deg = rotation_angle( relative_pose_gt[:, :3, :3], relative_pose_pred[:, :3, :3] ) rel_tangle_deg = translation_angle( relative_pose_gt[:, :3, 3], relative_pose_pred[:, :3, 3] ) return rel_rangle_deg, rel_tangle_deg def colmap_to_opencv_intrinsics(K): """ Modify camera intrinsics to follow a different convention. Coordinates of the center of the top-left pixels are by default: - (0.5, 0.5) in Colmap - (0,0) in OpenCV """ K = K.copy() K[..., 0, 2] -= 0.5 K[..., 1, 2] -= 0.5 return K def read_camera_parameters(filename): with open(filename) as f: lines = f.readlines() lines = [line.rstrip() for line in lines] # extrinsics: line [1,5), 4x4 matrix extrinsics = np.fromstring(' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4)) # intrinsics: line [7-10), 3x3 matrix intrinsics = np.fromstring(' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3)) return intrinsics, extrinsics