image_framework_ymj/include/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h

478 lines
20 KiB
C
Raw Normal View History

2024-12-06 16:25:16 +08:00
// ----------------------------------------------------------------------------
// - Open3D: www.open3d.org -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------
#include "open3d/core/linalg/kernel/SVD3x3.h"
#include "open3d/t/geometry/kernel/GeometryIndexer.h"
#include "open3d/t/pipelines/kernel/FillInLinearSystem.h"
namespace open3d {
namespace t {
namespace pipelines {
namespace kernel {
#if defined(__CUDACC__)
void FillInRigidAlignmentTermCUDA
#else
void FillInRigidAlignmentTermCPU
#endif
(core::Tensor &AtA,
core::Tensor &Atb,
core::Tensor &residual,
const core::Tensor &Ti_ps,
const core::Tensor &Tj_qs,
const core::Tensor &Ri_normal_ps,
int i,
int j,
float threshold) {
core::Device device = AtA.GetDevice();
int64_t n = Ti_ps.GetLength();
if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
utility::LogError(
"Unable to setup linear system: input length mismatch.");
}
// First fill in a small 12 x 12 linear system
core::Tensor AtA_local =
core::Tensor::Zeros({12, 12}, core::Float32, device);
core::Tensor Atb_local = core::Tensor::Zeros({12}, core::Float32, device);
float *AtA_local_ptr = static_cast<float *>(AtA_local.GetDataPtr());
float *Atb_local_ptr = static_cast<float *>(Atb_local.GetDataPtr());
float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
const float *Ti_ps_ptr = static_cast<const float *>(Ti_ps.GetDataPtr());
const float *Tj_qs_ptr = static_cast<const float *>(Tj_qs.GetDataPtr());
const float *Ri_normal_ps_ptr =
static_cast<const float *>(Ri_normal_ps.GetDataPtr());
core::ParallelFor(
AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
const float *normal_p_prime =
Ri_normal_ps_ptr + 3 * workload_idx;
float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
(p_prime[1] - q_prime[1]) * normal_p_prime[1] +
(p_prime[2] - q_prime[2]) * normal_p_prime[2];
if (abs(r) > threshold) return;
float J_ij[12];
J_ij[0] = -q_prime[2] * normal_p_prime[1] +
q_prime[1] * normal_p_prime[2];
J_ij[1] = q_prime[2] * normal_p_prime[0] -
q_prime[0] * normal_p_prime[2];
J_ij[2] = -q_prime[1] * normal_p_prime[0] +
q_prime[0] * normal_p_prime[1];
J_ij[3] = normal_p_prime[0];
J_ij[4] = normal_p_prime[1];
J_ij[5] = normal_p_prime[2];
for (int k = 0; k < 6; ++k) {
J_ij[k + 6] = -J_ij[k];
}
// Not optimized; Switch to reduction if necessary.
#if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
for (int i_local = 0; i_local < 12; ++i_local) {
for (int j_local = 0; j_local < 12; ++j_local) {
atomicAdd(&AtA_local_ptr[i_local * 12 + j_local],
J_ij[i_local] * J_ij[j_local]);
}
atomicAdd(&Atb_local_ptr[i_local], J_ij[i_local] * r);
}
atomicAdd(residual_ptr, r * r);
#else
#pragma omp critical(FillInRigidAlignmentTermCPU)
{
for (int i_local = 0; i_local < 12; ++i_local) {
for (int j_local = 0; j_local < 12; ++j_local) {
AtA_local_ptr[i_local * 12 + j_local]
+= J_ij[i_local] * J_ij[j_local];
}
Atb_local_ptr[i_local] += J_ij[i_local] * r;
}
*residual_ptr += r * r;
}
#endif
});
// Then fill-in the large linear system
std::vector<int64_t> indices_vec(12);
for (int k = 0; k < 6; ++k) {
indices_vec[k] = i * 6 + k;
indices_vec[k + 6] = j * 6 + k;
}
std::vector<int64_t> indices_i_vec;
std::vector<int64_t> indices_j_vec;
for (int local_i = 0; local_i < 12; ++local_i) {
for (int local_j = 0; local_j < 12; ++local_j) {
indices_i_vec.push_back(indices_vec[local_i]);
indices_j_vec.push_back(indices_vec[local_j]);
}
}
core::Tensor indices(indices_vec, {12}, core::Int64, device);
core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Int64, device);
core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Int64, device);
core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j});
AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12}));
core::Tensor Atb_sub = Atb.IndexGet({indices});
Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1}));
}
#if defined(__CUDACC__)
void FillInSLACAlignmentTermCUDA
#else
void FillInSLACAlignmentTermCPU
#endif
(core::Tensor &AtA,
core::Tensor &Atb,
core::Tensor &residual,
const core::Tensor &Ti_Cps,
const core::Tensor &Tj_Cqs,
const core::Tensor &Cnormal_ps,
const core::Tensor &Ri_Cnormal_ps,
const core::Tensor &RjT_Ri_Cnormal_ps,
const core::Tensor &cgrid_idx_ps,
const core::Tensor &cgrid_idx_qs,
const core::Tensor &cgrid_ratio_qs,
const core::Tensor &cgrid_ratio_ps,
int i,
int j,
int n_frags,
float threshold) {
int64_t n = Ti_Cps.GetLength();
if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
utility::LogError(
"Unable to setup linear system: input length mismatch.");
}
int n_vars = Atb.GetLength();
float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
// Geometric properties
const float *Ti_Cps_ptr = static_cast<const float *>(Ti_Cps.GetDataPtr());
const float *Tj_Cqs_ptr = static_cast<const float *>(Tj_Cqs.GetDataPtr());
const float *Cnormal_ps_ptr =
static_cast<const float *>(Cnormal_ps.GetDataPtr());
const float *Ri_Cnormal_ps_ptr =
static_cast<const float *>(Ri_Cnormal_ps.GetDataPtr());
const float *RjT_Ri_Cnormal_ps_ptr =
static_cast<const float *>(RjT_Ri_Cnormal_ps.GetDataPtr());
// Association properties
const int *cgrid_idx_ps_ptr =
static_cast<const int *>(cgrid_idx_ps.GetDataPtr());
const int *cgrid_idx_qs_ptr =
static_cast<const int *>(cgrid_idx_qs.GetDataPtr());
const float *cgrid_ratio_ps_ptr =
static_cast<const float *>(cgrid_ratio_ps.GetDataPtr());
const float *cgrid_ratio_qs_ptr =
static_cast<const float *>(cgrid_ratio_qs.GetDataPtr());
core::ParallelFor(
AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
const float *Ri_Cnormal_p =
Ri_Cnormal_ps_ptr + 3 * workload_idx;
const float *RjTRi_Cnormal_p =
RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
const float *cgrid_ratio_p =
cgrid_ratio_ps_ptr + 8 * workload_idx;
const float *cgrid_ratio_q =
cgrid_ratio_qs_ptr + 8 * workload_idx;
float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
(Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
(Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
if (abs(r) > threshold) return;
// Now we fill in a 60 x 60 sub-matrix: 2 x (6 + 8 x 3)
float J[60];
int idx[60];
// Jacobian w.r.t. Ti: 0-6
J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
J[3] = Ri_Cnormal_p[0];
J[4] = Ri_Cnormal_p[1];
J[5] = Ri_Cnormal_p[2];
// Jacobian w.r.t. Tj: 6-12
for (int k = 0; k < 6; ++k) {
J[k + 6] = -J[k];
idx[k + 0] = 6 * i + k;
idx[k + 6] = 6 * j + k;
}
// Jacobian w.r.t. C over p: 12-36
for (int k = 0; k < 8; ++k) {
J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];
idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
}
// Jacobian w.r.t. C over q: 36-60
for (int k = 0; k < 8; ++k) {
J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
}
// Not optimized; Switch to reduction if necessary.
#if defined(__CUDACC__)
for (int ki = 0; ki < 60; ++ki) {
for (int kj = 0; kj < 60; ++kj) {
float AtA_ij = J[ki] * J[kj];
int ij = idx[ki] * n_vars + idx[kj];
atomicAdd(AtA_ptr + ij, AtA_ij);
}
float Atb_i = J[ki] * r;
atomicAdd(Atb_ptr + idx[ki], Atb_i);
}
atomicAdd(residual_ptr, r * r);
#else
#pragma omp critical(FillInSLACAlignmentTermCPU)
{
for (int ki = 0; ki < 60; ++ki) {
for (int kj = 0; kj < 60; ++kj) {
AtA_ptr[idx[ki] * n_vars + idx[kj]]
+= J[ki] * J[kj];
}
Atb_ptr[idx[ki]] += J[ki] * r;
}
*residual_ptr += r * r;
}
#endif
});
}
#if defined(__CUDACC__)
void FillInSLACRegularizerTermCUDA
#else
void FillInSLACRegularizerTermCPU
#endif
(core::Tensor &AtA,
core::Tensor &Atb,
core::Tensor &residual,
const core::Tensor &grid_idx,
const core::Tensor &grid_nbs_idx,
const core::Tensor &grid_nbs_mask,
const core::Tensor &positions_init,
const core::Tensor &positions_curr,
float weight,
int n_frags,
int anchor_idx) {
int64_t n = grid_idx.GetLength();
int64_t n_vars = Atb.GetLength();
float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
const int *grid_idx_ptr = static_cast<const int *>(grid_idx.GetDataPtr());
const int *grid_nbs_idx_ptr =
static_cast<const int *>(grid_nbs_idx.GetDataPtr());
const bool *grid_nbs_mask_ptr =
static_cast<const bool *>(grid_nbs_mask.GetDataPtr());
const float *positions_init_ptr =
static_cast<const float *>(positions_init.GetDataPtr());
const float *positions_curr_ptr =
static_cast<const float *>(positions_curr.GetDataPtr());
core::ParallelFor(
AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
// Enumerate 6 neighbors
int idx_i = grid_idx_ptr[workload_idx];
const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;
// Build a 3x3 linear system to compute the local R
float cov[3][3] = {{0}};
float U[3][3], V[3][3], S[3];
int cnt = 0;
for (int k = 0; k < 6; ++k) {
bool mask_k = mask_nbs[k];
if (!mask_k) continue;
int idx_k = idx_nbs[k];
// Now build linear systems
float diff_ik_init[3] = {
positions_init_ptr[idx_i * 3 + 0] -
positions_init_ptr[idx_k * 3 + 0],
positions_init_ptr[idx_i * 3 + 1] -
positions_init_ptr[idx_k * 3 + 1],
positions_init_ptr[idx_i * 3 + 2] -
positions_init_ptr[idx_k * 3 + 2]};
float diff_ik_curr[3] = {
positions_curr_ptr[idx_i * 3 + 0] -
positions_curr_ptr[idx_k * 3 + 0],
positions_curr_ptr[idx_i * 3 + 1] -
positions_curr_ptr[idx_k * 3 + 1],
positions_curr_ptr[idx_i * 3 + 2] -
positions_curr_ptr[idx_k * 3 + 2]};
// Build linear system by computing XY^T when formulating Y
// = RX Y: curr X: init
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
cov[i][j] += diff_ik_init[i] * diff_ik_curr[j];
}
}
++cnt;
}
if (cnt < 3) {
return;
}
core::linalg::kernel::svd3x3(*cov, *U, S, *V);
float R[3][3];
core::linalg::kernel::transpose3x3_(*U);
core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
float d = core::linalg::kernel::det3x3(*R);
if (d < 0) {
U[2][0] = -U[2][0];
U[2][1] = -U[2][1];
U[2][2] = -U[2][2];
core::linalg::kernel::matmul3x3_3x3(*V, *U, *R);
}
// Now we have R, we build Hessian and residuals
// But first, we need to anchor a point
if (idx_i == anchor_idx) {
R[0][0] = R[1][1] = R[2][2] = 1;
R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] =
0;
}
for (int k = 0; k < 6; ++k) {
bool mask_k = mask_nbs[k];
if (mask_k) {
int idx_k = idx_nbs[k];
float diff_ik_init[3] = {
positions_init_ptr[idx_i * 3 + 0] -
positions_init_ptr[idx_k * 3 + 0],
positions_init_ptr[idx_i * 3 + 1] -
positions_init_ptr[idx_k * 3 + 1],
positions_init_ptr[idx_i * 3 + 2] -
positions_init_ptr[idx_k * 3 + 2]};
float diff_ik_curr[3] = {
positions_curr_ptr[idx_i * 3 + 0] -
positions_curr_ptr[idx_k * 3 + 0],
positions_curr_ptr[idx_i * 3 + 1] -
positions_curr_ptr[idx_k * 3 + 1],
positions_curr_ptr[idx_i * 3 + 2] -
positions_curr_ptr[idx_k * 3 + 2]};
float R_diff_ik_curr[3];
core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init,
R_diff_ik_curr);
float local_r[3];
local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
int offset_idx_i = 3 * idx_i + 6 * n_frags;
int offset_idx_k = 3 * idx_k + 6 * n_frags;
#if defined(__CUDACC__)
// Update residual
atomicAdd(residual_ptr,
weight * (local_r[0] * local_r[0] +
local_r[1] * local_r[1] +
local_r[2] * local_r[2]));
for (int axis = 0; axis < 3; ++axis) {
// Update AtA: 2x2
atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
offset_idx_i + axis],
weight);
atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
offset_idx_k + axis],
weight);
atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
offset_idx_k + axis],
-weight);
atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
offset_idx_i + axis],
-weight);
// Update Atb: 2x1
atomicAdd(&Atb_ptr[offset_idx_i + axis],
+weight * local_r[axis]);
atomicAdd(&Atb_ptr[offset_idx_k + axis],
-weight * local_r[axis]);
}
#else
#pragma omp critical(FillInSLACRegularizerTermCPU)
{
// Update residual
*residual_ptr += weight * (local_r[0] * local_r[0] +
local_r[1] * local_r[1] +
local_r[2] * local_r[2]);
for (int axis = 0; axis < 3; ++axis) {
// Update AtA: 2x2
AtA_ptr[(offset_idx_i + axis) * n_vars +
offset_idx_i + axis] += weight;
AtA_ptr[(offset_idx_k + axis) * n_vars +
offset_idx_k + axis] += weight;
AtA_ptr[(offset_idx_i + axis) * n_vars +
offset_idx_k + axis] -= weight;
AtA_ptr[(offset_idx_k + axis) * n_vars +
offset_idx_i + axis] -= weight;
// Update Atb: 2x1
Atb_ptr[offset_idx_i + axis] += weight * local_r[axis];
Atb_ptr[offset_idx_k + axis] -= weight * local_r[axis];
}
}
#endif
}
}
});
}
} // namespace kernel
} // namespace pipelines
} // namespace t
} // namespace open3d