image_framework_ymj/include/open3d/t/geometry/kernel/VoxelBlockGridImpl.h

// ----------------------------------------------------------------------------
// -                        Open3D: www.open3d.org                            -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------

#include <atomic>
#include <cmath>

#include "open3d/core/Dispatch.h"
#include "open3d/core/Dtype.h"
#include "open3d/core/MemoryManager.h"
#include "open3d/core/SizeVector.h"
#include "open3d/core/Tensor.h"
#include "open3d/core/hashmap/Dispatch.h"
#include "open3d/t/geometry/Utility.h"
#include "open3d/t/geometry/kernel/GeometryIndexer.h"
#include "open3d/t/geometry/kernel/GeometryMacros.h"
#include "open3d/t/geometry/kernel/VoxelBlockGrid.h"
#include "open3d/utility/Logging.h"
#include "open3d/utility/Timer.h"

namespace open3d {
namespace t {
namespace geometry {
namespace kernel {
namespace voxel_grid {

using index_t = int;
using ArrayIndexer = TArrayIndexer<index_t>;

#if defined(__CUDACC__)
void GetVoxelCoordinatesAndFlattenedIndicesCUDA
#else
void GetVoxelCoordinatesAndFlattenedIndicesCPU
#endif
        (const core::Tensor& buf_indices,
         const core::Tensor& block_keys,
         core::Tensor& voxel_coords,
         core::Tensor& flattened_indices,
         index_t resolution,
         float voxel_size) {
    core::Device device = buf_indices.GetDevice();

    const index_t* buf_indices_ptr = buf_indices.GetDataPtr<index_t>();
    const index_t* block_key_ptr = block_keys.GetDataPtr<index_t>();

    float* voxel_coords_ptr = voxel_coords.GetDataPtr<float>();
    int64_t* flattened_indices_ptr = flattened_indices.GetDataPtr<int64_t>();

    index_t n = flattened_indices.GetLength();
    ArrayIndexer voxel_indexer({resolution, resolution, resolution});
    index_t resolution3 = resolution * resolution * resolution;

    core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t workload_idx) {
        index_t block_idx = buf_indices_ptr[workload_idx / resolution3];
        index_t voxel_idx = workload_idx % resolution3;

        index_t block_key_offset = block_idx * 3;
        index_t xb = block_key_ptr[block_key_offset + 0];
        index_t yb = block_key_ptr[block_key_offset + 1];
        index_t zb = block_key_ptr[block_key_offset + 2];

        index_t xv, yv, zv;
        voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

        float x = (xb * resolution + xv) * voxel_size;
        float y = (yb * resolution + yv) * voxel_size;
        float z = (zb * resolution + zv) * voxel_size;

        flattened_indices_ptr[workload_idx] =
                block_idx * resolution3 + voxel_idx;

        index_t voxel_coords_offset = workload_idx * 3;
        voxel_coords_ptr[voxel_coords_offset + 0] = x;
        voxel_coords_ptr[voxel_coords_offset + 1] = y;
        voxel_coords_ptr[voxel_coords_offset + 2] = z;
    });
}

inline OPEN3D_DEVICE index_t
DeviceGetLinearIdx(index_t xo,
                   index_t yo,
                   index_t zo,
                   index_t curr_block_idx,
                   index_t resolution,
                   const ArrayIndexer& nb_block_masks_indexer,
                   const ArrayIndexer& nb_block_indices_indexer) {
    index_t xn = (xo + resolution) % resolution;
    index_t yn = (yo + resolution) % resolution;
    index_t zn = (zo + resolution) % resolution;

    index_t dxb = Sign(xo - xn);
    index_t dyb = Sign(yo - yn);
    index_t dzb = Sign(zo - zn);

    index_t nb_idx = (dxb + 1) + (dyb + 1) * 3 + (dzb + 1) * 9;

    bool block_mask_i =
            *nb_block_masks_indexer.GetDataPtr<bool>(curr_block_idx, nb_idx);
    if (!block_mask_i) return -1;

    index_t block_idx_i = *nb_block_indices_indexer.GetDataPtr<index_t>(
            curr_block_idx, nb_idx);

    return (((block_idx_i * resolution) + zn) * resolution + yn) * resolution +
           xn;
}

template <typename tsdf_t>
inline OPEN3D_DEVICE void DeviceGetNormal(
        const tsdf_t* tsdf_base_ptr,
        index_t xo,
        index_t yo,
        index_t zo,
        index_t curr_block_idx,
        float* n,
        index_t resolution,
        const ArrayIndexer& nb_block_masks_indexer,
        const ArrayIndexer& nb_block_indices_indexer) {
    auto GetLinearIdx = [&] OPEN3D_DEVICE(index_t xo, index_t yo,
                                          index_t zo) -> index_t {
        return DeviceGetLinearIdx(xo, yo, zo, curr_block_idx, resolution,
                                  nb_block_masks_indexer,
                                  nb_block_indices_indexer);
    };
    index_t vxp = GetLinearIdx(xo + 1, yo, zo);
    index_t vxn = GetLinearIdx(xo - 1, yo, zo);
    index_t vyp = GetLinearIdx(xo, yo + 1, zo);
    index_t vyn = GetLinearIdx(xo, yo - 1, zo);
    index_t vzp = GetLinearIdx(xo, yo, zo + 1);
    index_t vzn = GetLinearIdx(xo, yo, zo - 1);
    if (vxp >= 0 && vxn >= 0) n[0] = tsdf_base_ptr[vxp] - tsdf_base_ptr[vxn];
    if (vyp >= 0 && vyn >= 0) n[1] = tsdf_base_ptr[vyp] - tsdf_base_ptr[vyn];
    if (vzp >= 0 && vzn >= 0) n[2] = tsdf_base_ptr[vzp] - tsdf_base_ptr[vzn];
};

template <typename input_depth_t,
          typename input_color_t,
          typename tsdf_t,
          typename weight_t,
          typename color_t>
#if defined(__CUDACC__)
void IntegrateCUDA
#else
void IntegrateCPU
#endif
        (const core::Tensor& depth,
         const core::Tensor& color,
         const core::Tensor& indices,
         const core::Tensor& block_keys,
         TensorMap& block_value_map,
         const core::Tensor& depth_intrinsic,
         const core::Tensor& color_intrinsic,
         const core::Tensor& extrinsics,
         index_t resolution,
         float voxel_size,
         float sdf_trunc,
         float depth_scale,
         float depth_max) {
    // Parameters
    index_t resolution2 = resolution * resolution;
    index_t resolution3 = resolution2 * resolution;

    TransformIndexer transform_indexer(depth_intrinsic, extrinsics, voxel_size);
    TransformIndexer colormap_indexer(
            color_intrinsic,
            core::Tensor::Eye(4, core::Dtype::Float64, core::Device("CPU:0")));

    ArrayIndexer voxel_indexer({resolution, resolution, resolution});

    ArrayIndexer block_keys_indexer(block_keys, 1);
    ArrayIndexer depth_indexer(depth, 2);
    core::Device device = block_keys.GetDevice();

    const index_t* indices_ptr = indices.GetDataPtr<index_t>();

    if (!block_value_map.Contains("tsdf") ||
        !block_value_map.Contains("weight")) {
        utility::LogError(
                "TSDF and/or weight not allocated in blocks, please implement "
                "customized integration.");
    }
    tsdf_t* tsdf_base_ptr = block_value_map.at("tsdf").GetDataPtr<tsdf_t>();
    weight_t* weight_base_ptr =
            block_value_map.at("weight").GetDataPtr<weight_t>();

    bool integrate_color =
            block_value_map.Contains("color") && color.NumElements() > 0;
    color_t* color_base_ptr = nullptr;
    ArrayIndexer color_indexer;

    float color_multiplier = 1.0;
    if (integrate_color) {
        color_base_ptr = block_value_map.at("color").GetDataPtr<color_t>();
        color_indexer = ArrayIndexer(color, 2);

        // Float32: [0, 1] -> [0, 255]
        if (color.GetDtype() == core::Float32) {
            color_multiplier = 255.0;
        }
    }

    index_t n = indices.GetLength() * resolution3;
    core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t workload_idx) {
        // Natural index (0, N) -> (block_idx, voxel_idx)
        index_t block_idx = indices_ptr[workload_idx / resolution3];
        index_t voxel_idx = workload_idx % resolution3;

        /// Coordinate transform
        // block_idx -> (x_block, y_block, z_block)
        index_t* block_key_ptr =
                block_keys_indexer.GetDataPtr<index_t>(block_idx);
        index_t xb = block_key_ptr[0];
        index_t yb = block_key_ptr[1];
        index_t zb = block_key_ptr[2];

        // voxel_idx -> (x_voxel, y_voxel, z_voxel)
        index_t xv, yv, zv;
        voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

        // coordinate in world (in voxel)
        index_t x = xb * resolution + xv;
        index_t y = yb * resolution + yv;
        index_t z = zb * resolution + zv;

        // coordinate in camera (in voxel -> in meter)
        float xc, yc, zc, u, v;
        transform_indexer.RigidTransform(static_cast<float>(x),
                                         static_cast<float>(y),
                                         static_cast<float>(z), &xc, &yc, &zc);

        // coordinate in image (in pixel)
        transform_indexer.Project(xc, yc, zc, &u, &v);
        if (!depth_indexer.InBoundary(u, v)) {
            return;
        }

        index_t ui = static_cast<index_t>(u);
        index_t vi = static_cast<index_t>(v);

        // Associate image workload and compute SDF and
        // TSDF.
        float depth =
                *depth_indexer.GetDataPtr<input_depth_t>(ui, vi) / depth_scale;

        float sdf = depth - zc;
        if (depth <= 0 || depth > depth_max || zc <= 0 || sdf < -sdf_trunc) {
            return;
        }
        sdf = sdf < sdf_trunc ? sdf : sdf_trunc;
        sdf /= sdf_trunc;

        index_t linear_idx = block_idx * resolution3 + voxel_idx;

        tsdf_t* tsdf_ptr = tsdf_base_ptr + linear_idx;
        weight_t* weight_ptr = weight_base_ptr + linear_idx;

        float inv_wsum = 1.0f / (*weight_ptr + 1);
        float weight = *weight_ptr;
        *tsdf_ptr = (weight * (*tsdf_ptr) + sdf) * inv_wsum;

        if (integrate_color) {
            color_t* color_ptr = color_base_ptr + 3 * linear_idx;

            // Unproject ui, vi with depth_intrinsic, then project back with
            // color_intrinsic
            float x, y, z;
            transform_indexer.Unproject(ui, vi, 1.0, &x, &y, &z);

            float uf, vf;
            colormap_indexer.Project(x, y, z, &uf, &vf);
            if (color_indexer.InBoundary(uf, vf)) {
                ui = round(uf);
                vi = round(vf);

                input_color_t* input_color_ptr =
                        color_indexer.GetDataPtr<input_color_t>(ui, vi);

                for (index_t i = 0; i < 3; ++i) {
                    color_ptr[i] = (weight * color_ptr[i] +
                                    input_color_ptr[i] * color_multiplier) *
                                   inv_wsum;
                }
            }
        }
        *weight_ptr = weight + 1;
    });

#if defined(__CUDACC__)
    core::cuda::Synchronize();
#endif
}

#if defined(__CUDACC__)
void EstimateRangeCUDA
#else
void EstimateRangeCPU
#endif
        (const core::Tensor& block_keys,
         core::Tensor& range_minmax_map,
         const core::Tensor& intrinsics,
         const core::Tensor& extrinsics,
         int h,
         int w,
         int down_factor,
         int64_t block_resolution,
         float voxel_size,
         float depth_min,
         float depth_max,
         core::Tensor& fragment_buffer) {

    // TODO(wei): reserve it in a reusable buffer

    // Every 2 channels: (min, max)
    int h_down = h / down_factor;
    int w_down = w / down_factor;
    range_minmax_map = core::Tensor({h_down, w_down, 2}, core::Float32,
                                    block_keys.GetDevice());
    NDArrayIndexer range_map_indexer(range_minmax_map, 2);

    // Every 6 channels: (v_min, u_min, v_max, u_max, z_min, z_max)
    const int fragment_size = 16;

    if (fragment_buffer.GetDataPtr() == 0 ||
        fragment_buffer.NumElements() == 0) {
        // Rough heuristic; should tend to overallocate
        const int reserve_frag_buffer_size =
                h_down * w_down / (fragment_size * fragment_size) / voxel_size;
        fragment_buffer = core::Tensor({reserve_frag_buffer_size, 6},
                                       core::Float32, block_keys.GetDevice());
    }

    const int frag_buffer_size = fragment_buffer.NumElements() / 6;

    NDArrayIndexer frag_buffer_indexer(fragment_buffer, 1);
    NDArrayIndexer block_keys_indexer(block_keys, 1);
    TransformIndexer w2c_transform_indexer(intrinsics, extrinsics);
#if defined(__CUDACC__)
    core::Tensor count(std::vector<int>{0}, {1}, core::Int32,
                       block_keys.GetDevice());
    int* count_ptr = count.GetDataPtr<int>();
#else
    std::atomic<int> count_atomic(0);
    std::atomic<int>* count_ptr = &count_atomic;
#endif

#ifndef __CUDACC__
    using std::max;
    using std::min;
#endif

    // Pass 0: iterate over blocks, fill-in an rendering fragment array
    core::ParallelFor(
            block_keys.GetDevice(), block_keys.GetLength(),
            [=] OPEN3D_DEVICE(int64_t workload_idx) {
                int* key = block_keys_indexer.GetDataPtr<int>(workload_idx);

                int u_min = w_down - 1, v_min = h_down - 1, u_max = 0,
                    v_max = 0;
                float z_min = depth_max, z_max = depth_min;

                float xc, yc, zc, u, v;

                // Project 8 corners to low-res image and form a rectangle
                for (int i = 0; i < 8; ++i) {
                    float xw = (key[0] + ((i & 1) > 0)) * block_resolution *
                               voxel_size;
                    float yw = (key[1] + ((i & 2) > 0)) * block_resolution *
                               voxel_size;
                    float zw = (key[2] + ((i & 4) > 0)) * block_resolution *
                               voxel_size;

                    w2c_transform_indexer.RigidTransform(xw, yw, zw, &xc, &yc,
                                                         &zc);
                    if (zc <= 0) continue;

                    // Project to the down sampled image buffer
                    w2c_transform_indexer.Project(xc, yc, zc, &u, &v);
                    u /= down_factor;
                    v /= down_factor;

                    v_min = min(static_cast<int>(floorf(v)), v_min);
                    v_max = max(static_cast<int>(ceilf(v)), v_max);

                    u_min = min(static_cast<int>(floorf(u)), u_min);
                    u_max = max(static_cast<int>(ceilf(u)), u_max);

                    z_min = min(z_min, zc);
                    z_max = max(z_max, zc);
                }

                v_min = max(0, v_min);
                v_max = min(h_down - 1, v_max);

                u_min = max(0, u_min);
                u_max = min(w_down - 1, u_max);

                if (v_min >= v_max || u_min >= u_max || z_min >= z_max) return;

                // Divide the rectangle into small 16x16 fragments
                int frag_v_count =
                        ceilf(float(v_max - v_min + 1) / float(fragment_size));
                int frag_u_count =
                        ceilf(float(u_max - u_min + 1) / float(fragment_size));

                int frag_count = frag_v_count * frag_u_count;
                int frag_count_start = OPEN3D_ATOMIC_ADD(count_ptr, frag_count);
                int frag_count_end = frag_count_start + frag_count;
                if (frag_count_end >= frag_buffer_size) {
                    return;
                }

                int offset = 0;
                for (int frag_v = 0; frag_v < frag_v_count; ++frag_v) {
                    for (int frag_u = 0; frag_u < frag_u_count;
                         ++frag_u, ++offset) {
                        float* frag_ptr = frag_buffer_indexer.GetDataPtr<float>(
                                frag_count_start + offset);
                        // zmin, zmax
                        frag_ptr[0] = z_min;
                        frag_ptr[1] = z_max;

                        // vmin, umin
                        frag_ptr[2] = v_min + frag_v * fragment_size;
                        frag_ptr[3] = u_min + frag_u * fragment_size;

                        // vmax, umax
                        frag_ptr[4] = min(frag_ptr[2] + fragment_size - 1,
                                          static_cast<float>(v_max));
                        frag_ptr[5] = min(frag_ptr[3] + fragment_size - 1,
                                          static_cast<float>(u_max));
                    }
                }
            });
#if defined(__CUDACC__)
    int needed_frag_count = count[0].Item<int>();
#else
    int needed_frag_count = (*count_ptr).load();
#endif

    int frag_count = needed_frag_count;
    if (frag_count >= frag_buffer_size) {
        utility::LogWarning(
                "Could not generate full range map; allocated {} fragments but "
                "needed {}",
                frag_buffer_size, frag_count);
        frag_count = frag_buffer_size - 1;
    } else {
        utility::LogDebug("EstimateRange Allocated {} fragments and needed {}",
                          frag_buffer_size, frag_count);
    }

    // Pass 0.5: Fill in range map to prepare for atomic min/max
    core::ParallelFor(block_keys.GetDevice(), h_down * w_down,
                      [=] OPEN3D_DEVICE(int64_t workload_idx) {
                          int v = workload_idx / w_down;
                          int u = workload_idx % w_down;
                          float* range_ptr =
                                  range_map_indexer.GetDataPtr<float>(u, v);
                          range_ptr[0] = depth_max;
                          range_ptr[1] = depth_min;
                      });

    // Pass 1: iterate over rendering fragment array, fill-in range
    core::ParallelFor(
            block_keys.GetDevice(), frag_count * fragment_size * fragment_size,
            [=] OPEN3D_DEVICE(int64_t workload_idx) {
                int frag_idx = workload_idx / (fragment_size * fragment_size);
                int local_idx = workload_idx % (fragment_size * fragment_size);
                int dv = local_idx / fragment_size;
                int du = local_idx % fragment_size;

                float* frag_ptr =
                        frag_buffer_indexer.GetDataPtr<float>(frag_idx);
                int v_min = static_cast<int>(frag_ptr[2]);
                int u_min = static_cast<int>(frag_ptr[3]);
                int v_max = static_cast<int>(frag_ptr[4]);
                int u_max = static_cast<int>(frag_ptr[5]);

                int v = v_min + dv;
                int u = u_min + du;
                if (v > v_max || u > u_max) return;

                float z_min = frag_ptr[0];
                float z_max = frag_ptr[1];
                float* range_ptr = range_map_indexer.GetDataPtr<float>(u, v);
#ifdef __CUDACC__
                atomicMinf(&(range_ptr[0]), z_min);
                atomicMaxf(&(range_ptr[1]), z_max);
#else
#pragma omp critical(EstimateRangeCPU)
                {
                    range_ptr[0] = min(z_min, range_ptr[0]);
                    range_ptr[1] = max(z_max, range_ptr[1]);
                }
#endif
            });

#if defined(__CUDACC__)
    core::cuda::Synchronize();
#endif

    if (needed_frag_count != frag_count) {
        utility::LogInfo("Reallocating {} fragments for EstimateRange (was {})",
                         needed_frag_count, frag_count);

        fragment_buffer = core::Tensor({needed_frag_count, 6}, core::Float32,
                                       block_keys.GetDevice());
    }
}

struct MiniVecCache {
    index_t x;
    index_t y;
    index_t z;
    index_t block_idx;

    inline index_t OPEN3D_DEVICE Check(index_t xin, index_t yin, index_t zin) {
        return (xin == x && yin == y && zin == z) ? block_idx : -1;
    }

    inline void OPEN3D_DEVICE Update(index_t xin,
                                     index_t yin,
                                     index_t zin,
                                     index_t block_idx_in) {
        x = xin;
        y = yin;
        z = zin;
        block_idx = block_idx_in;
    }
};

template <typename tsdf_t, typename weight_t, typename color_t>
#if defined(__CUDACC__)
void RayCastCUDA
#else
void RayCastCPU
#endif
        (std::shared_ptr<core::HashMap>& hashmap,
         const TensorMap& block_value_map,
         const core::Tensor& range,
         TensorMap& renderings_map,
         const core::Tensor& intrinsic,
         const core::Tensor& extrinsics,
         index_t h,
         index_t w,
         index_t block_resolution,
         float voxel_size,
         float depth_scale,
         float depth_min,
         float depth_max,
         float weight_threshold,
         float trunc_voxel_multiplier,
         int range_map_down_factor) {
    using Key = utility::MiniVec<index_t, 3>;
    using Hash = utility::MiniVecHash<index_t, 3>;
    using Eq = utility::MiniVecEq<index_t, 3>;

    auto device_hashmap = hashmap->GetDeviceHashBackend();
#if defined(__CUDACC__)
    auto cuda_hashmap =
            std::dynamic_pointer_cast<core::StdGPUHashBackend<Key, Hash, Eq>>(
                    device_hashmap);
    if (cuda_hashmap == nullptr) {
        utility::LogError(
                "Unsupported backend: CUDA raycasting only supports STDGPU.");
    }
    auto hashmap_impl = cuda_hashmap->GetImpl();
#else
    auto cpu_hashmap =
            std::dynamic_pointer_cast<core::TBBHashBackend<Key, Hash, Eq>>(
                    device_hashmap);
    if (cpu_hashmap == nullptr) {
        utility::LogError(
                "Unsupported backend: CPU raycasting only supports TBB.");
    }
    auto hashmap_impl = *cpu_hashmap->GetImpl();
#endif

    core::Device device = hashmap->GetDevice();

    ArrayIndexer range_indexer(range, 2);

    // Geometry
    ArrayIndexer depth_indexer;
    ArrayIndexer vertex_indexer;
    ArrayIndexer normal_indexer;

    // Diff rendering
    ArrayIndexer index_indexer;
    ArrayIndexer mask_indexer;
    ArrayIndexer interp_ratio_indexer;
    ArrayIndexer interp_ratio_dx_indexer;
    ArrayIndexer interp_ratio_dy_indexer;
    ArrayIndexer interp_ratio_dz_indexer;

    // Color
    ArrayIndexer color_indexer;

    if (!block_value_map.Contains("tsdf") ||
        !block_value_map.Contains("weight")) {
        utility::LogError(
                "TSDF and/or weight not allocated in blocks, please implement "
                "customized integration.");
    }
    const tsdf_t* tsdf_base_ptr =
            block_value_map.at("tsdf").GetDataPtr<tsdf_t>();
    const weight_t* weight_base_ptr =
            block_value_map.at("weight").GetDataPtr<weight_t>();

    // Geometry
    if (renderings_map.Contains("depth")) {
        depth_indexer = ArrayIndexer(renderings_map.at("depth"), 2);
    }
    if (renderings_map.Contains("vertex")) {
        vertex_indexer = ArrayIndexer(renderings_map.at("vertex"), 2);
    }
    if (renderings_map.Contains("normal")) {
        normal_indexer = ArrayIndexer(renderings_map.at("normal"), 2);
    }

    // Diff rendering
    if (renderings_map.Contains("index")) {
        index_indexer = ArrayIndexer(renderings_map.at("index"), 2);
    }
    if (renderings_map.Contains("mask")) {
        mask_indexer = ArrayIndexer(renderings_map.at("mask"), 2);
    }
    if (renderings_map.Contains("interp_ratio")) {
        interp_ratio_indexer =
                ArrayIndexer(renderings_map.at("interp_ratio"), 2);
    }
    if (renderings_map.Contains("interp_ratio_dx")) {
        interp_ratio_dx_indexer =
                ArrayIndexer(renderings_map.at("interp_ratio_dx"), 2);
    }
    if (renderings_map.Contains("interp_ratio_dy")) {
        interp_ratio_dy_indexer =
                ArrayIndexer(renderings_map.at("interp_ratio_dy"), 2);
    }
    if (renderings_map.Contains("interp_ratio_dz")) {
        interp_ratio_dz_indexer =
                ArrayIndexer(renderings_map.at("interp_ratio_dz"), 2);
    }

    // Color
    bool render_color = false;
    if (block_value_map.Contains("color") && renderings_map.Contains("color")) {
        render_color = true;
        color_indexer = ArrayIndexer(renderings_map.at("color"), 2);
    }
    const color_t* color_base_ptr =
            render_color ? block_value_map.at("color").GetDataPtr<color_t>()
                         : nullptr;

    bool visit_neighbors = render_color || normal_indexer.GetDataPtr() ||
                           mask_indexer.GetDataPtr() ||
                           index_indexer.GetDataPtr() ||
                           interp_ratio_indexer.GetDataPtr() ||
                           interp_ratio_dx_indexer.GetDataPtr() ||
                           interp_ratio_dy_indexer.GetDataPtr() ||
                           interp_ratio_dz_indexer.GetDataPtr();

    TransformIndexer c2w_transform_indexer(
            intrinsic, t::geometry::InverseTransformation(extrinsics));
    TransformIndexer w2c_transform_indexer(intrinsic, extrinsics);

    index_t rows = h;
    index_t cols = w;
    index_t n = rows * cols;

    float block_size = voxel_size * block_resolution;
    index_t resolution2 = block_resolution * block_resolution;
    index_t resolution3 = resolution2 * block_resolution;

#ifndef __CUDACC__
    using std::max;
    using std::sqrt;
#endif

    core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t workload_idx) {
        auto GetLinearIdxAtP = [&] OPEN3D_DEVICE(
                                       index_t x_b, index_t y_b, index_t z_b,
                                       index_t x_v, index_t y_v, index_t z_v,
                                       core::buf_index_t block_buf_idx,
                                       MiniVecCache & cache) -> index_t {
            index_t x_vn = (x_v + block_resolution) % block_resolution;
            index_t y_vn = (y_v + block_resolution) % block_resolution;
            index_t z_vn = (z_v + block_resolution) % block_resolution;

            index_t dx_b = Sign(x_v - x_vn);
            index_t dy_b = Sign(y_v - y_vn);
            index_t dz_b = Sign(z_v - z_vn);

            if (dx_b == 0 && dy_b == 0 && dz_b == 0) {
                return block_buf_idx * resolution3 + z_v * resolution2 +
                       y_v * block_resolution + x_v;
            } else {
                Key key(x_b + dx_b, y_b + dy_b, z_b + dz_b);

                index_t block_buf_idx = cache.Check(key[0], key[1], key[2]);
                if (block_buf_idx < 0) {
                    auto iter = hashmap_impl.find(key);
                    if (iter == hashmap_impl.end()) return -1;
                    block_buf_idx = iter->second;
                    cache.Update(key[0], key[1], key[2], block_buf_idx);
                }

                return block_buf_idx * resolution3 + z_vn * resolution2 +
                       y_vn * block_resolution + x_vn;
            }
        };

        auto GetLinearIdxAtT = [&] OPEN3D_DEVICE(
                                       float x_o, float y_o, float z_o,
                                       float x_d, float y_d, float z_d, float t,
                                       MiniVecCache& cache) -> index_t {
            float x_g = x_o + t * x_d;
            float y_g = y_o + t * y_d;
            float z_g = z_o + t * z_d;

            // MiniVec coordinate and look up
            index_t x_b = static_cast<index_t>(floorf(x_g / block_size));
            index_t y_b = static_cast<index_t>(floorf(y_g / block_size));
            index_t z_b = static_cast<index_t>(floorf(z_g / block_size));

            Key key(x_b, y_b, z_b);
            index_t block_buf_idx = cache.Check(x_b, y_b, z_b);
            if (block_buf_idx < 0) {
                auto iter = hashmap_impl.find(key);
                if (iter == hashmap_impl.end()) return -1;
                block_buf_idx = iter->second;
                cache.Update(x_b, y_b, z_b, block_buf_idx);
            }

            // Voxel coordinate and look up
            index_t x_v = index_t((x_g - x_b * block_size) / voxel_size);
            index_t y_v = index_t((y_g - y_b * block_size) / voxel_size);
            index_t z_v = index_t((z_g - z_b * block_size) / voxel_size);

            return block_buf_idx * resolution3 + z_v * resolution2 +
                   y_v * block_resolution + x_v;
        };

        index_t y = workload_idx / cols;
        index_t x = workload_idx % cols;

        const float* range = range_indexer.GetDataPtr<float>(
                x / range_map_down_factor, y / range_map_down_factor);

        float* depth_ptr = nullptr;
        float* vertex_ptr = nullptr;
        float* color_ptr = nullptr;
        float* normal_ptr = nullptr;

        int64_t* index_ptr = nullptr;
        bool* mask_ptr = nullptr;
        float* interp_ratio_ptr = nullptr;
        float* interp_ratio_dx_ptr = nullptr;
        float* interp_ratio_dy_ptr = nullptr;
        float* interp_ratio_dz_ptr = nullptr;

        if (vertex_indexer.GetDataPtr()) {
            vertex_ptr = vertex_indexer.GetDataPtr<float>(x, y);
            vertex_ptr[0] = 0;
            vertex_ptr[1] = 0;
            vertex_ptr[2] = 0;
        }
        if (depth_indexer.GetDataPtr()) {
            depth_ptr = depth_indexer.GetDataPtr<float>(x, y);
            depth_ptr[0] = 0;
        }
        if (normal_indexer.GetDataPtr()) {
            normal_ptr = normal_indexer.GetDataPtr<float>(x, y);
            normal_ptr[0] = 0;
            normal_ptr[1] = 0;
            normal_ptr[2] = 0;
        }

        if (mask_indexer.GetDataPtr()) {
            mask_ptr = mask_indexer.GetDataPtr<bool>(x, y);
#ifdef __CUDACC__
#pragma unroll
#endif
            for (int i = 0; i < 8; ++i) {
                mask_ptr[i] = false;
            }
        }
        if (index_indexer.GetDataPtr()) {
            index_ptr = index_indexer.GetDataPtr<int64_t>(x, y);
#ifdef __CUDACC__
#pragma unroll
#endif
            for (int i = 0; i < 8; ++i) {
                index_ptr[i] = 0;
            }
        }
        if (interp_ratio_indexer.GetDataPtr()) {
            interp_ratio_ptr = interp_ratio_indexer.GetDataPtr<float>(x, y);
#ifdef __CUDACC__
#pragma unroll
#endif
            for (int i = 0; i < 8; ++i) {
                interp_ratio_ptr[i] = 0;
            }
        }
        if (interp_ratio_dx_indexer.GetDataPtr()) {
            interp_ratio_dx_ptr =
                    interp_ratio_dx_indexer.GetDataPtr<float>(x, y);
#ifdef __CUDACC__
#pragma unroll
#endif
            for (int i = 0; i < 8; ++i) {
                interp_ratio_dx_ptr[i] = 0;
            }
        }
        if (interp_ratio_dy_indexer.GetDataPtr()) {
            interp_ratio_dy_ptr =
                    interp_ratio_dy_indexer.GetDataPtr<float>(x, y);
#ifdef __CUDACC__
#pragma unroll
#endif
            for (int i = 0; i < 8; ++i) {
                interp_ratio_dy_ptr[i] = 0;
            }
        }
        if (interp_ratio_dz_indexer.GetDataPtr()) {
            interp_ratio_dz_ptr =
                    interp_ratio_dz_indexer.GetDataPtr<float>(x, y);
#ifdef __CUDACC__
#pragma unroll
#endif
            for (int i = 0; i < 8; ++i) {
                interp_ratio_dz_ptr[i] = 0;
            }
        }

        if (color_indexer.GetDataPtr()) {
            color_ptr = color_indexer.GetDataPtr<float>(x, y);
            color_ptr[0] = 0;
            color_ptr[1] = 0;
            color_ptr[2] = 0;
        }

        float t = range[0];
        const float t_max = range[1];
        if (t >= t_max) return;

        // Coordinates in camera and global
        float x_c = 0, y_c = 0, z_c = 0;
        float x_g = 0, y_g = 0, z_g = 0;
        float x_o = 0, y_o = 0, z_o = 0;

        // Iterative ray intersection check
        float t_prev = t;

        float tsdf_prev = -1.0f;
        float tsdf = 1.0;
        float sdf_trunc = voxel_size * trunc_voxel_multiplier;
        float w = 0.0;

        // Camera origin
        c2w_transform_indexer.RigidTransform(0, 0, 0, &x_o, &y_o, &z_o);

        // Direction
        c2w_transform_indexer.Unproject(static_cast<float>(x),
                                        static_cast<float>(y), 1.0f, &x_c, &y_c,
                                        &z_c);
        c2w_transform_indexer.RigidTransform(x_c, y_c, z_c, &x_g, &y_g, &z_g);
        float x_d = (x_g - x_o);
        float y_d = (y_g - y_o);
        float z_d = (z_g - z_o);

        MiniVecCache cache{0, 0, 0, -1};
        bool surface_found = false;
        while (t < t_max) {
            index_t linear_idx =
                    GetLinearIdxAtT(x_o, y_o, z_o, x_d, y_d, z_d, t, cache);

            if (linear_idx < 0) {
                t_prev = t;
                t += block_size;
            } else {
                tsdf_prev = tsdf;
                tsdf = tsdf_base_ptr[linear_idx];
                w = weight_base_ptr[linear_idx];
                if (tsdf_prev > 0 && w >= weight_threshold && tsdf <= 0) {
                    surface_found = true;
                    break;
                }
                t_prev = t;
                float delta = tsdf * sdf_trunc;
                t += delta < voxel_size ? voxel_size : delta;
            }
        }

        if (surface_found) {
            float t_intersect =
                    (t * tsdf_prev - t_prev * tsdf) / (tsdf_prev - tsdf);
            x_g = x_o + t_intersect * x_d;
            y_g = y_o + t_intersect * y_d;
            z_g = z_o + t_intersect * z_d;

            // Trivial vertex assignment
            if (depth_ptr) {
                *depth_ptr = t_intersect * depth_scale;
            }
            if (vertex_ptr) {
                w2c_transform_indexer.RigidTransform(
                        x_g, y_g, z_g, vertex_ptr + 0, vertex_ptr + 1,
                        vertex_ptr + 2);
            }
            if (!visit_neighbors) return;

            // Trilinear interpolation
            // TODO(wei): simplify the flow by splitting the
            // functions given what is enabled
            index_t x_b = static_cast<index_t>(floorf(x_g / block_size));
            index_t y_b = static_cast<index_t>(floorf(y_g / block_size));
            index_t z_b = static_cast<index_t>(floorf(z_g / block_size));
            float x_v = (x_g - float(x_b) * block_size) / voxel_size;
            float y_v = (y_g - float(y_b) * block_size) / voxel_size;
            float z_v = (z_g - float(z_b) * block_size) / voxel_size;

            Key key(x_b, y_b, z_b);

            index_t block_buf_idx = cache.Check(x_b, y_b, z_b);
            if (block_buf_idx < 0) {
                auto iter = hashmap_impl.find(key);
                if (iter == hashmap_impl.end()) return;
                block_buf_idx = iter->second;
                cache.Update(x_b, y_b, z_b, block_buf_idx);
            }

            index_t x_v_floor = static_cast<index_t>(floorf(x_v));
            index_t y_v_floor = static_cast<index_t>(floorf(y_v));
            index_t z_v_floor = static_cast<index_t>(floorf(z_v));

            float ratio_x = x_v - float(x_v_floor);
            float ratio_y = y_v - float(y_v_floor);
            float ratio_z = z_v - float(z_v_floor);

            float sum_r = 0.0;
            for (index_t k = 0; k < 8; ++k) {
                index_t dx_v = (k & 1) > 0 ? 1 : 0;
                index_t dy_v = (k & 2) > 0 ? 1 : 0;
                index_t dz_v = (k & 4) > 0 ? 1 : 0;

                index_t linear_idx_k = GetLinearIdxAtP(
                        x_b, y_b, z_b, x_v_floor + dx_v, y_v_floor + dy_v,
                        z_v_floor + dz_v, block_buf_idx, cache);

                if (linear_idx_k >= 0 && weight_base_ptr[linear_idx_k] > 0) {
                    float rx = dx_v * (ratio_x) + (1 - dx_v) * (1 - ratio_x);
                    float ry = dy_v * (ratio_y) + (1 - dy_v) * (1 - ratio_y);
                    float rz = dz_v * (ratio_z) + (1 - dz_v) * (1 - ratio_z);
                    float r = rx * ry * rz;

                    if (interp_ratio_ptr) {
                        interp_ratio_ptr[k] = r;
                    }
                    if (mask_ptr) {
                        mask_ptr[k] = true;
                    }
                    if (index_ptr) {
                        index_ptr[k] = linear_idx_k;
                    }

                    float tsdf_k = tsdf_base_ptr[linear_idx_k];
                    float interp_ratio_dx = ry * rz * (2 * dx_v - 1);
                    float interp_ratio_dy = rx * rz * (2 * dy_v - 1);
                    float interp_ratio_dz = rx * ry * (2 * dz_v - 1);

                    if (interp_ratio_dx_ptr) {
                        interp_ratio_dx_ptr[k] = interp_ratio_dx;
                    }
                    if (interp_ratio_dy_ptr) {
                        interp_ratio_dy_ptr[k] = interp_ratio_dy;
                    }
                    if (interp_ratio_dz_ptr) {
                        interp_ratio_dz_ptr[k] = interp_ratio_dz;
                    }

                    if (normal_ptr) {
                        normal_ptr[0] += interp_ratio_dx * tsdf_k;
                        normal_ptr[1] += interp_ratio_dy * tsdf_k;
                        normal_ptr[2] += interp_ratio_dz * tsdf_k;
                    }

                    if (color_ptr) {
                        index_t color_linear_idx = linear_idx_k * 3;
                        color_ptr[0] +=
                                r * color_base_ptr[color_linear_idx + 0];
                        color_ptr[1] +=
                                r * color_base_ptr[color_linear_idx + 1];
                        color_ptr[2] +=
                                r * color_base_ptr[color_linear_idx + 2];
                    }

                    sum_r += r;
                }
            }  // loop over 8 neighbors

            if (sum_r > 0) {
                sum_r *= 255.0;
                if (color_ptr) {
                    color_ptr[0] /= sum_r;
                    color_ptr[1] /= sum_r;
                    color_ptr[2] /= sum_r;
                }

                if (normal_ptr) {
                    constexpr float EPSILON = 1e-5f;
                    float norm = sqrt(normal_ptr[0] * normal_ptr[0] +
                                      normal_ptr[1] * normal_ptr[1] +
                                      normal_ptr[2] * normal_ptr[2]);
                    norm = std::max(norm, EPSILON);
                    w2c_transform_indexer.Rotate(
                            -normal_ptr[0] / norm, -normal_ptr[1] / norm,
                            -normal_ptr[2] / norm, normal_ptr + 0,
                            normal_ptr + 1, normal_ptr + 2);
                }
            }
        }  // surface-found
    });

#if defined(__CUDACC__)
    core::cuda::Synchronize();
#endif
}

template <typename tsdf_t, typename weight_t, typename color_t>
#if defined(__CUDACC__)
void ExtractPointCloudCUDA
#else
void ExtractPointCloudCPU
#endif
        (const core::Tensor& indices,
         const core::Tensor& nb_indices,
         const core::Tensor& nb_masks,
         const core::Tensor& block_keys,
         const TensorMap& block_value_map,
         core::Tensor& points,
         core::Tensor& normals,
         core::Tensor& colors,
         index_t resolution,
         float voxel_size,
         float weight_threshold,
         int& valid_size) {
    core::Device device = block_keys.GetDevice();

    // Parameters
    index_t resolution2 = resolution * resolution;
    index_t resolution3 = resolution2 * resolution;

    // Shape / transform indexers, no data involved
    ArrayIndexer voxel_indexer({resolution, resolution, resolution});

    // Real data indexer
    ArrayIndexer block_keys_indexer(block_keys, 1);
    ArrayIndexer nb_block_masks_indexer(nb_masks, 2);
    ArrayIndexer nb_block_indices_indexer(nb_indices, 2);

    // Plain arrays that does not require indexers
    const index_t* indices_ptr = indices.GetDataPtr<index_t>();

    if (!block_value_map.Contains("tsdf") ||
        !block_value_map.Contains("weight")) {
        utility::LogError(
                "TSDF and/or weight not allocated in blocks, please implement "
                "customized integration.");
    }
    const tsdf_t* tsdf_base_ptr =
            block_value_map.at("tsdf").GetDataPtr<tsdf_t>();
    const weight_t* weight_base_ptr =
            block_value_map.at("weight").GetDataPtr<weight_t>();
    const color_t* color_base_ptr = nullptr;
    if (block_value_map.Contains("color")) {
        color_base_ptr = block_value_map.at("color").GetDataPtr<color_t>();
    }

    index_t n_blocks = indices.GetLength();
    index_t n = n_blocks * resolution3;

    // Output
#if defined(__CUDACC__)
    core::Tensor count(std::vector<index_t>{0}, {1}, core::Int32,
                       block_keys.GetDevice());
    index_t* count_ptr = count.GetDataPtr<index_t>();
#else
    std::atomic<index_t> count_atomic(0);
    std::atomic<index_t>* count_ptr = &count_atomic;
#endif

    if (valid_size < 0) {
        utility::LogDebug(
                "No estimated max point cloud size provided, using a 2-pass "
                "estimation. Surface extraction could be slow.");
        // This pass determines valid number of points.

        core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t workload_idx) {
            auto GetLinearIdx = [&] OPEN3D_DEVICE(
                                        index_t xo, index_t yo, index_t zo,
                                        index_t curr_block_idx) -> index_t {
                return DeviceGetLinearIdx(xo, yo, zo, curr_block_idx,
                                          resolution, nb_block_masks_indexer,
                                          nb_block_indices_indexer);
            };

            // Natural index (0, N) -> (block_idx,
            // voxel_idx)
            index_t workload_block_idx = workload_idx / resolution3;
            index_t block_idx = indices_ptr[workload_block_idx];
            index_t voxel_idx = workload_idx % resolution3;

            // voxel_idx -> (x_voxel, y_voxel, z_voxel)
            index_t xv, yv, zv;
            voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

            index_t linear_idx = block_idx * resolution3 + voxel_idx;
            float tsdf_o = tsdf_base_ptr[linear_idx];
            float weight_o = weight_base_ptr[linear_idx];
            if (weight_o <= weight_threshold) return;

            // Enumerate x-y-z directions
            for (index_t i = 0; i < 3; ++i) {
                index_t linear_idx_i =
                        GetLinearIdx(xv + (i == 0), yv + (i == 1),
                                     zv + (i == 2), workload_block_idx);
                if (linear_idx_i < 0) continue;

                float tsdf_i = tsdf_base_ptr[linear_idx_i];
                float weight_i = weight_base_ptr[linear_idx_i];
                if (weight_i > weight_threshold && tsdf_i * tsdf_o < 0) {
                    OPEN3D_ATOMIC_ADD(count_ptr, 1);
                }
            }
        });

#if defined(__CUDACC__)
        valid_size = count[0].Item<index_t>();
        count[0] = 0;
#else
        valid_size = (*count_ptr).load();
        (*count_ptr) = 0;
#endif
    }

    if (points.GetLength() == 0) {
        points = core::Tensor({valid_size, 3}, core::Float32, device);
    }
    ArrayIndexer point_indexer(points, 1);

    // Normals
    ArrayIndexer normal_indexer;
    normals = core::Tensor({valid_size, 3}, core::Float32, device);
    normal_indexer = ArrayIndexer(normals, 1);

    // This pass extracts exact surface points.

    // Colors
    ArrayIndexer color_indexer;
    if (color_base_ptr) {
        colors = core::Tensor({valid_size, 3}, core::Float32, device);
        color_indexer = ArrayIndexer(colors, 1);
    }

    core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t workload_idx) {
        auto GetLinearIdx = [&] OPEN3D_DEVICE(
                                    index_t xo, index_t yo, index_t zo,
                                    index_t curr_block_idx) -> index_t {
            return DeviceGetLinearIdx(xo, yo, zo, curr_block_idx, resolution,
                                      nb_block_masks_indexer,
                                      nb_block_indices_indexer);
        };

        auto GetNormal = [&] OPEN3D_DEVICE(index_t xo, index_t yo, index_t zo,
                                           index_t curr_block_idx, float* n) {
            return DeviceGetNormal<tsdf_t>(
                    tsdf_base_ptr, xo, yo, zo, curr_block_idx, n, resolution,
                    nb_block_masks_indexer, nb_block_indices_indexer);
        };

        // Natural index (0, N) -> (block_idx, voxel_idx)
        index_t workload_block_idx = workload_idx / resolution3;
        index_t block_idx = indices_ptr[workload_block_idx];
        index_t voxel_idx = workload_idx % resolution3;

        /// Coordinate transform
        // block_idx -> (x_block, y_block, z_block)
        index_t* block_key_ptr =
                block_keys_indexer.GetDataPtr<index_t>(block_idx);
        index_t xb = block_key_ptr[0];
        index_t yb = block_key_ptr[1];
        index_t zb = block_key_ptr[2];

        // voxel_idx -> (x_voxel, y_voxel, z_voxel)
        index_t xv, yv, zv;
        voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

        index_t linear_idx = block_idx * resolution3 + voxel_idx;
        float tsdf_o = tsdf_base_ptr[linear_idx];
        float weight_o = weight_base_ptr[linear_idx];
        if (weight_o <= weight_threshold) return;

        float no[3] = {0}, ne[3] = {0};

        // Get normal at origin
        GetNormal(xv, yv, zv, workload_block_idx, no);

        index_t x = xb * resolution + xv;
        index_t y = yb * resolution + yv;
        index_t z = zb * resolution + zv;

        // Enumerate x-y-z axis
        for (index_t i = 0; i < 3; ++i) {
            index_t linear_idx_i =
                    GetLinearIdx(xv + (i == 0), yv + (i == 1), zv + (i == 2),
                                 workload_block_idx);
            if (linear_idx_i < 0) continue;

            float tsdf_i = tsdf_base_ptr[linear_idx_i];
            float weight_i = weight_base_ptr[linear_idx_i];
            if (weight_i > weight_threshold && tsdf_i * tsdf_o < 0) {
                float ratio = (0 - tsdf_o) / (tsdf_i - tsdf_o);

                index_t idx = OPEN3D_ATOMIC_ADD(count_ptr, 1);
                if (idx >= valid_size) {
                    printf("Point cloud size larger than "
                           "estimated, please increase the "
                           "estimation!\n");
                    return;
                }

                float* point_ptr = point_indexer.GetDataPtr<float>(idx);
                point_ptr[0] = voxel_size * (x + ratio * int(i == 0));
                point_ptr[1] = voxel_size * (y + ratio * int(i == 1));
                point_ptr[2] = voxel_size * (z + ratio * int(i == 2));

                // Get normal at edge and interpolate
                float* normal_ptr = normal_indexer.GetDataPtr<float>(idx);
                GetNormal(xv + (i == 0), yv + (i == 1), zv + (i == 2),
                          workload_block_idx, ne);
                float nx = (1 - ratio) * no[0] + ratio * ne[0];
                float ny = (1 - ratio) * no[1] + ratio * ne[1];
                float nz = (1 - ratio) * no[2] + ratio * ne[2];
                float norm = static_cast<float>(
                        sqrt(nx * nx + ny * ny + nz * nz) + 1e-5);
                normal_ptr[0] = nx / norm;
                normal_ptr[1] = ny / norm;
                normal_ptr[2] = nz / norm;

                if (color_base_ptr) {
                    float* color_ptr = color_indexer.GetDataPtr<float>(idx);
                    const color_t* color_o_ptr =
                            color_base_ptr + 3 * linear_idx;
                    float r_o = color_o_ptr[0];
                    float g_o = color_o_ptr[1];
                    float b_o = color_o_ptr[2];

                    const color_t* color_i_ptr =
                            color_base_ptr + 3 * linear_idx_i;
                    float r_i = color_i_ptr[0];
                    float g_i = color_i_ptr[1];
                    float b_i = color_i_ptr[2];

                    color_ptr[0] = ((1 - ratio) * r_o + ratio * r_i) / 255.0f;
                    color_ptr[1] = ((1 - ratio) * g_o + ratio * g_i) / 255.0f;
                    color_ptr[2] = ((1 - ratio) * b_o + ratio * b_i) / 255.0f;
                }
            }
        }
    });

#if defined(__CUDACC__)
    index_t total_count = count.Item<index_t>();
#else
    index_t total_count = (*count_ptr).load();
#endif

    utility::LogDebug("{} vertices extracted", total_count);
    valid_size = total_count;

#if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
    core::cuda::Synchronize();
#endif
}

template <typename tsdf_t, typename weight_t, typename color_t>
#if defined(__CUDACC__)
void ExtractTriangleMeshCUDA
#else
void ExtractTriangleMeshCPU
#endif
        (const core::Tensor& block_indices,
         const core::Tensor& inv_block_indices,
         const core::Tensor& nb_block_indices,
         const core::Tensor& nb_block_masks,
         const core::Tensor& block_keys,
         const TensorMap& block_value_map,
         core::Tensor& vertices,
         core::Tensor& triangles,
         core::Tensor& vertex_normals,
         core::Tensor& vertex_colors,
         index_t block_resolution,
         float voxel_size,
         float weight_threshold,
         index_t& vertex_count) {
    core::Device device = block_indices.GetDevice();

    index_t resolution = block_resolution;
    index_t resolution3 = resolution * resolution * resolution;

    // Shape / transform indexers, no data involved
    ArrayIndexer voxel_indexer({resolution, resolution, resolution});
    index_t n_blocks = static_cast<index_t>(block_indices.GetLength());

    // TODO(wei): profile performance by replacing the table to a hashmap.
    // Voxel-wise mesh info. 4 channels correspond to:
    // 3 edges' corresponding vertex index + 1 table index.
    core::Tensor mesh_structure;
    try {
        mesh_structure = core::Tensor::Zeros(
                {n_blocks, resolution, resolution, resolution, 4}, core::Int32,
                device);
    } catch (const std::runtime_error&) {
        utility::LogError(
                "Unable to allocate assistance mesh structure for Marching "
                "Cubes with {} active voxel blocks. Please consider using a "
                "larger voxel size (currently {}) for TSDF integration, or "
                "using tsdf_volume.cpu() to perform mesh extraction on CPU.",
                n_blocks, voxel_size);
    }

    // Real data indexer
    ArrayIndexer mesh_structure_indexer(mesh_structure, 4);
    ArrayIndexer nb_block_masks_indexer(nb_block_masks, 2);
    ArrayIndexer nb_block_indices_indexer(nb_block_indices, 2);

    // Plain arrays that does not require indexers
    const index_t* indices_ptr = block_indices.GetDataPtr<index_t>();
    const index_t* inv_indices_ptr = inv_block_indices.GetDataPtr<index_t>();

    if (!block_value_map.Contains("tsdf") ||
        !block_value_map.Contains("weight")) {
        utility::LogError(
                "TSDF and/or weight not allocated in blocks, please implement "
                "customized integration.");
    }
    const tsdf_t* tsdf_base_ptr =
            block_value_map.at("tsdf").GetDataPtr<tsdf_t>();
    const weight_t* weight_base_ptr =
            block_value_map.at("weight").GetDataPtr<weight_t>();
    const color_t* color_base_ptr = nullptr;
    if (block_value_map.Contains("color")) {
        color_base_ptr = block_value_map.at("color").GetDataPtr<color_t>();
    }

    index_t n = n_blocks * resolution3;
    // Pass 0: analyze mesh structure, set up one-on-one correspondences
    // from edges to vertices.

    core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t widx) {
        auto GetLinearIdx = [&] OPEN3D_DEVICE(
                                    index_t xo, index_t yo, index_t zo,
                                    index_t curr_block_idx) -> index_t {
            return DeviceGetLinearIdx(xo, yo, zo, curr_block_idx,
                                      static_cast<index_t>(resolution),
                                      nb_block_masks_indexer,
                                      nb_block_indices_indexer);
        };

        // Natural index (0, N) -> (block_idx, voxel_idx)
        index_t workload_block_idx = widx / resolution3;
        index_t voxel_idx = widx % resolution3;

        // voxel_idx -> (x_voxel, y_voxel, z_voxel)
        index_t xv, yv, zv;
        voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

        // Check per-vertex sign in the cube to determine cube
        // type
        index_t table_idx = 0;
        for (index_t i = 0; i < 8; ++i) {
            index_t linear_idx_i =
                    GetLinearIdx(xv + vtx_shifts[i][0], yv + vtx_shifts[i][1],
                                 zv + vtx_shifts[i][2], workload_block_idx);
            if (linear_idx_i < 0) return;

            float tsdf_i = tsdf_base_ptr[linear_idx_i];
            float weight_i = weight_base_ptr[linear_idx_i];
            if (weight_i <= weight_threshold) return;

            table_idx |= ((tsdf_i < 0) ? (1 << i) : 0);
        }

        index_t* mesh_struct_ptr = mesh_structure_indexer.GetDataPtr<index_t>(
                xv, yv, zv, workload_block_idx);
        mesh_struct_ptr[3] = table_idx;

        if (table_idx == 0 || table_idx == 255) return;

        // Check per-edge sign determine the cube type
        index_t edges_with_vertices = edge_table[table_idx];
        for (index_t i = 0; i < 12; ++i) {
            if (edges_with_vertices & (1 << i)) {
                index_t xv_i = xv + edge_shifts[i][0];
                index_t yv_i = yv + edge_shifts[i][1];
                index_t zv_i = zv + edge_shifts[i][2];
                index_t edge_i = edge_shifts[i][3];

                index_t dxb = xv_i / resolution;
                index_t dyb = yv_i / resolution;
                index_t dzb = zv_i / resolution;

                index_t nb_idx = (dxb + 1) + (dyb + 1) * 3 + (dzb + 1) * 9;

                index_t block_idx_i =
                        *nb_block_indices_indexer.GetDataPtr<index_t>(
                                workload_block_idx, nb_idx);
                index_t* mesh_ptr_i =
                        mesh_structure_indexer.GetDataPtr<index_t>(
                                xv_i - dxb * resolution,
                                yv_i - dyb * resolution,
                                zv_i - dzb * resolution,
                                inv_indices_ptr[block_idx_i]);

                // Non-atomic write, but we are safe
                mesh_ptr_i[edge_i] = -1;
            }
        }
    });

    // Pass 1: determine valid number of vertices (if not preset)
#if defined(__CUDACC__)
    core::Tensor count(std::vector<index_t>{0}, {}, core::Int32, device);

    index_t* count_ptr = count.GetDataPtr<index_t>();
#else
    std::atomic<index_t> count_atomic(0);
    std::atomic<index_t>* count_ptr = &count_atomic;
#endif

    if (vertex_count < 0) {
        core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t widx) {
            // Natural index (0, N) -> (block_idx, voxel_idx)
            index_t workload_block_idx = widx / resolution3;
            index_t voxel_idx = widx % resolution3;

            // voxel_idx -> (x_voxel, y_voxel, z_voxel)
            index_t xv, yv, zv;
            voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

            // Obtain voxel's mesh struct ptr
            index_t* mesh_struct_ptr =
                    mesh_structure_indexer.GetDataPtr<index_t>(
                            xv, yv, zv, workload_block_idx);

            // Early quit -- no allocated vertex to compute
            if (mesh_struct_ptr[0] != -1 && mesh_struct_ptr[1] != -1 &&
                mesh_struct_ptr[2] != -1) {
                return;
            }

            // Enumerate 3 edges in the voxel
            for (index_t e = 0; e < 3; ++e) {
                index_t vertex_idx = mesh_struct_ptr[e];
                if (vertex_idx != -1) continue;

                OPEN3D_ATOMIC_ADD(count_ptr, 1);
            }
        });

#if defined(__CUDACC__)
        vertex_count = count.Item<index_t>();
#else
        vertex_count = (*count_ptr).load();
#endif
    }

    utility::LogDebug("Total vertex count = {}", vertex_count);
    vertices = core::Tensor({vertex_count, 3}, core::Float32, device);

    vertex_normals = core::Tensor({vertex_count, 3}, core::Float32, device);
    ArrayIndexer normal_indexer = ArrayIndexer(vertex_normals, 1);

    ArrayIndexer color_indexer;
    if (color_base_ptr) {
        vertex_colors = core::Tensor({vertex_count, 3}, core::Float32, device);
        color_indexer = ArrayIndexer(vertex_colors, 1);
    }

    ArrayIndexer block_keys_indexer(block_keys, 1);
    ArrayIndexer vertex_indexer(vertices, 1);

#if defined(__CUDACC__)
    count = core::Tensor(std::vector<index_t>{0}, {}, core::Int32, device);
    count_ptr = count.GetDataPtr<index_t>();
#else
    (*count_ptr) = 0;
#endif

    // Pass 2: extract vertices.

    core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t widx) {
        auto GetLinearIdx = [&] OPEN3D_DEVICE(
                                    index_t xo, index_t yo, index_t zo,
                                    index_t curr_block_idx) -> index_t {
            return DeviceGetLinearIdx(xo, yo, zo, curr_block_idx, resolution,
                                      nb_block_masks_indexer,
                                      nb_block_indices_indexer);
        };

        auto GetNormal = [&] OPEN3D_DEVICE(index_t xo, index_t yo, index_t zo,
                                           index_t curr_block_idx, float* n) {
            return DeviceGetNormal<tsdf_t>(
                    tsdf_base_ptr, xo, yo, zo, curr_block_idx, n, resolution,
                    nb_block_masks_indexer, nb_block_indices_indexer);
        };

        // Natural index (0, N) -> (block_idx, voxel_idx)
        index_t workload_block_idx = widx / resolution3;
        index_t block_idx = indices_ptr[workload_block_idx];
        index_t voxel_idx = widx % resolution3;

        // block_idx -> (x_block, y_block, z_block)
        index_t* block_key_ptr =
                block_keys_indexer.GetDataPtr<index_t>(block_idx);
        index_t xb = block_key_ptr[0];
        index_t yb = block_key_ptr[1];
        index_t zb = block_key_ptr[2];

        // voxel_idx -> (x_voxel, y_voxel, z_voxel)
        index_t xv, yv, zv;
        voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

        // global coordinate (in voxels)
        index_t x = xb * resolution + xv;
        index_t y = yb * resolution + yv;
        index_t z = zb * resolution + zv;

        // Obtain voxel's mesh struct ptr
        index_t* mesh_struct_ptr = mesh_structure_indexer.GetDataPtr<index_t>(
                xv, yv, zv, workload_block_idx);

        // Early quit -- no allocated vertex to compute
        if (mesh_struct_ptr[0] != -1 && mesh_struct_ptr[1] != -1 &&
            mesh_struct_ptr[2] != -1) {
            return;
        }

        // Obtain voxel ptr
        index_t linear_idx = resolution3 * block_idx + voxel_idx;
        float tsdf_o = tsdf_base_ptr[linear_idx];

        float no[3] = {0}, ne[3] = {0};

        // Get normal at origin
        GetNormal(xv, yv, zv, workload_block_idx, no);

        // Enumerate 3 edges in the voxel
        for (index_t e = 0; e < 3; ++e) {
            index_t vertex_idx = mesh_struct_ptr[e];
            if (vertex_idx != -1) continue;

            index_t linear_idx_e =
                    GetLinearIdx(xv + (e == 0), yv + (e == 1), zv + (e == 2),
                                 workload_block_idx);
            OPEN3D_ASSERT(linear_idx_e > 0 &&
                          "Internal error: GetVoxelAt returns nullptr.");
            float tsdf_e = tsdf_base_ptr[linear_idx_e];
            float ratio = (0 - tsdf_o) / (tsdf_e - tsdf_o);

            index_t idx = OPEN3D_ATOMIC_ADD(count_ptr, 1);
            mesh_struct_ptr[e] = idx;

            float ratio_x = ratio * index_t(e == 0);
            float ratio_y = ratio * index_t(e == 1);
            float ratio_z = ratio * index_t(e == 2);

            float* vertex_ptr = vertex_indexer.GetDataPtr<float>(idx);
            vertex_ptr[0] = voxel_size * (x + ratio_x);
            vertex_ptr[1] = voxel_size * (y + ratio_y);
            vertex_ptr[2] = voxel_size * (z + ratio_z);

            // Get normal at edge and interpolate
            float* normal_ptr = normal_indexer.GetDataPtr<float>(idx);
            GetNormal(xv + (e == 0), yv + (e == 1), zv + (e == 2),
                      workload_block_idx, ne);
            float nx = (1 - ratio) * no[0] + ratio * ne[0];
            float ny = (1 - ratio) * no[1] + ratio * ne[1];
            float nz = (1 - ratio) * no[2] + ratio * ne[2];
            float norm = static_cast<float>(sqrt(nx * nx + ny * ny + nz * nz) +
                                            1e-5);
            normal_ptr[0] = nx / norm;
            normal_ptr[1] = ny / norm;
            normal_ptr[2] = nz / norm;

            if (color_base_ptr) {
                float* color_ptr = color_indexer.GetDataPtr<float>(idx);
                float r_o = color_base_ptr[linear_idx * 3 + 0];
                float g_o = color_base_ptr[linear_idx * 3 + 1];
                float b_o = color_base_ptr[linear_idx * 3 + 2];

                float r_e = color_base_ptr[linear_idx_e * 3 + 0];
                float g_e = color_base_ptr[linear_idx_e * 3 + 1];
                float b_e = color_base_ptr[linear_idx_e * 3 + 2];

                color_ptr[0] = ((1 - ratio) * r_o + ratio * r_e) / 255.0f;
                color_ptr[1] = ((1 - ratio) * g_o + ratio * g_e) / 255.0f;
                color_ptr[2] = ((1 - ratio) * b_o + ratio * b_e) / 255.0f;
            }
        }
    });

    // Pass 3: connect vertices and form triangles.
    index_t triangle_count = vertex_count * 3;
    triangles = core::Tensor({triangle_count, 3}, core::Int32, device);
    ArrayIndexer triangle_indexer(triangles, 1);

#if defined(__CUDACC__)
    count = core::Tensor(std::vector<index_t>{0}, {}, core::Int32, device);
    count_ptr = count.GetDataPtr<index_t>();
#else
    (*count_ptr) = 0;
#endif
    core::ParallelFor(device, n, [=] OPEN3D_DEVICE(index_t widx) {
        // Natural index (0, N) -> (block_idx, voxel_idx)
        index_t workload_block_idx = widx / resolution3;
        index_t voxel_idx = widx % resolution3;

        // voxel_idx -> (x_voxel, y_voxel, z_voxel)
        index_t xv, yv, zv;
        voxel_indexer.WorkloadToCoord(voxel_idx, &xv, &yv, &zv);

        // Obtain voxel's mesh struct ptr
        index_t* mesh_struct_ptr = mesh_structure_indexer.GetDataPtr<index_t>(
                xv, yv, zv, workload_block_idx);

        index_t table_idx = mesh_struct_ptr[3];
        if (tri_count[table_idx] == 0) return;

        for (index_t tri = 0; tri < 16; tri += 3) {
            if (tri_table[table_idx][tri] == -1) return;

            index_t tri_idx = OPEN3D_ATOMIC_ADD(count_ptr, 1);

            for (index_t vertex = 0; vertex < 3; ++vertex) {
                index_t edge = tri_table[table_idx][tri + vertex];

                index_t xv_i = xv + edge_shifts[edge][0];
                index_t yv_i = yv + edge_shifts[edge][1];
                index_t zv_i = zv + edge_shifts[edge][2];
                index_t edge_i = edge_shifts[edge][3];

                index_t dxb = xv_i / resolution;
                index_t dyb = yv_i / resolution;
                index_t dzb = zv_i / resolution;

                index_t nb_idx = (dxb + 1) + (dyb + 1) * 3 + (dzb + 1) * 9;

                index_t block_idx_i =
                        *nb_block_indices_indexer.GetDataPtr<index_t>(
                                workload_block_idx, nb_idx);
                index_t* mesh_struct_ptr_i =
                        mesh_structure_indexer.GetDataPtr<index_t>(
                                xv_i - dxb * resolution,
                                yv_i - dyb * resolution,
                                zv_i - dzb * resolution,
                                inv_indices_ptr[block_idx_i]);

                index_t* triangle_ptr =
                        triangle_indexer.GetDataPtr<index_t>(tri_idx);
                triangle_ptr[2 - vertex] = mesh_struct_ptr_i[edge_i];
            }
        }
    });

#if defined(__CUDACC__)
    triangle_count = count.Item<index_t>();
#else
    triangle_count = (*count_ptr).load();
#endif
    utility::LogDebug("Total triangle count = {}", triangle_count);
    triangles = triangles.Slice(0, 0, triangle_count);
}

}  // namespace voxel_grid
}  // namespace kernel
}  // namespace geometry
}  // namespace t
}  // namespace open3d