// ---------------------------------------------------------------------------- // - Open3D: www.open3d.org - // ---------------------------------------------------------------------------- // Copyright (c) 2018-2023 www.open3d.org // SPDX-License-Identifier: MIT // ---------------------------------------------------------------------------- #pragma once #include #include #include "open3d/ml/impl/misc/MemoryAllocation.h" #include "open3d/utility/Helper.h" #include "open3d/utility/MiniVec.h" namespace open3d { namespace ml { namespace impl { namespace { using namespace open3d::utility; template __global__ void IotaCUDAKernel(T* first, int64_t len, T value) { int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; if (LARGE_ARRAY) { const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; } else { linear_idx = x; } if (linear_idx < len) { T* ptr = first + linear_idx; value += linear_idx; *ptr = value; } } /// Iota function for CUDA template void IotaCUDA(const cudaStream_t& stream, T* first, T* last, T value) { ptrdiff_t len = last - first; if (len) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; if (len > block.x * INT32_MAX) { grid.y = std::ceil(std::cbrt(len)); grid.z = grid.y; grid.x = DivUp(len, int64_t(grid.z) * grid.y * block.x); IotaCUDAKernel <<>>(first, len, value); } else { grid = dim3(DivUp(len, block.x), 1, 1); IotaCUDAKernel <<>>(first, len, value); } } } __global__ void ComputeBatchIdKernel(int64_t* hashes, const int64_t num_voxels, const int64_t batch_hash) { int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= num_voxels) return; hashes[linear_idx] /= batch_hash; } /// This function computes batch_id from hash value. /// /// \param hashes Input and output array. /// \param num_voxels Number of valid voxels. /// \param batch_hash The value used to hash batch dimension. /// void ComputeBatchId(const cudaStream_t& stream, int64_t* hashes, const int64_t num_voxels, const int64_t batch_hash) { if (num_voxels) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(num_voxels)); grid.z = grid.y; grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x); ComputeBatchIdKernel<<>>(hashes, num_voxels, batch_hash); } } __global__ void ComputeVoxelPerBatchKernel(int64_t* num_voxels_per_batch, int64_t* unique_batches_count, int64_t* unique_batches, const int64_t num_batches) { int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= num_batches) return; int64_t out_idx = unique_batches[linear_idx]; num_voxels_per_batch[out_idx] = unique_batches_count[linear_idx]; } /// This function computes number of voxels per batch element. /// /// \param num_voxels_per_batch The output array. /// \param unique_batches_count Counts for unique batch_id. /// \param unique_batches Unique batch_id. /// \param num_batches Number of non empty batches (<= batch_size). /// void ComputeVoxelPerBatch(const cudaStream_t& stream, int64_t* num_voxels_per_batch, int64_t* unique_batches_count, int64_t* unique_batches, const int64_t num_batches) { if (num_batches) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(num_batches)); grid.z = grid.y; grid.x = DivUp(num_batches, int64_t(grid.z) * grid.y * block.x); ComputeVoxelPerBatchKernel<<>>( num_voxels_per_batch, unique_batches_count, unique_batches, num_batches); } } __global__ void ComputeIndicesBatchesKernel(int64_t* indices_batches, const int64_t* row_splits, const int64_t batch_size) { int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= batch_size) return; for (int64_t i = row_splits[linear_idx]; i < row_splits[linear_idx + 1]; ++i) { indices_batches[i] = linear_idx; } } /// This function computes mapping of index to batch_id. /// /// \param indices_batches The output array. /// \param row_splits The row_splits for defining batches. /// \param batch_size The batch_size of given points. /// void ComputeIndicesBatches(const cudaStream_t& stream, int64_t* indices_batches, const int64_t* row_splits, const int64_t batch_size) { if (batch_size) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(batch_size)); grid.z = grid.y; grid.x = DivUp(batch_size, int64_t(grid.z) * grid.y * block.x); ComputeIndicesBatchesKernel<<>>( indices_batches, row_splits, batch_size); } } template __global__ void ComputeHashKernel( int64_t* __restrict__ hashes, int64_t num_points, const T* const __restrict__ points, const int64_t batch_size, const int64_t* row_splits, const int64_t* indices_batches, const open3d::utility::MiniVec points_range_min_vec, const open3d::utility::MiniVec points_range_max_vec, const open3d::utility::MiniVec inv_voxel_size, const open3d::utility::MiniVec strides, const int64_t batch_hash, const int64_t invalid_hash) { typedef MiniVec Vec_t; int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= num_points) return; Vec_t point(points + linear_idx * NDIM); if ((point >= points_range_min_vec && point <= points_range_max_vec) .all()) { auto coords = ((point - points_range_min_vec) * inv_voxel_size) .template cast(); int64_t h = coords.dot(strides); h += indices_batches[linear_idx] * batch_hash; // add hash for batch_id hashes[linear_idx] = h; } else { hashes[linear_idx] = invalid_hash; // max hash value used as invalid } } /// This function computes the hash (linear index) for each point. /// Points outside the range will get a specific hash value. /// /// \tparam T The floating point type for the points /// \tparam NDIM The number of dimensions, e.g., 3. /// /// \param hashes The output vector with the hashes/linear indexes. /// \param num_points The number of points. /// \param points The array with the point coordinates. The shape is /// [num_points,NDIM] and the storage order is row-major. /// \param batch_size The batch size of points. /// \param row_splits row_splits for defining batches. /// \param indices_batches Mapping of index to batch_id. /// \param points_range_min_vec The minimum range for a point to be valid. /// \param points_range_max_vec The maximum range for a point to be valid. /// \param inv_voxel_size The reciprocal of the voxel edge lengths in each /// dimension /// \param strides The strides for computing the linear index. /// \param batch_hash The value for hashing batch dimension. /// \param invalid_hash The value to use for points outside the range. template void ComputeHash(const cudaStream_t& stream, int64_t* hashes, int64_t num_points, const T* const points, const int64_t batch_size, const int64_t* row_splits, const int64_t* indices_batches, const MiniVec points_range_min_vec, const MiniVec points_range_max_vec, const MiniVec inv_voxel_size, const MiniVec strides, const int64_t batch_hash, const int64_t invalid_hash) { if (num_points) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(num_points)); grid.z = grid.y; grid.x = DivUp(num_points, int64_t(grid.z) * grid.y * block.x); ComputeHashKernel<<>>( hashes, num_points, points, batch_size, row_splits, indices_batches, points_range_min_vec, points_range_max_vec, inv_voxel_size, strides, batch_hash, invalid_hash); } } template __global__ void LimitCountsKernel(T* counts, int64_t num, T limit) { int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= num) return; if (counts[linear_idx] > limit) { counts[linear_idx] = limit; } } /// This function performs an element-wise minimum operation. /// /// \param counts The input and output array. /// \param num Number of input elements. /// \param limit The second operator for the minimum operation. template void LimitCounts(const cudaStream_t& stream, T* counts, int64_t num, T limit) { if (num) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(num)); grid.z = grid.y; grid.x = DivUp(num, int64_t(grid.z) * grid.y * block.x); LimitCountsKernel<<>>(counts, num, limit); } } __global__ void ComputeStartIdxKernel( int64_t* start_idx, int64_t* points_count, const int64_t* num_voxels_prefix_sum, const int64_t* unique_hashes_count_prefix_sum, const int64_t* out_batch_splits, const int64_t batch_size, const int64_t max_voxels, const int64_t max_points_per_voxel) { int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= batch_size) return; int64_t voxel_idx; if (0 == linear_idx) { voxel_idx = 0; } else { voxel_idx = num_voxels_prefix_sum[linear_idx - 1]; } int64_t begin_out = out_batch_splits[linear_idx]; int64_t end_out = out_batch_splits[linear_idx + 1]; for (int64_t out_idx = begin_out; out_idx < end_out; out_idx++, voxel_idx++) { if (voxel_idx == 0) { start_idx[out_idx] = 0; points_count[out_idx] = min(max_points_per_voxel, unique_hashes_count_prefix_sum[0]); } else { start_idx[out_idx] = unique_hashes_count_prefix_sum[voxel_idx - 1]; points_count[out_idx] = min(max_points_per_voxel, unique_hashes_count_prefix_sum[voxel_idx] - unique_hashes_count_prefix_sum[voxel_idx - 1]); } } } /// Computes the starting index of each voxel. /// /// \param start_idx The output array for storing starting index. /// \param points_count The output array for storing points count. /// \param num_voxels_prefix_sum The Inclusive prefix sum which gives /// the index of starting voxel for each batch. /// \param unique_hashes_count_prefix_sum Inclusive prefix sum defining /// where point indices for each voxel ends. /// \param out_batch_splits Defines starting and ending voxels for /// each batch element. /// \param batch_size The batch size. /// \param max_voxels Maximum voxels per batch. /// \param max_points_per_voxel Maximum points per voxel. /// void ComputeStartIdx(const cudaStream_t& stream, int64_t* start_idx, int64_t* points_count, const int64_t* num_voxels_prefix_sum, const int64_t* unique_hashes_count_prefix_sum, const int64_t* out_batch_splits, const int64_t batch_size, const int64_t max_voxels, const int64_t max_points_per_voxel) { if (batch_size) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(batch_size)); grid.z = grid.y; grid.x = DivUp(batch_size, int64_t(grid.z) * grid.y * block.x); ComputeStartIdxKernel<<>>( start_idx, points_count, num_voxels_prefix_sum, unique_hashes_count_prefix_sum, out_batch_splits, batch_size, max_voxels, max_points_per_voxel); } } template __global__ void ComputeVoxelCoordsKernel( int32_t* __restrict__ voxel_coords, const T* const __restrict__ points, const int64_t* const __restrict__ point_indices, const int64_t* const __restrict__ prefix_sum, const MiniVec points_range_min_vec, const MiniVec inv_voxel_size, int64_t num_voxels) { typedef MiniVec Vec_t; int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= num_voxels) return; int64_t point_idx = point_indices[prefix_sum[linear_idx]]; Vec_t point(points + point_idx * NDIM); auto coords = ((point - points_range_min_vec) * inv_voxel_size) .template cast(); for (int i = 0; i < NDIM; ++i) { voxel_coords[linear_idx * NDIM + i] = coords[i]; } } /// Computes the coordinates for each voxel /// /// \param voxel_coords The output array with shape [num_voxels, NDIM]. /// \param points The array with the point coordinates. /// \param point_indices The array with the point indices for all voxels. /// \param prefix_sum Inclusive prefix sum defining where the point indices /// for each voxels end. /// \param points_range_min The lower bound of the domain to be /// voxelized. /// \param points_range_max The upper bound of the domain to be /// voxelized. /// \param inv_voxel_size The reciprocal of the voxel edge lengths for each /// dimension. /// \param num_voxels The number of voxels. template void ComputeVoxelCoords(const cudaStream_t& stream, int32_t* voxel_coords, const T* const points, const int64_t* const point_indices, const int64_t* const prefix_sum, const MiniVec points_range_min_vec, const MiniVec inv_voxel_size, int64_t num_voxels) { if (num_voxels) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(num_voxels)); grid.z = grid.y; grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x); ComputeVoxelCoordsKernel<<>>( voxel_coords, points, point_indices, prefix_sum, points_range_min_vec, inv_voxel_size, num_voxels); } } __global__ void CopyPointIndicesKernel( int64_t* __restrict__ out, const int64_t* const __restrict__ point_indices, const int64_t* const __restrict__ prefix_sum_in, const int64_t* const __restrict__ prefix_sum_out, const int64_t num_voxels) { // TODO data coalescing can be optimized int64_t linear_idx; const int64_t x = blockDim.x * blockIdx.x + threadIdx.x; const int64_t y = blockDim.y * blockIdx.y + threadIdx.y; const int64_t z = blockDim.z * blockIdx.z + threadIdx.z; linear_idx = z * gridDim.x * blockDim.x * gridDim.y + y * gridDim.x * blockDim.x + x; if (linear_idx >= num_voxels) return; int64_t begin_out; if (0 == linear_idx) { begin_out = 0; } else { begin_out = prefix_sum_out[linear_idx - 1]; } int64_t end_out = prefix_sum_out[linear_idx]; int64_t num_points = end_out - begin_out; int64_t in_idx = prefix_sum_in[linear_idx]; for (int64_t out_idx = begin_out; out_idx < end_out; ++out_idx, ++in_idx) { out[out_idx] = point_indices[in_idx]; } } /// Copies the point indices for each voxel to the output. /// /// \param out The output array with the point indices for all voxels. /// \param point_indices The array with the point indices for all voxels. /// \param prefix_sum_in Inclusive prefix sum defining where the point /// indices for each voxels end. /// \param prefix_sum_out Inclusive prefix sum defining where the point /// indices for each voxels end. /// \param num_voxels The number of voxels. /// void CopyPointIndices(const cudaStream_t& stream, int64_t* out, const int64_t* const point_indices, const int64_t* const prefix_sum_in, const int64_t* const prefix_sum_out, const int64_t num_voxels) { if (num_voxels) { const int BLOCKSIZE = 128; dim3 block(BLOCKSIZE, 1, 1); dim3 grid; grid.y = std::ceil(std::cbrt(num_voxels)); grid.z = grid.y; grid.x = DivUp(num_voxels, int64_t(grid.z) * grid.y * block.x); CopyPointIndicesKernel<<>>( out, point_indices, prefix_sum_in, prefix_sum_out, num_voxels); } } } // namespace /// This function voxelizes a point cloud. /// The function returns the integer coordinates of the voxels that /// contain points and a compact list of the indices that associate the /// voxels to the points. /// /// All pointer arguments point to device memory unless stated /// otherwise. /// /// \tparam T Floating-point data type for the point positions. /// /// \tparam NDIM The number of dimensions of the points. /// /// \tparam OUTPUT_ALLOCATOR Type of the output_allocator. See /// \p output_allocator for more information. /// /// \param stream The cuda stream for all kernel launches. /// /// \param temp Pointer to temporary memory. If nullptr then the required /// size of temporary memory will be written to \p temp_size and no /// work is done. /// /// \param temp_size The size of the temporary memory in bytes. This is /// used as an output if temp is nullptr /// /// \param texture_alignment The texture alignment in bytes. This is used /// for allocating segments within the temporary memory. /// /// \param num_points The number of points. /// /// \param points Array with the point positions. The shape is /// [num_points,NDIM]. /// /// \param batch_size The batch size of points. /// /// \param row_splits row_splits for defining batches. /// /// \param voxel_size The edge lengths of the voxel. The shape is /// [NDIM]. This pointer points to host memory! /// /// \param points_range_min The lower bound of the domain to be /// voxelized. /// The shape is [NDIM]. /// This pointer points to host memory! /// /// \param points_range_max The upper bound of the domain to be /// voxelized. /// The shape is [NDIM]. /// This pointer points to host memory! /// /// \param max_points_per_voxel This parameter limits the number of /// points /// that are recorderd for each voxel. /// /// \param max_voxels This parameter limits the number of voxels that /// will be generated. /// /// \param output_allocator An object that implements functions for /// allocating the output arrays. The object must implement /// functions AllocVoxelCoords(int32_t** ptr, int64_t rows, /// int64_t cols), AllocVoxelPointIndices(int64_t** ptr, int64_t /// size), AllocVoxelPointRowSplits(int64_t** ptr, int64_t /// size) and AllocVoxelBatchSplits(int64_t** ptr, int64_t size). /// All functions should allocate memory and return a pointer /// to that memory in ptr. The arguments size, rows, and cols /// define the size of the array as the number of elements. /// All functions must accept zero size arguments. In this case /// ptr does not need to be set. /// template void VoxelizeCUDA(const cudaStream_t& stream, void* temp, size_t& temp_size, int texture_alignment, size_t num_points, const T* const points, const size_t batch_size, const int64_t* const row_splits, const T* const voxel_size, const T* const points_range_min, const T* const points_range_max, const int64_t max_points_per_voxel, const int64_t max_voxels, OUTPUT_ALLOCATOR& output_allocator) { using namespace open3d::utility; typedef MiniVec Vec_t; const bool get_temp_size = !temp; if (get_temp_size) { temp = (char*)1; // worst case pointer alignment temp_size = std::numeric_limits::max(); } MemoryAllocation mem_temp(temp, temp_size, texture_alignment); const Vec_t inv_voxel_size = T(1) / Vec_t(voxel_size); const Vec_t points_range_min_vec(points_range_min); const Vec_t points_range_max_vec(points_range_max); MiniVec extents = ceil((points_range_max_vec - points_range_min_vec) * inv_voxel_size) .template cast(); MiniVec strides; for (int i = 0; i < NDIM; ++i) { strides[i] = 1; for (int j = 0; j < i; ++j) { strides[i] *= extents[j]; } } const int64_t batch_hash = strides[NDIM - 1] * extents[NDIM - 1]; const int64_t invalid_hash = batch_hash * batch_size; /// store batch_id for each point std::pair indices_batches = mem_temp.Alloc(num_points); if (!get_temp_size) { ComputeIndicesBatches(stream, indices_batches.first, row_splits, batch_size); } // use double buffers for the sorting std::pair point_indices = mem_temp.Alloc(num_points); std::pair point_indices_alt = mem_temp.Alloc(num_points); std::pair hashes = mem_temp.Alloc(num_points); std::pair hashes_alt = mem_temp.Alloc(num_points); cub::DoubleBuffer point_indices_dbuf(point_indices.first, point_indices_alt.first); cub::DoubleBuffer hashes_dbuf(hashes.first, hashes_alt.first); if (!get_temp_size) { IotaCUDA(stream, point_indices.first, point_indices.first + point_indices.second, int64_t(0)); ComputeHash(stream, hashes.first, num_points, points, batch_size, row_splits, indices_batches.first, points_range_min_vec, points_range_max_vec, inv_voxel_size, strides, batch_hash, invalid_hash); } { // TODO compute end_bit for radix sort std::pair sort_pairs_temp(nullptr, 0); cub::DeviceRadixSort::SortPairs( sort_pairs_temp.first, sort_pairs_temp.second, hashes_dbuf, point_indices_dbuf, num_points, 0, sizeof(int64_t) * 8, stream); sort_pairs_temp = mem_temp.Alloc(sort_pairs_temp.second); if (!get_temp_size) { cub::DeviceRadixSort::SortPairs(sort_pairs_temp.first, sort_pairs_temp.second, hashes_dbuf, point_indices_dbuf, num_points, 0, sizeof(int64_t) * 8, stream); } mem_temp.Free(sort_pairs_temp); } // reuse the alternate buffers std::pair unique_hashes(hashes_dbuf.Alternate(), hashes.second); std::pair unique_hashes_count( point_indices_dbuf.Alternate(), point_indices.second); // encode unique hashes(voxels) and their counts(points per voxel) int64_t num_voxels = 0; int64_t last_hash = 0; // 0 is a valid hash value { std::pair encode_temp(nullptr, 0); std::pair num_voxels_mem = mem_temp.Alloc(1); cub::DeviceRunLengthEncode::Encode( encode_temp.first, encode_temp.second, hashes_dbuf.Current(), unique_hashes.first, unique_hashes_count.first, num_voxels_mem.first, num_points, stream); encode_temp = mem_temp.Alloc(encode_temp.second); if (!get_temp_size) { cub::DeviceRunLengthEncode::Encode( encode_temp.first, encode_temp.second, hashes_dbuf.Current(), unique_hashes.first, unique_hashes_count.first, num_voxels_mem.first, num_points, stream); // get the number of voxels cudaMemcpyAsync(&num_voxels, num_voxels_mem.first, sizeof(int64_t), cudaMemcpyDeviceToHost, stream); // get the last hash value cudaMemcpyAsync(&last_hash, hashes_dbuf.Current() + hashes.second - 1, sizeof(int64_t), cudaMemcpyDeviceToHost, stream); // wait for the async copies while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/ } } mem_temp.Free(encode_temp); } if (invalid_hash == last_hash) { // the last hash is invalid we have one voxel less --num_voxels; } // reuse the hashes buffer std::pair unique_hashes_count_prefix_sum( hashes_dbuf.Current(), hashes.second); // compute the prefix sum for unique_hashes_count // gives starting index of each voxel { std::pair inclusive_scan_temp(nullptr, 0); cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, unique_hashes_count.first, unique_hashes_count_prefix_sum.first, unique_hashes_count.second, stream); inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second); if (!get_temp_size) { // We only need the prefix sum for the first num_voxels. cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, unique_hashes_count.first, unique_hashes_count_prefix_sum.first, num_voxels, stream); } mem_temp.Free(inclusive_scan_temp); } // Limit the number of output points to max_points_per_voxel by // limiting the unique_hashes_count. if (!get_temp_size) { if (max_points_per_voxel < num_points) { LimitCounts(stream, unique_hashes_count.first, num_voxels, max_points_per_voxel); } } // Convert unique_hashes to batch_id (divide with batch_hash) int64_t* unique_hashes_batch_id = unique_hashes.first; if (!get_temp_size) { ComputeBatchId(stream, unique_hashes_batch_id, num_voxels, batch_hash); } std::pair unique_batches = mem_temp.Alloc(batch_size); std::pair unique_batches_count = mem_temp.Alloc(batch_size); int64_t num_batches = 0; // Store non empty batches // Convert batch_id to counts (array of num_voxels per batch) { std::pair encode_temp(nullptr, 0); std::pair num_batches_mem = mem_temp.Alloc(1); cub::DeviceRunLengthEncode::Encode( encode_temp.first, encode_temp.second, unique_hashes_batch_id, unique_batches.first, unique_batches_count.first, num_batches_mem.first, num_voxels, stream); encode_temp = mem_temp.Alloc(encode_temp.second); if (!get_temp_size) { cub::DeviceRunLengthEncode::Encode( encode_temp.first, encode_temp.second, unique_hashes_batch_id, unique_batches.first, unique_batches_count.first, num_batches_mem.first, num_voxels, stream); // get the number of non empty batches. cudaMemcpyAsync(&num_batches, num_batches_mem.first, sizeof(int64_t), cudaMemcpyDeviceToHost, stream); // wait for the async copies while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/ } } mem_temp.Free(encode_temp); } // Insert count(0) for empty batches std::pair num_voxels_per_batch = mem_temp.Alloc(batch_size); if (!get_temp_size) { cudaMemset(num_voxels_per_batch.first, 0, batch_size * sizeof(int64_t)); ComputeVoxelPerBatch(stream, num_voxels_per_batch.first, unique_batches_count.first, unique_batches.first, num_batches); } std::pair num_voxels_prefix_sum(unique_batches.first, batch_size); // compute the prefix sum for number of voxels per batch // gives starting voxel index for each batch // used only when voxel count exceeds max_voxels { std::pair inclusive_scan_temp(nullptr, 0); cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, num_voxels_per_batch.first, num_voxels_prefix_sum.first, num_voxels_per_batch.second, stream); inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second); if (!get_temp_size) { if (num_voxels > max_voxels) { cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, num_voxels_per_batch.first, num_voxels_prefix_sum.first, num_voxels_per_batch.second, stream); } } mem_temp.Free(inclusive_scan_temp); } // Limit the number of voxels per batch to max_voxels if (!get_temp_size) { if (num_voxels >= max_voxels) LimitCounts(stream, num_voxels_per_batch.first, batch_size, max_voxels); } // Prefix sum of limited counts to get batch splits. int64_t* out_batch_splits = nullptr; if (!get_temp_size) { output_allocator.AllocVoxelBatchSplits(&out_batch_splits, batch_size + 1); cudaMemsetAsync(out_batch_splits, 0, sizeof(int64_t), stream); } // Prefix sum of counts to get batch splits { std::pair inclusive_scan_temp(nullptr, 0); cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, num_voxels_per_batch.first, out_batch_splits + 1, num_voxels_per_batch.second, stream); inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second); if (!get_temp_size) { cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, num_voxels_per_batch.first, out_batch_splits + 1, batch_size, stream); } mem_temp.Free(inclusive_scan_temp); } // num_valid_voxels excludes voxels exceeding max_voxels int64_t num_valid_voxels = num_points; if (!get_temp_size) { // get the number of valid voxels. cudaMemcpyAsync(&num_valid_voxels, out_batch_splits + batch_size, sizeof(int64_t), cudaMemcpyDeviceToHost, stream); // wait for the async copies while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/ } } // start_idx stores starting index of each valid voxel. // points_count stores number of valid points in respective voxel. std::pair start_idx(indices_batches.first, num_valid_voxels); std::pair points_count = mem_temp.Alloc(num_valid_voxels); if (!get_temp_size) { if (num_voxels <= max_voxels) { // starting index and points_count will be same as // unique_hashes_count_prefix_sum and unique_hashes_count when voxel // count doesn't exceeds max_voxels cudaMemsetAsync(start_idx.first, 0, sizeof(int64_t), stream); cudaMemcpyAsync(start_idx.first + 1, unique_hashes_count_prefix_sum.first, (num_voxels - 1) * sizeof(int64_t), cudaMemcpyDeviceToDevice, stream); mem_temp.Free(points_count); points_count.first = unique_hashes_count.first; } else { ComputeStartIdx(stream, start_idx.first, points_count.first, num_voxels_prefix_sum.first, unique_hashes_count_prefix_sum.first, out_batch_splits, batch_size, max_voxels, max_points_per_voxel); } } int64_t* out_voxel_row_splits = nullptr; if (!get_temp_size) { output_allocator.AllocVoxelPointRowSplits(&out_voxel_row_splits, num_valid_voxels + 1); } if (!get_temp_size) { // set first element to 0 cudaMemsetAsync(out_voxel_row_splits, 0, sizeof(int64_t), stream); } { std::pair inclusive_scan_temp(nullptr, 0); cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, points_count.first, out_voxel_row_splits + 1, num_valid_voxels, stream); inclusive_scan_temp = mem_temp.Alloc(inclusive_scan_temp.second); if (!get_temp_size) { cub::DeviceScan::InclusiveSum( inclusive_scan_temp.first, inclusive_scan_temp.second, points_count.first, out_voxel_row_splits + 1, num_valid_voxels, stream); } mem_temp.Free(inclusive_scan_temp); } if (get_temp_size) { // return the memory peak as the required temporary memory size. temp_size = mem_temp.MaxUsed(); return; } int32_t* out_voxel_coords = nullptr; output_allocator.AllocVoxelCoords(&out_voxel_coords, num_valid_voxels, NDIM); ComputeVoxelCoords(stream, out_voxel_coords, points, point_indices_dbuf.Current(), start_idx.first, points_range_min_vec, inv_voxel_size, num_valid_voxels); int64_t num_valid_points = 0; { cudaMemcpyAsync(&num_valid_points, out_voxel_row_splits + num_valid_voxels, sizeof(int64_t), cudaMemcpyDeviceToHost, stream); // wait for the async copies while (cudaErrorNotReady == cudaStreamQuery(stream)) { /*empty*/ } } int64_t* out_point_indices = nullptr; output_allocator.AllocVoxelPointIndices(&out_point_indices, num_valid_points); CopyPointIndices(stream, out_point_indices, point_indices_dbuf.Current(), start_idx.first, out_voxel_row_splits + 1, num_valid_voxels); } } // namespace impl } // namespace ml } // namespace open3d