Skip to content

Commit

Permalink
Voxel Downsample for Tensor interface (#6249)
Browse files Browse the repository at this point in the history
* infra for tensor index-reduction and interface for voxeldownsample

* basic sum reduction

* temp fix for workload nums

* temp fix with contiguous input

* separate index tensor

* add CPU counterpart

* clean up cpp part

* add unit test for index_add_

* fix point attribute shape

* fix unit tests

* fix doc, also fix several doc issues in t.pointcloud

* fix benchmark
  • Loading branch information
theNded authored Aug 11, 2023
1 parent f8fbef1 commit a9370f2
Show file tree
Hide file tree
Showing 14 changed files with 478 additions and 83 deletions.
50 changes: 28 additions & 22 deletions cpp/benchmarks/t/geometry/PointCloud.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,18 @@ void LegacyVoxelDownSample(benchmark::State& state, float voxel_size) {
void VoxelDownSample(benchmark::State& state,
const core::Device& device,
float voxel_size,
const core::HashBackendType& backend) {
const std::string& reduction) {
t::geometry::PointCloud pcd;
// t::io::CreatePointCloudFromFile lacks support of remove_inf_points and
// remove_nan_points
t::io::ReadPointCloud(path, pcd, {"auto", false, false, false});
pcd = pcd.To(device);

// Warm up.
pcd.VoxelDownSample(voxel_size, backend);
pcd.VoxelDownSample(voxel_size, reduction);

for (auto _ : state) {
pcd.VoxelDownSample(voxel_size, backend);
pcd.VoxelDownSample(voxel_size, reduction);
core::cuda::Synchronize(device);
}
}
Expand Down Expand Up @@ -387,28 +387,34 @@ BENCHMARK_CAPTURE(ToLegacyPointCloud, CUDA, core::Device("CUDA:0"))
->Unit(benchmark::kMillisecond);
#endif

#define ENUM_VOXELSIZE(DEVICE, BACKEND) \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_01, DEVICE, 0.01, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_02, DEVICE, 0.08, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_04, DEVICE, 0.04, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_08, DEVICE, 0.08, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_16, DEVICE, 0.16, BACKEND) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, BACKEND##_0_32, DEVICE, 0.32, BACKEND) \
#define ENUM_VOXELSIZE(DEVICE, REDUCTION) \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_01, DEVICE, 0.01, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_02, DEVICE, 0.08, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_04, DEVICE, 0.04, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_08, DEVICE, 0.08, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_16, DEVICE, 0.16, \
REDUCTION) \
->Unit(benchmark::kMillisecond); \
BENCHMARK_CAPTURE(VoxelDownSample, REDUCTION##_0_32, DEVICE, 0.32, \
REDUCTION) \
->Unit(benchmark::kMillisecond);

const std::string kReductionMean = "mean";
#ifdef BUILD_CUDA_MODULE
#define ENUM_VOXELDOWNSAMPLE_BACKEND() \
ENUM_VOXELSIZE(core::Device("CPU:0"), core::HashBackendType::TBB) \
ENUM_VOXELSIZE(core::Device("CUDA:0"), core::HashBackendType::Slab) \
ENUM_VOXELSIZE(core::Device("CUDA:0"), core::HashBackendType::StdGPU)
#define ENUM_VOXELDOWNSAMPLE_REDUCTION() \
ENUM_VOXELSIZE(core::Device("CPU:0"), kReductionMean) \
ENUM_VOXELSIZE(core::Device("CUDA:0"), kReductionMean)
#else
#define ENUM_VOXELDOWNSAMPLE_BACKEND() \
ENUM_VOXELSIZE(core::Device("CPU:0"), core::HashBackendType::TBB)
#define ENUM_VOXELDOWNSAMPLE_REDUCTION() \
ENUM_VOXELSIZE(core::Device("CPU:0"), kReductionMean)
#endif

BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_01, 0.01)
Expand All @@ -423,7 +429,7 @@ BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_16, 0.16)
->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(LegacyVoxelDownSample, Legacy_0_32, 0.32)
->Unit(benchmark::kMillisecond);
ENUM_VOXELDOWNSAMPLE_BACKEND()
ENUM_VOXELDOWNSAMPLE_REDUCTION()

BENCHMARK_CAPTURE(LegacyUniformDownSample, Legacy_2, 2)
->Unit(benchmark::kMillisecond);
Expand Down
3 changes: 3 additions & 0 deletions cpp/open3d/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ target_sources(core PRIVATE
kernel/BinaryEWCPU.cpp
kernel/IndexGetSet.cpp
kernel/IndexGetSetCPU.cpp
kernel/IndexReduction.cpp
kernel/IndexReductionCPU.cpp
kernel/Kernel.cpp
kernel/NonZero.cpp
kernel/NonZeroCPU.cpp
Expand Down Expand Up @@ -90,6 +92,7 @@ if (BUILD_CUDA_MODULE)
kernel/ArangeCUDA.cu
kernel/BinaryEWCUDA.cu
kernel/IndexGetSetCUDA.cu
kernel/IndexReductionCUDA.cu
kernel/NonZeroCUDA.cu
kernel/ReductionCUDA.cu
kernel/UnaryEWCUDA.cu
Expand Down
38 changes: 38 additions & 0 deletions cpp/open3d/core/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "open3d/core/TensorFunction.h"
#include "open3d/core/TensorKey.h"
#include "open3d/core/kernel/Arange.h"
#include "open3d/core/kernel/IndexReduction.h"
#include "open3d/core/kernel/Kernel.h"
#include "open3d/core/linalg/Det.h"
#include "open3d/core/linalg/Inverse.h"
Expand Down Expand Up @@ -955,6 +956,43 @@ void Tensor::IndexSet(const std::vector<Tensor>& index_tensors,
aip.GetIndexedShape(), aip.GetIndexedStrides());
}

void Tensor::IndexAdd_(int64_t dim, const Tensor& index, const Tensor& src) {
if (index.NumDims() != 1) {
utility::LogError("IndexAdd_ only supports 1D index tensors.");
}

// Dim check.
if (dim < 0) {
utility::LogError("IndexAdd_ only supports sum at non-negative dim.");
}
if (NumDims() <= dim) {
utility::LogError("Sum dim {} exceeds tensor dim {}.", dim, NumDims());
}

// shape check
if (src.NumDims() != NumDims()) {
utility::LogError(
"IndexAdd_ only supports src tensor with same dimension as "
"this tensor.");
}
for (int64_t d = 0; d < NumDims(); ++d) {
if (d != dim && src.GetShape(d) != GetShape(d)) {
utility::LogError(
"IndexAdd_ only supports src tensor with same shape as "
"this "
"tensor except dim {}.",
dim);
}
}

// Type check.
AssertTensorDtype(index, core::Int64);
AssertTensorDtype(*this, src.GetDtype());

// Apply kernel.
kernel::IndexAdd_(dim, index, src, *this);
}

Tensor Tensor::Permute(const SizeVector& dims) const {
// Check dimension size
if (static_cast<int64_t>(dims.size()) != NumDims()) {
Expand Down
10 changes: 10 additions & 0 deletions cpp/open3d/core/Tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,16 @@ class Tensor : public IsDevice {
void IndexSet(const std::vector<Tensor>& index_tensors,
const Tensor& src_tensor);

/// \brief Advanced in-place reduction by index.
///
/// See
/// https://pytorch.org/docs/stable/generated/torch.Tensor.index_add_.html
///
/// self[index[i]] = operator(self[index[i]], src[i]).
///
/// Note: Only support 1D index and src tensors now.
void IndexAdd_(int64_t dim, const Tensor& index, const Tensor& src);

/// \brief Permute (dimension shuffle) the Tensor, returns a view.
///
/// \param dims The desired ordering of dimensions.
Expand Down
49 changes: 49 additions & 0 deletions cpp/open3d/core/kernel/IndexReduction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// ----------------------------------------------------------------------------
// - Open3D: www.open3d.org -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------

#include "open3d/core/kernel/IndexReduction.h"

#include "open3d/utility/Logging.h"

namespace open3d {
namespace core {
namespace kernel {

void IndexAdd_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst) {
// Permute the reduction dimension to the first.
SizeVector permute = {};
for (int64_t d = 0; d <= dim; ++d) {
if (d == 0) {
permute.push_back(dim);
} else {
permute.push_back(d - 1);
}
}
for (int64_t d = dim + 1; d < src.NumDims(); ++d) {
permute.push_back(d);
}

auto src_permute = src.Permute(permute);
auto dst_permute = dst.Permute(permute);

if (dst.IsCPU()) {
IndexAddCPU_(dim, index, src_permute, dst_permute);
} else if (dst.IsCUDA()) {
#ifdef BUILD_CUDA_MODULE
IndexAddCUDA_(dim, index, src_permute, dst_permute);
#endif
} else {
utility::LogError("IndexAdd_: Unimplemented device");
}
}

} // namespace kernel
} // namespace core
} // namespace open3d
36 changes: 36 additions & 0 deletions cpp/open3d/core/kernel/IndexReduction.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// ----------------------------------------------------------------------------
// - Open3D: www.open3d.org -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------

#pragma once

#include "open3d/core/Tensor.h"
#include "open3d/utility/Logging.h"

namespace open3d {
namespace core {
namespace kernel {

void IndexAdd_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst);

void IndexAddCPU_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst);

#ifdef BUILD_CUDA_MODULE
void IndexAddCUDA_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst);
#endif

} // namespace kernel
} // namespace core
} // namespace open3d
79 changes: 79 additions & 0 deletions cpp/open3d/core/kernel/IndexReductionCPU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// ----------------------------------------------------------------------------
// - Open3D: www.open3d.org -
// ----------------------------------------------------------------------------
// Copyright (c) 2018-2023 www.open3d.org
// SPDX-License-Identifier: MIT
// ----------------------------------------------------------------------------

#include "open3d/core/Dispatch.h"
#include "open3d/core/Indexer.h"
#include "open3d/core/Tensor.h"
#include "open3d/utility/Logging.h"

namespace open3d {
namespace core {
namespace kernel {

template <typename func_t>
void LaunchIndexReductionKernel(int64_t dim,
const Device& device,
const Tensor& index,
const Tensor& src,
Tensor& dst,
const func_t& element_kernel) {
// index: [N,], src: [N, D], dst: [M, D]
// In Indexer, output shape defines the actual master strides.
// However, in IndexAdd_, input dominates the iterations.
// So put dst (output) at indexer's input, and src (input) at output.
Indexer indexer({dst}, src, DtypePolicy::NONE);

// Index is simply a 1D contiguous tensor, with a different stride
// behavior to src. So use raw pointer for simplicity.
auto index_ptr = index.GetDataPtr<int64_t>();

int64_t broadcasting_elems = 1;
for (int64_t d = 1; d < src.NumDims(); ++d) {
broadcasting_elems *= src.GetShape(d);
}
auto element_func = [=](int64_t workload_idx) {
int reduction_idx = workload_idx / broadcasting_elems;
int broadcasting_idx = workload_idx % broadcasting_elems;

const int64_t idx = index_ptr[reduction_idx];
int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;

void* src_ptr = indexer.GetOutputPtr(0, workload_idx);
void* dst_ptr = indexer.GetInputPtr(0, dst_idx);
// Note input and output is switched here to adapt to the indexer
element_kernel(src_ptr, dst_ptr);
};

// TODO: check in detail
// No OpenMP could be faster, otherwise there would be thousands of atomics.
for (int64_t d = 0; d < indexer.NumWorkloads(); ++d) {
element_func(d);
}
}

template <typename scalar_t>
static OPEN3D_HOST_DEVICE void CPUSumKernel(const void* src, void* dst) {
scalar_t* dst_s_ptr = static_cast<scalar_t*>(dst);
const scalar_t* src_s_ptr = static_cast<const scalar_t*>(src);
*dst_s_ptr += *src_s_ptr;
}

void IndexAddCPU_(int64_t dim,
const Tensor& index,
const Tensor& src,
Tensor& dst) {
DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
LaunchIndexReductionKernel(dim, src.GetDevice(), index, src, dst,
[](const void* src, void* dst) {
CPUSumKernel<scalar_t>(src, dst);
});
});
}

} // namespace kernel
} // namespace core
} // namespace open3d
Loading

0 comments on commit a9370f2

Please sign in to comment.