From 3107f2bb5f5f22602aea8f7d05411eb746eba409 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 May 2024 20:59:49 -0300 Subject: [PATCH] GH-41560: [C++] ChunkResolver: Implement ResolveMany and add unit tests (#41561) ### Rationale for this change I want `ResolveMany` to support me in the implementation of `Take` that doesn't `Concatenate` all the chunks from a `ChunkedArray` `values` parameter. ### What changes are included in this PR? - Implementation of `ChunkResolver::ResolveMany()` - Addition of missing unit tests for `ChunkResolver` ### Are these changes tested? Yes. By new unit tests. ### Are there any user-facing changes? No. `ChunkResolver` is an internal API at the moment (see #34535 for future plans). * GitHub Issue: #41560 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/chunk_resolver.cc | 80 +++++++- cpp/src/arrow/chunk_resolver.h | 128 +++++++++++- cpp/src/arrow/chunked_array_test.cc | 200 +++++++++++++++++++ cpp/src/arrow/compute/kernels/vector_sort.cc | 30 ++- 4 files changed, 407 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 29bccb52658f8..55eec53ced1c7 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -19,14 +19,14 @@ #include #include +#include #include #include #include "arrow/array.h" #include "arrow/record_batch.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { template @@ -54,6 +54,51 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { offsets[chunks.size()] = offset; return offsets; } + +/// \pre all the pre-conditions of ChunkResolver::ResolveMany() +/// \pre num_offsets - 1 <= std::numeric_limits::max() +template +void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets, + int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint, + IndexType* out_index_in_chunk_vec) { + auto* offsets = reinterpret_cast(signed_offsets); + const auto num_chunks = static_cast(num_offsets - 1); + // chunk_hint in [0, num_offsets) per the precondition. + for (int64_t i = 0; i < n_indices; i++) { + const auto index = static_cast(logical_index_vec[i]); + if (index >= offsets[chunk_hint] && + (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) { + out_chunk_index_vec[i] = chunk_hint; // hint is correct! + continue; + } + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + auto chunk_index = + ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + chunk_hint = static_cast(chunk_index); + out_chunk_index_vec[i] = chunk_hint; + } + if (out_index_in_chunk_vec != NULLPTR) { + for (int64_t i = 0; i < n_indices; i++) { + auto logical_index = logical_index_vec[i]; + auto chunk_index = out_chunk_index_vec[i]; + // chunk_index is in [0, chunks.size()] no matter what the + // value of logical_index is, so it's always safe to dereference + // offset_ as it contains chunks.size()+1 values. + out_index_in_chunk_vec[i] = + logical_index - static_cast(offsets[chunk_index]); +#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) + // Make it more likely that Valgrind/ASAN can catch an invalid memory + // access by poisoning out_index_in_chunk_vec[i] when the logical + // index is out-of-bounds. + if (chunk_index == num_chunks) { + out_index_in_chunk_vec[i] = std::numeric_limits::max(); + } +#endif + } + } +} + } // namespace ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept @@ -84,5 +129,32 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept { return *this; } -} // namespace internal -} // namespace arrow +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec, + uint8_t* out_chunk_index_vec, uint8_t chunk_hint, + uint8_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec, + uint32_t* out_chunk_index_vec, uint32_t chunk_hint, + uint32_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec, + uint16_t* out_chunk_index_vec, uint16_t chunk_hint, + uint16_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec, + uint64_t* out_chunk_index_vec, uint64_t chunk_hint, + uint64_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index c5dad1a17b18e..a2a3d5a864243 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include "arrow/type_fwd.h" @@ -27,6 +29,8 @@ namespace arrow::internal { +struct ChunkResolver; + struct ChunkLocation { /// \brief Index of the chunk in the array of chunks /// @@ -36,8 +40,17 @@ struct ChunkLocation { /// \brief Index of the value in the chunk /// - /// The value is undefined if chunk_index >= chunks.size() + /// The value is UNDEFINED if chunk_index >= chunks.size() int64_t index_in_chunk = 0; + + ChunkLocation() = default; + + ChunkLocation(int64_t chunk_index, int64_t index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {} + + bool operator==(ChunkLocation other) const { + return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; + } }; /// \brief An utility that incrementally resolves logical indices into @@ -60,12 +73,35 @@ struct ARROW_EXPORT ChunkResolver { explicit ChunkResolver(const std::vector& chunks) noexcept; explicit ChunkResolver(const RecordBatchVector& batches) noexcept; + /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. + /// + /// The first offset must be 0 and the last offset must be the logical length of the + /// chunked array. Each offset before the last represents the starting logical index of + /// the corresponding chunk. + explicit ChunkResolver(std::vector offsets) noexcept + : offsets_(std::move(offsets)), cached_chunk_(0) { +#ifndef NDEBUG + assert(offsets_.size() >= 1); + assert(offsets_[0] == 0); + for (size_t i = 1; i < offsets_.size(); i++) { + assert(offsets_[i] >= offsets_[i - 1]); + } +#endif + } + ChunkResolver(ChunkResolver&& other) noexcept; ChunkResolver& operator=(ChunkResolver&& other) noexcept; ChunkResolver(const ChunkResolver& other) noexcept; ChunkResolver& operator=(const ChunkResolver& other) noexcept; + int64_t logical_array_length() const { return offsets_.back(); } + int64_t num_chunks() const { return static_cast(offsets_.size()) - 1; } + + int64_t chunk_length(int64_t chunk_index) const { + return offsets_[chunk_index + 1] - offsets_[chunk_index]; + } + /// \brief Resolve a logical index to a ChunkLocation. /// /// The returned ChunkLocation contains the chunk index and the within-chunk index @@ -81,7 +117,7 @@ struct ARROW_EXPORT ChunkResolver { const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); const auto chunk_index = ResolveChunkIndex(index, cached_chunk); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; } /// \brief Resolve a logical index to a ChunkLocation. @@ -97,12 +133,70 @@ struct ARROW_EXPORT ChunkResolver { /// \return ChunkLocation with a valid chunk_index if index is within /// bounds, or with chunk_index == chunks.size() if logical index is /// `>= chunked_array.length()`. - inline ChunkLocation ResolveWithChunkIndexHint(int64_t index, - ChunkLocation hint) const { + inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { assert(hint.chunk_index < static_cast(offsets_.size())); const auto chunk_index = ResolveChunkIndex(index, hint.chunk_index); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; + } + + /// \brief Resolve `n_indices` logical indices to chunk indices. + /// + /// \pre 0 <= logical_index_vec[i] < logical_array_length() + /// (for well-defined and valid chunk index results) + /// \pre out_chunk_index_vec has space for `n_indices` + /// \pre chunk_hint in [0, chunks.size()] + /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) + /// \post if logical_index_vec[i] >= chunked_array.length(), then + /// out_chunk_index_vec[i] == chunks.size() + /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and + /// out_index_in_chunk_vec[i] are UNDEFINED + /// + /// \param n_indices The number of logical indices to resolve + /// \param logical_index_vec The logical indices to resolve + /// \param out_chunk_index_vec The output array where the chunk indices will be written + /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany + /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the + /// within-chunk indices will be written + /// \return false iff chunks.size() > std::numeric_limits::max() + template + [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, + IndexType* out_index_in_chunk_vec = NULLPTR) const { + if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { + // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). + constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); + // A ChunkedArray with enough empty chunks can make the index of a chunk + // exceed the logical index and thus the maximum value of IndexType. + const bool chunk_index_fits_on_type = + static_cast(offsets_.size() - 1) <= kMaxIndexTypeValue; + if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) { + return false; + } + // Since an index-in-chunk cannot possibly exceed the logical index being + // queried, we don't have to worry about these values not fitting on IndexType. + } + if constexpr (std::is_signed_v) { + // We interpret signed integers as unsigned and avoid having to generate double + // the amount of binary code to handle each integer width. + // + // Negative logical indices can become large values when cast to unsigned, and + // they are gracefully handled by ResolveManyImpl, but both the chunk index + // and the index in chunk values will be undefined in these cases. This + // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid + // logical index in the chunked array. + using U = std::make_unsigned_t; + ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), + reinterpret_cast(out_chunk_index_vec), + static_cast(chunk_hint), + reinterpret_cast(out_index_in_chunk_vec)); + } else { + static_assert(std::is_unsigned_v); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, + out_index_in_chunk_vec); + } + return true; } private: @@ -130,17 +224,33 @@ struct ARROW_EXPORT ChunkResolver { return chunk_index; } + /// \pre all the pre-conditions of ChunkResolver::ResolveMany() + /// \pre num_offsets - 1 <= std::numeric_limits::max() + void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const; + void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const; + void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const; + void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const; + + public: /// \brief Find the index of the chunk that contains the logical index. /// /// Any non-negative index is accepted. When `hi=num_offsets`, the largest /// possible return value is `num_offsets-1` which is equal to - /// `chunks.size()`. The is returned when the logical index is out-of-bounds. + /// `chunks.size()`. Which is returned when the logical index is greater or + /// equal the logical length of the chunked array. /// - /// \pre index >= 0 + /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned) /// \pre lo < hi /// \pre lo >= 0 && hi <= offsets_.size() static inline int64_t Bisect(int64_t index, const int64_t* offsets, int64_t lo, int64_t hi) { + return Bisect(static_cast(index), + reinterpret_cast(offsets), static_cast(lo), + static_cast(hi)); + } + + static inline int64_t Bisect(uint64_t index, const uint64_t* offsets, uint64_t lo, + uint64_t hi) { // Similar to std::upper_bound(), but slightly different as our offsets // array always starts with 0. auto n = hi - lo; @@ -148,8 +258,8 @@ struct ARROW_EXPORT ChunkResolver { // (lo < hi is guaranteed by the precondition). assert(n > 1 && "lo < hi is a precondition of Bisect"); do { - const int64_t m = n >> 1; - const int64_t mid = lo + m; + const uint64_t m = n >> 1; + const uint64_t mid = lo + m; if (index >= offsets[mid]) { lo = mid; n -= m; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 6ca52ab46ca68..e9cc283b53cd5 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/chunk_resolver.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/testing/builder.h" @@ -34,6 +35,9 @@ namespace arrow { +using internal::ChunkLocation; +using internal::ChunkResolver; + class TestChunkedArray : public ::testing::Test { protected: virtual void Construct() { @@ -310,4 +314,200 @@ TEST_F(TestChunkedArray, GetScalar) { ASSERT_RAISES(IndexError, carr.GetScalar(7)); } +// ChunkResolver tests + +using IndexTypes = ::testing::Types; + +TEST(TestChunkResolver, Resolve) { + ChunkResolver empty(std::vector({0})); // [] + // ChunkLocation::index_in_chunk is undefined when chunk_index==chunks.size(), + // so only chunk_index is compared in these cases. + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + ASSERT_EQ(one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); +} + +template +class TestChunkResolverMany : public ::testing::Test { + public: + using IndexType = T; + + Result> ResolveMany( + const ChunkResolver& resolver, const std::vector& logical_index_vec) { + const size_t n = logical_index_vec.size(); + std::vector chunk_index_vec; + chunk_index_vec.resize(n); + std::vector index_in_chunk_vec; + index_in_chunk_vec.resize(n); + bool valid = resolver.ResolveMany( + static_cast(n), logical_index_vec.data(), chunk_index_vec.data(), 0, + index_in_chunk_vec.data()); + if (ARROW_PREDICT_FALSE(!valid)) { + return Status::Invalid("index type doesn't fit possible chunk indexes"); + } + std::vector locations; + locations.reserve(n); + for (size_t i = 0; i < n; i++) { + auto chunk_index = static_cast(chunk_index_vec[i]); + auto index_in_chunk = static_cast(index_in_chunk_vec[i]); + locations.emplace_back(chunk_index, index_in_chunk); + } + return locations; + } + + void CheckResolveMany(const ChunkResolver& resolver, + const std::vector& logical_index_vec) { + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + IndexType logical_index = logical_index_vec[i]; + const auto expected = resolver.Resolve(logical_index); + ASSERT_LE(expected.chunk_index, resolver.num_chunks()); + if (expected.chunk_index == resolver.num_chunks()) { + // index_in_chunk is undefined in this case + ASSERT_EQ(locations[i].chunk_index, expected.chunk_index); + } else { + ASSERT_EQ(locations[i], expected); + } + } + } + + void TestBasics() { + std::vector logical_index_vec; + + ChunkResolver empty(std::vector({0})); // [] + logical_index_vec = {0, 0}; + CheckResolveMany(empty, logical_index_vec); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + logical_index_vec = {1, 0, 1}; + CheckResolveMany(one, logical_index_vec); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_and_empty, logical_index_vec); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_one_one, logical_index_vec); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + logical_index_vec = {10, 9, 8, 4, 3, 2, 1, 0, 1, 2, 3, 4, 8, 9, 10}; + CheckResolveMany(resolver, logical_index_vec); + } + + void TestOutOfBounds() { + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + + std::vector logical_index_vec = {10, 11, 12, 13, 14, 13, 11, 10}; + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (std::is_signed_v) { + std::vector logical_index_vec = {-1, -2, -3, -4, INT8_MIN}; + + ChunkResolver resolver(std::vector({0, 2, 128})); // [[0, 1], [2..127]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + // All the negative indices are greater than resolver.logical_array_length()-1 + // when cast to uint8_t. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (sizeof(IndexType) == 1) { + ChunkResolver resolver(std::vector( + {0, 2, 128, 129, 256})); // [[0, 1], [2..127], [128], [129, 255]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + if constexpr (sizeof(IndexType) == 1) { + // All the negative 8-bit indices are SMALLER than + // resolver.logical_array_length()=256 when cast to 8-bit unsigned integers. + // So the resolved locations might look valid, but they should not be trusted. + ASSERT_LT(locations[i].chunk_index, resolver.num_chunks()); + } else { + // All the negative indices are greater than resolver.logical_array_length() + // when cast to 16/32/64-bit unsigned integers. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + } + } + } + } + + void TestOverflow() { + const int64_t kMaxIndex = std::is_signed_v ? 127 : 255; + std::vector logical_index_vec = {0, 1, 2, + static_cast(kMaxIndex)}; + + // Overflows are rare because to make them possible, we need more chunks + // than logical elements in the ChunkedArray. That requires at least one + // empty chunk. + std::vector offsets; + for (int64_t i = 0; i <= kMaxIndex; i++) { + offsets.push_back(i); + } + ChunkResolver resolver{offsets}; + ASSERT_OK(ResolveMany(resolver, logical_index_vec)); + + offsets.push_back(kMaxIndex); // adding an empty chunk + ChunkResolver resolver_with_empty{offsets}; + if (sizeof(IndexType) == 1) { + ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } else { + ASSERT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } + } +}; + +TYPED_TEST_SUITE(TestChunkResolverMany, IndexTypes); + +TYPED_TEST(TestChunkResolverMany, Basics) { this->TestBasics(); } +TYPED_TEST(TestChunkResolverMany, OutOfBounds) { this->TestOutOfBounds(); } +TYPED_TEST(TestChunkResolverMany, Overflow) { this->TestOverflow(); } + } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index db2023ef04cad..ad22fa8d365c4 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -747,15 +747,13 @@ class TableSorter { auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, [&](uint64_t left, uint64_t right) { // First column is either null or nan - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); auto chunk_left = first_sort_key.GetChunk(left_loc); auto chunk_right = first_sort_key.GetChunk(right_loc); const auto left_is_null = chunk_left.IsNull(); @@ -786,15 +784,13 @@ class TableSorter { // Untyped implementation auto& comparator = comparator_; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, [&](uint64_t left, uint64_t right) { // First column is always null - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); return comparator.Compare(left_loc, right_loc, 1); }); // Copy back temp area into main buffer @@ -812,15 +808,13 @@ class TableSorter { auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(range_begin, range_middle, range_middle, range_end, temp_indices, [&](uint64_t left, uint64_t right) { // Both values are never null nor NaN. - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); auto chunk_left = first_sort_key.GetChunk(left_loc); auto chunk_right = first_sort_key.GetChunk(right_loc); DCHECK(!chunk_left.IsNull());