From 1387e1f31706877960a4b2047c5fa21e3f7347a0 Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Mon, 16 Sep 2024 14:21:30 -0700 Subject: [PATCH] Add check for overlapping ranges of ARRAY and MAP vectors (#10960) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10960 We don't allow overlapping ranges in ARRAY and MAP vectors. However this is not clear in the code, so we add a method for user to check that. In the future we will also add the check to `BaseVector::validate()` and `ArrayVectorBase` constructor to make it clear it is not allowed. Reviewed By: xiaoxmeng Differential Revision: D62212238 --- .../hive/tests/HivePartitionFunctionTest.cpp | 34 ++++------ velox/docs/develop/vectors.rst | 10 +-- velox/expression/tests/CastExprTest.cpp | 6 +- velox/expression/tests/ExprTest.cpp | 10 +-- velox/functions/lib/tests/SliceTestBase.h | 2 +- .../prestosql/tests/MapEntriesTest.cpp | 12 ++-- velox/functions/prestosql/tests/ZipTest.cpp | 2 +- velox/vector/ComplexVector.cpp | 66 +++++++++++++------ velox/vector/ComplexVector.h | 13 +++- velox/vector/tests/VectorTest.cpp | 38 ++++++++++- 10 files changed, 125 insertions(+), 68 deletions(-) diff --git a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp index 5884d1fc0fd42..eee2eb8139c49 100644 --- a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp +++ b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp @@ -319,23 +319,17 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) { auto rawSizes = sizesBuffer->asMutable(); auto rawNulls = nullsBuffer->asMutable(); - // Make the elements overlap and have gaps. + // Make the elements have gaps. // Set the values in position 2 to be invalid since that Array should be null. std::vector offsets{ - 0, 2, std::numeric_limits().max(), 1, 8}; + 0, 2, std::numeric_limits().max(), 4, 8}; std::vector sizes{ - 4, 3, std::numeric_limits().max(), 5, 2}; + 2, 1, std::numeric_limits().max(), 3, 2}; memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t)); memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t)); bits::setNull(rawNulls, 2); - // Produces arrays that look like: - // [9, NULL, 7, 6] - // [7, 6, 5] - // NULL - // [NULL, 7, 6, 5, NULL] - // [1, NULL] auto values = std::make_shared( pool_.get(), ARRAY(elements->type()), @@ -346,9 +340,9 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) { encodedElements); assertPartitions(values, 1, {0, 0, 0, 0, 0}); - assertPartitions(values, 2, {0, 0, 0, 0, 1}); - assertPartitions(values, 500, {342, 418, 0, 458, 31}); - assertPartitions(values, 997, {149, 936, 0, 103, 31}); + assertPartitions(values, 2, {1, 1, 0, 0, 1}); + assertPartitions(values, 500, {279, 7, 0, 308, 31}); + assertPartitions(values, 997, {279, 7, 0, 820, 31}); assertPartitionsWithConstChannel(values, 1); assertPartitionsWithConstChannel(values, 2); @@ -428,23 +422,17 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) { auto rawSizes = sizesBuffer->asMutable(); auto rawNulls = nullsBuffer->asMutable(); - // Make the elements overlap and have gaps. + // Make the elements have gaps. // Set the values in position 2 to be invalid since that Map should be null. std::vector offsets{ - 0, 2, std::numeric_limits().max(), 1, 8}; + 0, 2, std::numeric_limits().max(), 4, 8}; std::vector sizes{ - 4, 3, std::numeric_limits().max(), 5, 2}; + 2, 1, std::numeric_limits().max(), 3, 2}; memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t)); memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t)); bits::setNull(rawNulls, 2); - // Produces maps that look like: - // {key_0: 9, key_3: NULL, key_6: 7, key_9: 6} - // {key_6: 7, key_9: 6, key_2: 5} - // NULL - // {key_3: NULL, key_6: 7, key_9: 6, key_2: 5, key_5: NULL} - // {key_4: 1, key_7: NULL} auto values = std::make_shared( pool_.get(), MAP(mapKeys->type(), mapValues->type()), @@ -457,8 +445,8 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) { assertPartitions(values, 1, {0, 0, 0, 0, 0}); assertPartitions(values, 2, {0, 1, 0, 1, 0}); - assertPartitions(values, 500, {176, 259, 0, 91, 336}); - assertPartitions(values, 997, {694, 24, 0, 365, 345}); + assertPartitions(values, 500, {336, 413, 0, 259, 336}); + assertPartitions(values, 997, {345, 666, 0, 24, 345}); assertPartitionsWithConstChannel(values, 1); assertPartitionsWithConstChannel(values, 2); diff --git a/velox/docs/develop/vectors.rst b/velox/docs/develop/vectors.rst index 70515ec9f4304..1a3cb0b0b6a41 100644 --- a/velox/docs/develop/vectors.rst +++ b/velox/docs/develop/vectors.rst @@ -386,8 +386,9 @@ ArrayVector ~~~~~~~~~~~ ArrayVector stores values of type ARRAY. In addition to nulls buffer, it -contains offsets and sizes buffers and an elements vector. Offsets and sizes -are 32-bit integers. +contains offsets and sizes buffers and an elements vector. Offsets and sizes are +32-bit integers. The non-null non-empty ranges formed by offsets and sizes in a +vector is not allowed to overlap with each other. .. code-block:: c++ @@ -443,8 +444,9 @@ MapVector ~~~~~~~~~ MapVector stores values of type MAP. In addition to nulls buffer, it contains -offsets and sizes buffers, keys and values vectors. Offsets and sizes are -32-bit integers. +offsets and sizes buffers, keys and values vectors. Offsets and sizes are 32-bit +integers. The non-null non-empty ranges formed by offsets and sizes in a vector +is not allowed to overlap with each other. .. code-block:: c++ diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index 3ad743e31d21c..67317cc1758e6 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -1279,7 +1279,8 @@ TEST_F(CastExprTest, mapCast) { SelectivityVector rows(5); rows.setValid(2, false); - mapVector->setOffsetAndSize(2, 100, 100); + mapVector->setOffsetAndSize(2, 100, 1); + mapVector->setOffsetAndSize(51, 2, 1); std::vector results(1); auto rowVector = makeRowVector({mapVector}); @@ -1369,7 +1370,8 @@ TEST_F(CastExprTest, arrayCast) { SelectivityVector rows(5); rows.setValid(2, false); - arrayVector->setOffsetAndSize(2, 100, 10); + arrayVector->setOffsetAndSize(2, 100, 5); + arrayVector->setOffsetAndSize(20, 10, 5); std::vector results(1); auto rowVector = makeRowVector({arrayVector}); diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp index 100d0acb71b1b..38738a315c6bf 100644 --- a/velox/expression/tests/ExprTest.cpp +++ b/velox/expression/tests/ExprTest.cpp @@ -3568,10 +3568,10 @@ TEST_P(ParameterizedExprTest, mapKeysAndValues) { MAP(BIGINT(), BIGINT()), makeNulls(vectorSize, nullEvery(3)), vectorSize, - makeIndices(vectorSize, [](auto /* row */) { return 0; }), + makeIndices(vectorSize, folly::identity), makeIndices(vectorSize, [](auto /* row */) { return 1; }), - makeFlatVector({1, 2, 3}), - makeFlatVector({10, 20, 30})); + makeFlatVector(vectorSize, folly::identity), + makeFlatVector(vectorSize, [](auto i) { return 10 * i; })); auto input = makeRowVector({mapVector}); auto exprSet = compileMultiple( {"map_keys(c0)", "map_values(c0)"}, asRowType(input->type())); @@ -4805,7 +4805,7 @@ TEST_F(ExprTest, disablePeeling) { // Verify that peeling is disabled when the config is set by checking whether // the number of rows processed is equal to the alphabet size (when enabled) // or the dictionary size (when disabled). - // Also, ensure that single arg function recieves a flat vector even when + // Also, ensure that single arg function receives a flat vector even when // peeling is disabled. // This throws if input is not flat or constant. @@ -4847,7 +4847,7 @@ TEST_F(ExprTest, disablePeeling) { ASSERT_TRUE(stats.find("plus") != stats.end()); ASSERT_EQ(stats["plus"].numProcessedRows, dictSize); - // Ensure single arg function recieves a flat vector. + // Ensure single arg function receives a flat vector. // When top level column is dictionary wrapped. ASSERT_NO_THROW(evaluateMultiple( {"testing_single_arg_deterministic((c0))"}, diff --git a/velox/functions/lib/tests/SliceTestBase.h b/velox/functions/lib/tests/SliceTestBase.h index 8ea998ca888c2..bdb54550fbe68 100644 --- a/velox/functions/lib/tests/SliceTestBase.h +++ b/velox/functions/lib/tests/SliceTestBase.h @@ -35,7 +35,7 @@ class SliceTestBase : public FunctionBaseTest { const ArrayVectorPtr& expectedArrayVector) { auto result = evaluate(expression, makeRowVector(parameters)); ::facebook::velox::test::assertEqualVectors(expectedArrayVector, result); - EXPECT_NO_THROW(expectedArrayVector->checkRanges()); + EXPECT_FALSE(expectedArrayVector->hasOverlappingRanges()); } void basicTestCases() { diff --git a/velox/functions/prestosql/tests/MapEntriesTest.cpp b/velox/functions/prestosql/tests/MapEntriesTest.cpp index f94ae8b3e3ef9..a152a0a7e7007 100644 --- a/velox/functions/prestosql/tests/MapEntriesTest.cpp +++ b/velox/functions/prestosql/tests/MapEntriesTest.cpp @@ -168,14 +168,14 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) { makeNullableFlatVector({1, 2, 3, 4, std::nullopt, std::nullopt}); auto valueVector = makeFlatVector({1, 2, 3, 4}); - auto offsetBuffer = makeIndices({3, 2, 1, 0, 0, 0}); - auto sizeBuffer = makeIndices({1, 1, 1, 1, 1, 1}); + auto offsetBuffer = makeIndices({3, 2, 1, 0}); + auto sizeBuffer = makeIndices({1, 1, 1, 1}); auto mapVector = std::make_shared( pool(), MAP(BIGINT(), BIGINT()), nullptr, - 6, + 4, offsetBuffer, sizeBuffer, keyVector, @@ -188,9 +188,9 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) { EXPECT_NE(result, nullptr); auto elementVector = makeRowVector( - {makeFlatVector({4, 3, 2, 1, 1, 1}), - makeFlatVector({4, 3, 2, 1, 1, 1})}); - auto arrayVector = makeArrayVector({0, 1, 2, 3, 4, 5}, elementVector); + {makeFlatVector({4, 3, 2, 1}), + makeFlatVector({4, 3, 2, 1})}); + auto arrayVector = makeArrayVector({0, 1, 2, 3}, elementVector); test::assertEqualVectors(arrayVector, result); } diff --git a/velox/functions/prestosql/tests/ZipTest.cpp b/velox/functions/prestosql/tests/ZipTest.cpp index 9ed34c9df322e..a74f951205386 100644 --- a/velox/functions/prestosql/tests/ZipTest.cpp +++ b/velox/functions/prestosql/tests/ZipTest.cpp @@ -326,7 +326,7 @@ TEST_F(ZipTest, flatArraysWithDifferentOffsets) { /// Test if we can zip two flat vectors of arrays with overlapping ranges of /// elements. -TEST_F(ZipTest, flatArraysWithOverlappingRanges) { +TEST_F(ZipTest, DISABLED_flatArraysWithOverlappingRanges) { const auto size = 3; BufferPtr offsetsBuffer = allocateOffsets(size, pool_.get()); BufferPtr sizesBuffer = allocateSizes(size, pool_.get()); diff --git a/velox/vector/ComplexVector.cpp b/velox/vector/ComplexVector.cpp index f13a9d602a753..fc23071fb7ee7 100644 --- a/velox/vector/ComplexVector.cpp +++ b/velox/vector/ComplexVector.cpp @@ -809,31 +809,55 @@ VectorPtr RowVector::pushDictionaryToRowVectorLeaves(const VectorPtr& input) { wrappers, input->size(), input, input->pool()); } -void ArrayVectorBase::checkRanges() const { - std::unordered_map seenElements; - seenElements.reserve(size()); +template +vector_size_t ArrayVectorBase::nextNonEmpty(vector_size_t i) const { + while (i < size() && + ((kHasNulls && bits::isBitNull(rawNulls(), i)) || rawSizes_[i] <= 0)) { + ++i; + } + return i; +} + +template +bool ArrayVectorBase::maybeHaveOverlappingRanges() const { + vector_size_t curr = -1; + for (;;) { + auto next = nextNonEmpty(curr + 1); + if (next >= size()) { + return false; + } + // This also implicitly ensures rawOffsets_[curr] <= rawOffsets_[next]. + if (rawOffsets_[curr] + rawSizes_[curr] > rawOffsets_[next]) { + return true; + } + curr = next; + } +} +bool ArrayVectorBase::hasOverlappingRanges() const { + if (!(rawNulls() ? maybeHaveOverlappingRanges() + : maybeHaveOverlappingRanges())) { + return false; + } + std::vector indices; + indices.reserve(size()); for (vector_size_t i = 0; i < size(); ++i) { - auto size = sizeAt(i); - auto offset = offsetAt(i); - - for (vector_size_t j = 0; j < size; ++j) { - auto it = seenElements.find(offset + j); - if (it != seenElements.end()) { - VELOX_FAIL( - "checkRanges() found overlap at idx {}: element {} has offset {} " - "and size {}, and element {} has offset {} and size {}.", - offset + j, - it->second, - offsetAt(it->second), - sizeAt(it->second), - i, - offset, - size); - } - seenElements.emplace(offset + j, i); + const bool isNull = rawNulls() && bits::isBitNull(rawNulls(), i); + if (!isNull && rawSizes_[i] > 0) { + indices.push_back(i); + } + } + std::sort(indices.begin(), indices.end(), [&](auto i, auto j) { + return rawOffsets_[i] < rawOffsets_[j]; + }); + for (vector_size_t i = 1; i < indices.size(); ++i) { + auto j = indices[i - 1]; + auto k = indices[i]; + if (rawOffsets_[j] + rawSizes_[j] > rawOffsets_[k]) { + return true; } } + return false; } void ArrayVectorBase::validateArrayVectorBase( diff --git a/velox/vector/ComplexVector.h b/velox/vector/ComplexVector.h index dcda5d6842694..a12c347918b8c 100644 --- a/velox/vector/ComplexVector.h +++ b/velox/vector/ComplexVector.h @@ -364,9 +364,8 @@ struct ArrayVectorBase : BaseVector { sizes_->asMutable()[i] = size; } - /// Verify that an ArrayVector/MapVector does not contain overlapping [offset, - /// size] ranges. Throws in case overlaps are found. - void checkRanges() const; + /// Check if there is any overlapping [offset, size] ranges. + bool hasOverlappingRanges() const; protected: ArrayVectorBase( @@ -411,6 +410,14 @@ struct ArrayVectorBase : BaseVector { const vector_size_t* rawOffsets_; BufferPtr sizes_; const vector_size_t* rawSizes_; + + private: + template + bool maybeHaveOverlappingRanges() const; + + // Return the next non-null non-empty array/map after `index'. + template + vector_size_t nextNonEmpty(vector_size_t index) const; }; class ArrayVector : public ArrayVectorBase { diff --git a/velox/vector/tests/VectorTest.cpp b/velox/vector/tests/VectorTest.cpp index 88cbe5b842c05..3a5525412b025 100644 --- a/velox/vector/tests/VectorTest.cpp +++ b/velox/vector/tests/VectorTest.cpp @@ -2882,7 +2882,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) { // Test array. { - auto offsets = makeIndices({1, 1, 1, 1}); + auto offsets = makeIndices({0, 1, 2, 3}); auto sizes = makeIndices({1, 1, 1, 1}); auto* rawSizes = sizes->as(); @@ -2907,7 +2907,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) { // Test map. { - auto offsets = makeIndices({1, 1, 1, 1}); + auto offsets = makeIndices({0, 1, 2, 3}); auto sizes = makeIndices({1, 1, 1, 1}); auto* rawSizes = sizes->as(); @@ -3899,5 +3899,39 @@ TEST_F(VectorTest, testOverSizedArray) { EXPECT_THROW(BaseVector::flattenVector(constArray), VeloxUserError); } +TEST_F(VectorTest, hasOverlappingRanges) { + auto test = [this]( + vector_size_t length, + const std::vector& nulls, + const std::vector& offsets, + const std::vector& sizes, + bool overlap) { + auto makeArray = [&] { + return std::make_shared( + pool(), + ARRAY(BIGINT()), + makeNulls(nulls), + length, + makeIndices(length, [&](auto i) { return offsets[i]; }), + makeIndices(length, [&](auto i) { return sizes[i]; }), + makeNullConstant( + TypeKind::BIGINT, std::numeric_limits::max())); + }; + if (!overlap) { + ASSERT_FALSE(makeArray()->hasOverlappingRanges()); + } else { + ASSERT_TRUE(makeArray()->hasOverlappingRanges()); + } + }; + test(3, {false, false, false}, {0, 1, 2}, {1, 1, 1}, false); + test(3, {false, true, false}, {0, 0, 2}, {1, 1, 1}, false); + test(3, {false, false, false}, {0, 0, 2}, {1, 0, 1}, false); + test(3, {false, false, false}, {0, 0, 2}, {1, 1, 1}, true); + test(3, {false, false, false}, {2, 1, 0}, {1, 1, 1}, false); + test(3, {false, false, false}, {2, 1, 0}, {1, 2, 1}, true); + test(2, {false, false}, {0, 1}, {3, 1}, true); + test(2, {false, false}, {1, 0}, {1, 3}, true); +} + } // namespace } // namespace facebook::velox