Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for overlapping ranges of ARRAY and MAP vectors #10960

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 11 additions & 23 deletions velox/connectors/hive/tests/HivePartitionFunctionTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,23 +319,17 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) {
auto rawSizes = sizesBuffer->asMutable<vector_size_t>();
auto rawNulls = nullsBuffer->asMutable<uint64_t>();

// Make the elements overlap and have gaps.
// Make the elements have gaps.
// Set the values in position 2 to be invalid since that Array should be null.
std::vector<vector_size_t> offsets{
0, 2, std::numeric_limits<int32_t>().max(), 1, 8};
0, 2, std::numeric_limits<int32_t>().max(), 4, 8};
std::vector<vector_size_t> sizes{
4, 3, std::numeric_limits<int32_t>().max(), 5, 2};
2, 1, std::numeric_limits<int32_t>().max(), 3, 2};
memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t));
memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t));

bits::setNull(rawNulls, 2);

// Produces arrays that look like:
// [9, NULL, 7, 6]
// [7, 6, 5]
// NULL
// [NULL, 7, 6, 5, NULL]
// [1, NULL]
auto values = std::make_shared<ArrayVector>(
pool_.get(),
ARRAY(elements->type()),
Expand All @@ -346,9 +340,9 @@ TEST_F(HivePartitionFunctionTest, arrayElementsEncoded) {
encodedElements);

assertPartitions(values, 1, {0, 0, 0, 0, 0});
assertPartitions(values, 2, {0, 0, 0, 0, 1});
assertPartitions(values, 500, {342, 418, 0, 458, 31});
assertPartitions(values, 997, {149, 936, 0, 103, 31});
assertPartitions(values, 2, {1, 1, 0, 0, 1});
assertPartitions(values, 500, {279, 7, 0, 308, 31});
assertPartitions(values, 997, {279, 7, 0, 820, 31});

assertPartitionsWithConstChannel(values, 1);
assertPartitionsWithConstChannel(values, 2);
Expand Down Expand Up @@ -428,23 +422,17 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) {
auto rawSizes = sizesBuffer->asMutable<vector_size_t>();
auto rawNulls = nullsBuffer->asMutable<uint64_t>();

// Make the elements overlap and have gaps.
// Make the elements have gaps.
// Set the values in position 2 to be invalid since that Map should be null.
std::vector<vector_size_t> offsets{
0, 2, std::numeric_limits<int32_t>().max(), 1, 8};
0, 2, std::numeric_limits<int32_t>().max(), 4, 8};
std::vector<vector_size_t> sizes{
4, 3, std::numeric_limits<int32_t>().max(), 5, 2};
2, 1, std::numeric_limits<int32_t>().max(), 3, 2};
memcpy(rawOffsets, offsets.data(), size * sizeof(vector_size_t));
memcpy(rawSizes, sizes.data(), size * sizeof(vector_size_t));

bits::setNull(rawNulls, 2);

// Produces maps that look like:
// {key_0: 9, key_3: NULL, key_6: 7, key_9: 6}
// {key_6: 7, key_9: 6, key_2: 5}
// NULL
// {key_3: NULL, key_6: 7, key_9: 6, key_2: 5, key_5: NULL}
// {key_4: 1, key_7: NULL}
auto values = std::make_shared<MapVector>(
pool_.get(),
MAP(mapKeys->type(), mapValues->type()),
Expand All @@ -457,8 +445,8 @@ TEST_F(HivePartitionFunctionTest, mapEntriesEncoded) {

assertPartitions(values, 1, {0, 0, 0, 0, 0});
assertPartitions(values, 2, {0, 1, 0, 1, 0});
assertPartitions(values, 500, {176, 259, 0, 91, 336});
assertPartitions(values, 997, {694, 24, 0, 365, 345});
assertPartitions(values, 500, {336, 413, 0, 259, 336});
assertPartitions(values, 997, {345, 666, 0, 24, 345});

assertPartitionsWithConstChannel(values, 1);
assertPartitionsWithConstChannel(values, 2);
Expand Down
10 changes: 6 additions & 4 deletions velox/docs/develop/vectors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,9 @@ ArrayVector
~~~~~~~~~~~

ArrayVector stores values of type ARRAY. In addition to nulls buffer, it
contains offsets and sizes buffers and an elements vector. Offsets and sizes
are 32-bit integers.
contains offsets and sizes buffers and an elements vector. Offsets and sizes are
32-bit integers. The non-null non-empty ranges formed by offsets and sizes in a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does offsets need to in order in elements vector? Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it does not. But by default this is the case for most vectors, so we leverage that in the fast path.

vector is not allowed to overlap with each other.

.. code-block:: c++

Expand Down Expand Up @@ -443,8 +444,9 @@ MapVector
~~~~~~~~~

MapVector stores values of type MAP. In addition to nulls buffer, it contains
offsets and sizes buffers, keys and values vectors. Offsets and sizes are
32-bit integers.
offsets and sizes buffers, keys and values vectors. Offsets and sizes are 32-bit
integers. The non-null non-empty ranges formed by offsets and sizes in a vector
is not allowed to overlap with each other.

.. code-block:: c++

Expand Down
6 changes: 4 additions & 2 deletions velox/expression/tests/CastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1279,7 +1279,8 @@ TEST_F(CastExprTest, mapCast) {

SelectivityVector rows(5);
rows.setValid(2, false);
mapVector->setOffsetAndSize(2, 100, 100);
mapVector->setOffsetAndSize(2, 100, 1);
mapVector->setOffsetAndSize(51, 2, 1);
std::vector<VectorPtr> results(1);

auto rowVector = makeRowVector({mapVector});
Expand Down Expand Up @@ -1369,7 +1370,8 @@ TEST_F(CastExprTest, arrayCast) {

SelectivityVector rows(5);
rows.setValid(2, false);
arrayVector->setOffsetAndSize(2, 100, 10);
arrayVector->setOffsetAndSize(2, 100, 5);
arrayVector->setOffsetAndSize(20, 10, 5);
std::vector<VectorPtr> results(1);

auto rowVector = makeRowVector({arrayVector});
Expand Down
10 changes: 5 additions & 5 deletions velox/expression/tests/ExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3568,10 +3568,10 @@ TEST_P(ParameterizedExprTest, mapKeysAndValues) {
MAP(BIGINT(), BIGINT()),
makeNulls(vectorSize, nullEvery(3)),
vectorSize,
makeIndices(vectorSize, [](auto /* row */) { return 0; }),
makeIndices(vectorSize, folly::identity),
makeIndices(vectorSize, [](auto /* row */) { return 1; }),
makeFlatVector<int64_t>({1, 2, 3}),
makeFlatVector<int64_t>({10, 20, 30}));
makeFlatVector<int64_t>(vectorSize, folly::identity),
makeFlatVector<int64_t>(vectorSize, [](auto i) { return 10 * i; }));
auto input = makeRowVector({mapVector});
auto exprSet = compileMultiple(
{"map_keys(c0)", "map_values(c0)"}, asRowType(input->type()));
Expand Down Expand Up @@ -4805,7 +4805,7 @@ TEST_F(ExprTest, disablePeeling) {
// Verify that peeling is disabled when the config is set by checking whether
// the number of rows processed is equal to the alphabet size (when enabled)
// or the dictionary size (when disabled).
// Also, ensure that single arg function recieves a flat vector even when
// Also, ensure that single arg function receives a flat vector even when
// peeling is disabled.

// This throws if input is not flat or constant.
Expand Down Expand Up @@ -4847,7 +4847,7 @@ TEST_F(ExprTest, disablePeeling) {
ASSERT_TRUE(stats.find("plus") != stats.end());
ASSERT_EQ(stats["plus"].numProcessedRows, dictSize);

// Ensure single arg function recieves a flat vector.
// Ensure single arg function receives a flat vector.
// When top level column is dictionary wrapped.
ASSERT_NO_THROW(evaluateMultiple(
{"testing_single_arg_deterministic((c0))"},
Expand Down
2 changes: 1 addition & 1 deletion velox/functions/lib/tests/SliceTestBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SliceTestBase : public FunctionBaseTest {
const ArrayVectorPtr& expectedArrayVector) {
auto result = evaluate<ArrayVector>(expression, makeRowVector(parameters));
::facebook::velox::test::assertEqualVectors(expectedArrayVector, result);
EXPECT_NO_THROW(expectedArrayVector->checkRanges());
EXPECT_FALSE(expectedArrayVector->hasOverlappingRanges());
}

void basicTestCases() {
Expand Down
12 changes: 6 additions & 6 deletions velox/functions/prestosql/tests/MapEntriesTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,14 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) {
makeNullableFlatVector<int64_t>({1, 2, 3, 4, std::nullopt, std::nullopt});
auto valueVector = makeFlatVector<int64_t>({1, 2, 3, 4});

auto offsetBuffer = makeIndices({3, 2, 1, 0, 0, 0});
auto sizeBuffer = makeIndices({1, 1, 1, 1, 1, 1});
auto offsetBuffer = makeIndices({3, 2, 1, 0});
auto sizeBuffer = makeIndices({1, 1, 1, 1});

auto mapVector = std::make_shared<MapVector>(
pool(),
MAP(BIGINT(), BIGINT()),
nullptr,
6,
4,
offsetBuffer,
sizeBuffer,
keyVector,
Expand All @@ -188,9 +188,9 @@ TEST_F(MapEntriesTest, differentSizedValueKeyVectors) {

EXPECT_NE(result, nullptr);
auto elementVector = makeRowVector(
{makeFlatVector<int64_t>({4, 3, 2, 1, 1, 1}),
makeFlatVector<int64_t>({4, 3, 2, 1, 1, 1})});
auto arrayVector = makeArrayVector({0, 1, 2, 3, 4, 5}, elementVector);
{makeFlatVector<int64_t>({4, 3, 2, 1}),
makeFlatVector<int64_t>({4, 3, 2, 1})});
auto arrayVector = makeArrayVector({0, 1, 2, 3}, elementVector);

test::assertEqualVectors(arrayVector, result);
}
2 changes: 1 addition & 1 deletion velox/functions/prestosql/tests/ZipTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ TEST_F(ZipTest, flatArraysWithDifferentOffsets) {

/// Test if we can zip two flat vectors of arrays with overlapping ranges of
/// elements.
TEST_F(ZipTest, flatArraysWithOverlappingRanges) {
TEST_F(ZipTest, DISABLED_flatArraysWithOverlappingRanges) {
const auto size = 3;
BufferPtr offsetsBuffer = allocateOffsets(size, pool_.get());
BufferPtr sizesBuffer = allocateSizes(size, pool_.get());
Expand Down
70 changes: 49 additions & 21 deletions velox/vector/ComplexVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,31 +809,59 @@ VectorPtr RowVector::pushDictionaryToRowVectorLeaves(const VectorPtr& input) {
wrappers, input->size(), input, input->pool());
}

void ArrayVectorBase::checkRanges() const {
std::unordered_map<vector_size_t, vector_size_t> seenElements;
seenElements.reserve(size());
template <bool kHasNulls>
vector_size_t ArrayVectorBase::nextNonEmpty(vector_size_t i) const {
while (i < size() &&
((kHasNulls && bits::isBitNull(rawNulls(), i)) || rawSizes_[i] <= 0)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rawSizes_ could be < 0? Thanks!

Copy link
Contributor Author

@Yuhta Yuhta Sep 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should not, but some application may generate such a thing as intermediate state. This is checked in ArrayVectorBase::validateArrayVectorBase before calling this check.

++i;
}
return i;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we return std::optional if not found?

Copy link
Contributor Author

@Yuhta Yuhta Sep 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably no need, we will need to check i < size() once again anyway and this won't save us anything in terms of efficiency (it's probably even one more registry used) or code readability.

}

template <bool kHasNulls>
bool ArrayVectorBase::maybeHaveOverlappingRanges() const {
vector_size_t curr = 0;
curr = nextNonEmpty<kHasNulls>(curr);
if (curr >= size()) {
return false;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about

vector_size_t prev{-1};
for (;;) {
  const auto next = ...;
  ...
  if (prev != -1 && ...) {
    return false;
  }
  prev = next;
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated with curr and next. Feels a little bit weird since these are usually used for pointers.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you merge the code in l824 into the loop? curr{-1} can indicate the initial case? Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is call to rawOffsets_[curr] so curr = -1 is not valid here

for (;;) {
auto next = nextNonEmpty<kHasNulls>(curr + 1);
if (next >= size()) {
return false;
}
// This also implicitly ensures rawOffsets_[curr] <= rawOffsets_[next].
if (rawOffsets_[curr] + rawSizes_[curr] > rawOffsets_[next]) {
return true;
}
curr = next;
}
}

bool ArrayVectorBase::hasOverlappingRanges() const {
if (!(rawNulls() ? maybeHaveOverlappingRanges<true>()
: maybeHaveOverlappingRanges<false>())) {
return false;
}
std::vector<vector_size_t> indices;
indices.reserve(size());
for (vector_size_t i = 0; i < size(); ++i) {
auto size = sizeAt(i);
auto offset = offsetAt(i);

for (vector_size_t j = 0; j < size; ++j) {
auto it = seenElements.find(offset + j);
if (it != seenElements.end()) {
VELOX_FAIL(
"checkRanges() found overlap at idx {}: element {} has offset {} "
"and size {}, and element {} has offset {} and size {}.",
offset + j,
it->second,
offsetAt(it->second),
sizeAt(it->second),
i,
offset,
size);
}
seenElements.emplace(offset + j, i);
const bool isNull = rawNulls() && bits::isBitNull(rawNulls(), i);
if (!isNull && rawSizes_[i] > 0) {
indices.push_back(i);
}
}
std::sort(indices.begin(), indices.end(), [&](auto i, auto j) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rawSizes_ could be < 0? If not, why maybeHaveOverlappingRanges check is not sufficient? Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rawOffsets_ could be out of order

return rawOffsets_[i] < rawOffsets_[j];
});
for (vector_size_t i = 1; i < indices.size(); ++i) {
auto j = indices[i - 1];
auto k = indices[i];
if (rawOffsets_[j] + rawSizes_[j] > rawOffsets_[k]) {
return true;
}
}
return false;
}

void ArrayVectorBase::validateArrayVectorBase(
Expand Down
13 changes: 10 additions & 3 deletions velox/vector/ComplexVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,8 @@ struct ArrayVectorBase : BaseVector {
sizes_->asMutable<vector_size_t>()[i] = size;
}

/// Verify that an ArrayVector/MapVector does not contain overlapping [offset,
/// size] ranges. Throws in case overlaps are found.
void checkRanges() const;
/// Check if there is any overlapping [offset, size] ranges.
bool hasOverlappingRanges() const;

protected:
ArrayVectorBase(
Expand Down Expand Up @@ -411,6 +410,14 @@ struct ArrayVectorBase : BaseVector {
const vector_size_t* rawOffsets_;
BufferPtr sizes_;
const vector_size_t* rawSizes_;

private:
template <bool kHasNulls>
bool maybeHaveOverlappingRanges() const;

// Returns the next non-null non-empty array/map on or after `index'.
template <bool kHasNulls>
Yuhta marked this conversation as resolved.
Show resolved Hide resolved
vector_size_t nextNonEmpty(vector_size_t index) const;
};

class ArrayVector : public ArrayVectorBase {
Expand Down
38 changes: 36 additions & 2 deletions velox/vector/tests/VectorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2882,7 +2882,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) {

// Test array.
{
auto offsets = makeIndices({1, 1, 1, 1});
auto offsets = makeIndices({0, 1, 2, 3});
auto sizes = makeIndices({1, 1, 1, 1});

auto* rawSizes = sizes->as<vector_size_t>();
Expand All @@ -2907,7 +2907,7 @@ TEST_F(VectorTest, resizeArrayAndMapResetOffsets) {

// Test map.
{
auto offsets = makeIndices({1, 1, 1, 1});
auto offsets = makeIndices({0, 1, 2, 3});
auto sizes = makeIndices({1, 1, 1, 1});

auto* rawSizes = sizes->as<vector_size_t>();
Expand Down Expand Up @@ -3899,5 +3899,39 @@ TEST_F(VectorTest, testOverSizedArray) {
EXPECT_THROW(BaseVector::flattenVector(constArray), VeloxUserError);
}

TEST_F(VectorTest, hasOverlappingRanges) {
auto test = [this](
vector_size_t length,
const std::vector<bool>& nulls,
const std::vector<vector_size_t>& offsets,
const std::vector<vector_size_t>& sizes,
bool overlap) {
auto makeArray = [&] {
return std::make_shared<ArrayVector>(
pool(),
ARRAY(BIGINT()),
makeNulls(nulls),
length,
makeIndices(length, [&](auto i) { return offsets[i]; }),
makeIndices(length, [&](auto i) { return sizes[i]; }),
makeNullConstant(
TypeKind::BIGINT, std::numeric_limits<vector_size_t>::max()));
};
if (!overlap) {
ASSERT_FALSE(makeArray()->hasOverlappingRanges());
} else {
ASSERT_TRUE(makeArray()->hasOverlappingRanges());
}
};
test(3, {false, false, false}, {0, 1, 2}, {1, 1, 1}, false);
test(3, {false, true, false}, {0, 0, 2}, {1, 1, 1}, false);
test(3, {false, false, false}, {0, 0, 2}, {1, 0, 1}, false);
test(3, {false, false, false}, {0, 0, 2}, {1, 1, 1}, true);
test(3, {false, false, false}, {2, 1, 0}, {1, 1, 1}, false);
test(3, {false, false, false}, {2, 1, 0}, {1, 2, 1}, true);
test(2, {false, false}, {0, 1}, {3, 1}, true);
test(2, {false, false}, {1, 0}, {1, 3}, true);
}

} // namespace
} // namespace facebook::velox
Loading