Skip to content

Commit

Permalink
Storages: Returns RSResult::Some in MinMaxIndex::CheckIn when has null.
Browse files Browse the repository at this point in the history
  • Loading branch information
JinheLin committed Jun 20, 2024
1 parent 7e70abd commit c045024
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 14 deletions.
25 changes: 21 additions & 4 deletions dbms/src/Storages/DeltaMerge/Index/MinMaxIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,23 @@ ALWAYS_INLINE bool minIsNull(const DB::ColumnUInt8 & null_map, size_t i)
{
return null_map.getElement(i * 2);
}

ALWAYS_INLINE bool isLegacyOrHasNull(
const DB::ColumnUInt8 & null_map,
const PaddedPODArray<UInt8> & has_nulls,
size_t i)
{
// Null is a special value. The result of comparing null with any value and the logical
// operation result of null value are unknown. So, if a pack has null, always return RSResult::Some.

// Why need return RSResult::Some when a pack has null?
// Take `MinMaxIndex::checkIn` for example.
// There is pack has data [1, 2, 3, 4, null] and a query `... not in (100)`.
// RoughCheck::CheckIn(100) will return `RSResult::None`, because it never consider the null value.
// So the filter result of `not in (100)` is `not RSResult::None` and it is `RSResult::All`.
// If we skip the filter calculation, the null rows will not be filtered out.
return has_nulls[i] || minIsNull(null_map, i);
}
} // namespace details

void MinMaxIndex::addPack(const IColumn & column, const ColumnVector<UInt8> * del_mark)
Expand Down Expand Up @@ -238,7 +255,7 @@ RSResults MinMaxIndex::checkNullableInImpl(
const auto & minmaxes_data = toColumnVectorData<T>(column_nullable.getNestedColumnPtr());
for (size_t i = start_pack; i < start_pack + pack_count; ++i)
{
if (details::minIsNull(null_map, i))
if (details::isLegacyOrHasNull(null_map, has_null_marks, i))
continue;
auto min = minmaxes_data[i * 2];
auto max = minmaxes_data[i * 2 + 1];
Expand Down Expand Up @@ -283,7 +300,7 @@ RSResults MinMaxIndex::checkNullableIn(
const auto & offsets = string_column->getOffsets();
for (size_t i = start_pack; i < start_pack + pack_count; ++i)
{
if (details::minIsNull(null_map, i))
if (details::isLegacyOrHasNull(null_map, has_null_marks, i))
continue;
size_t pos = i * 2;
size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
Expand Down Expand Up @@ -493,7 +510,7 @@ RSResults MinMaxIndex::checkNullableCmpImpl(
const auto & minmaxes_data = toColumnVectorData<T>(column_nullable.getNestedColumnPtr());
for (size_t i = start_pack; i < start_pack + pack_count; ++i)
{
if (details::minIsNull(null_map, i))
if (details::isLegacyOrHasNull(null_map, has_null_marks, i))
continue;
auto min = minmaxes_data[i * 2];
auto max = minmaxes_data[i * 2 + 1];
Expand Down Expand Up @@ -539,7 +556,7 @@ RSResults MinMaxIndex::checkNullableCmp(
const auto & offsets = string_column->getOffsets();
for (size_t i = start_pack; i < start_pack + pack_count; ++i)
{
if (details::minIsNull(null_map, i))
if (details::isLegacyOrHasNull(null_map, has_null_marks, i))
continue;
size_t pos = i * 2;
size_t prev_offset = pos == 0 ? 0 : offsets[pos - 1];
Expand Down
153 changes: 143 additions & 10 deletions dbms/src/Storages/DeltaMerge/tests/gtest_dm_minmax_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1354,7 +1354,7 @@ try
}
auto type_value_pair = generateTypeValue(static_cast<MinMaxTestDatatype>(datatype), true);
ASSERT_EQ(
false,
operater_type == Test_In,
checkMatch(
case_name,
*context,
Expand Down Expand Up @@ -2259,6 +2259,24 @@ try
}
CATCH

namespace
{
// Only support Int64.
template <typename T>
MinMaxIndexPtr createMinMaxIndex(const IDataType & col_type, const std::vector<T> & cases)
{
auto minmax_index = std::make_shared<MinMaxIndex>(col_type);
for (const auto & c : cases)
{
RUNTIME_CHECK(c.column_data.size(), c.del_mark.size());
auto col_data = createColumn<Nullable<Int64>>(c.column_data).column;
auto del_mark_col = createColumn<UInt8>(c.del_mark).column;
minmax_index->addPack(*col_data, static_cast<const ColumnVector<UInt8> *>(del_mark_col.get()));
}
return minmax_index;
}
} // namespace

TEST_F(MinMaxIndexTest, CheckIsNull)
try
{
Expand All @@ -2281,15 +2299,7 @@ try
};

auto col_type = makeNullable(std::make_shared<DataTypeInt64>());
auto minmax_index = std::make_shared<MinMaxIndex>(*col_type);
for (const auto & c : cases)
{
ASSERT_EQ(c.column_data.size(), c.del_mark.size());
auto col_data = createColumn<Nullable<Int64>>(c.column_data).column;
auto del_mark_col = createColumn<UInt8>(c.del_mark).column;
minmax_index->addPack(*col_data, static_cast<const ColumnVector<UInt8> *>(del_mark_col.get()));
}

auto minmax_index = createMinMaxIndex(*col_type, cases);
auto actual_results = minmax_index->checkIsNull(0, cases.size());
for (size_t i = 0; i < cases.size(); ++i)
{
Expand All @@ -2303,4 +2313,127 @@ try
}
CATCH

TEST_F(MinMaxIndexTest, CheckIn)
try
{
struct CheckInTestCase
{
std::vector<std::optional<Int64>> column_data;
std::vector<UInt64> del_mark;
};
struct ValuesAndResults
{
std::vector<Int64> values;
RSResults results;
};
std::vector<CheckInTestCase> cases = {
{
.column_data = {1, 2, 3, 4, std::nullopt},
.del_mark = {0, 0, 0, 0, 0},
},
{
.column_data = {6, 7, 8, 9, 10},
.del_mark = {0, 0, 0, 0, 0},
},
{
.column_data = {std::nullopt, std::nullopt},
.del_mark = {0, 0},
},
{
.column_data = {1, 2, 3, 4, std::nullopt},
.del_mark = {0, 0, 0, 0, 1},
},
{
.column_data = {6, 7, 8, 9, 10},
.del_mark = {0, 0, 0, 1, 0},
},
{
.column_data = {std::nullopt, std::nullopt},
.del_mark = {1, 0},
},
{
.column_data = {std::nullopt, std::nullopt},
.del_mark = {1, 1},
},
{
.column_data = {1, 2, 3, 4},
.del_mark = {1, 1, 1, 1},
},
{
.column_data = {1, 1},
.del_mark = {0, 0},
},
{
.column_data = {1, 1, std::nullopt},
.del_mark = {0, 0, 0},
},
};

std::vector<ValuesAndResults> params = {
{
.values = {1, 2, 3, 4, 5, 6},
.results = {
RSResult::Some,
RSResult::Some,
RSResult::Some,
RSResult::Some, // Should be All, but not support now
RSResult::Some,
RSResult::Some,
RSResult::Some,
RSResult::Some,
RSResult::All, // minimum value equals to maximum value
RSResult::Some,
},
},
{
.values = {100},
.results = {
RSResult::Some,
RSResult::None,
RSResult::Some,
RSResult::None,
RSResult::None,
RSResult::Some,
RSResult::Some,
RSResult::Some,
RSResult::None,
RSResult::Some,
},
},
{
.values = {0},
.results = {
RSResult::Some,
RSResult::None,
RSResult::Some,
RSResult::None,
RSResult::None,
RSResult::Some,
RSResult::Some,
RSResult::Some,
RSResult::None,
RSResult::Some,
},
},
};

auto col_type = makeNullable(std::make_shared<DataTypeInt64>());
auto minmax_index = createMinMaxIndex(*col_type, cases);
for (const auto & [values, expected_results] : params)
{
ASSERT_EQ(expected_results.size(), cases.size());
auto actual_results = minmax_index->checkIn(0, cases.size(), std::vector<Field>(values.cbegin(), values.cend()), col_type);
for (size_t j = 0; j < cases.size(); ++j)
{
ASSERT_EQ(actual_results[j], expected_results[j]) << fmt::format(
"column_data={}, del_mark={}, values={}, actual={} expected={}",
cases[j].column_data,
cases[j].del_mark,
values,
magic_enum::enum_name(actual_results[j]),
magic_enum::enum_name(expected_results[j]));
}
}
}
CATCH
} // namespace DB::DM::tests

0 comments on commit c045024

Please sign in to comment.