Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize comparision for collation UTF8_BIN and UTF8MB4_BIN #5299

Merged
merged 19 commits into from
Jul 7, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dbms/src/Columns/ColumnConst.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ class ColumnConst final : public COWPtrHelper<IColumn, ColumnConst>
template <typename T>
T getValue() const
{
return getField().safeGet<typename NearestFieldType<T>::Type>();
auto && tmp = getField();
return std::move(tmp.safeGet<typename NearestFieldType<T>::Type>());
}
};

Expand Down
259 changes: 259 additions & 0 deletions dbms/src/Functions/CollationOperatorOptimized.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
// Copyright 2022 PingCAP, Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <Columns/ColumnString.h>
#include <Core/AccurateComparison.h>
#include <Functions/StringUtil.h>
#include <common/StringRef.h>
#include <common/defines.h>

#include <cstddef>
#include <string_view>


namespace DB
{

template <typename T>
ALWAYS_INLINE inline int signum(T val)
{
return (0 < val) - (val < 0);
}

// Check equality is much faster than other comparison.
// - check size first
// - return 0 if equal else 1
__attribute__((flatten, always_inline, pure)) inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs)
{
return StringRef(lhs) == StringRef(rhs) ? 0 : 1;
}

// Compare str view by memcmp
__attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2)
{
return signum(v1.compare(v2));
}

// Remove tail space
__attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim(const std::string_view & v)
{
size_t end = v.find_last_not_of(' ');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if other space characters need to be handled here.

Copy link
Contributor Author

@solotzg solotzg Jul 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to MySQL::Doc::string-comparison-functions.

  • In particular, trailing spaces are always significant. This differs from comparisons performed with the = operator, for which the significance of trailing spaces in nonbinary strings (CHAR, VARCHAR, and TEXT values) depends on the pad attribute of the collation used for the comparison. For more information, see Trailing Space Handling in Comparisons.
mysql> SET NAMES utf8mb4 COLLATE utf8mb4_bin;
mysql> SELECT 'a ' = 'a';
+------------+
| 'a ' = 'a' |
+------------+
|          1 |
+------------+
mysql> SET NAMES utf8mb4 COLLATE utf8mb4_0900_bin;
mysql> SELECT 'a ' = 'a';
+------------+
| 'a ' = 'a' |
+------------+
|          0 |
+------------+

return end == std::string_view::npos ? "" : v.substr(0, end + 1);
}

__attribute__((flatten, always_inline, pure)) inline int RtrimeStrCompare(const std::string_view & va, const std::string_view & vb)
{
return RawStrCompare(RightTrim(va), RightTrim(vb));
}

// If true, only need to check equal or not.
template <typename T>
struct IsEqualRelated
{
static constexpr const bool value = false;
};

// For `EqualsOp` and `NotEqualsOp`, value is true.
template <typename... A>
struct IsEqualRelated<DB::EqualsOp<A...>>
{
static constexpr const bool value = true;
};
template <typename... A>
struct IsEqualRelated<DB::NotEqualsOp<A...>>
{
static constexpr const bool value = true;
};

// Return true if any str has tail space
__attribute__((always_inline, pure)) inline bool HasTailSpace(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
size_t size)
{
bool has_tail_space = false;

// #pragma clang loop vectorize(enable)
for (size_t i = 0; i < size; ++i)
{
const auto * ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);
auto size = StringUtil::sizeAt(a_offsets, i) - 1;
auto pos = size > 0 ? size - 1 : 0; // if size is 0, use the last pos which aways contains a '\0'
has_tail_space |= ptr[pos] == ' ';
solotzg marked this conversation as resolved.
Show resolved Hide resolved
}
return has_tail_space;
}


// Loop columns and invoke callback for each pair.
template <typename F>
__attribute__((flatten, always_inline)) inline void LoopTwoColumns(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const ColumnString::Chars_t & b_data,
const ColumnString::Offsets & b_offsets,
size_t size,
F && func)
{
for (size_t i = 0; i < size; ++i)
{
size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
size_t b_size = StringUtil::sizeAt(b_offsets, i) - 1;
const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);
const auto * b_ptr = reinterpret_cast<const char *>(&b_data[StringUtil::offsetAt(b_offsets, i)]);

func({a_ptr, a_size}, {b_ptr, b_size}, i);
}
}

// Loop one column and invoke callback for each pair.
template <typename F>
__attribute__((flatten, always_inline)) inline void LoopOneColumn(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
size_t size,
F && func)
{
for (size_t i = 0; i < size; ++i)
{
size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1;
const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]);

func({a_ptr, a_size}, i);
}
}

// Handle str-column compare str-column.
// - Optimize UTF8_BIN and UTF8MB4_BIN
// - Check if columns don NOT contains tail space
solotzg marked this conversation as resolved.
Show resolved Hide resolved
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
template <typename Op, typename Result>
ALWAYS_INLINE inline bool StringVectorStringVector(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const ColumnString::Chars_t & b_data,
const ColumnString::Offsets & b_offsets,
const TiDB::TiDBCollatorPtr & collator,
Result & c)
{
bool use_optimized_path = false;

switch (collator->getCollatorId())
{
case TiDB::ITiDBCollator::UTF8MB4_BIN:
case TiDB::ITiDBCollator::UTF8_BIN:
{
size_t size = a_offsets.size();

if (unlikely(HasTailSpace(a_data, a_offsets, size) || HasTailSpace(b_data, b_offsets, size)))
{
// if any col has any str with tail space, trim it ans compare
LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare(RightTrim(va), RightTrim(vb)), 0);
}
else
{
c[i] = Op::apply(RtrimeStrCompare(va, vb), 0);
}
});
}
else
{
// in most case, string will not contain tail space
LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare(va, vb), 0);
}
else
{
c[i] = Op::apply(RawStrCompare(va, vb), 0);
}
});
}

use_optimized_path = true;

break;
}
default:
break;
}
return use_optimized_path;
}

// Handle str-column compare const-str.
// - Optimize UTF8_BIN and UTF8MB4_BIN
// - Right trim const-str first
// - Check if column don NOT contains tail space
solotzg marked this conversation as resolved.
Show resolved Hide resolved
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way
template <typename Op, typename Result>
ALWAYS_INLINE inline bool StringVectorConstant(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const std::string_view & b,
const TiDB::TiDBCollatorPtr & collator,
Result & c)
{
bool use_optimized_path = false;

switch (collator->getCollatorId())
{
case TiDB::ITiDBCollator::UTF8MB4_BIN:
case TiDB::ITiDBCollator::UTF8_BIN:
{
size_t size = a_offsets.size();

std::string_view tar_str_view = RightTrim(b); // right trim const-str first

if (likely(!HasTailSpace(a_data, a_offsets, size)))
{
LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare(view, tar_str_view), 0);
}
else
{
c[i] = Op::apply(RawStrCompare(view, tar_str_view), 0);
}
});
}
else
{
LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) {
if constexpr (IsEqualRelated<Op>::value)
{
c[i] = Op::apply(RawStrEqualCompare(RightTrim(view), tar_str_view), 0);
}
else
{
c[i] = Op::apply(RawStrCompare(RightTrim(view), tar_str_view), 0);
}
});
}
use_optimized_path = true;
break;
}
default:
break;
}
return use_optimized_path;
}

} // namespace DB
63 changes: 54 additions & 9 deletions dbms/src/Functions/FunctionsComparison.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/CollationOperatorOptimized.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/FunctionsLogical.h>
#include <Functions/IFunction.h>
Expand Down Expand Up @@ -301,6 +302,12 @@ struct StringComparisonWithCollatorImpl
const TiDB::TiDBCollatorPtr & collator,
PaddedPODArray<ResultType> & c)
{
bool optimized_path = StringVectorStringVector<Op>(a_data, a_offsets, b_data, b_offsets, collator, c);
if (optimized_path)
{
return;
}

size_t size = a_offsets.size();

for (size_t i = 0; i < size; ++i)
Expand All @@ -317,10 +324,17 @@ struct StringComparisonWithCollatorImpl
static void NO_INLINE stringVectorConstant(
const ColumnString::Chars_t & a_data,
const ColumnString::Offsets & a_offsets,
const std::string & b,
const std::string_view & b,
const TiDB::TiDBCollatorPtr & collator,
PaddedPODArray<ResultType> & c)
{
bool optimized_path = StringVectorConstant<Op>(a_data, a_offsets, b, collator, c);

if (optimized_path)
{
return;
}

size_t size = a_offsets.size();
ColumnString::Offset b_size = b.size();
const char * b_data = reinterpret_cast<const char *>(b.data());
Expand All @@ -332,7 +346,7 @@ struct StringComparisonWithCollatorImpl
}

static void constantStringVector(
const std::string & a,
const std::string_view & a,
const ColumnString::Chars_t & b_data,
const ColumnString::Offsets & b_offsets,
const TiDB::TiDBCollatorPtr & collator,
Expand All @@ -342,8 +356,8 @@ struct StringComparisonWithCollatorImpl
}

static void constantConstant(
const std::string & a,
const std::string & b,
std::string_view a,
solotzg marked this conversation as resolved.
Show resolved Hide resolved
std::string_view b,
const TiDB::TiDBCollatorPtr & collator,
ResultType & c)
{
Expand Down Expand Up @@ -720,10 +734,41 @@ class FunctionComparison : public IFunction
using ResultType = typename ResultColumnType::value_type;
using StringImpl = StringComparisonWithCollatorImpl<Op<int, int>, ResultType>;

std::string_view c0_const_str_ref{};
std::string_view c1_const_str_ref{};

if (c0_const)
{
if (const auto * c0_const_string = checkAndGetColumn<ColumnString>(&c0_const->getDataColumn()); c0_const_string)
{
c0_const_str_ref = std::string_view(c0_const_string->getDataAt(0));
}
else if (const auto * c0_const_fixed_string = checkAndGetColumn<ColumnFixedString>(&c0_const->getDataColumn()); c0_const_fixed_string)
{
c0_const_str_ref = std::string_view(c0_const_fixed_string->getDataAt(0));
}
else
throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
}

if (c1_const)
{
if (const auto * c1_const_string = checkAndGetColumn<ColumnString>(&c1_const->getDataColumn()); c1_const_string)
{
c1_const_str_ref = std::string_view(c1_const_string->getDataAt(0));
}
else if (const auto * c1_const_fixed_string = checkAndGetColumn<ColumnFixedString>(&c0_const->getDataColumn()); c1_const_fixed_string)
solotzg marked this conversation as resolved.
Show resolved Hide resolved
{
c1_const_str_ref = std::string_view(c1_const_fixed_string->getDataAt(0));
}
else
throw Exception("Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN);
}

if (c0_const && c1_const)
{
ResultType res = 0;
StringImpl::constantConstant(c0_const->getValue<String>(), c1_const->getValue<String>(), collator, res);
StringImpl::constantConstant(c0_const_str_ref, c1_const_str_ref, collator, res);
block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(c0_const->size(), toField(res));
return true;
}
Expand All @@ -745,12 +790,12 @@ class FunctionComparison : public IFunction
StringImpl::stringVectorConstant(
c0_string->getChars(),
c0_string->getOffsets(),
c1_const->getValue<String>(),
c1_const_str_ref,
collator,
c_res->getData());
else if (c0_const && c1_string)
StringImpl::constantStringVector(
c0_const->getValue<String>(),
c0_const_str_ref,
c1_string->getChars(),
c1_string->getOffsets(),
collator,
Expand All @@ -770,8 +815,8 @@ class FunctionComparison : public IFunction
template <typename ReturnColumnType = ColumnUInt8>
bool executeString(Block & block, size_t result, const IColumn * c0, const IColumn * c1) const
{
const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
const auto * c0_string = checkAndGetColumn<ColumnString>(c0);
const auto * c1_string = checkAndGetColumn<ColumnString>(c1);
const ColumnConst * c0_const = checkAndGetColumnConstStringOrFixedString(c0);
const ColumnConst * c1_const = checkAndGetColumnConstStringOrFixedString(c1);

Expand Down
Loading