-
Notifications
You must be signed in to change notification settings - Fork 411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize comparision for collation UTF8_BIN
and UTF8MB4_BIN
#5299
Merged
Merged
Changes from 7 commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
b35b586
Add options to support `Profile Guided Optimization`
solotzg 4e906f2
Fix name in script
solotzg 0c83b5a
Optimize comparision for UTF8_BIN and UTF8MB4_BIN
solotzg a270dee
Revert "Fix name in script"
solotzg bf761ed
Revert "Add options to support `Profile Guided Optimization`"
solotzg 9e405da
Format code
solotzg a2103ac
Add more test cases
solotzg d5dc1b0
optimize code
solotzg bd4e49a
Fix typo
solotzg b8ed71a
Merge remote-tracking branch 'pingcap/master' into optimize-collation…
solotzg a8e0ed2
Fix bug
solotzg 429e566
Merge branch 'master' into optimize-collation-comp
ti-chi-bot b85741c
Address comment
solotzg 1608bab
Update dbms/src/Functions/CollationOperatorOptimized.h
solotzg 9ad2e69
Update dbms/src/Functions/CollationOperatorOptimized.h
solotzg 5b3d8e0
Address comment
solotzg 6a1f4dd
Merge branch 'master' into optimize-collation-comp
solotzg 682b12a
Merge branch 'master' into optimize-collation-comp
ti-chi-bot f409f67
Merge branch 'master' into optimize-collation-comp
ti-chi-bot File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,259 @@ | ||
// Copyright 2022 PingCAP, Ltd. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#pragma once | ||
|
||
#include <Columns/ColumnString.h> | ||
#include <Core/AccurateComparison.h> | ||
#include <Functions/StringUtil.h> | ||
#include <common/StringRef.h> | ||
#include <common/defines.h> | ||
|
||
#include <cstddef> | ||
#include <string_view> | ||
|
||
|
||
namespace DB | ||
{ | ||
|
||
template <typename T> | ||
ALWAYS_INLINE inline int signum(T val) | ||
{ | ||
return (0 < val) - (val < 0); | ||
} | ||
|
||
// Check equality is much faster than other comparison. | ||
// - check size first | ||
// - return 0 if equal else 1 | ||
__attribute__((flatten, always_inline, pure)) inline uint8_t RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) | ||
{ | ||
return StringRef(lhs) == StringRef(rhs) ? 0 : 1; | ||
} | ||
|
||
// Compare str view by memcmp | ||
__attribute__((flatten, always_inline, pure)) inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) | ||
{ | ||
return signum(v1.compare(v2)); | ||
} | ||
|
||
// Remove tail space | ||
__attribute__((flatten, always_inline, pure)) inline std::string_view RightTrim(const std::string_view & v) | ||
{ | ||
size_t end = v.find_last_not_of(' '); | ||
return end == std::string_view::npos ? "" : v.substr(0, end + 1); | ||
} | ||
|
||
__attribute__((flatten, always_inline, pure)) inline int RtrimeStrCompare(const std::string_view & va, const std::string_view & vb) | ||
{ | ||
return RawStrCompare(RightTrim(va), RightTrim(vb)); | ||
} | ||
|
||
// If true, only need to check equal or not. | ||
template <typename T> | ||
struct IsEqualRelated | ||
{ | ||
static constexpr const bool value = false; | ||
}; | ||
|
||
// For `EqualsOp` and `NotEqualsOp`, value is true. | ||
template <typename... A> | ||
struct IsEqualRelated<DB::EqualsOp<A...>> | ||
{ | ||
static constexpr const bool value = true; | ||
}; | ||
template <typename... A> | ||
struct IsEqualRelated<DB::NotEqualsOp<A...>> | ||
{ | ||
static constexpr const bool value = true; | ||
}; | ||
|
||
// Return true if any str has tail space | ||
__attribute__((always_inline, pure)) inline bool HasTailSpace( | ||
const ColumnString::Chars_t & a_data, | ||
const ColumnString::Offsets & a_offsets, | ||
size_t size) | ||
{ | ||
bool has_tail_space = false; | ||
|
||
// #pragma clang loop vectorize(enable) | ||
for (size_t i = 0; i < size; ++i) | ||
{ | ||
const auto * ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]); | ||
auto size = StringUtil::sizeAt(a_offsets, i) - 1; | ||
auto pos = size > 0 ? size - 1 : 0; // if size is 0, use the last pos which aways contains a '\0' | ||
has_tail_space |= ptr[pos] == ' '; | ||
solotzg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return has_tail_space; | ||
} | ||
|
||
|
||
// Loop columns and invoke callback for each pair. | ||
template <typename F> | ||
__attribute__((flatten, always_inline)) inline void LoopTwoColumns( | ||
const ColumnString::Chars_t & a_data, | ||
const ColumnString::Offsets & a_offsets, | ||
const ColumnString::Chars_t & b_data, | ||
const ColumnString::Offsets & b_offsets, | ||
size_t size, | ||
F && func) | ||
{ | ||
for (size_t i = 0; i < size; ++i) | ||
{ | ||
size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1; | ||
size_t b_size = StringUtil::sizeAt(b_offsets, i) - 1; | ||
const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]); | ||
const auto * b_ptr = reinterpret_cast<const char *>(&b_data[StringUtil::offsetAt(b_offsets, i)]); | ||
|
||
func({a_ptr, a_size}, {b_ptr, b_size}, i); | ||
} | ||
} | ||
|
||
// Loop one column and invoke callback for each pair. | ||
template <typename F> | ||
__attribute__((flatten, always_inline)) inline void LoopOneColumn( | ||
const ColumnString::Chars_t & a_data, | ||
const ColumnString::Offsets & a_offsets, | ||
size_t size, | ||
F && func) | ||
{ | ||
for (size_t i = 0; i < size; ++i) | ||
{ | ||
size_t a_size = StringUtil::sizeAt(a_offsets, i) - 1; | ||
const auto * a_ptr = reinterpret_cast<const char *>(&a_data[StringUtil::offsetAt(a_offsets, i)]); | ||
|
||
func({a_ptr, a_size}, i); | ||
} | ||
} | ||
|
||
// Handle str-column compare str-column. | ||
// - Optimize UTF8_BIN and UTF8MB4_BIN | ||
// - Check if columns don NOT contains tail space | ||
solotzg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way | ||
template <typename Op, typename Result> | ||
ALWAYS_INLINE inline bool StringVectorStringVector( | ||
const ColumnString::Chars_t & a_data, | ||
const ColumnString::Offsets & a_offsets, | ||
const ColumnString::Chars_t & b_data, | ||
const ColumnString::Offsets & b_offsets, | ||
const TiDB::TiDBCollatorPtr & collator, | ||
Result & c) | ||
{ | ||
bool use_optimized_path = false; | ||
|
||
switch (collator->getCollatorId()) | ||
{ | ||
case TiDB::ITiDBCollator::UTF8MB4_BIN: | ||
case TiDB::ITiDBCollator::UTF8_BIN: | ||
{ | ||
size_t size = a_offsets.size(); | ||
|
||
if (unlikely(HasTailSpace(a_data, a_offsets, size) || HasTailSpace(b_data, b_offsets, size))) | ||
{ | ||
// if any col has any str with tail space, trim it ans compare | ||
LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) { | ||
if constexpr (IsEqualRelated<Op>::value) | ||
{ | ||
c[i] = Op::apply(RawStrEqualCompare(RightTrim(va), RightTrim(vb)), 0); | ||
} | ||
else | ||
{ | ||
c[i] = Op::apply(RtrimeStrCompare(va, vb), 0); | ||
} | ||
}); | ||
} | ||
else | ||
{ | ||
// in most case, string will not contain tail space | ||
LoopTwoColumns(a_data, a_offsets, b_data, b_offsets, size, [&c](const std::string_view & va, const std::string_view & vb, size_t i) { | ||
if constexpr (IsEqualRelated<Op>::value) | ||
{ | ||
c[i] = Op::apply(RawStrEqualCompare(va, vb), 0); | ||
} | ||
else | ||
{ | ||
c[i] = Op::apply(RawStrCompare(va, vb), 0); | ||
} | ||
}); | ||
} | ||
|
||
use_optimized_path = true; | ||
|
||
break; | ||
} | ||
default: | ||
break; | ||
} | ||
return use_optimized_path; | ||
} | ||
|
||
// Handle str-column compare const-str. | ||
// - Optimize UTF8_BIN and UTF8MB4_BIN | ||
// - Right trim const-str first | ||
// - Check if column don NOT contains tail space | ||
solotzg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// - If Op is `EqualsOp` or `NotEqualsOp`, optimize comparison by faster way | ||
template <typename Op, typename Result> | ||
ALWAYS_INLINE inline bool StringVectorConstant( | ||
const ColumnString::Chars_t & a_data, | ||
const ColumnString::Offsets & a_offsets, | ||
const std::string_view & b, | ||
const TiDB::TiDBCollatorPtr & collator, | ||
Result & c) | ||
{ | ||
bool use_optimized_path = false; | ||
|
||
switch (collator->getCollatorId()) | ||
{ | ||
case TiDB::ITiDBCollator::UTF8MB4_BIN: | ||
case TiDB::ITiDBCollator::UTF8_BIN: | ||
{ | ||
size_t size = a_offsets.size(); | ||
|
||
std::string_view tar_str_view = RightTrim(b); // right trim const-str first | ||
|
||
if (likely(!HasTailSpace(a_data, a_offsets, size))) | ||
{ | ||
LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) { | ||
if constexpr (IsEqualRelated<Op>::value) | ||
{ | ||
c[i] = Op::apply(RawStrEqualCompare(view, tar_str_view), 0); | ||
} | ||
else | ||
{ | ||
c[i] = Op::apply(RawStrCompare(view, tar_str_view), 0); | ||
} | ||
}); | ||
} | ||
else | ||
{ | ||
LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) { | ||
if constexpr (IsEqualRelated<Op>::value) | ||
{ | ||
c[i] = Op::apply(RawStrEqualCompare(RightTrim(view), tar_str_view), 0); | ||
} | ||
else | ||
{ | ||
c[i] = Op::apply(RawStrCompare(RightTrim(view), tar_str_view), 0); | ||
} | ||
}); | ||
} | ||
use_optimized_path = true; | ||
break; | ||
} | ||
default: | ||
break; | ||
} | ||
return use_optimized_path; | ||
} | ||
|
||
} // namespace DB |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if other space characters need to be handled here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to MySQL::Doc::string-comparison-functions.