From 4ab1564a17c2ed8bd78b5255f721def878fdd296 Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Tue, 16 Aug 2022 12:24:51 +0800 Subject: [PATCH] Optimize equality comparison for small str with fixed size (#5569) ref pingcap/tiflash#5294 --- .../Functions/CollationOperatorOptimized.h | 114 ++++++++-- dbms/src/Storages/Transaction/CollatorUtils.h | 2 - libs/libcommon/include/common/defines.h | 4 + libs/libcommon/include/common/fixed_mem_eq.h | 203 ++++++++++++++++++ 4 files changed, 306 insertions(+), 17 deletions(-) create mode 100644 libs/libcommon/include/common/fixed_mem_eq.h diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h index 4651e8981bc..3eed2ec2965 100644 --- a/dbms/src/Functions/CollationOperatorOptimized.h +++ b/dbms/src/Functions/CollationOperatorOptimized.h @@ -20,11 +20,11 @@ #include #include #include +#include #include #include - namespace DB { @@ -50,7 +50,7 @@ struct IsEqualRelated> // Loop columns and invoke callback for each pair. // Remove last zero byte. template -__attribute__((flatten, always_inline)) inline void LoopTwoColumns( +FLATTEN_INLINE inline void LoopTwoColumns( const ColumnString::Chars_t & a_data, const ColumnString::Offsets & a_offsets, const ColumnString::Chars_t & b_data, @@ -79,7 +79,7 @@ __attribute__((flatten, always_inline)) inline void LoopTwoColumns( // Loop one column and invoke callback for each pair. // Remove last zero byte. template -__attribute__((flatten, always_inline)) inline void LoopOneColumn( +FLATTEN_INLINE inline void LoopOneColumn( const ColumnString::Chars_t & a_data, const ColumnString::Offsets & a_offsets, size_t size, @@ -97,6 +97,23 @@ __attribute__((flatten, always_inline)) inline void LoopOneColumn( } } +template +FLATTEN_INLINE inline void LoopOneColumnCmpEqFixedStr( + const ColumnString::Chars_t & a_data, + const ColumnString::Offsets & a_offsets, + const char * src, + Result & c) +{ + LoopOneColumn(a_data, a_offsets, a_offsets.size(), [&](std::string_view view, size_t i) { + if constexpr (trim) + view = RightTrim(view); + auto res = 1; + if (view.size() == n) + res = mem_utils::memcmp_eq_fixed_size(view.data(), src) ? 0 : 1; + c[i] = Op::apply(res, 0); + }); +} + // Handle str-column compare str-column. // - Optimize bin collator // - Check if columns do NOT contain tail space @@ -175,8 +192,6 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( const TiDB::TiDBCollatorPtr & collator, Result & c) { - bool use_optimized_path = false; - switch (collator->getCollatorType()) { case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN: @@ -184,11 +199,46 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN: case TiDB::ITiDBCollator::CollatorType::ASCII_BIN: { - size_t size = a_offsets.size(); - std::string_view tar_str_view = RightTrim(b); // right trim const-str first - LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) { + if constexpr (IsEqualRelated::value) + { +#ifdef M + static_assert(false, "`M` is defined"); +#endif +#define M(k) \ + case k: \ + { \ + LoopOneColumnCmpEqFixedStr(a_data, a_offsets, tar_str_view.data(), c); \ + return true; \ + } + + switch (tar_str_view.size()) + { + M(0); + M(1); + M(2); + M(3); + M(4); + M(5); + M(6); + M(7); + M(8); + M(9); + M(10); + M(11); + M(12); + M(13); + M(14); + M(15); + M(16); + default: + break; + } +#undef M + } + + LoopOneColumn(a_data, a_offsets, a_offsets.size(), [&c, &tar_str_view](const std::string_view & view, size_t i) { if constexpr (IsEqualRelated::value) { c[i] = Op::apply(RawStrEqualCompare(RightTrim(view), tar_str_view), 0); @@ -199,13 +249,48 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( } }); - use_optimized_path = true; - break; + return true; } case TiDB::ITiDBCollator::CollatorType::BINARY: { - size_t size = a_offsets.size(); - LoopOneColumn(a_data, a_offsets, size, [&c, &b](const std::string_view & view, size_t i) { + if constexpr (IsEqualRelated::value) + { +#ifdef M + static_assert(false, "`M` is defined"); +#endif +#define M(k) \ + case k: \ + { \ + LoopOneColumnCmpEqFixedStr(a_data, a_offsets, b.data(), c); \ + return true; \ + } + + switch (b.size()) + { + M(0); + M(1); + M(2); + M(3); + M(4); + M(5); + M(6); + M(7); + M(8); + M(9); + M(10); + M(11); + M(12); + M(13); + M(14); + M(15); + M(16); + default: + break; + } +#undef M + } + + LoopOneColumn(a_data, a_offsets, a_offsets.size(), [&c, &b](const std::string_view & view, size_t i) { if constexpr (IsEqualRelated::value) { c[i] = Op::apply(RawStrEqualCompare((view), b), 0); @@ -216,13 +301,12 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( } }); - use_optimized_path = true; - break; + return true; } default: break; } - return use_optimized_path; + return false; } } // namespace DB diff --git a/dbms/src/Storages/Transaction/CollatorUtils.h b/dbms/src/Storages/Transaction/CollatorUtils.h index 3f318a5b700..c757ade8043 100644 --- a/dbms/src/Storages/Transaction/CollatorUtils.h +++ b/dbms/src/Storages/Transaction/CollatorUtils.h @@ -19,8 +19,6 @@ #include -#define FLATTEN_INLINE_PURE __attribute__((flatten, always_inline, pure)) - namespace DB { diff --git a/libs/libcommon/include/common/defines.h b/libs/libcommon/include/common/defines.h index ff79a4d2077..7996bb7a579 100644 --- a/libs/libcommon/include/common/defines.h +++ b/libs/libcommon/include/common/defines.h @@ -50,10 +50,14 @@ # define ALWAYS_INLINE __forceinline # define NO_INLINE static __declspec(noinline) # define MAY_ALIAS +# define FLATTEN_INLINE_PURE +# define FLATTEN_INLINE #else # define ALWAYS_INLINE __attribute__((__always_inline__)) # define NO_INLINE __attribute__((__noinline__)) # define MAY_ALIAS __attribute__((__may_alias__)) +# define FLATTEN_INLINE_PURE __attribute__((flatten, always_inline, pure)) +# define FLATTEN_INLINE __attribute__((flatten, always_inline)) #endif #if !defined(__x86_64__) && !defined(__aarch64__) && !defined(__PPC__) diff --git a/libs/libcommon/include/common/fixed_mem_eq.h b/libs/libcommon/include/common/fixed_mem_eq.h new file mode 100644 index 00000000000..85691008c6d --- /dev/null +++ b/libs/libcommon/include/common/fixed_mem_eq.h @@ -0,0 +1,203 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include + +namespace mem_utils +{ + +FLATTEN_INLINE_PURE +constexpr inline bool memcmp_eq0(const char *, const char *) +{ + return true; +} + +FLATTEN_INLINE_PURE +inline bool memcmp_eq1(const char * a, const char * b) +{ + return a[0] == b[0]; +} + +FLATTEN_INLINE_PURE +inline bool memcmp_eq2(const char * a, const char * b) +{ + const uint16_t ax = *reinterpret_cast(a); + const uint16_t bx = *reinterpret_cast(b); + return ax == bx; +} + +/* +If use `&&` and cpu failed to predict `cmp -> jne`, the pipeline will be broken. + + movzx eax, word ptr [rdi] + cmp ax, word ptr [rsi] + jne ... + movzx eax, word ptr [rdi + 1] + cmp ax, word ptr [rsi + 1] + sete al + ret + +Use `&` to reduce unnecessary branch. Instructions like (1) and (2) are independent(same for (3) and (4)), it's friendly for parallelism. + + movzx eax, word ptr [rdi] // (1) + movzx ecx, word ptr [rdi + 1] // (2) + xor ax, word ptr [rsi] // (3) + xor cx, word ptr [rsi + 1] // (4) + or cx, ax + sete al + ret +*/ +FLATTEN_INLINE_PURE +inline bool memcmp_eq3(const char * a, const char * b) +{ + return memcmp_eq2(a, b) & memcmp_eq2(a + 1, b + 1); +} + +FLATTEN_INLINE_PURE +inline bool memcmp_eq4(const char * a, const char * b) +{ + const uint32_t ax = *reinterpret_cast(a); + const uint32_t bx = *reinterpret_cast(b); + return ax == bx; +} + +FLATTEN_INLINE_PURE +inline bool memcmp_eq8(const char * a, const char * b) +{ + const uint64_t ax = *reinterpret_cast(a); + const uint64_t bx = *reinterpret_cast(b); + return ax == bx; +} + +// check memory equal of two pointers in fixed size +template +ALWAYS_INLINE inline bool memcmp_eq_fixed_size(const char * a, const char * b) +{ +#ifdef M + static_assert(false, "`M` is defined"); +#else +#define M(s) \ + if constexpr (k == (s)) \ + { \ + return memcmp_eq##s(a, b); \ + } +#endif + + static_assert(k >= 0); + + if constexpr (k >= 16) + { + /* + For x86-64 clang 13.0.0 with options `-O3 -msse4.2`, `std::memcmp(.. , .. , 16)` will be translated to + + movdqu xmm0, xmmword ptr [rdi] + movdqu xmm1, xmmword ptr [rsi] + pxor xmm1, xmm0 + ptest xmm1, xmm1 + sete al + ret + + with options `-O3 -mavx2`, it will be + + vmovdqu xmm0, xmmword ptr [rdi] + vpxor xmm0, xmm0, xmmword ptr [rsi] + vptest xmm0, xmm0 + sete al + ret + + */ + return std::memcmp(a, b, k) == 0; + } + else if constexpr (k > 8) + { + /* + if use `std::memcmp(.. , .. , 9)`, it will be + x86-64 + mov rax, qword ptr [rdi] + xor rax, qword ptr [rsi] + mov cl, byte ptr [rdi + 8] + xor cl, byte ptr [rsi + 8] + movzx ecx, cl + or rcx, rax + sete al + ret + + arm-64 v8 + ldr x8, [x0] + ldr x9, [x1] + ldrb w10, [x0, #8] + ldrb w11, [x1, #8] + eor x8, x8, x9 + eor w9, w10, w11 + and x9, x9, #0xff + orr x8, x8, x9 + cmp x8, #0 + cset w0, eq + ret + + + Make operator fetch same size of memory to reduce instructions and get better parallelism. + + x86-64 + mov rax, qword ptr [rdi] + mov rcx, qword ptr [rdi + 1] + xor rax, qword ptr [rsi] + xor rcx, qword ptr [rsi + 1] + or rcx, rax + sete al + ret + + arm-64 v8 + ldr x8, [x0] + ldr x9, [x1] + ldur x10, [x0, #1] + ldur x11, [x1, #1] + cmp x8, x9 + cset w8, eq + cmp x10, x11 + cset w9, eq + and w0, w8, w9 + ret + + */ + return memcmp_eq8(a, b) & memcmp_eq8(a + k - 8, b + k - 8); + } + else if constexpr (k > 4) + { + if constexpr (k == 8) + return memcmp_eq8(a, b); + else + return memcmp_eq4(a, b) & memcmp_eq4(a + k - 4, b + k - 4); + } + else if constexpr (k > 2) + { + M(3); + M(4); + } + else + { + M(1); + M(2); + M(0); + } +#undef M +} + +} // namespace mem_utils