diff --git a/dbms/src/Functions/CollationOperatorOptimized.h b/dbms/src/Functions/CollationOperatorOptimized.h index 3eed2ec2965..4651e8981bc 100644 --- a/dbms/src/Functions/CollationOperatorOptimized.h +++ b/dbms/src/Functions/CollationOperatorOptimized.h @@ -20,11 +20,11 @@ #include #include #include -#include #include #include + namespace DB { @@ -50,7 +50,7 @@ struct IsEqualRelated> // Loop columns and invoke callback for each pair. // Remove last zero byte. template -FLATTEN_INLINE inline void LoopTwoColumns( +__attribute__((flatten, always_inline)) inline void LoopTwoColumns( const ColumnString::Chars_t & a_data, const ColumnString::Offsets & a_offsets, const ColumnString::Chars_t & b_data, @@ -79,7 +79,7 @@ FLATTEN_INLINE inline void LoopTwoColumns( // Loop one column and invoke callback for each pair. // Remove last zero byte. template -FLATTEN_INLINE inline void LoopOneColumn( +__attribute__((flatten, always_inline)) inline void LoopOneColumn( const ColumnString::Chars_t & a_data, const ColumnString::Offsets & a_offsets, size_t size, @@ -97,23 +97,6 @@ FLATTEN_INLINE inline void LoopOneColumn( } } -template -FLATTEN_INLINE inline void LoopOneColumnCmpEqFixedStr( - const ColumnString::Chars_t & a_data, - const ColumnString::Offsets & a_offsets, - const char * src, - Result & c) -{ - LoopOneColumn(a_data, a_offsets, a_offsets.size(), [&](std::string_view view, size_t i) { - if constexpr (trim) - view = RightTrim(view); - auto res = 1; - if (view.size() == n) - res = mem_utils::memcmp_eq_fixed_size(view.data(), src) ? 0 : 1; - c[i] = Op::apply(res, 0); - }); -} - // Handle str-column compare str-column. // - Optimize bin collator // - Check if columns do NOT contain tail space @@ -192,6 +175,8 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( const TiDB::TiDBCollatorPtr & collator, Result & c) { + bool use_optimized_path = false; + switch (collator->getCollatorType()) { case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN: @@ -199,46 +184,11 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( case TiDB::ITiDBCollator::CollatorType::LATIN1_BIN: case TiDB::ITiDBCollator::CollatorType::ASCII_BIN: { - std::string_view tar_str_view = RightTrim(b); // right trim const-str first - - if constexpr (IsEqualRelated::value) - { -#ifdef M - static_assert(false, "`M` is defined"); -#endif -#define M(k) \ - case k: \ - { \ - LoopOneColumnCmpEqFixedStr(a_data, a_offsets, tar_str_view.data(), c); \ - return true; \ - } + size_t size = a_offsets.size(); - switch (tar_str_view.size()) - { - M(0); - M(1); - M(2); - M(3); - M(4); - M(5); - M(6); - M(7); - M(8); - M(9); - M(10); - M(11); - M(12); - M(13); - M(14); - M(15); - M(16); - default: - break; - } -#undef M - } + std::string_view tar_str_view = RightTrim(b); // right trim const-str first - LoopOneColumn(a_data, a_offsets, a_offsets.size(), [&c, &tar_str_view](const std::string_view & view, size_t i) { + LoopOneColumn(a_data, a_offsets, size, [&c, &tar_str_view](const std::string_view & view, size_t i) { if constexpr (IsEqualRelated::value) { c[i] = Op::apply(RawStrEqualCompare(RightTrim(view), tar_str_view), 0); @@ -249,48 +199,13 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( } }); - return true; + use_optimized_path = true; + break; } case TiDB::ITiDBCollator::CollatorType::BINARY: { - if constexpr (IsEqualRelated::value) - { -#ifdef M - static_assert(false, "`M` is defined"); -#endif -#define M(k) \ - case k: \ - { \ - LoopOneColumnCmpEqFixedStr(a_data, a_offsets, b.data(), c); \ - return true; \ - } - - switch (b.size()) - { - M(0); - M(1); - M(2); - M(3); - M(4); - M(5); - M(6); - M(7); - M(8); - M(9); - M(10); - M(11); - M(12); - M(13); - M(14); - M(15); - M(16); - default: - break; - } -#undef M - } - - LoopOneColumn(a_data, a_offsets, a_offsets.size(), [&c, &b](const std::string_view & view, size_t i) { + size_t size = a_offsets.size(); + LoopOneColumn(a_data, a_offsets, size, [&c, &b](const std::string_view & view, size_t i) { if constexpr (IsEqualRelated::value) { c[i] = Op::apply(RawStrEqualCompare((view), b), 0); @@ -301,12 +216,13 @@ ALWAYS_INLINE inline bool CompareStringVectorConstant( } }); - return true; + use_optimized_path = true; + break; } default: break; } - return false; + return use_optimized_path; } } // namespace DB diff --git a/dbms/src/Storages/Transaction/CollatorUtils.h b/dbms/src/Storages/Transaction/CollatorUtils.h index c757ade8043..3f318a5b700 100644 --- a/dbms/src/Storages/Transaction/CollatorUtils.h +++ b/dbms/src/Storages/Transaction/CollatorUtils.h @@ -19,6 +19,8 @@ #include +#define FLATTEN_INLINE_PURE __attribute__((flatten, always_inline, pure)) + namespace DB { diff --git a/libs/libcommon/include/common/defines.h b/libs/libcommon/include/common/defines.h index 7996bb7a579..ff79a4d2077 100644 --- a/libs/libcommon/include/common/defines.h +++ b/libs/libcommon/include/common/defines.h @@ -50,14 +50,10 @@ # define ALWAYS_INLINE __forceinline # define NO_INLINE static __declspec(noinline) # define MAY_ALIAS -# define FLATTEN_INLINE_PURE -# define FLATTEN_INLINE #else # define ALWAYS_INLINE __attribute__((__always_inline__)) # define NO_INLINE __attribute__((__noinline__)) # define MAY_ALIAS __attribute__((__may_alias__)) -# define FLATTEN_INLINE_PURE __attribute__((flatten, always_inline, pure)) -# define FLATTEN_INLINE __attribute__((flatten, always_inline)) #endif #if !defined(__x86_64__) && !defined(__aarch64__) && !defined(__PPC__) diff --git a/libs/libcommon/include/common/fixed_mem_eq.h b/libs/libcommon/include/common/fixed_mem_eq.h deleted file mode 100644 index 85691008c6d..00000000000 --- a/libs/libcommon/include/common/fixed_mem_eq.h +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright 2022 PingCAP, Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include -#include -#include - -namespace mem_utils -{ - -FLATTEN_INLINE_PURE -constexpr inline bool memcmp_eq0(const char *, const char *) -{ - return true; -} - -FLATTEN_INLINE_PURE -inline bool memcmp_eq1(const char * a, const char * b) -{ - return a[0] == b[0]; -} - -FLATTEN_INLINE_PURE -inline bool memcmp_eq2(const char * a, const char * b) -{ - const uint16_t ax = *reinterpret_cast(a); - const uint16_t bx = *reinterpret_cast(b); - return ax == bx; -} - -/* -If use `&&` and cpu failed to predict `cmp -> jne`, the pipeline will be broken. - - movzx eax, word ptr [rdi] - cmp ax, word ptr [rsi] - jne ... - movzx eax, word ptr [rdi + 1] - cmp ax, word ptr [rsi + 1] - sete al - ret - -Use `&` to reduce unnecessary branch. Instructions like (1) and (2) are independent(same for (3) and (4)), it's friendly for parallelism. - - movzx eax, word ptr [rdi] // (1) - movzx ecx, word ptr [rdi + 1] // (2) - xor ax, word ptr [rsi] // (3) - xor cx, word ptr [rsi + 1] // (4) - or cx, ax - sete al - ret -*/ -FLATTEN_INLINE_PURE -inline bool memcmp_eq3(const char * a, const char * b) -{ - return memcmp_eq2(a, b) & memcmp_eq2(a + 1, b + 1); -} - -FLATTEN_INLINE_PURE -inline bool memcmp_eq4(const char * a, const char * b) -{ - const uint32_t ax = *reinterpret_cast(a); - const uint32_t bx = *reinterpret_cast(b); - return ax == bx; -} - -FLATTEN_INLINE_PURE -inline bool memcmp_eq8(const char * a, const char * b) -{ - const uint64_t ax = *reinterpret_cast(a); - const uint64_t bx = *reinterpret_cast(b); - return ax == bx; -} - -// check memory equal of two pointers in fixed size -template -ALWAYS_INLINE inline bool memcmp_eq_fixed_size(const char * a, const char * b) -{ -#ifdef M - static_assert(false, "`M` is defined"); -#else -#define M(s) \ - if constexpr (k == (s)) \ - { \ - return memcmp_eq##s(a, b); \ - } -#endif - - static_assert(k >= 0); - - if constexpr (k >= 16) - { - /* - For x86-64 clang 13.0.0 with options `-O3 -msse4.2`, `std::memcmp(.. , .. , 16)` will be translated to - - movdqu xmm0, xmmword ptr [rdi] - movdqu xmm1, xmmword ptr [rsi] - pxor xmm1, xmm0 - ptest xmm1, xmm1 - sete al - ret - - with options `-O3 -mavx2`, it will be - - vmovdqu xmm0, xmmword ptr [rdi] - vpxor xmm0, xmm0, xmmword ptr [rsi] - vptest xmm0, xmm0 - sete al - ret - - */ - return std::memcmp(a, b, k) == 0; - } - else if constexpr (k > 8) - { - /* - if use `std::memcmp(.. , .. , 9)`, it will be - x86-64 - mov rax, qword ptr [rdi] - xor rax, qword ptr [rsi] - mov cl, byte ptr [rdi + 8] - xor cl, byte ptr [rsi + 8] - movzx ecx, cl - or rcx, rax - sete al - ret - - arm-64 v8 - ldr x8, [x0] - ldr x9, [x1] - ldrb w10, [x0, #8] - ldrb w11, [x1, #8] - eor x8, x8, x9 - eor w9, w10, w11 - and x9, x9, #0xff - orr x8, x8, x9 - cmp x8, #0 - cset w0, eq - ret - - - Make operator fetch same size of memory to reduce instructions and get better parallelism. - - x86-64 - mov rax, qword ptr [rdi] - mov rcx, qword ptr [rdi + 1] - xor rax, qword ptr [rsi] - xor rcx, qword ptr [rsi + 1] - or rcx, rax - sete al - ret - - arm-64 v8 - ldr x8, [x0] - ldr x9, [x1] - ldur x10, [x0, #1] - ldur x11, [x1, #1] - cmp x8, x9 - cset w8, eq - cmp x10, x11 - cset w9, eq - and w0, w8, w9 - ret - - */ - return memcmp_eq8(a, b) & memcmp_eq8(a + k - 8, b + k - 8); - } - else if constexpr (k > 4) - { - if constexpr (k == 8) - return memcmp_eq8(a, b); - else - return memcmp_eq4(a, b) & memcmp_eq4(a + k - 4, b + k - 4); - } - else if constexpr (k > 2) - { - M(3); - M(4); - } - else - { - M(1); - M(2); - M(0); - } -#undef M -} - -} // namespace mem_utils