From 2133f4fac5e00851a62aeaaa12659a96f8ddd0cb Mon Sep 17 00:00:00 2001 From: xzhangxian1008 Date: Fri, 17 Feb 2023 13:42:01 +0800 Subject: [PATCH] Support `ilike` function (#6741) close pingcap/tiflash#6740 --- .gitignore | 2 + contrib/tipb | 2 +- dbms/src/Common/StringUtils/StringUtils.h | 5 + dbms/src/Flash/Coprocessor/DAGUtils.cpp | 1 + dbms/src/Functions/FunctionsStringSearch.cpp | 5 +- dbms/src/Functions/FunctionsStringSearch.h | 94 ++++- .../Functions/tests/bench_function_ilike.cpp | 170 ++++++++ .../Functions/tests/gtest_strings_search.cpp | 373 ++++++++++++++++-- 8 files changed, 624 insertions(+), 28 deletions(-) create mode 100644 dbms/src/Functions/tests/bench_function_ilike.cpp diff --git a/.gitignore b/.gitignore index 4d7cd638b6f..8f99a2cb33c 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,8 @@ compile_commands.json *.logrt /build +/build-debug +/build-release /docs/en_single_page/ /docs/ru_single_page/ /docs/venv/ diff --git a/contrib/tipb b/contrib/tipb index f8511db2f07..cf83fcb61a3 160000 --- a/contrib/tipb +++ b/contrib/tipb @@ -1 +1 @@ -Subproject commit f8511db2f07273a60d3bbe3e202c28bbdf69ce50 +Subproject commit cf83fcb61a3b749f38dcfe8a31e157fa33ca206e diff --git a/dbms/src/Common/StringUtils/StringUtils.h b/dbms/src/Common/StringUtils/StringUtils.h index cd5e56a82dc..689b940a9e5 100644 --- a/dbms/src/Common/StringUtils/StringUtils.h +++ b/dbms/src/Common/StringUtils/StringUtils.h @@ -88,6 +88,11 @@ inline bool isASCII(char c) return static_cast(c) < 0x80; } +inline bool isUpperAlphaASCII(char c) +{ + return (c >= 'A' && c <= 'Z'); +} + inline bool isAlphaASCII(char c) { return (c >= 'a' && c <= 'z') diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index a2165f3159b..83563c47338 100755 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -430,6 +430,7 @@ const std::unordered_map scalar_func_map({ {tipb::ScalarFuncSig::IsIPv6, "tiDBIsIPv6"}, //{tipb::ScalarFuncSig::UUID, "cast"}, + {tipb::ScalarFuncSig::IlikeSig, "ilike3Args"}, {tipb::ScalarFuncSig::LikeSig, "like3Args"}, {tipb::ScalarFuncSig::RegexpSig, "regexp"}, {tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"}, diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 0e6d78e48c8..a69a19039ec 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -341,7 +341,7 @@ struct PositionImpl }; /// Is the LIKE expression reduced to finding a substring in a string? -inline bool likePatternIsStrstr(const String & pattern, String & res) +bool likePatternIsStrstr(const String & pattern, String & res) { res = ""; @@ -1514,7 +1514,6 @@ struct NameMatch { static constexpr auto name = "match"; }; - struct NameLike3Args { static constexpr auto name = "like3Args"; @@ -1545,6 +1544,7 @@ using FunctionPositionCaseInsensitiveUTF8 using FunctionMatch = FunctionsStringSearch, NameMatch>; using FunctionLike = FunctionsStringSearch, NameLike>; using FunctionLike3Args = FunctionsStringSearch, NameLike3Args>; +using FunctionIlike3Args = FunctionsStringSearch, NameIlike3Args>; using FunctionNotLike = FunctionsStringSearch, NameNotLike>; using FunctionExtract = FunctionsStringSearchToString; using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; @@ -1561,6 +1561,7 @@ void registerFunctionsStringSearch(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); } diff --git a/dbms/src/Functions/FunctionsStringSearch.h b/dbms/src/Functions/FunctionsStringSearch.h index d8db1b7c356..dc9bccd10f6 100644 --- a/dbms/src/Functions/FunctionsStringSearch.h +++ b/dbms/src/Functions/FunctionsStringSearch.h @@ -16,13 +16,84 @@ #include #include +#include +#include +#include +#include +#include #include #include #include #include +#include +#include + +#include +#include namespace DB { +using Chars_t = ColumnString::Chars_t; +using Offsets = ColumnString::Offsets; + +struct IlikeHelper +{ + static void lowerStrings(Chars_t & chars) + { + size_t size = chars.size(); + size_t i = 0; + while (i < size) + { + if (isUpperAlphaASCII(chars[i])) + { + chars[i] = toLowerIfAlphaASCII(chars[i]); + ++i; + } + else + { + size_t utf8_len = UTF8::seqLength(chars[i]); + i += utf8_len; + } + } + } + + static void lowerColumnConst(ColumnConst * lowered_col_const) + { + auto * col_data = typeid_cast(&lowered_col_const->getDataColumn()); + RUNTIME_ASSERT(col_data != nullptr, "Invalid column type, should be ColumnString"); + + lowerStrings(col_data->getChars()); + } + + static void lowerColumnString(MutableColumnPtr & col) + { + auto * col_vector = typeid_cast(&*col); + RUNTIME_ASSERT(col_vector != nullptr, "Invalid column type, should be ColumnString"); + + lowerStrings(col_vector->getChars()); + } + + // Only lower the 'A', 'B', 'C'... + static void lowerAlphaASCII(Block & block, const ColumnNumbers & arguments) + { + MutableColumnPtr column_haystack = block.getByPosition(arguments[0]).column->assumeMutable(); + MutableColumnPtr column_needle = block.getByPosition(arguments[1]).column->assumeMutable(); + + auto * col_haystack_const = typeid_cast(&*column_haystack); + auto * col_needle_const = typeid_cast(&*column_needle); + + if (col_haystack_const != nullptr) + lowerColumnConst(col_haystack_const); + else + lowerColumnString(column_haystack); + + if (col_needle_const != nullptr) + lowerColumnConst(col_needle_const); + else + lowerColumnString(column_needle); + } +}; + /** Search and replace functions in strings: * * position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found. @@ -57,6 +128,11 @@ extern const int ILLEGAL_COLUMN; static const UInt8 CH_ESCAPE_CHAR = '\\'; +struct NameIlike3Args +{ + static constexpr auto name = "ilike3Args"; +}; + template class FunctionsStringSearch : public IFunction { @@ -116,8 +192,20 @@ class FunctionsStringSearch : public IFunction return std::make_shared>(); } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + void executeImpl(Block & result_block, const ColumnNumbers & arguments, size_t result) const override { + auto block = result_block; + if constexpr (name == std::string_view(NameIlike3Args::name)) + { + if (!collator->isCI()) + { + block.getByPosition(arguments[0]).column = (*std::move(result_block.getByPosition(arguments[0]).column)).mutate(); + block.getByPosition(arguments[1]).column = (*std::move(result_block.getByPosition(arguments[1]).column)).mutate(); + + IlikeHelper::lowerAlphaASCII(block, arguments); + } + } + using ResultType = typename Impl::ResultType; const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; @@ -170,7 +258,7 @@ class FunctionsStringSearch : public IFunction ResultType res{}; auto needle_string = col_needle_const->getValue(); Impl::constantConstant(col_haystack_const->getValue(), needle_string, escape_char, match_type, collator, res); - block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); + result_block.getByPosition(result).column = result_block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); return; } @@ -210,7 +298,7 @@ class FunctionsStringSearch : public IFunction + getName(), ErrorCodes::ILLEGAL_COLUMN); - block.getByPosition(result).column = std::move(col_res); + result_block.getByPosition(result).column = std::move(col_res); } private: diff --git a/dbms/src/Functions/tests/bench_function_ilike.cpp b/dbms/src/Functions/tests/bench_function_ilike.cpp new file mode 100644 index 00000000000..f6837649eab --- /dev/null +++ b/dbms/src/Functions/tests/bench_function_ilike.cpp @@ -0,0 +1,170 @@ +// Copyright 2023 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +/// this is a hack, include the cpp file so we can test MatchImpl directly +#include +#include // NOLINT + +namespace DB +{ +namespace tests +{ + +constexpr size_t data_num = 500000; + +class IlikeBench : public benchmark::Fixture +{ +public: + using ColStringType = typename TypeTraits::FieldType; + using ColUInt8Type = typename TypeTraits::FieldType; + + ColumnWithTypeAndName escape = createConstColumn(1, static_cast('\\')); + + ColumnsWithTypeAndName data1{ + toVec("col0", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + toVec("col1", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + escape}; + ColumnsWithTypeAndName data2{ + toVec("col0", std::vector(data_num, "AAAAAAAAAAAAAAAAA")), + toVec("col1", std::vector(data_num, "AAAAAAAAAAAAAAAAA")), + escape}; + ColumnsWithTypeAndName data3{ + toVec("col0", std::vector(data_num, "aAaAaAaAaAaAaAaAa")), + toVec("col1", std::vector(data_num, "aAaAaAaAaAaAaAaAa")), + escape}; + ColumnsWithTypeAndName data4{ + toVec("col0", std::vector(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")), + toVec("col1", std::vector(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")), + escape}; + ColumnsWithTypeAndName data5{ + toVec("col0", std::vector(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")), + toVec("col1", std::vector(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")), + escape}; + + void SetUp(const benchmark::State &) override {} +}; + +class LikeBench : public benchmark::Fixture +{ +public: + using ColStringType = typename TypeTraits::FieldType; + using ColUInt8Type = typename TypeTraits::FieldType; + + ColumnWithTypeAndName escape = createConstColumn(1, static_cast('\\')); + + ColumnsWithTypeAndName lower_data11{toVec("col0", std::vector(data_num, "aaaaaaaaaaaaaaaaa"))}; + ColumnsWithTypeAndName lower_data12{toVec("col1", std::vector(data_num, "aaaaaaaaaaaaaaaaa"))}; + + ColumnsWithTypeAndName lower_data21{toVec("col0", std::vector(data_num, "AAAAAAAAAAAAAAAAA"))}; + ColumnsWithTypeAndName lower_data22{toVec("col1", std::vector(data_num, "AAAAAAAAAAAAAAAAA"))}; + + ColumnsWithTypeAndName lower_data31{toVec("col0", std::vector(data_num, "aAaAaAaAaAaAaAaAa"))}; + ColumnsWithTypeAndName lower_data32{toVec("col1", std::vector(data_num, "aAaAaAaAaAaAaAaAa"))}; + + ColumnsWithTypeAndName lower_data41{toVec("col0", std::vector(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯"))}; + ColumnsWithTypeAndName lower_data42{toVec("col1", std::vector(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯"))}; + + ColumnsWithTypeAndName lower_data51{toVec("col0", std::vector(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯"))}; + ColumnsWithTypeAndName lower_data52{toVec("col1", std::vector(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯"))}; + + ColumnsWithTypeAndName like_data1{ + toVec("col0", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + toVec("col1", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + escape}; + ColumnsWithTypeAndName like_data2{ + toVec("col0", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + toVec("col1", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + escape}; + ColumnsWithTypeAndName like_data3{ + toVec("col0", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + toVec("col1", std::vector(data_num, "aaaaaaaaaaaaaaaaa")), + escape}; + ColumnsWithTypeAndName like_data4{ + toVec("col0", std::vector(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")), + toVec("col1", std::vector(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")), + escape}; + ColumnsWithTypeAndName like_data5{ + toVec("col0", std::vector(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")), + toVec("col1", std::vector(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")), + escape}; + + void SetUp(const benchmark::State &) override {} +}; + +BENCHMARK_DEFINE_F(IlikeBench, ilike) +(benchmark::State & state) +try +{ + FunctionIlike3Args function_ilike; + TiDB::TiDBCollatorPtr collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_BIN); + function_ilike.setCollator(collator); + std::vector blocks{Block(data1), Block(data2), Block(data3), Block(data4), Block(data5)}; + for (auto & block : blocks) + block.insert({nullptr, std::make_shared>(), "res"}); + ColumnNumbers arguments{0, 1, 2}; + for (auto _ : state) + { + for (auto & block : blocks) + function_ilike.executeImpl(block, arguments, 3); + } +} +CATCH +BENCHMARK_REGISTER_F(IlikeBench, ilike)->Iterations(10); + +BENCHMARK_DEFINE_F(LikeBench, like) +(benchmark::State & state) +try +{ + FunctionLowerUTF8 function_lower; + FunctionLike function_like; + TiDB::TiDBCollatorPtr collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_BIN); + function_like.setCollator(collator); + std::vector lower_blocks{ + Block(lower_data11), + Block(lower_data21), + Block(lower_data31), + Block(lower_data41), + Block(lower_data51), + Block(lower_data12), + Block(lower_data22), + Block(lower_data32), + Block(lower_data42), + Block(lower_data52)}; + std::vector like_blocks{Block(like_data1), Block(like_data2), Block(like_data3), Block(like_data4), Block(like_data5)}; + + for (auto & block : lower_blocks) + block.insert({nullptr, std::make_shared(), "res"}); + for (auto & block : like_blocks) + block.insert({nullptr, std::make_shared>(), "res"}); + + ColumnNumbers lower_arguments{0, 1}; + ColumnNumbers like_arguments{0, 1, 2}; + for (auto _ : state) + { + for (auto & block : lower_blocks) + function_lower.executeImpl(block, lower_arguments, 1); + for (auto & block : like_blocks) + function_like.executeImpl(block, like_arguments, 3); + } +} +CATCH +BENCHMARK_REGISTER_F(LikeBench, like)->Iterations(10); + +} // namespace tests +} // namespace DB diff --git a/dbms/src/Functions/tests/gtest_strings_search.cpp b/dbms/src/Functions/tests/gtest_strings_search.cpp index 544ebc34df6..51560cf701a 100644 --- a/dbms/src/Functions/tests/gtest_strings_search.cpp +++ b/dbms/src/Functions/tests/gtest_strings_search.cpp @@ -13,8 +13,8 @@ // limitations under the License. #include -#include #include +#include #include namespace DB @@ -24,7 +24,8 @@ namespace tests class StringMatch : public FunctionTest { protected: - const String func_name = "like3Args"; + const String func_like_name = "like3Args"; + const String func_ilike_name = "ilike3Args"; const String long_str = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzab" "cdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdef" "ghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkl" @@ -34,6 +35,14 @@ class StringMatch : public FunctionTest const String long_pattern = "abcdefghijklmnopqrstuvwxyz_bcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz%abcdefghijklmnopqrstuvwxyz"; + std::vector collators{ + TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_GENERAL_CI), + TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_GENERAL_CI), + TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_UNICODE_CI), + TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_UNICODE_CI), + TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8MB4_BIN), + TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_BIN)}; + ColumnWithTypeAndName escape = createConstColumn(1, static_cast('\\')); static ColumnWithTypeAndName toNullableVec(const std::vector> & v) @@ -127,7 +136,7 @@ try auto needle = createColumn(needle_raw, "needle"); auto expected = createColumn(result_raw, "result"); - auto result = executeFunction(func_name, {haystack, needle, escape}); + auto result = executeFunction(func_like_name, {haystack, needle, escape}); ASSERT_COLUMN_EQ(expected, result); @@ -159,7 +168,7 @@ try auto nullable_needle = createColumn>(nullable_needle_raw, "needle"); auto nullable_expected = createColumn>(nullable_result_raw, "result"); - auto nullable_result = executeFunction(func_name, {nullable_haystack, nullable_needle, escape}); + auto nullable_result = executeFunction(func_like_name, {nullable_haystack, nullable_needle, escape}); ASSERT_COLUMN_EQ(nullable_expected, nullable_result); } @@ -199,7 +208,7 @@ try auto needle = createColumn>(needle_raw); auto expected = createColumn>(result_raw); - auto result = executeFunction(func_name, {haystack, needle, escape}); + auto result = executeFunction(func_like_name, {haystack, needle, escape}); ASSERT_COLUMN_EQ(expected, result); } } @@ -213,7 +222,7 @@ TEST_F(StringMatch, LikeVectorWithVector) ASSERT_COLUMN_EQ( toNullableVec(expect), executeFunction( - func_name, + func_like_name, toNullableVec(haystack), toNullableVec(needle), escape)); @@ -221,7 +230,7 @@ TEST_F(StringMatch, LikeVectorWithVector) ASSERT_COLUMN_EQ( toVec(expect), executeFunction( - func_name, + func_like_name, toVec(haystack), toVec(needle), escape)); @@ -232,7 +241,7 @@ TEST_F(StringMatch, LikeVectorWithVector) ASSERT_COLUMN_EQ( toNullableVec(expect_null), executeFunction( - func_name, + func_like_name, toNullableVec(haystack_null), toNullableVec(needle_null), escape)); @@ -246,7 +255,7 @@ TEST_F(StringMatch, LikeConstWithVector) ASSERT_COLUMN_EQ( toNullableVec(expect), executeFunction( - func_name, + func_like_name, toConst("abcaba"), toNullableVec(needle), escape)); @@ -254,7 +263,7 @@ TEST_F(StringMatch, LikeConstWithVector) ASSERT_COLUMN_EQ( toVec(expect), executeFunction( - func_name, + func_like_name, toConst("abcaba"), toVec(needle), escape)); @@ -262,7 +271,7 @@ TEST_F(StringMatch, LikeConstWithVector) ASSERT_COLUMN_EQ( toVec(expect1), executeFunction( - func_name, + func_like_name, toConst(long_str), toVec(needle), escape)); @@ -272,7 +281,7 @@ TEST_F(StringMatch, LikeConstWithVector) ASSERT_COLUMN_EQ( toNullableVec(expect_null), executeFunction( - func_name, + func_like_name, toConst("abc"), toNullableVec(needle_null), escape)); @@ -288,7 +297,7 @@ TEST_F(StringMatch, LikeVectorWithConst) ASSERT_COLUMN_EQ( toNullableVec(expect), executeFunction( - func_name, + func_like_name, toNullableVec(haystack), toConst("%aa%"), escape)); @@ -296,7 +305,7 @@ TEST_F(StringMatch, LikeVectorWithConst) ASSERT_COLUMN_EQ( toVec(expect), executeFunction( - func_name, + func_like_name, toVec(haystack), toConst("%aa%"), escape)); @@ -304,7 +313,7 @@ TEST_F(StringMatch, LikeVectorWithConst) ASSERT_COLUMN_EQ( toVec(expect1), executeFunction( - func_name, + func_like_name, toVec(haystack), toConst("%爱tif%"), escape)); @@ -312,7 +321,7 @@ TEST_F(StringMatch, LikeVectorWithConst) ASSERT_COLUMN_EQ( toVec(expect2), executeFunction( - func_name, + func_like_name, toVec(haystack), toConst("%不爱tif%"), escape)); @@ -320,7 +329,7 @@ TEST_F(StringMatch, LikeVectorWithConst) ASSERT_COLUMN_EQ( toVec(expect3), executeFunction( - func_name, + func_like_name, toVec(haystack), toConst(long_pattern), escape)); @@ -330,7 +339,7 @@ TEST_F(StringMatch, LikeVectorWithConst) ASSERT_COLUMN_EQ( toNullableVec(expect_null), executeFunction( - func_name, + func_like_name, toNullableVec(haystack_null), toConst("abc"), escape)); @@ -341,7 +350,7 @@ TEST_F(StringMatch, LikeConstWithConst) ASSERT_COLUMN_EQ( toConst(1), executeFunction( - func_name, + func_like_name, toConst("resaasfe"), toConst("%aa%"), escape)); @@ -349,7 +358,7 @@ TEST_F(StringMatch, LikeConstWithConst) ASSERT_COLUMN_EQ( toConst(0), executeFunction( - func_name, + func_like_name, toConst("abcde"), toConst("%aa%"), escape)); @@ -357,7 +366,7 @@ TEST_F(StringMatch, LikeConstWithConst) ASSERT_COLUMN_EQ( toConst(1), executeFunction( - func_name, + func_like_name, toConst("我爱tiflash"), toConst("%爱tif%"), escape)); @@ -365,11 +374,331 @@ TEST_F(StringMatch, LikeConstWithConst) ASSERT_COLUMN_EQ( toConst(0), executeFunction( - func_name, + func_like_name, toConst("我爱tiflash"), toConst("%不爱tif%"), escape)); } +TEST_F(StringMatch, Ilike3ArgsVectorWithVector) +try +{ + struct Case + { + int match; + std::string a; + std::string b; + }; + std::vector cases = { + {1, "", ""}, + {1, "a", "A"}, + {1, "", ""}, + {1, "a", "%"}, + {1, "A", "a%"}, + {1, "a", "%a"}, + {1, "ab", "a%"}, + {1, "ab", "ab"}, + // pattern can only be used as the second argument + {0, "a%", "ab"}, + {1, "aaaa", "a%"}, + {0, "aaaa", "aaab%"}, + {1, "aabAAbabaabbaB", "aab%a%aAB%b"}, + {1, "a", "_"}, + {1, "Abab", "_b__"}, + {0, "abab", "_b_"}, + }; + + InferredDataVector haystack_raw = {}; + InferredDataVector needle_raw = {}; + InferredDataVector result_raw = {}; + + for (auto & cas : cases) + { + haystack_raw.push_back(cas.a); + needle_raw.push_back(cas.b); + result_raw.push_back(cas.match); + } + + auto haystack = createColumn(haystack_raw, "haystack"); + auto needle = createColumn(needle_raw, "needle"); + auto expected = createColumn(result_raw, "result"); + + for (const auto * collator : collators) + { + auto result = executeFunction(func_ilike_name, {haystack, needle, escape}, collator); + ASSERT_COLUMN_EQ(expected, result); + } + + struct NullableCase + { + std::optional match; + std::optional a; + std::optional b; + }; + std::vector nullable_cases = { + {std::nullopt, std::nullopt, ""}, + {std::nullopt, "a", std::nullopt}, + {std::nullopt, std::nullopt, std::nullopt}, + {1, "a", "%"}, + }; + + InferredDataVector> nullable_haystack_raw = {}; + InferredDataVector> nullable_needle_raw = {}; + InferredDataVector> nullable_result_raw = {}; + + for (auto & cas : nullable_cases) + { + nullable_haystack_raw.push_back(cas.a); + nullable_needle_raw.push_back(cas.b); + nullable_result_raw.push_back(cas.match); + } + + auto nullable_haystack = createColumn>(nullable_haystack_raw, "haystack"); + auto nullable_needle = createColumn>(nullable_needle_raw, "needle"); + auto nullable_expected = createColumn>(nullable_result_raw, "result"); + for (const auto * collator : collators) + { + auto nullable_result = executeFunction(func_ilike_name, {nullable_haystack, nullable_needle, escape}, collator); + ASSERT_COLUMN_EQ(nullable_expected, nullable_result); + } +} +CATCH + +TEST_F(StringMatch, Ilike3ArgsConstantWithVector) +try +{ + struct Case + { + std::string src; + std::vector> pat; + }; + std::vector cases = { + // {"a", {{"B", 0}, {"A", 1}, {"_", 1}, {"%", 1}}}, + {"aaB", {{"aAb", 1}, {"aB_", 0}, {"A_A", 0}, {"a__", 1}}}, + }; + + for (const auto * collator : collators) + { + for (auto & cas : cases) + { + InferredDataVector> needle_raw = {}; + InferredDataVector> result_raw = {}; + + for (auto & pat : cas.pat) + { + needle_raw.push_back(pat.first); + result_raw.push_back(pat.second); + } + + auto haystack = createConstColumn>(1, cas.src); + auto needle = createColumn>(needle_raw); + auto expected = createColumn>(result_raw); + + auto result = executeFunction(func_ilike_name, {haystack, needle, escape}, collator); + ASSERT_COLUMN_EQ(expected, result); + } + } +} +CATCH + +TEST_F(StringMatch, ilikeVectorWithVector) +{ + std::vector> haystack = {"我爱TiflaSH", "我爱TifLash", "", "A", "", "a", "a", "A", "ab", "aB", "a%", "aaAa", "aaaa", "aabaabABaabbab", "a", "abab", "abAB", "abcdefGHijklmn", "a", long_str}; + std::vector> needle = {"我_Tif%", "%爱tI%", "", "a", "", "%", "a%", "%a", "a%", "ab", "Ab", "a%", "aAab%", "aab%a%aab%b", "_", "_b__", "_b_", "a%", "abcDefghIjklmn%", long_pattern}; + std::vector> expect = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1}; + + std::vector> haystack_null = {{}, "a"}; + std::vector> needle_null = {"我_tif%", {}}; + std::vector> expect_null = {{}, {}}; + + for (const auto * collator : collators) + { + ASSERT_COLUMN_EQ( + toNullableVec(expect), + executeFunction( + "ilike3Args", + {toNullableVec(haystack), toNullableVec(needle), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toVec(expect), + executeFunction( + "ilike3Args", + {toVec(haystack), toVec(needle), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toNullableVec(expect_null), + executeFunction( + "ilike3Args", + {toNullableVec(haystack_null), toNullableVec(needle_null), escape}, + collator)); + } +} + +TEST_F(StringMatch, IlikeConstWithVector) +{ + std::vector> needle = {"", "a", "", "%", "a%", "%a", "a%", "ab", "ab", "a%", "aaab%", "aab%a%aab%b", "_", "_b__", "_b_", long_pattern}; + std::vector> expect = {0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}; + std::vector> expect1 = {0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1}; + + std::vector> needle_null = {{}}; + std::vector> expect_null = {{}}; + + for (const auto * collator : collators) + { + ASSERT_COLUMN_EQ( + toNullableVec(expect), + executeFunction( + func_ilike_name, + {toConst("abcAba"), toNullableVec(needle), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toVec(expect), + executeFunction( + func_ilike_name, + {toConst("ABCaba"), toVec(needle), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toVec(expect1), + executeFunction( + func_ilike_name, + {toConst(long_str), toVec(needle), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toNullableVec(expect_null), + executeFunction( + func_ilike_name, + {toConst("ABC"), toNullableVec(needle_null), escape}, + collator)); + } +} + +TEST_F(StringMatch, IlikeVectorWithConst) +{ + std::vector> haystack = {"我爱tiflash", "", "a", "", "a", "a", "A", "ab", "ab", "a%", "aaaa", "aaaa", "aabaABAbaabbaB", "a", "abab", "Abab", long_str}; + std::vector> expect = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0}; + std::vector> expect1 = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector> expect2 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector> expect3 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + + std::vector> haystack_null = {{}}; + std::vector> expect_null = {{}}; + + for (const auto * collator : collators) + { + ASSERT_COLUMN_EQ( + toNullableVec(expect), + executeFunction( + func_ilike_name, + {toNullableVec(haystack), toConst("%aA%"), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toVec(expect), + executeFunction( + func_ilike_name, + {toVec(haystack), toConst("%aa%"), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toVec(expect1), + executeFunction( + func_ilike_name, + {toVec(haystack), toConst("%爱tIf%"), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toVec(expect2), + executeFunction( + func_ilike_name, + {toVec(haystack), toConst("%不爱tiF%"), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toVec(expect3), + executeFunction( + func_ilike_name, + {toVec(haystack), toConst(long_pattern), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toNullableVec(expect_null), + executeFunction( + func_ilike_name, + {toNullableVec(haystack_null), toConst("Abc"), escape}, + collator)); + } +} + +TEST_F(StringMatch, IlikeConstWithConst) +{ + for (const auto * collator : collators) + { + ASSERT_COLUMN_EQ( + toConst(1), + executeFunction( + func_ilike_name, + {toConst("resaAsfe"), toConst("%aa%"), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toConst(0), + executeFunction( + func_ilike_name, + {toConst("Abcde"), toConst("%aa%"), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toConst(1), + executeFunction( + func_ilike_name, + {toConst("我爱Tiflash"), toConst("%爱tiF%"), escape}, + collator)); + + ASSERT_COLUMN_EQ( + toConst(0), + executeFunction( + func_ilike_name, + {toConst("我爱tiflAsh"), toConst("%不爱tIf%"), escape}, + collator)); + } +} + +// ilike function will modify the column's content in-place, in order to +// ensure the column's content is not modified after function finishes the work, +// we need to replace the modified columns with other columns which clone the +// original columns at the beginning. +TEST_F(StringMatch, CheckInvariance) +{ + ColumnWithTypeAndName escape = createConstColumn(1, static_cast('\\')); + TiDB::TiDBCollatorPtr collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_BIN); + + std::vector> vec_vec_vec_col0{"aAa", "", "123", "a嗯A"}; + std::vector> vec_vec_vec_col1{"aaA", "123", "", "嗯嗯a嗯"}; + String const_const_col0("aSd"); + String const_const_col1("a嗯A嗯"); + + auto vec_vec_vec_col0_col = toVec(vec_vec_vec_col0); + auto vec_vec_vec_col0_expect_col = toVec(vec_vec_vec_col0); + auto vec_vec_vec_col1_col = toVec(vec_vec_vec_col1); + auto vec_vec_vec_col1_expect_col = toVec(vec_vec_vec_col1); + auto const_const_col0_col = toConst(const_const_col0); + auto const_const_col0_expect_col = toConst(const_const_col0); + auto const_const_col1_col = toConst(const_const_col1); + auto const_const_col1_expect_col = toConst(const_const_col1); + + executeFunction(func_ilike_name, {vec_vec_vec_col0_col, vec_vec_vec_col1_col, escape}, collator); + ASSERT_COLUMN_EQ(vec_vec_vec_col0_col, vec_vec_vec_col0_expect_col); + ASSERT_COLUMN_EQ(vec_vec_vec_col1_col, vec_vec_vec_col1_expect_col); + + executeFunction(func_ilike_name, {const_const_col0_col, const_const_col1_col, escape}, collator); + ASSERT_COLUMN_EQ(const_const_col0_col, const_const_col0_expect_col); + ASSERT_COLUMN_EQ(const_const_col1_col, const_const_col1_expect_col); +} + } // namespace tests } // namespace DB