Skip to content

Commit

Permalink
Support ilike function (#6741)
Browse files Browse the repository at this point in the history
close #6740
  • Loading branch information
xzhangxian1008 authored Feb 17, 2023
1 parent 47d4c8f commit 2133f4f
Show file tree
Hide file tree
Showing 8 changed files with 624 additions and 28 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ compile_commands.json
*.logrt

/build
/build-debug
/build-release
/docs/en_single_page/
/docs/ru_single_page/
/docs/venv/
Expand Down
2 changes: 1 addition & 1 deletion contrib/tipb
5 changes: 5 additions & 0 deletions dbms/src/Common/StringUtils/StringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ inline bool isASCII(char c)
return static_cast<unsigned char>(c) < 0x80;
}

inline bool isUpperAlphaASCII(char c)
{
return (c >= 'A' && c <= 'Z');
}

inline bool isAlphaASCII(char c)
{
return (c >= 'a' && c <= 'z')
Expand Down
1 change: 1 addition & 0 deletions dbms/src/Flash/Coprocessor/DAGUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
{tipb::ScalarFuncSig::IsIPv6, "tiDBIsIPv6"},
//{tipb::ScalarFuncSig::UUID, "cast"},

{tipb::ScalarFuncSig::IlikeSig, "ilike3Args"},
{tipb::ScalarFuncSig::LikeSig, "like3Args"},
{tipb::ScalarFuncSig::RegexpSig, "regexp"},
{tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"},
Expand Down
5 changes: 3 additions & 2 deletions dbms/src/Functions/FunctionsStringSearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ struct PositionImpl
};

/// Is the LIKE expression reduced to finding a substring in a string?
inline bool likePatternIsStrstr(const String & pattern, String & res)
bool likePatternIsStrstr(const String & pattern, String & res)
{
res = "";

Expand Down Expand Up @@ -1514,7 +1514,6 @@ struct NameMatch
{
static constexpr auto name = "match";
};

struct NameLike3Args
{
static constexpr auto name = "like3Args";
Expand Down Expand Up @@ -1545,6 +1544,7 @@ using FunctionPositionCaseInsensitiveUTF8
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionLike3Args = FunctionsStringSearch<MatchImpl<true, false, true>, NameLike3Args>;
using FunctionIlike3Args = FunctionsStringSearch<MatchImpl<true, false, true>, NameIlike3Args>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
using FunctionReplaceOne = FunctionStringReplace<ReplaceStringImpl<true>, NameReplaceOne>;
Expand All @@ -1561,6 +1561,7 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionLike3Args>();
factory.registerFunction<FunctionIlike3Args>();
factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionExtract>();
}
Expand Down
94 changes: 91 additions & 3 deletions dbms/src/Functions/FunctionsStringSearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,84 @@

#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <Columns/IColumn.h>
#include <Common/Exception.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/UTF8Helpers.h>
#include <Common/typeid_cast.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/StringUtil.h>
#include <common/defines.h>

#include <cstring>
#include <string_view>

namespace DB
{
using Chars_t = ColumnString::Chars_t;
using Offsets = ColumnString::Offsets;

struct IlikeHelper
{
static void lowerStrings(Chars_t & chars)
{
size_t size = chars.size();
size_t i = 0;
while (i < size)
{
if (isUpperAlphaASCII(chars[i]))
{
chars[i] = toLowerIfAlphaASCII(chars[i]);
++i;
}
else
{
size_t utf8_len = UTF8::seqLength(chars[i]);
i += utf8_len;
}
}
}

static void lowerColumnConst(ColumnConst * lowered_col_const)
{
auto * col_data = typeid_cast<ColumnString *>(&lowered_col_const->getDataColumn());
RUNTIME_ASSERT(col_data != nullptr, "Invalid column type, should be ColumnString");

lowerStrings(col_data->getChars());
}

static void lowerColumnString(MutableColumnPtr & col)
{
auto * col_vector = typeid_cast<ColumnString *>(&*col);
RUNTIME_ASSERT(col_vector != nullptr, "Invalid column type, should be ColumnString");

lowerStrings(col_vector->getChars());
}

// Only lower the 'A', 'B', 'C'...
static void lowerAlphaASCII(Block & block, const ColumnNumbers & arguments)
{
MutableColumnPtr column_haystack = block.getByPosition(arguments[0]).column->assumeMutable();
MutableColumnPtr column_needle = block.getByPosition(arguments[1]).column->assumeMutable();

auto * col_haystack_const = typeid_cast<ColumnConst *>(&*column_haystack);
auto * col_needle_const = typeid_cast<ColumnConst *>(&*column_needle);

if (col_haystack_const != nullptr)
lowerColumnConst(col_haystack_const);
else
lowerColumnString(column_haystack);

if (col_needle_const != nullptr)
lowerColumnConst(col_needle_const);
else
lowerColumnString(column_needle);
}
};

/** Search and replace functions in strings:
*
* position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found.
Expand Down Expand Up @@ -57,6 +128,11 @@ extern const int ILLEGAL_COLUMN;

static const UInt8 CH_ESCAPE_CHAR = '\\';

struct NameIlike3Args
{
static constexpr auto name = "ilike3Args";
};

template <typename Impl, typename Name>
class FunctionsStringSearch : public IFunction
{
Expand Down Expand Up @@ -116,8 +192,20 @@ class FunctionsStringSearch : public IFunction
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
}

void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
void executeImpl(Block & result_block, const ColumnNumbers & arguments, size_t result) const override
{
auto block = result_block;
if constexpr (name == std::string_view(NameIlike3Args::name))
{
if (!collator->isCI())
{
block.getByPosition(arguments[0]).column = (*std::move(result_block.getByPosition(arguments[0]).column)).mutate();
block.getByPosition(arguments[1]).column = (*std::move(result_block.getByPosition(arguments[1]).column)).mutate();

IlikeHelper::lowerAlphaASCII(block, arguments);
}
}

using ResultType = typename Impl::ResultType;

const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
Expand Down Expand Up @@ -170,7 +258,7 @@ class FunctionsStringSearch : public IFunction
ResultType res{};
auto needle_string = col_needle_const->getValue<String>();
Impl::constantConstant(col_haystack_const->getValue<String>(), needle_string, escape_char, match_type, collator, res);
block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
result_block.getByPosition(result).column = result_block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
return;
}

Expand Down Expand Up @@ -210,7 +298,7 @@ class FunctionsStringSearch : public IFunction
+ getName(),
ErrorCodes::ILLEGAL_COLUMN);

block.getByPosition(result).column = std::move(col_res);
result_block.getByPosition(result).column = std::move(col_res);
}

private:
Expand Down
170 changes: 170 additions & 0 deletions dbms/src/Functions/tests/bench_function_ilike.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// Copyright 2023 PingCAP, Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <Functions/FunctionsComparison.h>
#include <Functions/FunctionsStringSearch.h>
#include <TestUtils/FunctionTestUtils.h>
#include <benchmark/benchmark.h>

/// this is a hack, include the cpp file so we can test MatchImpl directly
#include <Functions/FunctionsString.cpp>
#include <Functions/FunctionsStringSearch.cpp> // NOLINT

namespace DB
{
namespace tests
{

constexpr size_t data_num = 500000;

class IlikeBench : public benchmark::Fixture
{
public:
using ColStringType = typename TypeTraits<String>::FieldType;
using ColUInt8Type = typename TypeTraits<UInt8>::FieldType;

ColumnWithTypeAndName escape = createConstColumn<Int32>(1, static_cast<Int32>('\\'));

ColumnsWithTypeAndName data1{
toVec<String>("col0", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
escape};
ColumnsWithTypeAndName data2{
toVec<String>("col0", std::vector<ColStringType>(data_num, "AAAAAAAAAAAAAAAAA")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "AAAAAAAAAAAAAAAAA")),
escape};
ColumnsWithTypeAndName data3{
toVec<String>("col0", std::vector<ColStringType>(data_num, "aAaAaAaAaAaAaAaAa")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "aAaAaAaAaAaAaAaAa")),
escape};
ColumnsWithTypeAndName data4{
toVec<String>("col0", std::vector<ColStringType>(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")),
escape};
ColumnsWithTypeAndName data5{
toVec<String>("col0", std::vector<ColStringType>(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")),
escape};

void SetUp(const benchmark::State &) override {}
};

class LikeBench : public benchmark::Fixture
{
public:
using ColStringType = typename TypeTraits<String>::FieldType;
using ColUInt8Type = typename TypeTraits<UInt8>::FieldType;

ColumnWithTypeAndName escape = createConstColumn<Int32>(1, static_cast<Int32>('\\'));

ColumnsWithTypeAndName lower_data11{toVec<String>("col0", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa"))};
ColumnsWithTypeAndName lower_data12{toVec<String>("col1", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa"))};

ColumnsWithTypeAndName lower_data21{toVec<String>("col0", std::vector<ColStringType>(data_num, "AAAAAAAAAAAAAAAAA"))};
ColumnsWithTypeAndName lower_data22{toVec<String>("col1", std::vector<ColStringType>(data_num, "AAAAAAAAAAAAAAAAA"))};

ColumnsWithTypeAndName lower_data31{toVec<String>("col0", std::vector<ColStringType>(data_num, "aAaAaAaAaAaAaAaAa"))};
ColumnsWithTypeAndName lower_data32{toVec<String>("col1", std::vector<ColStringType>(data_num, "aAaAaAaAaAaAaAaAa"))};

ColumnsWithTypeAndName lower_data41{toVec<String>("col0", std::vector<ColStringType>(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯"))};
ColumnsWithTypeAndName lower_data42{toVec<String>("col1", std::vector<ColStringType>(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯"))};

ColumnsWithTypeAndName lower_data51{toVec<String>("col0", std::vector<ColStringType>(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯"))};
ColumnsWithTypeAndName lower_data52{toVec<String>("col1", std::vector<ColStringType>(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯"))};

ColumnsWithTypeAndName like_data1{
toVec<String>("col0", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
escape};
ColumnsWithTypeAndName like_data2{
toVec<String>("col0", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
escape};
ColumnsWithTypeAndName like_data3{
toVec<String>("col0", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "aaaaaaaaaaaaaaaaa")),
escape};
ColumnsWithTypeAndName like_data4{
toVec<String>("col0", std::vector<ColStringType>(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "嗯嗯嗯嗯嗯嗯嗯嗯嗯嗯")),
escape};
ColumnsWithTypeAndName like_data5{
toVec<String>("col0", std::vector<ColStringType>(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")),
toVec<String>("col1", std::vector<ColStringType>(data_num, "a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯a嗯")),
escape};

void SetUp(const benchmark::State &) override {}
};

BENCHMARK_DEFINE_F(IlikeBench, ilike)
(benchmark::State & state)
try
{
FunctionIlike3Args function_ilike;
TiDB::TiDBCollatorPtr collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_BIN);
function_ilike.setCollator(collator);
std::vector<Block> blocks{Block(data1), Block(data2), Block(data3), Block(data4), Block(data5)};
for (auto & block : blocks)
block.insert({nullptr, std::make_shared<DataTypeNumber<UInt8>>(), "res"});
ColumnNumbers arguments{0, 1, 2};
for (auto _ : state)
{
for (auto & block : blocks)
function_ilike.executeImpl(block, arguments, 3);
}
}
CATCH
BENCHMARK_REGISTER_F(IlikeBench, ilike)->Iterations(10);

BENCHMARK_DEFINE_F(LikeBench, like)
(benchmark::State & state)
try
{
FunctionLowerUTF8 function_lower;
FunctionLike function_like;
TiDB::TiDBCollatorPtr collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::UTF8_BIN);
function_like.setCollator(collator);
std::vector<Block> lower_blocks{
Block(lower_data11),
Block(lower_data21),
Block(lower_data31),
Block(lower_data41),
Block(lower_data51),
Block(lower_data12),
Block(lower_data22),
Block(lower_data32),
Block(lower_data42),
Block(lower_data52)};
std::vector<Block> like_blocks{Block(like_data1), Block(like_data2), Block(like_data3), Block(like_data4), Block(like_data5)};

for (auto & block : lower_blocks)
block.insert({nullptr, std::make_shared<DataTypeString>(), "res"});
for (auto & block : like_blocks)
block.insert({nullptr, std::make_shared<DataTypeNumber<UInt8>>(), "res"});

ColumnNumbers lower_arguments{0, 1};
ColumnNumbers like_arguments{0, 1, 2};
for (auto _ : state)
{
for (auto & block : lower_blocks)
function_lower.executeImpl(block, lower_arguments, 1);
for (auto & block : like_blocks)
function_like.executeImpl(block, like_arguments, 3);
}
}
CATCH
BENCHMARK_REGISTER_F(LikeBench, like)->Iterations(10);

} // namespace tests
} // namespace DB
Loading

0 comments on commit 2133f4f

Please sign in to comment.