From 0618c04d350fec5d1ba3f6cca3cfca72007d1ab3 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Fri, 19 Jan 2024 12:42:17 +0800 Subject: [PATCH] lowerUTF8/upperUTF8 allow lower/uppercase characters occupy different number of bytes (#8622) (#8669) close pingcap/tiflash#8484 --- dbms/src/Common/UTF8Helpers.h | 2 +- dbms/src/Functions/FunctionsString.cpp | 183 +++++++++++------- dbms/src/Functions/FunctionsString.h | 12 +- .../Functions/tests/gtest_strings_lower.cpp | 15 +- .../Functions/tests/gtest_strings_upper.cpp | 122 +++++------- 5 files changed, 179 insertions(+), 155 deletions(-) diff --git a/dbms/src/Common/UTF8Helpers.h b/dbms/src/Common/UTF8Helpers.h index b2c5d8415c3..54c849fb14a 100644 --- a/dbms/src/Common/UTF8Helpers.h +++ b/dbms/src/Common/UTF8Helpers.h @@ -66,7 +66,7 @@ inline void syncForward(const UInt8 *& s, const UInt8 * const end) /// returns UTF-8 code point sequence length judging by it's first octet inline size_t seqLength(const UInt8 first_octet) { - if (first_octet < 0x80u) + if (first_octet < 0x80 || first_octet >= 0xF8) /// The specs of UTF-8. return 1; const size_t bits = 8; diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 3cccf5dcb08..1d02f52502c 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -437,25 +437,62 @@ template < char flip_case_mask, int to_case(int)> __attribute__((always_inline)) inline void toCaseImplTiDB( - ConstPtr & src, - const ConstPtr src_end, - Ptr & dst) + const UInt8 *& src, + const UInt8 * src_end, + size_t offsets_pos, + ColumnString::Chars_t & dst_data, + IColumn::Offsets & dst_offsets, + bool & is_diff_offsets) { - if (src[0] <= ascii_upper_bound) + if (*src <= ascii_upper_bound) { + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + 1); if (*src >= not_case_lower_bound && *src <= not_case_upper_bound) - *dst++ = *src++ ^ flip_case_mask; + dst_data[dst_size] = *src++ ^ flip_case_mask; else - *dst++ = *src++; + dst_data[dst_size] = *src++; } else { static const Poco::UTF8Encoding utf8; - if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src)) - src += chars, dst += chars; - else - ++src, ++dst; + int src_sequence_length = utf8.sequenceLength(src, 1); + assert(src_sequence_length > 0); + if unlikely (src + src_sequence_length > src_end) + { + /// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others + size_t dst_size = dst_data.size(); + dst_data.resize(src_end - src + dst_size); + memcpy(&dst_data[dst_size], src, src_end - src); + src = src_end; + return; + } + + int src_ch = utf8.convert(src); + if unlikely (src_ch == -1) + { + /// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + src_sequence_length); + memcpy(&dst_data[dst_size], src, src_sequence_length); + src += src_sequence_length; + return; + } + int dst_ch = to_case(src_ch); + int dst_sequence_length = utf8.convert(dst_ch, nullptr, 0); + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + dst_sequence_length); + utf8.convert(dst_ch, &dst_data[dst_size], dst_sequence_length); + + if (dst_sequence_length != src_sequence_length) + { + assert((Int64)dst_offsets[offsets_pos] + dst_sequence_length - src_sequence_length >= 0); + dst_offsets[offsets_pos] += dst_sequence_length - src_sequence_length; + is_diff_offsets = true; + } + + src += src_sequence_length; } } @@ -548,10 +585,19 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP( (not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case), void, lowerUpperUTF8ArrayImplTiDB, - (src, src_end, dst), - (ConstPtr & src, const ConstPtr src_end, Ptr & dst), - { + (src_data, src_offsets, dst_data, dst_offsets), + (const ColumnString::Chars_t & src_data, + const IColumn::Offsets & src_offsets, + ColumnString::Chars_t & dst_data, + IColumn::Offsets & dst_offsets), + { + dst_data.reserve(src_data.size()); + dst_offsets.assign(src_offsets); static const auto flip_mask = SimdWord::template fromSingle(flip_case_mask); + const UInt8 *src = src_data.data(), *src_end = src_data.data() + src_data.size(); + auto * begin = src; + bool is_diff_offsets = false; + size_t offsets_pos = 0; while (src + WORD_SIZE < src_end) { auto word = SimdWord::fromUnaligned(src); @@ -566,29 +612,71 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP( range_check.as_int8 = (word.as_int8 >= lower_bounds.as_int8) & (word.as_int8 <= upper_bounds.as_int8); selected.as_int8 = range_check.as_int8 & flip_mask.as_int8; word.as_int8 ^= selected.as_int8; - word.toUnaligned(dst); + size_t dst_size = dst_data.size(); + dst_data.resize(dst_size + WORD_SIZE); + word.toUnaligned(&dst_data[dst_size]); src += WORD_SIZE; - dst += WORD_SIZE; } else { + size_t offset_from_begin = src - begin; + while (offset_from_begin >= src_offsets[offsets_pos]) + ++offsets_pos; auto expected_end = src + WORD_SIZE; - while (src < expected_end) + while (true) + { + const UInt8 * row_end = begin + src_offsets[offsets_pos]; + assert(row_end >= src); + auto end = std::min(expected_end, row_end); + while (src < end) + { + toCaseImplTiDB< + not_case_lower_bound, + not_case_upper_bound, + ascii_upper_bound, + flip_case_mask, + to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets); + } + if (src >= expected_end) + break; + ++offsets_pos; + } + } + } + + if (src < src_end) + { + size_t offset_from_begin = src - begin; + while (offset_from_begin >= src_offsets[offsets_pos]) + ++offsets_pos; + + while (src < src_end) + { + const UInt8 * row_end = begin + src_offsets[offsets_pos]; + assert(row_end >= src); + while (src < row_end) { toCaseImplTiDB< not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, - to_case>(src, src_end, dst); + to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets); } + ++offsets_pos; + } + } + + if unlikely (is_diff_offsets) + { + Int64 diff = 0; + for (size_t i = 0; i < dst_offsets.size(); ++i) + { + /// diff is the cumulative offset difference from 0 to the i position + diff += (Int64)dst_offsets[i] - (Int64)src_offsets[i]; + dst_offsets[i] = src_offsets[i] + diff; } } - while (src < src_end) - toCaseImplTiDB( - src, - src_end, - dst); }) } // namespace @@ -618,55 +706,20 @@ void TiDBLowerUpperUTF8Impl ColumnString::Chars_t & res_data, IColumn::Offsets & res_offsets) { - res_data.resize(data.size()); - res_offsets.assign(offsets); - array(data.data(), data.data() + data.size(), res_data.data()); + lowerUpperUTF8ArrayImplTiDB( + data, + offsets, + res_data, + res_offsets); } template void TiDBLowerUpperUTF8Impl::vectorFixed( - const ColumnString::Chars_t & data, + const ColumnString::Chars_t & /*data*/, size_t /*n*/, - ColumnString::Chars_t & res_data) -{ - res_data.resize(data.size()); - array(data.data(), data.data() + data.size(), res_data.data()); -} - -template -void TiDBLowerUpperUTF8Impl::constant( - const std::string & data, - std::string & res_data) + ColumnString::Chars_t & /*res_data*/) { - res_data.resize(data.size()); - array( - reinterpret_cast(data.data()), - reinterpret_cast(data.data() + data.size()), - reinterpret_cast(&res_data[0])); -} - -template -void TiDBLowerUpperUTF8Impl::toCase( - const UInt8 *& src, - const UInt8 * src_end, - UInt8 *& dst) -{ - toCaseImplTiDB( - src, - src_end, - dst); -} - -template -void TiDBLowerUpperUTF8Impl::array( - const UInt8 * src, - const UInt8 * src_end, - UInt8 * dst) -{ - lowerUpperUTF8ArrayImplTiDB( - src, - src_end, - dst); + throw Exception("Cannot apply function TiDBLowerUpperUTF8 to fixed string.", ErrorCodes::ILLEGAL_COLUMN); } /** If the string is encoded in UTF-8, then it selects a substring of code points in it. diff --git a/dbms/src/Functions/FunctionsString.h b/dbms/src/Functions/FunctionsString.h index 12f63c96f7b..e9abf90bf9a 100644 --- a/dbms/src/Functions/FunctionsString.h +++ b/dbms/src/Functions/FunctionsString.h @@ -179,13 +179,13 @@ class FunctionStringToString : public IFunction void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override { const ColumnPtr column = block.getByPosition(arguments[0]).column; - if (const ColumnString * col = checkAndGetColumn(column.get())) + if (const auto * col = checkAndGetColumn(column.get())) { auto col_res = ColumnString::create(); Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets()); block.getByPosition(result).column = std::move(col_res); } - else if (const ColumnFixedString * col = checkAndGetColumn(column.get())) + else if (const auto * col = checkAndGetColumn(column.get())) { auto col_res = ColumnFixedString::create(col->getN()); Impl::vectorFixed(col->getChars(), col->getN(), col_res->getChars()); @@ -212,17 +212,9 @@ struct TiDBLowerUpperUTF8Impl static void vectorFixed(const ColumnString::Chars_t & data, size_t n, ColumnString::Chars_t & res_data); - static void constant(const std::string & data, std::string & res_data); - - /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`. - * `src` and `dst` are incremented by corresponding sequence lengths. */ - static void toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst); - private: static constexpr auto ascii_upper_bound = '\x7f'; static constexpr auto flip_case_mask = 'A' ^ 'a'; - - static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst); }; struct TiDBLowerUpperBinaryImpl diff --git a/dbms/src/Functions/tests/gtest_strings_lower.cpp b/dbms/src/Functions/tests/gtest_strings_lower.cpp index 77c7579038d..7d49fa2a44a 100644 --- a/dbms/src/Functions/tests/gtest_strings_lower.cpp +++ b/dbms/src/Functions/tests/gtest_strings_lower.cpp @@ -60,22 +60,33 @@ class StringLower : public DB::tests::FunctionTest TEST_F(StringLower, lowerAll) { std::vector> candidate_strings - = {"one WEEK’S time TEST", + = {"one WEEK'S time TEST", "abc测试def", "ABCテストabc", "ЀЁЂѓЄЅІїЈЉЊЋЌѝЎЏ", "+Ѐ-ё*Ђ/ѓ!Є@Ѕ#І$@Ї%Ј……љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", + "İaSdİİİİdDS", "ΑΒΓΔΕΖΗΘικΛΜΝΞΟΠΡΣτΥΦΧΨωΣ", + "ȺDȺİȺaȺȾOİȺ", + "TEST_WRONG_UTF8_1\x80\xe0\x21", "▲Α▼Βγ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", + "ⱮⱭȺΩABCDEFGHIJKLMNOPꞍaȾ", + "TEST_WRONG_UTF8_2\xf1\x22", "թՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆմՇ"}; + std::vector> lower_case_strings - = {"one week’s time test", + = {"one week's time test", "abc测试def", "abcテストabc", "ѐёђѓєѕіїјљњћќѝўџ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ(ћ)ќ¥ѝ#ў@џ!^", + "iasdiiiidds", "αβγδεζηθικλμνξοπρστυφχψωσ", + "ⱥdⱥiⱥaⱥⱦoiⱥ", + "test_wrong_utf8_1\x80\xe0\x21", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕", + "ɱɑⱥωabcdefghijklmnopɥaⱦ", + "test_wrong_utf8_2\xf1\x22", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}; diff --git a/dbms/src/Functions/tests/gtest_strings_upper.cpp b/dbms/src/Functions/tests/gtest_strings_upper.cpp index a89dcf032a1..60086abd962 100644 --- a/dbms/src/Functions/tests/gtest_strings_upper.cpp +++ b/dbms/src/Functions/tests/gtest_strings_upper.cpp @@ -42,92 +42,60 @@ class StringUpper : public DB::tests::FunctionTest return createColumn>(v); } - static ColumnWithTypeAndName toVec(const std::vector & v) { return createColumn(v); } + static ColumnWithTypeAndName toVec(const std::vector> & v) + { + std::vector strings; + strings.reserve(v.size()); + for (std::optional s : v) + { + strings.push_back(s.value()); + } + + return createColumn(strings); + } static ColumnWithTypeAndName toConst(const String & s) { return createConstColumn(1, s); } }; TEST_F(StringUpper, upperAll) { - ASSERT_COLUMN_EQ( - toNullableVec( - {"ONE WEEK’S TIME TEST", - "ABC测试DEF", - "ABCテストABC", - "ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ", - "+Ѐ-Ё*Ђ/Ѓ!Є@Ѕ#І$@Ї%Ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", - "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΣ", - "▲Α▼ΒΓ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", - "ԹՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆՄՇ"}), - executeFunction( - "upperUTF8", - toNullableVec( - {"one week’s time TEST", - "abc测试DeF", - "AbCテストAbC", - "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", - "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", - "αβγδεζηθικλμνξοπρστυφχψως", - "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★ς✕", - "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); + std::vector> candidate_strings + = {"one week's time TEST", + "abc测试DeF", + "AbCテストAbC", + "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", + "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", + "ſⱥⱦⱥaſfɫoomɑɱɒ", + "αβγδεζηθικλμνξοπρστυφχψως", + "test_wrong_utf8_1\x80\xe0\x21", + "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★ς✕", + "ȿɀabcdefghijklmnopɥı", + "test_wrong_utf8_2\xf1\x22", + "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}; + + std::vector> upper_case_strings + = {"ONE WEEK'S TIME TEST", + "ABC测试DEF", + "ABCテストABC", + "ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ", + "+Ѐ-Ё*Ђ/Ѓ!Є@Ѕ#І$@Ї%Ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", + "SȺȾȺASFⱢOOMⱭⱮⱰ", + "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΣ", + "TEST_WRONG_UTF8_1\x80\xe0\x21", + "▲Α▼ΒΓ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", + "ⱾⱿABCDEFGHIJKLMNOPꞍI", + "TEST_WRONG_UTF8_2\xf1\x22", + "ԹՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆՄՇ"}; + + ASSERT_COLUMN_EQ(toNullableVec(upper_case_strings), executeFunction("upperUTF8", toNullableVec(candidate_strings))); + + ASSERT_COLUMN_EQ(toVec(upper_case_strings), executeFunction("upperUTF8", toVec(candidate_strings))); ASSERT_COLUMN_EQ( - toVec( - {"ONE WEEK’S TIME TEST", - "ABC测试DEF", - "ABCテストABC", - "ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ", - "+Ѐ-Ё*Ђ/Ѓ!Є@Ѕ#І$@Ї%Ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", - "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΣ", - "▲Α▼ΒΓ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", - "ԹՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆՄՇ"}), - executeFunction( - "upperUTF8", - toVec( - {"one week’s time TEST", - "abc测试DeF", - "AbCテストAbC", - "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", - "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", - "αβγδεζηθικλμνξοπρστυφχψως", - "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★ς✕", - "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); + toNullableVec(candidate_strings), + executeFunction("upperBinary", toNullableVec(candidate_strings))); - ASSERT_COLUMN_EQ( - toNullableVec( - {"one week’s time TEST", - "abc测试DeF", - "AbCテストAbC", - "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", - "αβγδεζηθικλμνξοπρστυφχψως", - "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}), - executeFunction( - "upperBinary", - toNullableVec( - {"one week’s time TEST", - "abc测试DeF", - "AbCテストAbC", - "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", - "αβγδεζηθικλμνξοπρστυφχψως", - "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); - - ASSERT_COLUMN_EQ( - toVec( - {"one week’s time TEST", - "abc测试DeF", - "AbCテストAbC", - "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", - "αβγδεζηθικλμνξοπρστυφχψως", - "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}), - executeFunction( - "upperBinary", - toVec( - {"one week’s time TEST", - "abc测试DeF", - "AbCテストAbC", - "ѐёђѓєѕіїјЉЊЋЌЍЎЏ", - "αβγδεζηθικλμνξοπρστυφχψως", - "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}))); + ASSERT_COLUMN_EQ(toVec(candidate_strings), executeFunction("upperBinary", toVec(candidate_strings))); ASSERT_COLUMN_EQ(toConst("ONE WEEK’S TIME TEST"), executeFunction("upperUTF8", toConst("one week’s time TEST")));