Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lowerUTF8/upperUTF8 allow lower/uppercase characters occupy different number of bytes (#8622) #8667

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dbms/src/Common/UTF8Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ inline void syncForward(const UInt8 *& s, const UInt8 * const end)
/// returns UTF-8 code point sequence length judging by it's first octet
inline size_t seqLength(const UInt8 first_octet)
{
if (first_octet < 0x80u)
if (first_octet < 0x80 || first_octet >= 0xF8) /// The specs of UTF-8.
return 1;

const size_t bits = 8;
Expand Down
196 changes: 118 additions & 78 deletions dbms/src/Functions/FunctionsString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,25 +422,62 @@ template <char not_case_lower_bound,
char flip_case_mask,
int to_case(int)>
__attribute__((always_inline)) inline void toCaseImplTiDB(
ConstPtr<UInt8> & src,
const ConstPtr<UInt8> src_end,
Ptr<UInt8> & dst)
const UInt8 *& src,
const UInt8 * src_end,
size_t offsets_pos,
ColumnString::Chars_t & dst_data,
IColumn::Offsets & dst_offsets,
bool & is_diff_offsets)
{
if (src[0] <= ascii_upper_bound)
if (*src <= ascii_upper_bound)
{
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + 1);
if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
*dst++ = *src++ ^ flip_case_mask;
dst_data[dst_size] = *src++ ^ flip_case_mask;
else
*dst++ = *src++;
dst_data[dst_size] = *src++;
}
else
{
static const Poco::UTF8Encoding utf8;

if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src))
src += chars, dst += chars;
else
++src, ++dst;
int src_sequence_length = utf8.sequenceLength(src, 1);
assert(src_sequence_length > 0);
if unlikely (src + src_sequence_length > src_end)
{
/// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others
size_t dst_size = dst_data.size();
dst_data.resize(src_end - src + dst_size);
memcpy(&dst_data[dst_size], src, src_end - src);
src = src_end;
return;
}

int src_ch = utf8.convert(src);
if unlikely (src_ch == -1)
{
/// If this row has invalid utf-8 characters, just copy it to dst string and do not influence others
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + src_sequence_length);
memcpy(&dst_data[dst_size], src, src_sequence_length);
src += src_sequence_length;
return;
}
int dst_ch = to_case(src_ch);
int dst_sequence_length = utf8.convert(dst_ch, nullptr, 0);
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + dst_sequence_length);
utf8.convert(dst_ch, &dst_data[dst_size], dst_sequence_length);

if (dst_sequence_length != src_sequence_length)
{
assert((Int64)dst_offsets[offsets_pos] + dst_sequence_length - src_sequence_length >= 0);
dst_offsets[offsets_pos] += dst_sequence_length - src_sequence_length;
is_diff_offsets = true;
}

src += src_sequence_length;
}
}

Expand Down Expand Up @@ -543,12 +580,19 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP(
to_case),
void,
lowerUpperUTF8ArrayImplTiDB,
(src, src_end, dst),
(ConstPtr<UInt8> & src,
const ConstPtr<UInt8> src_end,
Ptr<UInt8> & dst),
{
(src_data, src_offsets, dst_data, dst_offsets),
(const ColumnString::Chars_t & src_data,
const IColumn::Offsets & src_offsets,
ColumnString::Chars_t & dst_data,
IColumn::Offsets & dst_offsets),
{
dst_data.reserve(src_data.size());
dst_offsets.assign(src_offsets);
static const auto flip_mask = SimdWord::template fromSingle<int8_t>(flip_case_mask);
const UInt8 *src = src_data.data(), *src_end = src_data.data() + src_data.size();
auto * begin = src;
bool is_diff_offsets = false;
size_t offsets_pos = 0;
while (src + WORD_SIZE < src_end)
{
auto word = SimdWord::fromUnaligned(src);
Expand All @@ -563,31 +607,71 @@ TIFLASH_DECLARE_MULTITARGET_FUNCTION_TP(
range_check.as_int8 = (word.as_int8 >= lower_bounds.as_int8) & (word.as_int8 <= upper_bounds.as_int8);
selected.as_int8 = range_check.as_int8 & flip_mask.as_int8;
word.as_int8 ^= selected.as_int8;
word.toUnaligned(dst);
size_t dst_size = dst_data.size();
dst_data.resize(dst_size + WORD_SIZE);
word.toUnaligned(&dst_data[dst_size]);
src += WORD_SIZE;
dst += WORD_SIZE;
}
else
{
size_t offset_from_begin = src - begin;
while (offset_from_begin >= src_offsets[offsets_pos])
++offsets_pos;
auto expected_end = src + WORD_SIZE;
while (src < expected_end)
while (true)
{
const UInt8 * row_end = begin + src_offsets[offsets_pos];
assert(row_end >= src);
auto end = std::min(expected_end, row_end);
while (src < end)
{
toCaseImplTiDB<
not_case_lower_bound,
not_case_upper_bound,
ascii_upper_bound,
flip_case_mask,
to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets);
}
if (src >= expected_end)
break;
++offsets_pos;
}
}
}

if (src < src_end)
{
size_t offset_from_begin = src - begin;
while (offset_from_begin >= src_offsets[offsets_pos])
++offsets_pos;

while (src < src_end)
{
const UInt8 * row_end = begin + src_offsets[offsets_pos];
assert(row_end >= src);
while (src < row_end)
{
toCaseImplTiDB<
not_case_lower_bound,
not_case_upper_bound,
ascii_upper_bound,
flip_case_mask,
to_case>(src, src_end, dst);
to_case>(src, row_end, offsets_pos, dst_data, dst_offsets, is_diff_offsets);
}
++offsets_pos;
}
}

if unlikely (is_diff_offsets)
{
Int64 diff = 0;
for (size_t i = 0; i < dst_offsets.size(); ++i)
{
/// diff is the cumulative offset difference from 0 to the i position
diff += (Int64)dst_offsets[i] - (Int64)src_offsets[i];
dst_offsets[i] = src_offsets[i] + diff;
}
}
while (src < src_end)
toCaseImplTiDB<
not_case_lower_bound,
not_case_upper_bound,
ascii_upper_bound,
flip_case_mask,
to_case>(src, src_end, dst);
})
} // namespace

Expand Down Expand Up @@ -618,66 +702,22 @@ void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>
ColumnString::Chars_t & res_data,
IColumn::Offsets & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
array(data.data(), data.data() + data.size(), res_data.data());
lowerUpperUTF8ArrayImplTiDB<not_case_lower_bound, not_case_upper_bound, ascii_upper_bound, flip_case_mask, to_case>(
data,
offsets,
res_data,
res_offsets);
}

template <char not_case_lower_bound,
char not_case_upper_bound,
int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::vectorFixed(
const ColumnString::Chars_t & data,
const ColumnString::Chars_t & /*data*/,
size_t /*n*/,
ColumnString::Chars_t & res_data)
{
res_data.resize(data.size());
array(data.data(), data.data() + data.size(), res_data.data());
}

template <char not_case_lower_bound,
char not_case_upper_bound,
int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::constant(
const std::string & data,
std::string & res_data)
ColumnString::Chars_t & /*res_data*/)
{
res_data.resize(data.size());
array(reinterpret_cast<const UInt8 *>(data.data()),
reinterpret_cast<const UInt8 *>(data.data() + data.size()),
reinterpret_cast<UInt8 *>(&res_data[0]));
}

template <char not_case_lower_bound,
char not_case_upper_bound,
int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::toCase(
const UInt8 *& src,
const UInt8 * src_end,
UInt8 *& dst)
{
toCaseImplTiDB<
not_case_lower_bound,
not_case_upper_bound,
ascii_upper_bound,
flip_case_mask,
to_case>(src, src_end, dst);
}

template <char not_case_lower_bound,
char not_case_upper_bound,
int to_case(int)>
void TiDBLowerUpperUTF8Impl<not_case_lower_bound, not_case_upper_bound, to_case>::array(
const UInt8 * src,
const UInt8 * src_end,
UInt8 * dst)
{
lowerUpperUTF8ArrayImplTiDB<
not_case_lower_bound,
not_case_upper_bound,
ascii_upper_bound,
flip_case_mask,
to_case>(src, src_end, dst);
throw Exception("Cannot apply function TiDBLowerUpperUTF8 to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
}

/** If the string is encoded in UTF-8, then it selects a substring of code points in it.
Expand Down
12 changes: 2 additions & 10 deletions dbms/src/Functions/FunctionsString.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,13 @@ class FunctionStringToString : public IFunction
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
{
const ColumnPtr column = block.getByPosition(arguments[0]).column;
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
if (const auto * col = checkAndGetColumn<ColumnString>(column.get()))
{
auto col_res = ColumnString::create();
Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
block.getByPosition(result).column = std::move(col_res);
}
else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
else if (const auto * col = checkAndGetColumn<ColumnFixedString>(column.get()))
{
auto col_res = ColumnFixedString::create(col->getN());
Impl::vectorFixed(col->getChars(), col->getN(), col_res->getChars());
Expand All @@ -220,17 +220,9 @@ struct TiDBLowerUpperUTF8Impl

static void vectorFixed(const ColumnString::Chars_t & data, size_t n, ColumnString::Chars_t & res_data);

static void constant(const std::string & data, std::string & res_data);

/** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
* `src` and `dst` are incremented by corresponding sequence lengths. */
static void toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst);

private:
static constexpr auto ascii_upper_bound = '\x7f';
static constexpr auto flip_case_mask = 'A' ^ 'a';

static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst);
};

struct TiDBLowerUpperBinaryImpl
Expand Down
32 changes: 29 additions & 3 deletions dbms/src/Functions/tests/gtest_strings_lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,35 @@ class StringLower : public DB::tests::FunctionTest

TEST_F(StringLower, lowerAll)
{
std::vector<std::optional<String>> candidate_strings = {"one WEEK’S time TEST", "abc测试def", "ABCテストabc", "ЀЁЂѓЄЅІїЈЉЊЋЌѝЎЏ", "+Ѐ-ё*Ђ/ѓ!Є@Ѕ#І$@Ї%Ј……љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", "ΑΒΓΔΕΖΗΘικΛΜΝΞΟΠΡΣτΥΦΧΨωΣ", "▲Α▼Βγ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕", "թՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆմՇ"};
std::vector<std::optional<String>> lower_case_strings = {"one week’s time test", "abc测试def", "abcテストabc", "ѐёђѓєѕіїјљњћќѝўџ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ(ћ)ќ¥ѝ#ў@џ!^", "αβγδεζηθικλμνξοπρστυφχψωσ", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"};

std::vector<std::optional<String>> candidate_strings
= {"one WEEK'S time TEST",
"abc测试def",
"ABCテストabc",
"ЀЁЂѓЄЅІїЈЉЊЋЌѝЎЏ",
"+Ѐ-ё*Ђ/ѓ!Є@Ѕ#І$@Ї%Ј……љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^",
"İaSdİİİİdDS",
"ΑΒΓΔΕΖΗΘικΛΜΝΞΟΠΡΣτΥΦΧΨωΣ",
"ȺDȺİȺaȺȾOİȺ",
"TEST_WRONG_UTF8_1\x80\xe0\x21",
"▲Α▼Βγ➨ΔΕ☎ΖΗ✂ΘΙ€ΚΛ♫ΜΝ✓ΞΟ✚ΠΡ℉ΣΤ♥ΥΦ♖ΧΨ♘Ω★Σ✕",
"ⱮⱭȺΩABCDEFGHIJKLMNOPꞍaȾ",
"TEST_WRONG_UTF8_2\xf1\x22",
"թՓՁՋՐՉՃԺԾՔՈԵՌՏԸՒԻՕՊԱՍԴՖԳՀՅԿԼԽԶՂՑՎԲՆմՇ"};

std::vector<std::optional<String>> lower_case_strings
= {"one week's time test",
"abc测试def",
"abcテストabc",
"ѐёђѓєѕіїјљњћќѝўџ",
"+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……љ&њ(ћ)ќ¥ѝ#ў@џ!^",
"iasdiiiidds",
"αβγδεζηθικλμνξοπρστυφχψωσ",
"ⱥdⱥiⱥaⱥⱦoiⱥ",
"test_wrong_utf8_1\x80\xe0\x21",
"▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★σ✕",
"ɱɑⱥωabcdefghijklmnopɥaⱦ",
"test_wrong_utf8_2\xf1\x22",
"թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"};

ASSERT_COLUMN_EQ(
toNullableVec(lower_case_strings),
Expand Down
Loading