Skip to content

Commit

Permalink
lowerUTF8/upperUTF8 allow lower/uppercase characters occupy different…
Browse files Browse the repository at this point in the history
… number of bytes (#8622) (#8667)

close #8484
  • Loading branch information
ti-chi-bot authored Jan 24, 2024
1 parent 3131f5a commit ddea3a4
Show file tree
Hide file tree
Showing 6 changed files with 328 additions and 114 deletions.
2 changes: 1 addition & 1 deletion dbms/src/Common/UTF8Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ inline void syncForward(const UInt8 *& s, const UInt8 * const end)
/// returns UTF-8 code point sequence length judging by it's first octet
inline size_t seqLength(const UInt8 first_octet)
{
if (first_octet < 0x80u)
if (first_octet < 0x80 || first_octet >= 0xF8) /// The specs of UTF-8.
return 1;

const size_t bits = 8;
Expand Down
135 changes: 135 additions & 0 deletions dbms/src/Functions/CharUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,141 @@ const CaseRange caseRange[]{
{0x1F00, 0x1F07, {8, 0, 8}},
{0x1F08, 0x1F0F, {0, -8, 0}},
{0x1F10, 0x1F15, {8, 0, 8}},
{0x1F18, 0x1F1D, {0, -8, 0}},
{0x1F20, 0x1F27, {8, 0, 8}},
{0x1F28, 0x1F2F, {0, -8, 0}},
{0x1F30, 0x1F37, {8, 0, 8}},
{0x1F38, 0x1F3F, {0, -8, 0}},
{0x1F40, 0x1F45, {8, 0, 8}},
{0x1F48, 0x1F4D, {0, -8, 0}},
{0x1F51, 0x1F51, {8, 0, 8}},
{0x1F53, 0x1F53, {8, 0, 8}},
{0x1F55, 0x1F55, {8, 0, 8}},
{0x1F57, 0x1F57, {8, 0, 8}},
{0x1F59, 0x1F59, {0, -8, 0}},
{0x1F5B, 0x1F5B, {0, -8, 0}},
{0x1F5D, 0x1F5D, {0, -8, 0}},
{0x1F5F, 0x1F5F, {0, -8, 0}},
{0x1F60, 0x1F67, {8, 0, 8}},
{0x1F68, 0x1F6F, {0, -8, 0}},
{0x1F70, 0x1F71, {74, 0, 74}},
{0x1F72, 0x1F75, {86, 0, 86}},
{0x1F76, 0x1F77, {100, 0, 100}},
{0x1F78, 0x1F79, {128, 0, 128}},
{0x1F7A, 0x1F7B, {112, 0, 112}},
{0x1F7C, 0x1F7D, {126, 0, 126}},
{0x1F80, 0x1F87, {8, 0, 8}},
{0x1F88, 0x1F8F, {0, -8, 0}},
{0x1F90, 0x1F97, {8, 0, 8}},
{0x1F98, 0x1F9F, {0, -8, 0}},
{0x1FA0, 0x1FA7, {8, 0, 8}},
{0x1FA8, 0x1FAF, {0, -8, 0}},
{0x1FB0, 0x1FB1, {8, 0, 8}},
{0x1FB3, 0x1FB3, {9, 0, 9}},
{0x1FB8, 0x1FB9, {0, -8, 0}},
{0x1FBA, 0x1FBB, {0, -74, 0}},
{0x1FBC, 0x1FBC, {0, -9, 0}},
{0x1FBE, 0x1FBE, {-7205, 0, -7205}},
{0x1FC3, 0x1FC3, {9, 0, 9}},
{0x1FC8, 0x1FCB, {0, -86, 0}},
{0x1FCC, 0x1FCC, {0, -9, 0}},
{0x1FD0, 0x1FD1, {8, 0, 8}},
{0x1FD8, 0x1FD9, {0, -8, 0}},
{0x1FDA, 0x1FDB, {0, -100, 0}},
{0x1FE0, 0x1FE1, {8, 0, 8}},
{0x1FE5, 0x1FE5, {7, 0, 7}},
{0x1FE8, 0x1FE9, {0, -8, 0}},
{0x1FEA, 0x1FEB, {0, -112, 0}},
{0x1FEC, 0x1FEC, {0, -7, 0}},
{0x1FF3, 0x1FF3, {9, 0, 9}},
{0x1FF8, 0x1FF9, {0, -128, 0}},
{0x1FFA, 0x1FFB, {0, -126, 0}},
{0x1FFC, 0x1FFC, {0, -9, 0}},
{0x2126, 0x2126, {0, -7517, 0}},
{0x212A, 0x212A, {0, -8383, 0}},
{0x212B, 0x212B, {0, -8262, 0}},
{0x2132, 0x2132, {0, 28, 0}},
{0x214E, 0x214E, {-28, 0, -28}},
{0x2160, 0x216F, {0, 16, 0}},
{0x2170, 0x217F, {-16, 0, -16}},
{0x2183, 0x2184, {UpperLower, UpperLower, UpperLower}},
{0x24B6, 0x24CF, {0, 26, 0}},
{0x24D0, 0x24E9, {-26, 0, -26}},
{0x2C00, 0x2C2F, {0, 48, 0}},
{0x2C30, 0x2C5F, {-48, 0, -48}},
{0x2C60, 0x2C61, {UpperLower, UpperLower, UpperLower}},
{0x2C62, 0x2C62, {0, -10743, 0}},
{0x2C63, 0x2C63, {0, -3814, 0}},
{0x2C64, 0x2C64, {0, -10727, 0}},
{0x2C65, 0x2C65, {-10795, 0, -10795}},
{0x2C66, 0x2C66, {-10792, 0, -10792}},
{0x2C67, 0x2C6C, {UpperLower, UpperLower, UpperLower}},
{0x2C6D, 0x2C6D, {0, -10780, 0}},
{0x2C6E, 0x2C6E, {0, -10749, 0}},
{0x2C6F, 0x2C6F, {0, -10783, 0}},
{0x2C70, 0x2C70, {0, -10782, 0}},
{0x2C72, 0x2C73, {UpperLower, UpperLower, UpperLower}},
{0x2C75, 0x2C76, {UpperLower, UpperLower, UpperLower}},
{0x2C7E, 0x2C7F, {0, -10815, 0}},
{0x2C80, 0x2CE3, {UpperLower, UpperLower, UpperLower}},
{0x2CEB, 0x2CEE, {UpperLower, UpperLower, UpperLower}},
{0x2CF2, 0x2CF3, {UpperLower, UpperLower, UpperLower}},
{0x2D00, 0x2D25, {-7264, 0, -7264}},
{0x2D27, 0x2D27, {-7264, 0, -7264}},
{0x2D2D, 0x2D2D, {-7264, 0, -7264}},
{0xA640, 0xA66D, {UpperLower, UpperLower, UpperLower}},
{0xA680, 0xA69B, {UpperLower, UpperLower, UpperLower}},
{0xA722, 0xA72F, {UpperLower, UpperLower, UpperLower}},
{0xA732, 0xA76F, {UpperLower, UpperLower, UpperLower}},
{0xA779, 0xA77C, {UpperLower, UpperLower, UpperLower}},
{0xA77D, 0xA77D, {0, -35332, 0}},
{0xA77E, 0xA787, {UpperLower, UpperLower, UpperLower}},
{0xA78B, 0xA78C, {UpperLower, UpperLower, UpperLower}},
{0xA78D, 0xA78D, {0, -42280, 0}},
{0xA790, 0xA793, {UpperLower, UpperLower, UpperLower}},
{0xA794, 0xA794, {48, 0, 48}},
{0xA796, 0xA7A9, {UpperLower, UpperLower, UpperLower}},
{0xA7AA, 0xA7AA, {0, -42308, 0}},
{0xA7AB, 0xA7AB, {0, -42319, 0}},
{0xA7AC, 0xA7AC, {0, -42315, 0}},
{0xA7AD, 0xA7AD, {0, -42305, 0}},
{0xA7AE, 0xA7AE, {0, -42308, 0}},
{0xA7B0, 0xA7B0, {0, -42258, 0}},
{0xA7B1, 0xA7B1, {0, -42282, 0}},
{0xA7B2, 0xA7B2, {0, -42261, 0}},
{0xA7B3, 0xA7B3, {0, 928, 0}},
{0xA7B4, 0xA7C3, {UpperLower, UpperLower, UpperLower}},
{0xA7C4, 0xA7C4, {0, -48, 0}},
{0xA7C5, 0xA7C5, {0, -42307, 0}},
{0xA7C6, 0xA7C6, {0, -35384, 0}},
{0xA7C7, 0xA7CA, {UpperLower, UpperLower, UpperLower}},
{0xA7D0, 0xA7D1, {UpperLower, UpperLower, UpperLower}},
{0xA7D6, 0xA7D9, {UpperLower, UpperLower, UpperLower}},
{0xA7F5, 0xA7F6, {UpperLower, UpperLower, UpperLower}},
{0xAB53, 0xAB53, {-928, 0, -928}},
{0xAB70, 0xABBF, {-38864, 0, -38864}},
{0xFF21, 0xFF3A, {0, 32, 0}},
{0xFF41, 0xFF5A, {-32, 0, -32}},
{0x10400, 0x10427, {0, 40, 0}},
{0x10428, 0x1044F, {-40, 0, -40}},
{0x104B0, 0x104D3, {0, 40, 0}},
{0x104D8, 0x104FB, {-40, 0, -40}},
{0x10570, 0x1057A, {0, 39, 0}},
{0x1057C, 0x1058A, {0, 39, 0}},
{0x1058C, 0x10592, {0, 39, 0}},
{0x10594, 0x10595, {0, 39, 0}},
{0x10597, 0x105A1, {-39, 0, -39}},
{0x105A3, 0x105B1, {-39, 0, -39}},
{0x105B3, 0x105B9, {-39, 0, -39}},
{0x105BB, 0x105BC, {-39, 0, -39}},
{0x10C80, 0x10CB2, {0, 64, 0}},
{0x10CC0, 0x10CF2, {-64, 0, -64}},
{0x118A0, 0x118BF, {0, 32, 0}},
{0x118C0, 0x118DF, {-32, 0, -32}},
{0x16E40, 0x16E5F, {0, 32, 0}},
{0x16E60, 0x16E7F, {-32, 0, -32}},
{0x1E900, 0x1E921, {0, 34, 0}},
{0x1E922, 0x1E943, {-34, 0, -34}},
};

inline int toCase(int _case, int ch)
Expand Down
Loading

0 comments on commit ddea3a4

Please sign in to comment.