Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(c++): Support the UTF-8 to UTF-16 with SIMD #1990

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 208 additions & 1 deletion cpp/fury/util/string_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,129 @@ inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) {
*utf8++ = static_cast<char>((code_point & 0x3F) | 0x80);
}

std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) {
std::u16string utf16;
utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations

char buffer[64]; // Buffer to hold temporary UTF-16 results
char16_t *output =
reinterpret_cast<char16_t *>(buffer); // Use char16_t for output

size_t i = 0;
size_t n = utf8.size();

while (i + 32 <= n) {

for (int j = 0; j < 32; ++j) {
uint8_t byte = utf8[i + j];

if (byte < 0x80) {
// 1-byte character (ASCII)
*output++ = static_cast<char16_t>(byte);
} else if (byte < 0xE0) {
// 2-byte character
uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F);
if (!is_little_endian) {
utf16_char = (utf16_char >> 8) |
(utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
++j;
} else if (byte < 0xF0) {
// 3-byte character
uint16_t utf16_char = ((byte & 0x0F) << 12) |
((utf8[i + j + 1] & 0x3F) << 6) |
(utf8[i + j + 2] & 0x3F);
if (!is_little_endian) {
utf16_char = (utf16_char >> 8) |
(utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
j += 2;
} else {
// 4-byte character (surrogate pair handling required)
uint32_t code_point =
((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) |
((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F);

// Convert the code point to a surrogate pair
uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);

if (!is_little_endian) {
high_surrogate = (high_surrogate >> 8) |
(high_surrogate << 8); // Swap bytes for big-endian
low_surrogate = (low_surrogate >> 8) |
(low_surrogate << 8); // Swap bytes for big-endian
}

*output++ = high_surrogate;
*output++ = low_surrogate;

j += 3;
}
}

// Append the processed buffer to the final utf16 string
utf16.append(reinterpret_cast<char16_t *>(buffer),
output - reinterpret_cast<char16_t *>(buffer));
output =
reinterpret_cast<char16_t *>(buffer); // Reset output buffer pointer
i += 32;
}

// Handle remaining characters
while (i < n) {
uint8_t byte = utf8[i];

if (byte < 0x80) {
*output++ = static_cast<char16_t>(byte);
} else if (byte < 0xE0) {
uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
if (!is_little_endian) {
utf16_char =
(utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
++i;
} else if (byte < 0xF0) {
uint16_t utf16_char = ((byte & 0x0F) << 12) |
((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
if (!is_little_endian) {
utf16_char =
(utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
}
*output++ = utf16_char;
i += 2;
} else {
uint32_t code_point = ((byte & 0x07) << 18) |
((utf8[i + 1] & 0x3F) << 12) |
((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);

uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);

if (!is_little_endian) {
high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
}

*output++ = high_surrogate;
*output++ = low_surrogate;

i += 3;
}

++i;
}

// Append the last part of the buffer to the utf16 string
utf16.append(reinterpret_cast<char16_t *>(buffer),
output - reinterpret_cast<char16_t *>(buffer));

return utf16;
}

#if defined(__x86_64__) || defined(_M_X64)

bool isLatin(const std::string &str) {
Expand Down Expand Up @@ -168,6 +291,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
return utf8;
}

std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
return utf8ToUtf16SIMD(utf8, is_little_endian);
}

#elif defined(__ARM_NEON) || defined(__ARM_NEON__)

bool isLatin(const std::string &str) {
Expand Down Expand Up @@ -264,6 +391,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
return utf8;
}

std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
return utf8ToUtf16SIMD(utf8, is_little_endian);
}

#elif defined(__riscv) && __riscv_vector

bool isLatin(const std::string &str) {
Expand Down Expand Up @@ -365,6 +496,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
return utf8;
}

std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
return utf8ToUtf16SIMD(utf8, is_little_endian);
}

#else

bool isLatin(const std::string &str) {
Expand Down Expand Up @@ -414,6 +549,78 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
return utf8;
}

// Fallback implementation without SIMD acceleration
std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
std::u16string utf16; // Resulting UTF-16 string
size_t i = 0; // Index for traversing the UTF-8 string
size_t n = utf8.size(); // Total length of the UTF-8 string

// Loop through each byte of the UTF-8 string
while (i < n) {
uint32_t code_point = 0; // The Unicode code point
unsigned char c = utf8[i]; // Current byte of the UTF-8 string

// Determine the number of bytes for this character based on its first byte
if ((c & 0x80) == 0) {
// 1-byte character (ASCII)
code_point = c;
++i;
} else if ((c & 0xE0) == 0xC0) {
// 2-byte character
code_point = c & 0x1F;
code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
i += 2;
} else if ((c & 0xF0) == 0xE0) {
// 3-byte character
code_point = c & 0x0F;
code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
i += 3;
} else if ((c & 0xF8) == 0xF0) {
// 4-byte character
code_point = c & 0x07;
code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
code_point = (code_point << 6) | (utf8[i + 3] & 0x3F);
i += 4;
} else {
// Invalid UTF-8 byte sequence
throw std::invalid_argument("Invalid UTF-8 encoding.");
}

// If the code point is beyond the BMP range, use surrogate pairs
if (code_point >= 0x10000) {
code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair
uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate
uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate

// If not little-endian, swap bytes of the surrogates
if (!is_little_endian) {
high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
}

// Add both high and low surrogates to the UTF-16 string
utf16.push_back(high_surrogate);
utf16.push_back(low_surrogate);
} else {
// For code points within the BMP range, directly store as a 16-bit value
uint16_t utf16_char = static_cast<uint16_t>(code_point);

// If not little-endian, swap the bytes of the 16-bit character
if (!is_little_endian) {
utf16_char = (utf16_char >> 8) | (utf16_char << 8);
}

// Add the UTF-16 character to the string
utf16.push_back(utf16_char);
}
}

// Return the resulting UTF-16 string
return utf16;
}

#endif

} // namespace fury
} // namespace fury
4 changes: 3 additions & 1 deletion cpp/fury/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,6 @@ bool isLatin(const std::string &str);

std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian);

} // namespace fury
std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian);

} // namespace fury
Loading
Loading