From 4b4eaf0b88c305e2f42d46d90e511f1e6fe6d033 Mon Sep 17 00:00:00 2001 From: pandalee99 <1162953505@qq.com> Date: Tue, 24 Dec 2024 20:03:52 +0800 Subject: [PATCH 1/6] make utf8 to utf16 --- cpp/fury/util/string_util.cc | 194 ++++++++++++++++++++++++++ cpp/fury/util/string_util.h | 8 +- cpp/fury/util/string_util_test.cc | 218 ++++++++++++++++++++++++++++++ 3 files changed, 417 insertions(+), 3 deletions(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index 5413c72af8..ab4b0985ec 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -58,6 +58,116 @@ inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { *utf8++ = static_cast((code_point & 0x3F) | 0x80); } +std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { + std::u16string utf16; + utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations + + char buffer[64]; // Buffer to hold temporary UTF-16 results + char16_t* output = reinterpret_cast(buffer); // Use char16_t for output + + size_t i = 0; + size_t n = utf8.size(); + + while (i + 32 <= n) { + + for (int j = 0; j < 32; ++j) { + uint8_t byte = utf8[i + j]; + + if (byte < 0x80) { + // 1-byte character (ASCII) + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + // 2-byte character + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++j; + } else if (byte < 0xF0) { + // 3-byte character + uint16_t utf16_char = ((byte & 0x0F) << 12) | ((utf8[i + j + 1] & 0x3F) << 6) | + (utf8[i + j + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + j += 2; + } else { + // 4-byte character (surrogate pair handling required) + uint32_t code_point = ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | + ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); + + // Convert the code point to a surrogate pair + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); // Swap bytes for big-endian + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); // Swap bytes for big-endian + } + + *output++ = high_surrogate; + *output++ = low_surrogate; + + j += 3; + } + } + + // Append the processed buffer to the final utf16 string + utf16.append(reinterpret_cast(buffer), output - reinterpret_cast(buffer)); + output = reinterpret_cast(buffer); // Reset output buffer pointer + i += 32; + } + + // Handle remaining characters + while (i < n) { + uint8_t byte = utf8[i]; + + if (byte < 0x80) { + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++i; + } else if (byte < 0xF0) { + uint16_t utf16_char = ((byte & 0x0F) << 12) | ((utf8[i + 1] & 0x3F) << 6) | + (utf8[i + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + i += 2; + } else { + uint32_t code_point = ((byte & 0x07) << 18) | ((utf8[i + 1] & 0x3F) << 12) | + ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F); + + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + *output++ = high_surrogate; + *output++ = low_surrogate; + + i += 3; + } + + ++i; + } + + // Append the last part of the buffer to the utf16 string + utf16.append(reinterpret_cast(buffer), output - reinterpret_cast(buffer)); + + return utf16; + } + #if defined(__x86_64__) || defined(_M_X64) bool isLatin(const std::string &str) { @@ -168,6 +278,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8,is_little_endian); +} + #elif defined(__ARM_NEON) || defined(__ARM_NEON__) bool isLatin(const std::string &str) { @@ -264,6 +378,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8,is_little_endian); +} + #elif defined(__riscv) && __riscv_vector bool isLatin(const std::string &str) { @@ -365,6 +483,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8,is_little_endian); +} + #else bool isLatin(const std::string &str) { @@ -414,6 +536,78 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +// Fallback implementation without SIMD acceleration +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string + + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string + + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); + } + + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); + } + } + + // Return the resulting UTF-16 string + return utf16; +} + #endif } // namespace fury diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h index 9cb4cc7e83..8a200afe38 100644 --- a/cpp/fury/util/string_util.h +++ b/cpp/fury/util/string_util.h @@ -23,8 +23,10 @@ namespace fury { -bool isLatin(const std::string &str); + bool isLatin(const std::string &str); -std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); + std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); -} // namespace fury + std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian); + +} // namespace fury \ No newline at end of file diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index 9b2213b9b0..b9a7dc0781 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -298,6 +298,224 @@ TEST(UTF16ToUTF8Test, PerformanceTest) { } } + +// Generate random UTF-8 string +std::string generateRandomUTF8String(size_t length) { + std::string str; + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (str.size() < length) { + uint32_t code_point = distribution(generator); + + if (code_point <= 0x7F) { + str.push_back(static_cast(code_point)); + } else if (code_point <= 0x7FF) { + str.push_back(0xC0 | (code_point >> 6)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0xFFFF) { + str.push_back(0xE0 | (code_point >> 12)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0x10FFFF) { + str.push_back(0xF0 | (code_point >> 18)); + str.push_back(0x80 | ((code_point >> 12) & 0x3F)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } + } + + return str; +} + +std::u16string utf8ToUtf16BaseLine(const std::string &utf8, bool is_little_endian) { + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string + + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string + + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); + } + + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); + } + } + + // Return the resulting UTF-16 string + return utf16; +} + +// Testing Basic Logic +TEST(UTF8ToUTF16Test, BasicConversion) { +std::string utf8 = u8"Hello, 世界!"; +std::u16string utf16 = fury::utf8ToUtf16(utf8, true); +ASSERT_EQ(utf16, u"Hello, 世界!"); +} + +// Testing Empty String +TEST(UTF8ToUTF16Test, EmptyString) { +std::string utf8 = ""; +std::u16string utf16 = fury::utf8ToUtf16(utf8, true); +ASSERT_EQ(utf16, u""); +} + +// Testing emoji +TEST(UTF8ToUTF16Test, SurrogatePairs) { +std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 emoji +std::u16string utf16 = fury::utf8ToUtf16(utf8, true); +std::u16string expected_utf16 = {0xD83D, 0xDE00}; // Surrogate pair for emoji +ASSERT_EQ(utf16, expected_utf16); +} + +// Correct Boundary testing for U+FFFD (replacement character) +TEST(UTF8ToUTF16Test, BoundaryValues) { +// "\xEF\xBF\xBD" is the UTF-8 encoding for U+FFFD (replacement character) +std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 +std::u16string utf16 = fury::utf8ToUtf16(utf8, true); +std::u16string expected_utf16 = {0xFFFD}; // Expected UTF-16 representation of U+FFFD +ASSERT_EQ(utf16, expected_utf16); +} + + +// Testing Special Characters +TEST(UTF8ToUTF16Test, SpecialCharacters) { +std::string utf8 = " \n\t"; +std::u16string utf16 = fury::utf8ToUtf16(utf8, true); +ASSERT_EQ(utf16, u" \n\t"); +} + +// Testing LittleEndian +TEST(UTF8ToUTF16Test, LittleEndian) { +std::string utf8 = "ab"; +std::u16string utf16 = fury::utf8ToUtf16(utf8, true); +std::u16string expected_utf16 = {0x61, 0x62}; // Little-endian UTF-16 representation of "ab" +ASSERT_EQ(utf16, expected_utf16); +} + +// Correct BigEndian testing for BOM (Byte Order Mark) +TEST(UTF8ToUTF16Test, BigEndian) { +std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) +std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian +std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 +ASSERT_EQ(utf16, expected_utf16); +} + +// Testing round-trip conversion (UTF-8 -> UTF-16 -> UTF-8) +TEST(UTF8ToUTF16Test, RoundTripConversion) { +std::string original_utf8 = u8"Hello, 世界!"; +std::u16string utf16 = fury::utf8ToUtf16(original_utf8, true); +std::string utf8_converted_back = fury::utf16ToUtf8(utf16, true); +ASSERT_EQ(original_utf8, utf8_converted_back); +} + +// Testing Performance +TEST(UTF8ToUTF16Test, PerformanceTest) { +const size_t num_tests = 1000; +const size_t string_length = 1000; +// Default little_endian +bool is_little_endian = true; + +// Random UTF-8 +std::vector test_strings; +for (size_t i = 0; i < num_tests; ++i) { +test_strings.push_back(generateRandomUTF8String(string_length)); +} + +// Standard Library +try { +auto start_time = std::chrono::high_resolution_clock::now(); +for (const auto &str : test_strings) { +std::wstring_convert, char16_t> convert; +std::u16string utf16 = convert.from_bytes(str); +} +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast(end_time - start_time).count(); +FURY_LOG(INFO) << "Standard Library Running Time: " << duration << " ns"; +} catch (const std::exception &e) { +FURY_LOG(FATAL) << "Caught exception in standard library conversion: " << e.what(); +} + +// BaseLine +try { +auto start_time = std::chrono::high_resolution_clock::now(); +for (const auto &str : test_strings) { +std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); +} +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast(end_time - start_time).count(); +FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns"; +} catch (const std::exception &e) { +FURY_LOG(FATAL) << "Caught exception in baseline conversion: " << e.what(); +} + +// Optimized (SIMD) +try { +auto start_time = std::chrono::high_resolution_clock::now(); +for (const auto &str : test_strings) { +std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); +} +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast(end_time - start_time).count(); +FURY_LOG(INFO) << "SIMD Optimized Running Time: " << duration << " ns"; +} catch (const std::exception &e) { +FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " << e.what(); +} +} + } // namespace fury int main(int argc, char **argv) { From 38baad7831e9339f65be07cb7c45af29f72337d0 Mon Sep 17 00:00:00 2001 From: pandalee99 <1162953505@qq.com> Date: Tue, 24 Dec 2024 20:04:40 +0800 Subject: [PATCH 2/6] code style --- cpp/fury/util/string_util.cc | 413 ++++++++++++++------------- cpp/fury/util/string_util_test.cc | 459 +++++++++++++++--------------- 2 files changed, 448 insertions(+), 424 deletions(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index ab4b0985ec..ff4dbb573a 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -28,48 +28,51 @@ #endif #include +#include #include namespace fury { // Swap bytes to convert from big endian to little endian -inline uint16_t swapBytes(uint16_t value) { - return (value >> 8) | (value << 8); -} + inline uint16_t swapBytes(uint16_t value) { + return (value >> 8) | (value << 8); + } -inline void utf16ToUtf8(uint16_t code_unit, char *&output) { - if (code_unit < 0x80) { - *output++ = static_cast(code_unit); - } else if (code_unit < 0x800) { - *output++ = static_cast(0xC0 | (code_unit >> 6)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } else { - *output++ = static_cast(0xE0 | (code_unit >> 12)); - *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } -} + inline void utf16ToUtf8(uint16_t code_unit, char *&output) { + if (code_unit < 0x80) { + *output++ = static_cast(code_unit); + } else if (code_unit < 0x800) { + *output++ = static_cast(0xC0 | (code_unit >> 6)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } else { + *output++ = static_cast(0xE0 | (code_unit >> 12)); + *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } + } -inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { - uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); - *utf8++ = static_cast((code_point >> 18) | 0xF0); - *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); - *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); - *utf8++ = static_cast((code_point & 0x3F) | 0x80); -} + inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { + uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); + *utf8++ = static_cast((code_point >> 18) | 0xF0); + *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); + *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); + *utf8++ = static_cast((code_point & 0x3F) | 0x80); + } -std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { + std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { std::u16string utf16; utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations char buffer[64]; // Buffer to hold temporary UTF-16 results - char16_t* output = reinterpret_cast(buffer); // Use char16_t for output + char16_t *output = + reinterpret_cast(buffer); // Use char16_t for output size_t i = 0; size_t n = utf8.size(); while (i + 32 <= n) { + // Now process the characters in 'in' SIMD register for (int j = 0; j < 32; ++j) { uint8_t byte = utf8[i + j]; @@ -80,31 +83,37 @@ std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { // 2-byte character uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F); if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian } *output++ = utf16_char; ++j; } else if (byte < 0xF0) { // 3-byte character - uint16_t utf16_char = ((byte & 0x0F) << 12) | ((utf8[i + j + 1] & 0x3F) << 6) | + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + j + 1] & 0x3F) << 6) | (utf8[i + j + 2] & 0x3F); if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian } *output++ = utf16_char; j += 2; } else { // 4-byte character (surrogate pair handling required) - uint32_t code_point = ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | - ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); + uint32_t code_point = + ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | + ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); // Convert the code point to a surrogate pair uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); // Swap bytes for big-endian - low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); // Swap bytes for big-endian + high_surrogate = (high_surrogate >> 8) | + (high_surrogate << 8); // Swap bytes for big-endian + low_surrogate = (low_surrogate >> 8) | + (low_surrogate << 8); // Swap bytes for big-endian } *output++ = high_surrogate; @@ -115,8 +124,10 @@ std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { } // Append the processed buffer to the final utf16 string - utf16.append(reinterpret_cast(buffer), output - reinterpret_cast(buffer)); - output = reinterpret_cast(buffer); // Reset output buffer pointer + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); + output = + reinterpret_cast(buffer); // Reset output buffer pointer i += 32; } @@ -129,20 +140,23 @@ std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { } else if (byte < 0xE0) { uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F); if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian } *output++ = utf16_char; ++i; } else if (byte < 0xF0) { - uint16_t utf16_char = ((byte & 0x0F) << 12) | ((utf8[i + 1] & 0x3F) << 6) | - (utf8[i + 2] & 0x3F); + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F); if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian } *output++ = utf16_char; i += 2; } else { - uint32_t code_point = ((byte & 0x07) << 18) | ((utf8[i + 1] & 0x3F) << 12) | + uint32_t code_point = ((byte & 0x07) << 18) | + ((utf8[i + 1] & 0x3F) << 12) | ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F); uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); @@ -163,128 +177,129 @@ std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { } // Append the last part of the buffer to the utf16 string - utf16.append(reinterpret_cast(buffer), output - reinterpret_cast(buffer)); + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); return utf16; } #if defined(__x86_64__) || defined(_M_X64) -bool isLatin(const std::string &str) { - const char *data = str.data(); - size_t len = str.size(); + bool isLatin(const std::string &str) { + const char *data = str.data(); + size_t len = str.size(); - size_t i = 0; - __m256i latin_mask = _mm256_set1_epi8(0x80); - for (; i + 32 <= len; i += 32) { - __m256i chars = - _mm256_loadu_si256(reinterpret_cast(data + i)); - __m256i result = _mm256_and_si256(chars, latin_mask); - if (!_mm256_testz_si256(result, result)) { - return false; - } - } + size_t i = 0; + __m256i latin_mask = _mm256_set1_epi8(0x80); + for (; i + 32 <= len; i += 32) { + __m256i chars = + _mm256_loadu_si256(reinterpret_cast(data + i)); + __m256i result = _mm256_and_si256(chars, latin_mask); + if (!_mm256_testz_si256(result, result)) { + return false; + } + } - for (; i < len; ++i) { - if (static_cast(data[i]) >= 128) { - return false; + for (; i < len; ++i) { + if (static_cast(data[i]) >= 128) { + return false; + } + } + + return true; } - } - return true; -} + std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { + std::string utf8; + utf8.reserve(utf16.size() * + 3); // Reserve enough space to avoid frequent reallocations -std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { - std::string utf8; - utf8.reserve(utf16.size() * - 3); // Reserve enough space to avoid frequent reallocations + const __m256i limit1 = _mm256_set1_epi16(0x80); + const __m256i limit2 = _mm256_set1_epi16(0x800); + const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800); + const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF); + const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00); + const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF); - const __m256i limit1 = _mm256_set1_epi16(0x80); - const __m256i limit2 = _mm256_set1_epi16(0x800); - const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800); - const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF); - const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00); - const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF); + char buffer[64]; // Buffer to hold temporary UTF-8 bytes + char *output = buffer; - char buffer[64]; // Buffer to hold temporary UTF-8 bytes - char *output = buffer; + size_t i = 0; + size_t n = utf16.size(); - size_t i = 0; - size_t n = utf16.size(); + while (i + 16 <= n) { + __m256i in = + _mm256_loadu_si256(reinterpret_cast(utf16.data() + i)); - while (i + 16 <= n) { - __m256i in = - _mm256_loadu_si256(reinterpret_cast(utf16.data() + i)); + if (!is_little_endian) { + in = _mm256_or_si256( + _mm256_slli_epi16(in, 8), + _mm256_srli_epi16(in, 8)); // Swap bytes for big-endian + } - if (!is_little_endian) { - in = _mm256_or_si256( - _mm256_slli_epi16(in, 8), - _mm256_srli_epi16(in, 8)); // Swap bytes for big-endian - } + __m256i mask1 = _mm256_cmpgt_epi16(in, limit1); + __m256i mask2 = _mm256_cmpgt_epi16(in, limit2); + __m256i high_surrogate_mask = + _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start), + _mm256_cmpgt_epi16(in, surrogate_high_end)); + __m256i low_surrogate_mask = + _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start), + _mm256_cmpgt_epi16(in, surrogate_low_end)); + + if (_mm256_testz_si256(mask1, mask1)) { + // All values < 0x80, 1 byte per character + for (int j = 0; j < 16; ++j) { + *output++ = static_cast(utf16[i + j]); + } + } else if (_mm256_testz_si256(mask2, mask2)) { + // All values < 0x800, 2 bytes per character + for (int j = 0; j < 16; ++j) { + utf16ToUtf8(utf16[i + j], output); + } + } else { + // Mix of 1, 2, and 3 byte characters + for (int j = 0; j < 16; ++j) { + if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) && + j + 1 < 16 && + !_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) { + // Surrogate pair + utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output); + ++j; + } else { + utf16ToUtf8(utf16[i + j], output); + } + } + } - __m256i mask1 = _mm256_cmpgt_epi16(in, limit1); - __m256i mask2 = _mm256_cmpgt_epi16(in, limit2); - __m256i high_surrogate_mask = - _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start), - _mm256_cmpgt_epi16(in, surrogate_high_end)); - __m256i low_surrogate_mask = - _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start), - _mm256_cmpgt_epi16(in, surrogate_low_end)); - - if (_mm256_testz_si256(mask1, mask1)) { - // All values < 0x80, 1 byte per character - for (int j = 0; j < 16; ++j) { - *output++ = static_cast(utf16[i + j]); - } - } else if (_mm256_testz_si256(mask2, mask2)) { - // All values < 0x800, 2 bytes per character - for (int j = 0; j < 16; ++j) { - utf16ToUtf8(utf16[i + j], output); - } - } else { - // Mix of 1, 2, and 3 byte characters - for (int j = 0; j < 16; ++j) { - if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) && - j + 1 < 16 && - !_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) { - // Surrogate pair - utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output); - ++j; - } else { - utf16ToUtf8(utf16[i + j], output); + utf8.append(buffer, output - buffer); + output = buffer; // Reset output buffer pointer + i += 16; } - } - } - utf8.append(buffer, output - buffer); - output = buffer; // Reset output buffer pointer - i += 16; - } + // Handle remaining characters + while (i < n) { + if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF && + utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { + // Surrogate pair + utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output); + ++i; + } else { + utf16ToUtf8(utf16[i], output); + } + ++i; + } + utf8.append(buffer, output - buffer); - // Handle remaining characters - while (i < n) { - if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF && - utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { - // Surrogate pair - utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output); - ++i; - } else { - utf16ToUtf8(utf16[i], output); + return utf8; } - ++i; - } - utf8.append(buffer, output - buffer); - - return utf8; -} -std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { - return utf8ToUtf16SIMD(utf8,is_little_endian); -} + std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8, is_little_endian); + } #elif defined(__ARM_NEON) || defined(__ARM_NEON__) -bool isLatin(const std::string &str) { + bool isLatin(const std::string &str) { const char *data = str.data(); size_t len = str.size(); @@ -379,7 +394,7 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { } std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { - return utf8ToUtf16SIMD(utf8,is_little_endian); + return utf8ToUtf16SIMD(utf8, is_little_endian); } #elif defined(__riscv) && __riscv_vector @@ -484,7 +499,7 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { } std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { - return utf8ToUtf16SIMD(utf8,is_little_endian); + return utf8ToUtf16SIMD(utf8, is_little_endian); } #else @@ -538,76 +553,76 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { // Fallback implementation without SIMD acceleration std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { - std::u16string utf16; // Resulting UTF-16 string - size_t i = 0; // Index for traversing the UTF-8 string - size_t n = utf8.size(); // Total length of the UTF-8 string - - // Loop through each byte of the UTF-8 string - while (i < n) { - uint32_t code_point = 0; // The Unicode code point - unsigned char c = utf8[i]; // Current byte of the UTF-8 string - - // Determine the number of bytes for this character based on its first byte - if ((c & 0x80) == 0) { - // 1-byte character (ASCII) - code_point = c; - ++i; - } else if ((c & 0xE0) == 0xC0) { - // 2-byte character - code_point = c & 0x1F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - i += 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character - code_point = c & 0x0F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - i += 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character - code_point = c & 0x07; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); - i += 4; - } else { - // Invalid UTF-8 byte sequence - throw std::invalid_argument("Invalid UTF-8 encoding."); - } + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string - // If the code point is beyond the BMP range, use surrogate pairs - if (code_point >= 0x10000) { - code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair - uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string - // If not little-endian, swap bytes of the surrogates - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); - low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); - } + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); + } - // Add both high and low surrogates to the UTF-16 string - utf16.push_back(high_surrogate); - utf16.push_back(low_surrogate); - } else { - // For code points within the BMP range, directly store as a 16-bit value - uint16_t utf16_char = static_cast(code_point); + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate - // If not little-endian, swap the bytes of the 16-bit character - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); - } + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } - // Add the UTF-16 character to the string - utf16.push_back(utf16_char); - } + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); } + } - // Return the resulting UTF-16 string - return utf16; + // Return the resulting UTF-16 string + return utf16; } #endif -} // namespace fury +} // namespace fury \ No newline at end of file diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index b9a7dc0781..9328093f7e 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -30,274 +30,273 @@ namespace fury { // Function to generate a random string -std::string generateRandomString(size_t length) { - const char charset[] = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - std::default_random_engine rng(std::random_device{}()); - std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); - - std::string result; - result.reserve(length); - for (size_t i = 0; i < length; ++i) { - result += charset[dist(rng)]; - } + std::string generateRandomString(size_t length) { + const char charset[] = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + std::default_random_engine rng(std::random_device{}()); + std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); + + std::string result; + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + result += charset[dist(rng)]; + } - return result; -} + return result; + } -bool isLatin_BaseLine(const std::string &str) { - for (char c : str) { - if (static_cast(c) >= 128) { - return false; + bool isLatin_BaseLine(const std::string &str) { + for (char c : str) { + if (static_cast(c) >= 128) { + return false; + } + } + return true; } - } - return true; -} -TEST(StringUtilTest, TestIsLatinFunctions) { - std::string testStr = generateRandomString(100000); - auto start_time = std::chrono::high_resolution_clock::now(); - bool result = isLatin_BaseLine(testStr); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns."; + TEST(StringUtilTest, TestIsLatinFunctions) { + std::string testStr = generateRandomString(100000); + auto start_time = std::chrono::high_resolution_clock::now(); + bool result = isLatin_BaseLine(testStr); + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns."; - start_time = std::chrono::high_resolution_clock::now(); - result = isLatin(testStr); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - - start_time) - .count(); - FURY_LOG(INFO) << "Optimized Running Time: " << duration << " ns."; + start_time = std::chrono::high_resolution_clock::now(); + result = isLatin(testStr); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - + start_time) + .count(); + FURY_LOG(INFO) << "Optimized Running Time: " << duration << " ns."; - EXPECT_TRUE(result); + EXPECT_TRUE(result); } TEST(StringUtilTest, TestIsLatinLogic) { - // Test strings with only Latin characters - EXPECT_TRUE(isLatin("Fury")); - EXPECT_TRUE(isLatin(generateRandomString(80))); - - // Test unaligned strings with only Latin characters - EXPECT_TRUE(isLatin(generateRandomString(80) + "1")); - EXPECT_TRUE(isLatin(generateRandomString(80) + "12")); - EXPECT_TRUE(isLatin(generateRandomString(80) + "123")); - - // Test strings with non-Latin characters - EXPECT_FALSE(isLatin("你好, Fury")); - EXPECT_FALSE(isLatin(generateRandomString(80) + "你好")); - EXPECT_FALSE(isLatin(generateRandomString(80) + "1你好")); - EXPECT_FALSE(isLatin(generateRandomString(11) + "你")); - EXPECT_FALSE(isLatin(generateRandomString(10) + "你好")); - EXPECT_FALSE(isLatin(generateRandomString(9) + "性能好")); - EXPECT_FALSE(isLatin("\u1234")); - EXPECT_FALSE(isLatin("a\u1234")); - EXPECT_FALSE(isLatin("ab\u1234")); - EXPECT_FALSE(isLatin("abc\u1234")); - EXPECT_FALSE(isLatin("abcd\u1234")); - EXPECT_FALSE(isLatin("Javaone Keynote\u1234")); +// Test strings with only Latin characters +EXPECT_TRUE(isLatin("Fury")); +EXPECT_TRUE(isLatin(generateRandomString(80))); + +// Test unaligned strings with only Latin characters +EXPECT_TRUE(isLatin(generateRandomString(80) + "1")); +EXPECT_TRUE(isLatin(generateRandomString(80) + "12")); +EXPECT_TRUE(isLatin(generateRandomString(80) + "123")); + +// Test strings with non-Latin characters +EXPECT_FALSE(isLatin("你好, Fury")); +EXPECT_FALSE(isLatin(generateRandomString(80) + "你好")); +EXPECT_FALSE(isLatin(generateRandomString(80) + "1你好")); +EXPECT_FALSE(isLatin(generateRandomString(11) + "你")); +EXPECT_FALSE(isLatin(generateRandomString(10) + "你好")); +EXPECT_FALSE(isLatin(generateRandomString(9) + "性能好")); +EXPECT_FALSE(isLatin("\u1234")); +EXPECT_FALSE(isLatin("a\u1234")); +EXPECT_FALSE(isLatin("ab\u1234")); +EXPECT_FALSE(isLatin("abc\u1234")); +EXPECT_FALSE(isLatin("abcd\u1234")); +EXPECT_FALSE(isLatin("Javaone Keynote\u1234")); } // Generate random UTF-16 string ensuring valid surrogate pairs std::u16string generateRandomUTF16String(size_t length) { - std::u16string str; - std::mt19937 generator(std::random_device{}()); - std::uniform_int_distribution distribution(0, 0x10FFFF); - - while (str.size() < length) { - uint32_t code_point = distribution(generator); - - if (code_point <= 0xD7FF || - (code_point >= 0xE000 && code_point <= 0xFFFF)) { - str.push_back(static_cast(code_point)); - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - code_point -= 0x10000; - str.push_back(static_cast((code_point >> 10) + 0xD800)); - str.push_back(static_cast((code_point & 0x3FF) + 0xDC00)); + std::u16string str; + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (str.size() < length) { + uint32_t code_point = distribution(generator); + + if (code_point <= 0xD7FF || + (code_point >= 0xE000 && code_point <= 0xFFFF)) { + str.push_back(static_cast(code_point)); + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { + code_point -= 0x10000; + str.push_back(static_cast((code_point >> 10) + 0xD800)); + str.push_back(static_cast((code_point & 0x3FF) + 0xDC00)); + } } - } - return str; + return str; } // Basic implementation // Swap bytes to convert from big endian to little endian inline uint16_t swapBytes(uint16_t value) { - return (value >> 8) | (value << 8); + return (value >> 8) | (value << 8); } inline void utf16ToUtf8(uint16_t code_unit, char *&output) { - if (code_unit < 0x80) { - *output++ = static_cast(code_unit); - } else if (code_unit < 0x800) { - *output++ = static_cast(0xC0 | (code_unit >> 6)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } else { - *output++ = static_cast(0xE0 | (code_unit >> 12)); - *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } + if (code_unit < 0x80) { + *output++ = static_cast(code_unit); + } else if (code_unit < 0x800) { + *output++ = static_cast(0xC0 | (code_unit >> 6)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } else { + *output++ = static_cast(0xE0 | (code_unit >> 12)); + *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } } inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { - uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); - *utf8++ = static_cast((code_point >> 18) | 0xF0); - *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); - *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); - *utf8++ = static_cast((code_point & 0x3F) | 0x80); + uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); + *utf8++ = static_cast((code_point >> 18) | 0xF0); + *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); + *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); + *utf8++ = static_cast((code_point & 0x3F) | 0x80); } std::string utf16ToUtf8BaseLine(const std::u16string &utf16, bool is_little_endian) { - std::string utf8; - utf8.reserve(utf16.size() * - 3); // Reserve enough space to avoid frequent reallocations - - size_t i = 0; - size_t n = utf16.size(); - char buffer[4]; // Buffer to hold temporary UTF-8 bytes - char *output = buffer; - - while (i < n) { - uint16_t code_unit = utf16[i]; - if (!is_little_endian) { - code_unit = swapBytes(code_unit); - } - if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF && - utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { - // Surrogate pair - uint16_t high = code_unit; - uint16_t low = utf16[i + 1]; - if (!is_little_endian) { - low = swapBytes(low); - } - utf16SurrogatePairToUtf8(high, low, output); - utf8.append(buffer, output - buffer); - output = buffer; - ++i; - } else { - utf16ToUtf8(code_unit, output); - utf8.append(buffer, output - buffer); - output = buffer; + std::string utf8; + utf8.reserve(utf16.size() * + 3); // Reserve enough space to avoid frequent reallocations + + size_t i = 0; + size_t n = utf16.size(); + char buffer[4]; // Buffer to hold temporary UTF-8 bytes + char *output = buffer; + + while (i < n) { + uint16_t code_unit = utf16[i]; + if (!is_little_endian) { + code_unit = swapBytes(code_unit); + } + if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF && + utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { + // Surrogate pair + uint16_t high = code_unit; + uint16_t low = utf16[i + 1]; + if (!is_little_endian) { + low = swapBytes(low); + } + utf16SurrogatePairToUtf8(high, low, output); + utf8.append(buffer, output - buffer); + output = buffer; + ++i; + } else { + utf16ToUtf8(code_unit, output); + utf8.append(buffer, output - buffer); + output = buffer; + } + ++i; } - ++i; - } - return utf8; + return utf8; } // Testing Basic Logic TEST(UTF16ToUTF8Test, BasicConversion) { - std::u16string utf16 = u"Hello, 世界!"; - std::string utf8 = fury::utf16ToUtf8(utf16, true); - ASSERT_EQ(utf8, u8"Hello, 世界!"); +std::u16string utf16 = u"Hello, 世界!"; +std::string utf8 = fury::utf16ToUtf8(utf16, true); +ASSERT_EQ(utf8, u8"Hello, 世界!"); } // Testing Empty String TEST(UTF16ToUTF8Test, EmptyString) { - std::u16string utf16 = u""; - std::string utf8 = fury::utf16ToUtf8(utf16, true); - ASSERT_EQ(utf8, ""); +std::u16string utf16 = u""; +std::string utf8 = fury::utf16ToUtf8(utf16, true); +ASSERT_EQ(utf8, ""); } // Testing emoji TEST(UTF16ToUTF8Test, SurrogatePairs) { - std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji - std::string utf8 = fury::utf16ToUtf8(utf16, true); - ASSERT_EQ(utf8, "\xF0\x9F\x98\x80"); +std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji +std::string utf8 = fury::utf16ToUtf8(utf16, true); +ASSERT_EQ(utf8, "\xF0\x9F\x98\x80"); } // Testing Boundary TEST(UTF16ToUTF8Test, BoundaryValues) { - std::u16string utf16 = {0x0000, 0xFFFF}; - std::string utf8 = fury::utf16ToUtf8(utf16, true); - std::string expected_utf8 = std::string("\x00", 1) + "\xEF\xBF\xBF"; - ASSERT_EQ(utf8, expected_utf8); +std::u16string utf16 = {0x0000, 0xFFFF}; +std::string utf8 = fury::utf16ToUtf8(utf16, true); +std::string expected_utf8 = std::string("\x00", 1) + "\xEF\xBF\xBF"; +ASSERT_EQ(utf8, expected_utf8); } // Testing Special Characters TEST(UTF16ToUTF8Test, SpecialCharacters) { - std::u16string utf16 = u" \n\t"; - std::string utf8 = fury::utf16ToUtf8(utf16, true); - ASSERT_EQ(utf8, " \n\t"); +std::u16string utf16 = u" \n\t"; +std::string utf8 = fury::utf16ToUtf8(utf16, true); +ASSERT_EQ(utf8, " \n\t"); } // Testing LittleEndian TEST(UTF16ToUTF8Test, LittleEndian) { - std::u16string utf16 = {0x61, 0x62}; // "ab" - std::string utf8 = fury::utf16ToUtf8(utf16, true); - ASSERT_EQ(utf8, "ab"); +std::u16string utf16 = {0x61, 0x62}; // "ab" +std::string utf8 = fury::utf16ToUtf8(utf16, true); +ASSERT_EQ(utf8, "ab"); } // Testing BigEndian TEST(UTF16ToUTF8Test, BigEndian) { - std::u16string utf16 = {0xFFFE, 0xFFFE}; - std::string utf8 = fury::utf16ToUtf8(utf16, false); - ASSERT_EQ(utf8, "\xEF\xBF\xBE\xEF\xBF\xBE"); +std::u16string utf16 = {0xFFFE, 0xFFFE}; +std::string utf8 = fury::utf16ToUtf8(utf16, false); +ASSERT_EQ(utf8, "\xEF\xBF\xBE\xEF\xBF\xBE"); } // Testing Performance TEST(UTF16ToUTF8Test, PerformanceTest) { - const size_t num_tests = 1000; - const size_t string_length = 1000; - // Default little_endian - bool is_little_endian = true; - - // Random UTF-16 - std::vector test_strings; - for (size_t i = 0; i < num_tests; ++i) { - test_strings.push_back(generateRandomUTF16String(string_length)); - } - - // Lib - try { - auto start_time = std::chrono::high_resolution_clock::now(); - for (const auto &str : test_strings) { - std::wstring_convert, char16_t> convert; - std::string utf8 = convert.to_bytes(str); - } - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(INFO) << "Standard library Running Time: " << duration << " ns"; - } catch (const std::exception &e) { - FURY_LOG(FATAL) << "Caught exception: " << e.what(); - } - - // BaseLine - try { - auto start_time = std::chrono::high_resolution_clock::now(); - for (const auto &str : test_strings) { - std::string utf8 = utf16ToUtf8BaseLine(str, is_little_endian); - } - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(INFO) << "Baseline Running Time: " << duration << " ns"; - } catch (const std::exception &e) { - FURY_LOG(FATAL) << "Caught exception: " << e.what(); - } - - // SIMD - try { - auto start_time = std::chrono::high_resolution_clock::now(); - for (const auto &str : test_strings) { - std::string utf8 = fury::utf16ToUtf8(str, is_little_endian); - } - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(INFO) << "SIMD Running Time: " << duration << " ns"; - } catch (const std::exception &e) { - FURY_LOG(FATAL) << "Caught exception: " << e.what(); - } +const size_t num_tests = 1000; +const size_t string_length = 1000; +// Default little_endian +bool is_little_endian = true; + +// Random UTF-16 +std::vector test_strings; +for (size_t i = 0; i < num_tests; ++i) { +test_strings.push_back(generateRandomUTF16String(string_length)); +} + +// Lib +try { +auto start_time = std::chrono::high_resolution_clock::now(); +for (const auto &str : test_strings) { +std::wstring_convert, char16_t> convert; +std::string utf8 = convert.to_bytes(str); +} +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); +FURY_LOG(INFO) << "Standard library Running Time: " << duration << " ns"; +} catch (const std::exception &e) { +FURY_LOG(FATAL) << "Caught exception: " << e.what(); } +// BaseLine +try { +auto start_time = std::chrono::high_resolution_clock::now(); +for (const auto &str : test_strings) { +std::string utf8 = utf16ToUtf8BaseLine(str, is_little_endian); +} +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); +FURY_LOG(INFO) << "Baseline Running Time: " << duration << " ns"; +} catch (const std::exception &e) { +FURY_LOG(FATAL) << "Caught exception: " << e.what(); +} + +// SIMD +try { +auto start_time = std::chrono::high_resolution_clock::now(); +for (const auto &str : test_strings) { +std::string utf8 = fury::utf16ToUtf8(str, is_little_endian); +} +auto end_time = std::chrono::high_resolution_clock::now(); +auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); +FURY_LOG(INFO) << "SIMD Running Time: " << duration << " ns"; +} catch (const std::exception &e) { +FURY_LOG(FATAL) << "Caught exception: " << e.what(); +} +} // Generate random UTF-8 string std::string generateRandomUTF8String(size_t length) { @@ -328,15 +327,16 @@ std::string generateRandomUTF8String(size_t length) { return str; } -std::u16string utf8ToUtf16BaseLine(const std::string &utf8, bool is_little_endian) { - std::u16string utf16; // Resulting UTF-16 string - size_t i = 0; // Index for traversing the UTF-8 string - size_t n = utf8.size(); // Total length of the UTF-8 string +std::u16string utf8ToUtf16BaseLine(const std::string &utf8, + bool is_little_endian) { + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string // Loop through each byte of the UTF-8 string while (i < n) { - uint32_t code_point = 0; // The Unicode code point - unsigned char c = utf8[i]; // Current byte of the UTF-8 string + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string // Determine the number of bytes for this character based on its first byte if ((c & 0x80) == 0) { @@ -368,9 +368,9 @@ std::u16string utf8ToUtf16BaseLine(const std::string &utf8, bool is_little_endia // If the code point is beyond the BMP range, use surrogate pairs if (code_point >= 0x10000) { - code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate // If not little-endian, swap bytes of the surrogates if (!is_little_endian) { @@ -424,13 +424,13 @@ ASSERT_EQ(utf16, expected_utf16); // Correct Boundary testing for U+FFFD (replacement character) TEST(UTF8ToUTF16Test, BoundaryValues) { // "\xEF\xBF\xBD" is the UTF-8 encoding for U+FFFD (replacement character) -std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 +std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = {0xFFFD}; // Expected UTF-16 representation of U+FFFD +std::u16string expected_utf16 = { + 0xFFFD}; // Expected UTF-16 representation of U+FFFD ASSERT_EQ(utf16, expected_utf16); } - // Testing Special Characters TEST(UTF8ToUTF16Test, SpecialCharacters) { std::string utf8 = " \n\t"; @@ -442,15 +442,16 @@ ASSERT_EQ(utf16, u" \n\t"); TEST(UTF8ToUTF16Test, LittleEndian) { std::string utf8 = "ab"; std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = {0x61, 0x62}; // Little-endian UTF-16 representation of "ab" +std::u16string expected_utf16 = { + 0x61, 0x62}; // Little-endian UTF-16 representation of "ab" ASSERT_EQ(utf16, expected_utf16); } // Correct BigEndian testing for BOM (Byte Order Mark) TEST(UTF8ToUTF16Test, BigEndian) { -std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) -std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian -std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 +std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) +std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian +std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 ASSERT_EQ(utf16, expected_utf16); } @@ -483,10 +484,13 @@ std::wstring_convert, char16_t> convert; std::u16string utf16 = convert.from_bytes(str); } auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast(end_time - start_time).count(); +auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); FURY_LOG(INFO) << "Standard Library Running Time: " << duration << " ns"; } catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in standard library conversion: " << e.what(); +FURY_LOG(FATAL) << "Caught exception in standard library conversion: " +<< e.what(); } // BaseLine @@ -496,7 +500,9 @@ for (const auto &str : test_strings) { std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); } auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast(end_time - start_time).count(); +auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns"; } catch (const std::exception &e) { FURY_LOG(FATAL) << "Caught exception in baseline conversion: " << e.what(); @@ -509,16 +515,19 @@ for (const auto &str : test_strings) { std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); } auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast(end_time - start_time).count(); +auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); FURY_LOG(INFO) << "SIMD Optimized Running Time: " << duration << " ns"; } catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " << e.what(); +FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " +<< e.what(); } } } // namespace fury int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file From d63097ac92a7af97245c794ed98c3745993ac214 Mon Sep 17 00:00:00 2001 From: pandalee99 <1162953505@qq.com> Date: Tue, 24 Dec 2024 20:51:18 +0800 Subject: [PATCH 3/6] fix --- cpp/fury/util/string_util.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index ff4dbb573a..866de3b4bf 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -28,7 +28,6 @@ #endif #include -#include #include namespace fury { From 672868e0aacd0742006c6a83651529d822e6ab8c Mon Sep 17 00:00:00 2001 From: pandalee99 <1162953505@qq.com> Date: Wed, 25 Dec 2024 23:06:04 +0800 Subject: [PATCH 4/6] fix bug --- cpp/fury/util/string_util_test.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index 9328093f7e..69a6c5c457 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -307,6 +307,13 @@ std::string generateRandomUTF8String(size_t length) { while (str.size() < length) { uint32_t code_point = distribution(generator); + // Skip surrogate pairs (0xD800 to 0xDFFF) and other invalid Unicode code + // points + if ((code_point >= 0xD800 && code_point <= 0xDFFF) || + code_point > 0x10FFFF) { + continue; + } + if (code_point <= 0x7F) { str.push_back(static_cast(code_point)); } else if (code_point <= 0x7FF) { @@ -479,9 +486,14 @@ test_strings.push_back(generateRandomUTF8String(string_length)); // Standard Library try { auto start_time = std::chrono::high_resolution_clock::now(); +std::wstring_convert, wchar_t> convert; +// Loop through test strings and convert each UTF-8 string to UTF-16 for (const auto &str : test_strings) { -std::wstring_convert, char16_t> convert; -std::u16string utf16 = convert.from_bytes(str); +std::wstring wide_str = convert.from_bytes(str); +std::u16string utf16; +for (wchar_t wc : wide_str) { +utf16.push_back(static_cast(wc)); +} } auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast( From cd2df8d13c0369ad6c8ccfe73662a6a306fcacaa Mon Sep 17 00:00:00 2001 From: pandalee99 <1162953505@qq.com> Date: Wed, 25 Dec 2024 23:46:31 +0800 Subject: [PATCH 5/6] clang-format --- cpp/fury/util/string_util.cc | 484 +++++++++--------- cpp/fury/util/string_util.h | 6 +- cpp/fury/util/string_util_test.cc | 796 +++++++++++++++--------------- 3 files changed, 643 insertions(+), 643 deletions(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index 866de3b4bf..d6ae9475d9 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -33,272 +33,272 @@ namespace fury { // Swap bytes to convert from big endian to little endian - inline uint16_t swapBytes(uint16_t value) { - return (value >> 8) | (value << 8); - } +inline uint16_t swapBytes(uint16_t value) { + return (value >> 8) | (value << 8); +} - inline void utf16ToUtf8(uint16_t code_unit, char *&output) { - if (code_unit < 0x80) { - *output++ = static_cast(code_unit); - } else if (code_unit < 0x800) { - *output++ = static_cast(0xC0 | (code_unit >> 6)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } else { - *output++ = static_cast(0xE0 | (code_unit >> 12)); - *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); +inline void utf16ToUtf8(uint16_t code_unit, char *&output) { + if (code_unit < 0x80) { + *output++ = static_cast(code_unit); + } else if (code_unit < 0x800) { + *output++ = static_cast(0xC0 | (code_unit >> 6)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } else { + *output++ = static_cast(0xE0 | (code_unit >> 12)); + *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } +} + +inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { + uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); + *utf8++ = static_cast((code_point >> 18) | 0xF0); + *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); + *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); + *utf8++ = static_cast((code_point & 0x3F) | 0x80); +} + +std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { + std::u16string utf16; + utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations + + char buffer[64]; // Buffer to hold temporary UTF-16 results + char16_t *output = + reinterpret_cast(buffer); // Use char16_t for output + + size_t i = 0; + size_t n = utf8.size(); + + while (i + 32 <= n) { + + // Now process the characters in 'in' SIMD register + for (int j = 0; j < 32; ++j) { + uint8_t byte = utf8[i + j]; + + if (byte < 0x80) { + // 1-byte character (ASCII) + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + // 2-byte character + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++j; + } else if (byte < 0xF0) { + // 3-byte character + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + j + 1] & 0x3F) << 6) | + (utf8[i + j + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + j += 2; + } else { + // 4-byte character (surrogate pair handling required) + uint32_t code_point = + ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | + ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); + + // Convert the code point to a surrogate pair + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | + (high_surrogate << 8); // Swap bytes for big-endian + low_surrogate = (low_surrogate >> 8) | + (low_surrogate << 8); // Swap bytes for big-endian } - } - inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { - uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); - *utf8++ = static_cast((code_point >> 18) | 0xF0); - *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); - *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); - *utf8++ = static_cast((code_point & 0x3F) | 0x80); + *output++ = high_surrogate; + *output++ = low_surrogate; + + j += 3; + } } - std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { - std::u16string utf16; - utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations - - char buffer[64]; // Buffer to hold temporary UTF-16 results - char16_t *output = - reinterpret_cast(buffer); // Use char16_t for output - - size_t i = 0; - size_t n = utf8.size(); - - while (i + 32 <= n) { - - // Now process the characters in 'in' SIMD register - for (int j = 0; j < 32; ++j) { - uint8_t byte = utf8[i + j]; - - if (byte < 0x80) { - // 1-byte character (ASCII) - *output++ = static_cast(byte); - } else if (byte < 0xE0) { - // 2-byte character - uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F); - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | - (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - ++j; - } else if (byte < 0xF0) { - // 3-byte character - uint16_t utf16_char = ((byte & 0x0F) << 12) | - ((utf8[i + j + 1] & 0x3F) << 6) | - (utf8[i + j + 2] & 0x3F); - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | - (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - j += 2; - } else { - // 4-byte character (surrogate pair handling required) - uint32_t code_point = - ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | - ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); - - // Convert the code point to a surrogate pair - uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); - - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | - (high_surrogate << 8); // Swap bytes for big-endian - low_surrogate = (low_surrogate >> 8) | - (low_surrogate << 8); // Swap bytes for big-endian - } - - *output++ = high_surrogate; - *output++ = low_surrogate; - - j += 3; - } - } - - // Append the processed buffer to the final utf16 string - utf16.append(reinterpret_cast(buffer), - output - reinterpret_cast(buffer)); - output = - reinterpret_cast(buffer); // Reset output buffer pointer - i += 32; - } + // Append the processed buffer to the final utf16 string + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); + output = + reinterpret_cast(buffer); // Reset output buffer pointer + i += 32; + } - // Handle remaining characters - while (i < n) { - uint8_t byte = utf8[i]; - - if (byte < 0x80) { - *output++ = static_cast(byte); - } else if (byte < 0xE0) { - uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F); - if (!is_little_endian) { - utf16_char = - (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - ++i; - } else if (byte < 0xF0) { - uint16_t utf16_char = ((byte & 0x0F) << 12) | - ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F); - if (!is_little_endian) { - utf16_char = - (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - i += 2; - } else { - uint32_t code_point = ((byte & 0x07) << 18) | - ((utf8[i + 1] & 0x3F) << 12) | - ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F); - - uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); - - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); - low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); - } - - *output++ = high_surrogate; - *output++ = low_surrogate; - - i += 3; - } - - ++i; - } + // Handle remaining characters + while (i < n) { + uint8_t byte = utf8[i]; - // Append the last part of the buffer to the utf16 string - utf16.append(reinterpret_cast(buffer), - output - reinterpret_cast(buffer)); + if (byte < 0x80) { + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++i; + } else if (byte < 0xF0) { + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + i += 2; + } else { + uint32_t code_point = ((byte & 0x07) << 18) | + ((utf8[i + 1] & 0x3F) << 12) | + ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F); - return utf16; + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + *output++ = high_surrogate; + *output++ = low_surrogate; + + i += 3; } + ++i; + } + + // Append the last part of the buffer to the utf16 string + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); + + return utf16; +} + #if defined(__x86_64__) || defined(_M_X64) - bool isLatin(const std::string &str) { - const char *data = str.data(); - size_t len = str.size(); - - size_t i = 0; - __m256i latin_mask = _mm256_set1_epi8(0x80); - for (; i + 32 <= len; i += 32) { - __m256i chars = - _mm256_loadu_si256(reinterpret_cast(data + i)); - __m256i result = _mm256_and_si256(chars, latin_mask); - if (!_mm256_testz_si256(result, result)) { - return false; - } - } +bool isLatin(const std::string &str) { + const char *data = str.data(); + size_t len = str.size(); - for (; i < len; ++i) { - if (static_cast(data[i]) >= 128) { - return false; - } - } + size_t i = 0; + __m256i latin_mask = _mm256_set1_epi8(0x80); + for (; i + 32 <= len; i += 32) { + __m256i chars = + _mm256_loadu_si256(reinterpret_cast(data + i)); + __m256i result = _mm256_and_si256(chars, latin_mask); + if (!_mm256_testz_si256(result, result)) { + return false; + } + } + + for (; i < len; ++i) { + if (static_cast(data[i]) >= 128) { + return false; + } + } + + return true; +} - return true; +std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { + std::string utf8; + utf8.reserve(utf16.size() * + 3); // Reserve enough space to avoid frequent reallocations + + const __m256i limit1 = _mm256_set1_epi16(0x80); + const __m256i limit2 = _mm256_set1_epi16(0x800); + const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800); + const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF); + const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00); + const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF); + + char buffer[64]; // Buffer to hold temporary UTF-8 bytes + char *output = buffer; + + size_t i = 0; + size_t n = utf16.size(); + + while (i + 16 <= n) { + __m256i in = + _mm256_loadu_si256(reinterpret_cast(utf16.data() + i)); + + if (!is_little_endian) { + in = _mm256_or_si256( + _mm256_slli_epi16(in, 8), + _mm256_srli_epi16(in, 8)); // Swap bytes for big-endian } - std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { - std::string utf8; - utf8.reserve(utf16.size() * - 3); // Reserve enough space to avoid frequent reallocations - - const __m256i limit1 = _mm256_set1_epi16(0x80); - const __m256i limit2 = _mm256_set1_epi16(0x800); - const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800); - const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF); - const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00); - const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF); - - char buffer[64]; // Buffer to hold temporary UTF-8 bytes - char *output = buffer; - - size_t i = 0; - size_t n = utf16.size(); - - while (i + 16 <= n) { - __m256i in = - _mm256_loadu_si256(reinterpret_cast(utf16.data() + i)); - - if (!is_little_endian) { - in = _mm256_or_si256( - _mm256_slli_epi16(in, 8), - _mm256_srli_epi16(in, 8)); // Swap bytes for big-endian - } - - __m256i mask1 = _mm256_cmpgt_epi16(in, limit1); - __m256i mask2 = _mm256_cmpgt_epi16(in, limit2); - __m256i high_surrogate_mask = - _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start), - _mm256_cmpgt_epi16(in, surrogate_high_end)); - __m256i low_surrogate_mask = - _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start), - _mm256_cmpgt_epi16(in, surrogate_low_end)); - - if (_mm256_testz_si256(mask1, mask1)) { - // All values < 0x80, 1 byte per character - for (int j = 0; j < 16; ++j) { - *output++ = static_cast(utf16[i + j]); - } - } else if (_mm256_testz_si256(mask2, mask2)) { - // All values < 0x800, 2 bytes per character - for (int j = 0; j < 16; ++j) { - utf16ToUtf8(utf16[i + j], output); - } - } else { - // Mix of 1, 2, and 3 byte characters - for (int j = 0; j < 16; ++j) { - if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) && - j + 1 < 16 && - !_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) { - // Surrogate pair - utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output); - ++j; - } else { - utf16ToUtf8(utf16[i + j], output); - } - } - } - - utf8.append(buffer, output - buffer); - output = buffer; // Reset output buffer pointer - i += 16; + __m256i mask1 = _mm256_cmpgt_epi16(in, limit1); + __m256i mask2 = _mm256_cmpgt_epi16(in, limit2); + __m256i high_surrogate_mask = + _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start), + _mm256_cmpgt_epi16(in, surrogate_high_end)); + __m256i low_surrogate_mask = + _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start), + _mm256_cmpgt_epi16(in, surrogate_low_end)); + + if (_mm256_testz_si256(mask1, mask1)) { + // All values < 0x80, 1 byte per character + for (int j = 0; j < 16; ++j) { + *output++ = static_cast(utf16[i + j]); + } + } else if (_mm256_testz_si256(mask2, mask2)) { + // All values < 0x800, 2 bytes per character + for (int j = 0; j < 16; ++j) { + utf16ToUtf8(utf16[i + j], output); + } + } else { + // Mix of 1, 2, and 3 byte characters + for (int j = 0; j < 16; ++j) { + if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) && + j + 1 < 16 && + !_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) { + // Surrogate pair + utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output); + ++j; + } else { + utf16ToUtf8(utf16[i + j], output); } + } + } - // Handle remaining characters - while (i < n) { - if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF && - utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { - // Surrogate pair - utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output); - ++i; - } else { - utf16ToUtf8(utf16[i], output); - } - ++i; - } - utf8.append(buffer, output - buffer); + utf8.append(buffer, output - buffer); + output = buffer; // Reset output buffer pointer + i += 16; + } - return utf8; + // Handle remaining characters + while (i < n) { + if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF && + utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { + // Surrogate pair + utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output); + ++i; + } else { + utf16ToUtf8(utf16[i], output); } + ++i; + } + utf8.append(buffer, output - buffer); - std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { - return utf8ToUtf16SIMD(utf8, is_little_endian); - } + return utf8; +} + +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8, is_little_endian); +} #elif defined(__ARM_NEON) || defined(__ARM_NEON__) - bool isLatin(const std::string &str) { +bool isLatin(const std::string &str) { const char *data = str.data(); size_t len = str.size(); diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h index 8a200afe38..d5bdde543a 100644 --- a/cpp/fury/util/string_util.h +++ b/cpp/fury/util/string_util.h @@ -23,10 +23,10 @@ namespace fury { - bool isLatin(const std::string &str); +bool isLatin(const std::string &str); - std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); +std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); - std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian); +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian); } // namespace fury \ No newline at end of file diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index 69a6c5c457..60c72c80b3 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -30,516 +30,516 @@ namespace fury { // Function to generate a random string - std::string generateRandomString(size_t length) { - const char charset[] = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - std::default_random_engine rng(std::random_device{}()); - std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); - - std::string result; - result.reserve(length); - for (size_t i = 0; i < length; ++i) { - result += charset[dist(rng)]; - } - - return result; - } +std::string generateRandomString(size_t length) { + const char charset[] = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + std::default_random_engine rng(std::random_device{}()); + std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); + + std::string result; + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + result += charset[dist(rng)]; + } - bool isLatin_BaseLine(const std::string &str) { - for (char c : str) { - if (static_cast(c) >= 128) { - return false; - } - } - return true; + return result; +} + +bool isLatin_BaseLine(const std::string &str) { + for (char c : str) { + if (static_cast(c) >= 128) { + return false; } + } + return true; +} - TEST(StringUtilTest, TestIsLatinFunctions) { - std::string testStr = generateRandomString(100000); - auto start_time = std::chrono::high_resolution_clock::now(); - bool result = isLatin_BaseLine(testStr); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns."; - - start_time = std::chrono::high_resolution_clock::now(); - result = isLatin(testStr); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - - start_time) - .count(); - FURY_LOG(INFO) << "Optimized Running Time: " << duration << " ns."; - - EXPECT_TRUE(result); +TEST(StringUtilTest, TestIsLatinFunctions) { + std::string testStr = generateRandomString(100000); + auto start_time = std::chrono::high_resolution_clock::now(); + bool result = isLatin_BaseLine(testStr); + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns."; + + start_time = std::chrono::high_resolution_clock::now(); + result = isLatin(testStr); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - + start_time) + .count(); + FURY_LOG(INFO) << "Optimized Running Time: " << duration << " ns."; + + EXPECT_TRUE(result); } TEST(StringUtilTest, TestIsLatinLogic) { -// Test strings with only Latin characters -EXPECT_TRUE(isLatin("Fury")); -EXPECT_TRUE(isLatin(generateRandomString(80))); - -// Test unaligned strings with only Latin characters -EXPECT_TRUE(isLatin(generateRandomString(80) + "1")); -EXPECT_TRUE(isLatin(generateRandomString(80) + "12")); -EXPECT_TRUE(isLatin(generateRandomString(80) + "123")); - -// Test strings with non-Latin characters -EXPECT_FALSE(isLatin("你好, Fury")); -EXPECT_FALSE(isLatin(generateRandomString(80) + "你好")); -EXPECT_FALSE(isLatin(generateRandomString(80) + "1你好")); -EXPECT_FALSE(isLatin(generateRandomString(11) + "你")); -EXPECT_FALSE(isLatin(generateRandomString(10) + "你好")); -EXPECT_FALSE(isLatin(generateRandomString(9) + "性能好")); -EXPECT_FALSE(isLatin("\u1234")); -EXPECT_FALSE(isLatin("a\u1234")); -EXPECT_FALSE(isLatin("ab\u1234")); -EXPECT_FALSE(isLatin("abc\u1234")); -EXPECT_FALSE(isLatin("abcd\u1234")); -EXPECT_FALSE(isLatin("Javaone Keynote\u1234")); + // Test strings with only Latin characters + EXPECT_TRUE(isLatin("Fury")); + EXPECT_TRUE(isLatin(generateRandomString(80))); + + // Test unaligned strings with only Latin characters + EXPECT_TRUE(isLatin(generateRandomString(80) + "1")); + EXPECT_TRUE(isLatin(generateRandomString(80) + "12")); + EXPECT_TRUE(isLatin(generateRandomString(80) + "123")); + + // Test strings with non-Latin characters + EXPECT_FALSE(isLatin("你好, Fury")); + EXPECT_FALSE(isLatin(generateRandomString(80) + "你好")); + EXPECT_FALSE(isLatin(generateRandomString(80) + "1你好")); + EXPECT_FALSE(isLatin(generateRandomString(11) + "你")); + EXPECT_FALSE(isLatin(generateRandomString(10) + "你好")); + EXPECT_FALSE(isLatin(generateRandomString(9) + "性能好")); + EXPECT_FALSE(isLatin("\u1234")); + EXPECT_FALSE(isLatin("a\u1234")); + EXPECT_FALSE(isLatin("ab\u1234")); + EXPECT_FALSE(isLatin("abc\u1234")); + EXPECT_FALSE(isLatin("abcd\u1234")); + EXPECT_FALSE(isLatin("Javaone Keynote\u1234")); } // Generate random UTF-16 string ensuring valid surrogate pairs std::u16string generateRandomUTF16String(size_t length) { - std::u16string str; - std::mt19937 generator(std::random_device{}()); - std::uniform_int_distribution distribution(0, 0x10FFFF); - - while (str.size() < length) { - uint32_t code_point = distribution(generator); - - if (code_point <= 0xD7FF || - (code_point >= 0xE000 && code_point <= 0xFFFF)) { - str.push_back(static_cast(code_point)); - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - code_point -= 0x10000; - str.push_back(static_cast((code_point >> 10) + 0xD800)); - str.push_back(static_cast((code_point & 0x3FF) + 0xDC00)); - } + std::u16string str; + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (str.size() < length) { + uint32_t code_point = distribution(generator); + + if (code_point <= 0xD7FF || + (code_point >= 0xE000 && code_point <= 0xFFFF)) { + str.push_back(static_cast(code_point)); + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { + code_point -= 0x10000; + str.push_back(static_cast((code_point >> 10) + 0xD800)); + str.push_back(static_cast((code_point & 0x3FF) + 0xDC00)); } + } - return str; + return str; } // Basic implementation // Swap bytes to convert from big endian to little endian inline uint16_t swapBytes(uint16_t value) { - return (value >> 8) | (value << 8); + return (value >> 8) | (value << 8); } inline void utf16ToUtf8(uint16_t code_unit, char *&output) { - if (code_unit < 0x80) { - *output++ = static_cast(code_unit); - } else if (code_unit < 0x800) { - *output++ = static_cast(0xC0 | (code_unit >> 6)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } else { - *output++ = static_cast(0xE0 | (code_unit >> 12)); - *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } + if (code_unit < 0x80) { + *output++ = static_cast(code_unit); + } else if (code_unit < 0x800) { + *output++ = static_cast(0xC0 | (code_unit >> 6)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } else { + *output++ = static_cast(0xE0 | (code_unit >> 12)); + *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } } inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { - uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); - *utf8++ = static_cast((code_point >> 18) | 0xF0); - *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); - *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); - *utf8++ = static_cast((code_point & 0x3F) | 0x80); + uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); + *utf8++ = static_cast((code_point >> 18) | 0xF0); + *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); + *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); + *utf8++ = static_cast((code_point & 0x3F) | 0x80); } std::string utf16ToUtf8BaseLine(const std::u16string &utf16, bool is_little_endian) { - std::string utf8; - utf8.reserve(utf16.size() * - 3); // Reserve enough space to avoid frequent reallocations - - size_t i = 0; - size_t n = utf16.size(); - char buffer[4]; // Buffer to hold temporary UTF-8 bytes - char *output = buffer; - - while (i < n) { - uint16_t code_unit = utf16[i]; - if (!is_little_endian) { - code_unit = swapBytes(code_unit); - } - if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF && - utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { - // Surrogate pair - uint16_t high = code_unit; - uint16_t low = utf16[i + 1]; - if (!is_little_endian) { - low = swapBytes(low); - } - utf16SurrogatePairToUtf8(high, low, output); - utf8.append(buffer, output - buffer); - output = buffer; - ++i; - } else { - utf16ToUtf8(code_unit, output); - utf8.append(buffer, output - buffer); - output = buffer; - } - ++i; + std::string utf8; + utf8.reserve(utf16.size() * + 3); // Reserve enough space to avoid frequent reallocations + + size_t i = 0; + size_t n = utf16.size(); + char buffer[4]; // Buffer to hold temporary UTF-8 bytes + char *output = buffer; + + while (i < n) { + uint16_t code_unit = utf16[i]; + if (!is_little_endian) { + code_unit = swapBytes(code_unit); + } + if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF && + utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { + // Surrogate pair + uint16_t high = code_unit; + uint16_t low = utf16[i + 1]; + if (!is_little_endian) { + low = swapBytes(low); + } + utf16SurrogatePairToUtf8(high, low, output); + utf8.append(buffer, output - buffer); + output = buffer; + ++i; + } else { + utf16ToUtf8(code_unit, output); + utf8.append(buffer, output - buffer); + output = buffer; } - return utf8; + ++i; + } + return utf8; } // Testing Basic Logic TEST(UTF16ToUTF8Test, BasicConversion) { -std::u16string utf16 = u"Hello, 世界!"; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, u8"Hello, 世界!"); + std::u16string utf16 = u"Hello, 世界!"; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, u8"Hello, 世界!"); } // Testing Empty String TEST(UTF16ToUTF8Test, EmptyString) { -std::u16string utf16 = u""; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, ""); + std::u16string utf16 = u""; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, ""); } // Testing emoji TEST(UTF16ToUTF8Test, SurrogatePairs) { -std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, "\xF0\x9F\x98\x80"); + std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, "\xF0\x9F\x98\x80"); } // Testing Boundary TEST(UTF16ToUTF8Test, BoundaryValues) { -std::u16string utf16 = {0x0000, 0xFFFF}; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -std::string expected_utf8 = std::string("\x00", 1) + "\xEF\xBF\xBF"; -ASSERT_EQ(utf8, expected_utf8); + std::u16string utf16 = {0x0000, 0xFFFF}; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + std::string expected_utf8 = std::string("\x00", 1) + "\xEF\xBF\xBF"; + ASSERT_EQ(utf8, expected_utf8); } // Testing Special Characters TEST(UTF16ToUTF8Test, SpecialCharacters) { -std::u16string utf16 = u" \n\t"; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, " \n\t"); + std::u16string utf16 = u" \n\t"; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, " \n\t"); } // Testing LittleEndian TEST(UTF16ToUTF8Test, LittleEndian) { -std::u16string utf16 = {0x61, 0x62}; // "ab" -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, "ab"); + std::u16string utf16 = {0x61, 0x62}; // "ab" + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, "ab"); } // Testing BigEndian TEST(UTF16ToUTF8Test, BigEndian) { -std::u16string utf16 = {0xFFFE, 0xFFFE}; -std::string utf8 = fury::utf16ToUtf8(utf16, false); -ASSERT_EQ(utf8, "\xEF\xBF\xBE\xEF\xBF\xBE"); + std::u16string utf16 = {0xFFFE, 0xFFFE}; + std::string utf8 = fury::utf16ToUtf8(utf16, false); + ASSERT_EQ(utf8, "\xEF\xBF\xBE\xEF\xBF\xBE"); } // Testing Performance TEST(UTF16ToUTF8Test, PerformanceTest) { -const size_t num_tests = 1000; -const size_t string_length = 1000; -// Default little_endian -bool is_little_endian = true; - -// Random UTF-16 -std::vector test_strings; -for (size_t i = 0; i < num_tests; ++i) { -test_strings.push_back(generateRandomUTF16String(string_length)); -} - -// Lib -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::wstring_convert, char16_t> convert; -std::string utf8 = convert.to_bytes(str); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "Standard library Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception: " << e.what(); -} - -// BaseLine -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::string utf8 = utf16ToUtf8BaseLine(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "Baseline Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception: " << e.what(); -} - -// SIMD -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::string utf8 = fury::utf16ToUtf8(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "SIMD Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception: " << e.what(); -} + const size_t num_tests = 1000; + const size_t string_length = 1000; + // Default little_endian + bool is_little_endian = true; + + // Random UTF-16 + std::vector test_strings; + for (size_t i = 0; i < num_tests; ++i) { + test_strings.push_back(generateRandomUTF16String(string_length)); + } + + // Lib + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::wstring_convert, char16_t> convert; + std::string utf8 = convert.to_bytes(str); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "Standard library Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception: " << e.what(); + } + + // BaseLine + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::string utf8 = utf16ToUtf8BaseLine(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "Baseline Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception: " << e.what(); + } + + // SIMD + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::string utf8 = fury::utf16ToUtf8(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "SIMD Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception: " << e.what(); + } } // Generate random UTF-8 string std::string generateRandomUTF8String(size_t length) { - std::string str; - std::mt19937 generator(std::random_device{}()); - std::uniform_int_distribution distribution(0, 0x10FFFF); - - while (str.size() < length) { - uint32_t code_point = distribution(generator); - - // Skip surrogate pairs (0xD800 to 0xDFFF) and other invalid Unicode code - // points - if ((code_point >= 0xD800 && code_point <= 0xDFFF) || - code_point > 0x10FFFF) { - continue; - } - - if (code_point <= 0x7F) { - str.push_back(static_cast(code_point)); - } else if (code_point <= 0x7FF) { - str.push_back(0xC0 | (code_point >> 6)); - str.push_back(0x80 | (code_point & 0x3F)); - } else if (code_point <= 0xFFFF) { - str.push_back(0xE0 | (code_point >> 12)); - str.push_back(0x80 | ((code_point >> 6) & 0x3F)); - str.push_back(0x80 | (code_point & 0x3F)); - } else if (code_point <= 0x10FFFF) { - str.push_back(0xF0 | (code_point >> 18)); - str.push_back(0x80 | ((code_point >> 12) & 0x3F)); - str.push_back(0x80 | ((code_point >> 6) & 0x3F)); - str.push_back(0x80 | (code_point & 0x3F)); - } + std::string str; + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (str.size() < length) { + uint32_t code_point = distribution(generator); + + // Skip surrogate pairs (0xD800 to 0xDFFF) and other invalid Unicode code + // points + if ((code_point >= 0xD800 && code_point <= 0xDFFF) || + code_point > 0x10FFFF) { + continue; } - return str; + if (code_point <= 0x7F) { + str.push_back(static_cast(code_point)); + } else if (code_point <= 0x7FF) { + str.push_back(0xC0 | (code_point >> 6)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0xFFFF) { + str.push_back(0xE0 | (code_point >> 12)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0x10FFFF) { + str.push_back(0xF0 | (code_point >> 18)); + str.push_back(0x80 | ((code_point >> 12) & 0x3F)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } + } + + return str; } std::u16string utf8ToUtf16BaseLine(const std::string &utf8, bool is_little_endian) { - std::u16string utf16; // Resulting UTF-16 string - size_t i = 0; // Index for traversing the UTF-8 string - size_t n = utf8.size(); // Total length of the UTF-8 string - - // Loop through each byte of the UTF-8 string - while (i < n) { - uint32_t code_point = 0; // The Unicode code point - unsigned char c = utf8[i]; // Current byte of the UTF-8 string - - // Determine the number of bytes for this character based on its first byte - if ((c & 0x80) == 0) { - // 1-byte character (ASCII) - code_point = c; - ++i; - } else if ((c & 0xE0) == 0xC0) { - // 2-byte character - code_point = c & 0x1F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - i += 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character - code_point = c & 0x0F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - i += 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character - code_point = c & 0x07; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); - i += 4; - } else { - // Invalid UTF-8 byte sequence - throw std::invalid_argument("Invalid UTF-8 encoding."); - } - - // If the code point is beyond the BMP range, use surrogate pairs - if (code_point >= 0x10000) { - code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair - uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate - - // If not little-endian, swap bytes of the surrogates - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); - low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); - } - - // Add both high and low surrogates to the UTF-16 string - utf16.push_back(high_surrogate); - utf16.push_back(low_surrogate); - } else { - // For code points within the BMP range, directly store as a 16-bit value - uint16_t utf16_char = static_cast(code_point); - - // If not little-endian, swap the bytes of the 16-bit character - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); - } - - // Add the UTF-16 character to the string - utf16.push_back(utf16_char); - } + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string + + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string + + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); } - // Return the resulting UTF-16 string - return utf16; + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); + } + } + + // Return the resulting UTF-16 string + return utf16; } // Testing Basic Logic TEST(UTF8ToUTF16Test, BasicConversion) { -std::string utf8 = u8"Hello, 世界!"; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -ASSERT_EQ(utf16, u"Hello, 世界!"); + std::string utf8 = u8"Hello, 世界!"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u"Hello, 世界!"); } // Testing Empty String TEST(UTF8ToUTF16Test, EmptyString) { -std::string utf8 = ""; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -ASSERT_EQ(utf16, u""); + std::string utf8 = ""; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u""); } // Testing emoji TEST(UTF8ToUTF16Test, SurrogatePairs) { -std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 emoji -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = {0xD83D, 0xDE00}; // Surrogate pair for emoji -ASSERT_EQ(utf16, expected_utf16); + std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 emoji + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = {0xD83D, 0xDE00}; // Surrogate pair for emoji + ASSERT_EQ(utf16, expected_utf16); } // Correct Boundary testing for U+FFFD (replacement character) TEST(UTF8ToUTF16Test, BoundaryValues) { -// "\xEF\xBF\xBD" is the UTF-8 encoding for U+FFFD (replacement character) -std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = { - 0xFFFD}; // Expected UTF-16 representation of U+FFFD -ASSERT_EQ(utf16, expected_utf16); + // "\xEF\xBF\xBD" is the UTF-8 encoding for U+FFFD (replacement character) + std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = { + 0xFFFD}; // Expected UTF-16 representation of U+FFFD + ASSERT_EQ(utf16, expected_utf16); } // Testing Special Characters TEST(UTF8ToUTF16Test, SpecialCharacters) { -std::string utf8 = " \n\t"; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -ASSERT_EQ(utf16, u" \n\t"); + std::string utf8 = " \n\t"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u" \n\t"); } // Testing LittleEndian TEST(UTF8ToUTF16Test, LittleEndian) { -std::string utf8 = "ab"; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = { - 0x61, 0x62}; // Little-endian UTF-16 representation of "ab" -ASSERT_EQ(utf16, expected_utf16); + std::string utf8 = "ab"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = { + 0x61, 0x62}; // Little-endian UTF-16 representation of "ab" + ASSERT_EQ(utf16, expected_utf16); } // Correct BigEndian testing for BOM (Byte Order Mark) TEST(UTF8ToUTF16Test, BigEndian) { -std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) -std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian -std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 -ASSERT_EQ(utf16, expected_utf16); + std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) + std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian + std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 + ASSERT_EQ(utf16, expected_utf16); } // Testing round-trip conversion (UTF-8 -> UTF-16 -> UTF-8) TEST(UTF8ToUTF16Test, RoundTripConversion) { -std::string original_utf8 = u8"Hello, 世界!"; -std::u16string utf16 = fury::utf8ToUtf16(original_utf8, true); -std::string utf8_converted_back = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(original_utf8, utf8_converted_back); + std::string original_utf8 = u8"Hello, 世界!"; + std::u16string utf16 = fury::utf8ToUtf16(original_utf8, true); + std::string utf8_converted_back = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(original_utf8, utf8_converted_back); } // Testing Performance TEST(UTF8ToUTF16Test, PerformanceTest) { -const size_t num_tests = 1000; -const size_t string_length = 1000; -// Default little_endian -bool is_little_endian = true; - -// Random UTF-8 -std::vector test_strings; -for (size_t i = 0; i < num_tests; ++i) { -test_strings.push_back(generateRandomUTF8String(string_length)); -} - -// Standard Library -try { -auto start_time = std::chrono::high_resolution_clock::now(); -std::wstring_convert, wchar_t> convert; -// Loop through test strings and convert each UTF-8 string to UTF-16 -for (const auto &str : test_strings) { -std::wstring wide_str = convert.from_bytes(str); -std::u16string utf16; -for (wchar_t wc : wide_str) { -utf16.push_back(static_cast(wc)); -} -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "Standard Library Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in standard library conversion: " -<< e.what(); -} - -// BaseLine -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in baseline conversion: " << e.what(); -} - -// Optimized (SIMD) -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "SIMD Optimized Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " -<< e.what(); -} + const size_t num_tests = 1000; + const size_t string_length = 1000; + // Default little_endian + bool is_little_endian = true; + + // Random UTF-8 + std::vector test_strings; + for (size_t i = 0; i < num_tests; ++i) { + test_strings.push_back(generateRandomUTF8String(string_length)); + } + + // Standard Library + try { + auto start_time = std::chrono::high_resolution_clock::now(); + std::wstring_convert, wchar_t> convert; + // Loop through test strings and convert each UTF-8 string to UTF-16 + for (const auto &str : test_strings) { + std::wstring wide_str = convert.from_bytes(str); + std::u16string utf16; + for (wchar_t wc : wide_str) { + utf16.push_back(static_cast(wc)); + } + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "Standard Library Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in standard library conversion: " + << e.what(); + } + + // BaseLine + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in baseline conversion: " << e.what(); + } + + // Optimized (SIMD) + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "SIMD Optimized Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " + << e.what(); + } } } // namespace fury int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); } \ No newline at end of file From c60dff86607bff1c5ccb3d2b07d6b54a2e0e96d7 Mon Sep 17 00:00:00 2001 From: pandalee99 <1162953505@qq.com> Date: Wed, 25 Dec 2024 23:59:56 +0800 Subject: [PATCH 6/6] fix --- cpp/fury/util/string_util.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index d6ae9475d9..3c0543c913 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -71,7 +71,6 @@ std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { while (i + 32 <= n) { - // Now process the characters in 'in' SIMD register for (int j = 0; j < 32; ++j) { uint8_t byte = utf8[i + j];