From cd2df8d13c0369ad6c8ccfe73662a6a306fcacaa Mon Sep 17 00:00:00 2001 From: pandalee99 <1162953505@qq.com> Date: Wed, 25 Dec 2024 23:46:31 +0800 Subject: [PATCH] clang-format --- cpp/fury/util/string_util.cc | 484 +++++++++--------- cpp/fury/util/string_util.h | 6 +- cpp/fury/util/string_util_test.cc | 796 +++++++++++++++--------------- 3 files changed, 643 insertions(+), 643 deletions(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index 866de3b4bf..d6ae9475d9 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -33,272 +33,272 @@ namespace fury { // Swap bytes to convert from big endian to little endian - inline uint16_t swapBytes(uint16_t value) { - return (value >> 8) | (value << 8); - } +inline uint16_t swapBytes(uint16_t value) { + return (value >> 8) | (value << 8); +} - inline void utf16ToUtf8(uint16_t code_unit, char *&output) { - if (code_unit < 0x80) { - *output++ = static_cast(code_unit); - } else if (code_unit < 0x800) { - *output++ = static_cast(0xC0 | (code_unit >> 6)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } else { - *output++ = static_cast(0xE0 | (code_unit >> 12)); - *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); +inline void utf16ToUtf8(uint16_t code_unit, char *&output) { + if (code_unit < 0x80) { + *output++ = static_cast(code_unit); + } else if (code_unit < 0x800) { + *output++ = static_cast(0xC0 | (code_unit >> 6)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } else { + *output++ = static_cast(0xE0 | (code_unit >> 12)); + *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } +} + +inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { + uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); + *utf8++ = static_cast((code_point >> 18) | 0xF0); + *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); + *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); + *utf8++ = static_cast((code_point & 0x3F) | 0x80); +} + +std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { + std::u16string utf16; + utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations + + char buffer[64]; // Buffer to hold temporary UTF-16 results + char16_t *output = + reinterpret_cast(buffer); // Use char16_t for output + + size_t i = 0; + size_t n = utf8.size(); + + while (i + 32 <= n) { + + // Now process the characters in 'in' SIMD register + for (int j = 0; j < 32; ++j) { + uint8_t byte = utf8[i + j]; + + if (byte < 0x80) { + // 1-byte character (ASCII) + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + // 2-byte character + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++j; + } else if (byte < 0xF0) { + // 3-byte character + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + j + 1] & 0x3F) << 6) | + (utf8[i + j + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + j += 2; + } else { + // 4-byte character (surrogate pair handling required) + uint32_t code_point = + ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | + ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); + + // Convert the code point to a surrogate pair + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | + (high_surrogate << 8); // Swap bytes for big-endian + low_surrogate = (low_surrogate >> 8) | + (low_surrogate << 8); // Swap bytes for big-endian } - } - inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { - uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); - *utf8++ = static_cast((code_point >> 18) | 0xF0); - *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); - *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); - *utf8++ = static_cast((code_point & 0x3F) | 0x80); + *output++ = high_surrogate; + *output++ = low_surrogate; + + j += 3; + } } - std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { - std::u16string utf16; - utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations - - char buffer[64]; // Buffer to hold temporary UTF-16 results - char16_t *output = - reinterpret_cast(buffer); // Use char16_t for output - - size_t i = 0; - size_t n = utf8.size(); - - while (i + 32 <= n) { - - // Now process the characters in 'in' SIMD register - for (int j = 0; j < 32; ++j) { - uint8_t byte = utf8[i + j]; - - if (byte < 0x80) { - // 1-byte character (ASCII) - *output++ = static_cast(byte); - } else if (byte < 0xE0) { - // 2-byte character - uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F); - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | - (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - ++j; - } else if (byte < 0xF0) { - // 3-byte character - uint16_t utf16_char = ((byte & 0x0F) << 12) | - ((utf8[i + j + 1] & 0x3F) << 6) | - (utf8[i + j + 2] & 0x3F); - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | - (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - j += 2; - } else { - // 4-byte character (surrogate pair handling required) - uint32_t code_point = - ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | - ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); - - // Convert the code point to a surrogate pair - uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); - - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | - (high_surrogate << 8); // Swap bytes for big-endian - low_surrogate = (low_surrogate >> 8) | - (low_surrogate << 8); // Swap bytes for big-endian - } - - *output++ = high_surrogate; - *output++ = low_surrogate; - - j += 3; - } - } - - // Append the processed buffer to the final utf16 string - utf16.append(reinterpret_cast(buffer), - output - reinterpret_cast(buffer)); - output = - reinterpret_cast(buffer); // Reset output buffer pointer - i += 32; - } + // Append the processed buffer to the final utf16 string + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); + output = + reinterpret_cast(buffer); // Reset output buffer pointer + i += 32; + } - // Handle remaining characters - while (i < n) { - uint8_t byte = utf8[i]; - - if (byte < 0x80) { - *output++ = static_cast(byte); - } else if (byte < 0xE0) { - uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F); - if (!is_little_endian) { - utf16_char = - (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - ++i; - } else if (byte < 0xF0) { - uint16_t utf16_char = ((byte & 0x0F) << 12) | - ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F); - if (!is_little_endian) { - utf16_char = - (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian - } - *output++ = utf16_char; - i += 2; - } else { - uint32_t code_point = ((byte & 0x07) << 18) | - ((utf8[i + 1] & 0x3F) << 12) | - ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F); - - uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); - - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); - low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); - } - - *output++ = high_surrogate; - *output++ = low_surrogate; - - i += 3; - } - - ++i; - } + // Handle remaining characters + while (i < n) { + uint8_t byte = utf8[i]; - // Append the last part of the buffer to the utf16 string - utf16.append(reinterpret_cast(buffer), - output - reinterpret_cast(buffer)); + if (byte < 0x80) { + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++i; + } else if (byte < 0xF0) { + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + i += 2; + } else { + uint32_t code_point = ((byte & 0x07) << 18) | + ((utf8[i + 1] & 0x3F) << 12) | + ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F); - return utf16; + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + *output++ = high_surrogate; + *output++ = low_surrogate; + + i += 3; } + ++i; + } + + // Append the last part of the buffer to the utf16 string + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); + + return utf16; +} + #if defined(__x86_64__) || defined(_M_X64) - bool isLatin(const std::string &str) { - const char *data = str.data(); - size_t len = str.size(); - - size_t i = 0; - __m256i latin_mask = _mm256_set1_epi8(0x80); - for (; i + 32 <= len; i += 32) { - __m256i chars = - _mm256_loadu_si256(reinterpret_cast(data + i)); - __m256i result = _mm256_and_si256(chars, latin_mask); - if (!_mm256_testz_si256(result, result)) { - return false; - } - } +bool isLatin(const std::string &str) { + const char *data = str.data(); + size_t len = str.size(); - for (; i < len; ++i) { - if (static_cast(data[i]) >= 128) { - return false; - } - } + size_t i = 0; + __m256i latin_mask = _mm256_set1_epi8(0x80); + for (; i + 32 <= len; i += 32) { + __m256i chars = + _mm256_loadu_si256(reinterpret_cast(data + i)); + __m256i result = _mm256_and_si256(chars, latin_mask); + if (!_mm256_testz_si256(result, result)) { + return false; + } + } + + for (; i < len; ++i) { + if (static_cast(data[i]) >= 128) { + return false; + } + } + + return true; +} - return true; +std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { + std::string utf8; + utf8.reserve(utf16.size() * + 3); // Reserve enough space to avoid frequent reallocations + + const __m256i limit1 = _mm256_set1_epi16(0x80); + const __m256i limit2 = _mm256_set1_epi16(0x800); + const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800); + const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF); + const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00); + const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF); + + char buffer[64]; // Buffer to hold temporary UTF-8 bytes + char *output = buffer; + + size_t i = 0; + size_t n = utf16.size(); + + while (i + 16 <= n) { + __m256i in = + _mm256_loadu_si256(reinterpret_cast(utf16.data() + i)); + + if (!is_little_endian) { + in = _mm256_or_si256( + _mm256_slli_epi16(in, 8), + _mm256_srli_epi16(in, 8)); // Swap bytes for big-endian } - std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { - std::string utf8; - utf8.reserve(utf16.size() * - 3); // Reserve enough space to avoid frequent reallocations - - const __m256i limit1 = _mm256_set1_epi16(0x80); - const __m256i limit2 = _mm256_set1_epi16(0x800); - const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800); - const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF); - const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00); - const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF); - - char buffer[64]; // Buffer to hold temporary UTF-8 bytes - char *output = buffer; - - size_t i = 0; - size_t n = utf16.size(); - - while (i + 16 <= n) { - __m256i in = - _mm256_loadu_si256(reinterpret_cast(utf16.data() + i)); - - if (!is_little_endian) { - in = _mm256_or_si256( - _mm256_slli_epi16(in, 8), - _mm256_srli_epi16(in, 8)); // Swap bytes for big-endian - } - - __m256i mask1 = _mm256_cmpgt_epi16(in, limit1); - __m256i mask2 = _mm256_cmpgt_epi16(in, limit2); - __m256i high_surrogate_mask = - _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start), - _mm256_cmpgt_epi16(in, surrogate_high_end)); - __m256i low_surrogate_mask = - _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start), - _mm256_cmpgt_epi16(in, surrogate_low_end)); - - if (_mm256_testz_si256(mask1, mask1)) { - // All values < 0x80, 1 byte per character - for (int j = 0; j < 16; ++j) { - *output++ = static_cast(utf16[i + j]); - } - } else if (_mm256_testz_si256(mask2, mask2)) { - // All values < 0x800, 2 bytes per character - for (int j = 0; j < 16; ++j) { - utf16ToUtf8(utf16[i + j], output); - } - } else { - // Mix of 1, 2, and 3 byte characters - for (int j = 0; j < 16; ++j) { - if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) && - j + 1 < 16 && - !_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) { - // Surrogate pair - utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output); - ++j; - } else { - utf16ToUtf8(utf16[i + j], output); - } - } - } - - utf8.append(buffer, output - buffer); - output = buffer; // Reset output buffer pointer - i += 16; + __m256i mask1 = _mm256_cmpgt_epi16(in, limit1); + __m256i mask2 = _mm256_cmpgt_epi16(in, limit2); + __m256i high_surrogate_mask = + _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start), + _mm256_cmpgt_epi16(in, surrogate_high_end)); + __m256i low_surrogate_mask = + _mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start), + _mm256_cmpgt_epi16(in, surrogate_low_end)); + + if (_mm256_testz_si256(mask1, mask1)) { + // All values < 0x80, 1 byte per character + for (int j = 0; j < 16; ++j) { + *output++ = static_cast(utf16[i + j]); + } + } else if (_mm256_testz_si256(mask2, mask2)) { + // All values < 0x800, 2 bytes per character + for (int j = 0; j < 16; ++j) { + utf16ToUtf8(utf16[i + j], output); + } + } else { + // Mix of 1, 2, and 3 byte characters + for (int j = 0; j < 16; ++j) { + if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) && + j + 1 < 16 && + !_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) { + // Surrogate pair + utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output); + ++j; + } else { + utf16ToUtf8(utf16[i + j], output); } + } + } - // Handle remaining characters - while (i < n) { - if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF && - utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { - // Surrogate pair - utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output); - ++i; - } else { - utf16ToUtf8(utf16[i], output); - } - ++i; - } - utf8.append(buffer, output - buffer); + utf8.append(buffer, output - buffer); + output = buffer; // Reset output buffer pointer + i += 16; + } - return utf8; + // Handle remaining characters + while (i < n) { + if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF && + utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { + // Surrogate pair + utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output); + ++i; + } else { + utf16ToUtf8(utf16[i], output); } + ++i; + } + utf8.append(buffer, output - buffer); - std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { - return utf8ToUtf16SIMD(utf8, is_little_endian); - } + return utf8; +} + +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8, is_little_endian); +} #elif defined(__ARM_NEON) || defined(__ARM_NEON__) - bool isLatin(const std::string &str) { +bool isLatin(const std::string &str) { const char *data = str.data(); size_t len = str.size(); diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h index 8a200afe38..d5bdde543a 100644 --- a/cpp/fury/util/string_util.h +++ b/cpp/fury/util/string_util.h @@ -23,10 +23,10 @@ namespace fury { - bool isLatin(const std::string &str); +bool isLatin(const std::string &str); - std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); +std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); - std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian); +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian); } // namespace fury \ No newline at end of file diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index 69a6c5c457..60c72c80b3 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -30,516 +30,516 @@ namespace fury { // Function to generate a random string - std::string generateRandomString(size_t length) { - const char charset[] = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - std::default_random_engine rng(std::random_device{}()); - std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); - - std::string result; - result.reserve(length); - for (size_t i = 0; i < length; ++i) { - result += charset[dist(rng)]; - } - - return result; - } +std::string generateRandomString(size_t length) { + const char charset[] = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + std::default_random_engine rng(std::random_device{}()); + std::uniform_int_distribution<> dist(0, sizeof(charset) - 2); + + std::string result; + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + result += charset[dist(rng)]; + } - bool isLatin_BaseLine(const std::string &str) { - for (char c : str) { - if (static_cast(c) >= 128) { - return false; - } - } - return true; + return result; +} + +bool isLatin_BaseLine(const std::string &str) { + for (char c : str) { + if (static_cast(c) >= 128) { + return false; } + } + return true; +} - TEST(StringUtilTest, TestIsLatinFunctions) { - std::string testStr = generateRandomString(100000); - auto start_time = std::chrono::high_resolution_clock::now(); - bool result = isLatin_BaseLine(testStr); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); - FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns."; - - start_time = std::chrono::high_resolution_clock::now(); - result = isLatin(testStr); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - - start_time) - .count(); - FURY_LOG(INFO) << "Optimized Running Time: " << duration << " ns."; - - EXPECT_TRUE(result); +TEST(StringUtilTest, TestIsLatinFunctions) { + std::string testStr = generateRandomString(100000); + auto start_time = std::chrono::high_resolution_clock::now(); + bool result = isLatin_BaseLine(testStr); + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns."; + + start_time = std::chrono::high_resolution_clock::now(); + result = isLatin(testStr); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - + start_time) + .count(); + FURY_LOG(INFO) << "Optimized Running Time: " << duration << " ns."; + + EXPECT_TRUE(result); } TEST(StringUtilTest, TestIsLatinLogic) { -// Test strings with only Latin characters -EXPECT_TRUE(isLatin("Fury")); -EXPECT_TRUE(isLatin(generateRandomString(80))); - -// Test unaligned strings with only Latin characters -EXPECT_TRUE(isLatin(generateRandomString(80) + "1")); -EXPECT_TRUE(isLatin(generateRandomString(80) + "12")); -EXPECT_TRUE(isLatin(generateRandomString(80) + "123")); - -// Test strings with non-Latin characters -EXPECT_FALSE(isLatin("你好, Fury")); -EXPECT_FALSE(isLatin(generateRandomString(80) + "你好")); -EXPECT_FALSE(isLatin(generateRandomString(80) + "1你好")); -EXPECT_FALSE(isLatin(generateRandomString(11) + "你")); -EXPECT_FALSE(isLatin(generateRandomString(10) + "你好")); -EXPECT_FALSE(isLatin(generateRandomString(9) + "性能好")); -EXPECT_FALSE(isLatin("\u1234")); -EXPECT_FALSE(isLatin("a\u1234")); -EXPECT_FALSE(isLatin("ab\u1234")); -EXPECT_FALSE(isLatin("abc\u1234")); -EXPECT_FALSE(isLatin("abcd\u1234")); -EXPECT_FALSE(isLatin("Javaone Keynote\u1234")); + // Test strings with only Latin characters + EXPECT_TRUE(isLatin("Fury")); + EXPECT_TRUE(isLatin(generateRandomString(80))); + + // Test unaligned strings with only Latin characters + EXPECT_TRUE(isLatin(generateRandomString(80) + "1")); + EXPECT_TRUE(isLatin(generateRandomString(80) + "12")); + EXPECT_TRUE(isLatin(generateRandomString(80) + "123")); + + // Test strings with non-Latin characters + EXPECT_FALSE(isLatin("你好, Fury")); + EXPECT_FALSE(isLatin(generateRandomString(80) + "你好")); + EXPECT_FALSE(isLatin(generateRandomString(80) + "1你好")); + EXPECT_FALSE(isLatin(generateRandomString(11) + "你")); + EXPECT_FALSE(isLatin(generateRandomString(10) + "你好")); + EXPECT_FALSE(isLatin(generateRandomString(9) + "性能好")); + EXPECT_FALSE(isLatin("\u1234")); + EXPECT_FALSE(isLatin("a\u1234")); + EXPECT_FALSE(isLatin("ab\u1234")); + EXPECT_FALSE(isLatin("abc\u1234")); + EXPECT_FALSE(isLatin("abcd\u1234")); + EXPECT_FALSE(isLatin("Javaone Keynote\u1234")); } // Generate random UTF-16 string ensuring valid surrogate pairs std::u16string generateRandomUTF16String(size_t length) { - std::u16string str; - std::mt19937 generator(std::random_device{}()); - std::uniform_int_distribution distribution(0, 0x10FFFF); - - while (str.size() < length) { - uint32_t code_point = distribution(generator); - - if (code_point <= 0xD7FF || - (code_point >= 0xE000 && code_point <= 0xFFFF)) { - str.push_back(static_cast(code_point)); - } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { - code_point -= 0x10000; - str.push_back(static_cast((code_point >> 10) + 0xD800)); - str.push_back(static_cast((code_point & 0x3FF) + 0xDC00)); - } + std::u16string str; + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (str.size() < length) { + uint32_t code_point = distribution(generator); + + if (code_point <= 0xD7FF || + (code_point >= 0xE000 && code_point <= 0xFFFF)) { + str.push_back(static_cast(code_point)); + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF) { + code_point -= 0x10000; + str.push_back(static_cast((code_point >> 10) + 0xD800)); + str.push_back(static_cast((code_point & 0x3FF) + 0xDC00)); } + } - return str; + return str; } // Basic implementation // Swap bytes to convert from big endian to little endian inline uint16_t swapBytes(uint16_t value) { - return (value >> 8) | (value << 8); + return (value >> 8) | (value << 8); } inline void utf16ToUtf8(uint16_t code_unit, char *&output) { - if (code_unit < 0x80) { - *output++ = static_cast(code_unit); - } else if (code_unit < 0x800) { - *output++ = static_cast(0xC0 | (code_unit >> 6)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } else { - *output++ = static_cast(0xE0 | (code_unit >> 12)); - *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); - *output++ = static_cast(0x80 | (code_unit & 0x3F)); - } + if (code_unit < 0x80) { + *output++ = static_cast(code_unit); + } else if (code_unit < 0x800) { + *output++ = static_cast(0xC0 | (code_unit >> 6)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } else { + *output++ = static_cast(0xE0 | (code_unit >> 12)); + *output++ = static_cast(0x80 | ((code_unit >> 6) & 0x3F)); + *output++ = static_cast(0x80 | (code_unit & 0x3F)); + } } inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { - uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); - *utf8++ = static_cast((code_point >> 18) | 0xF0); - *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); - *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); - *utf8++ = static_cast((code_point & 0x3F) | 0x80); + uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00); + *utf8++ = static_cast((code_point >> 18) | 0xF0); + *utf8++ = static_cast(((code_point >> 12) & 0x3F) | 0x80); + *utf8++ = static_cast(((code_point >> 6) & 0x3F) | 0x80); + *utf8++ = static_cast((code_point & 0x3F) | 0x80); } std::string utf16ToUtf8BaseLine(const std::u16string &utf16, bool is_little_endian) { - std::string utf8; - utf8.reserve(utf16.size() * - 3); // Reserve enough space to avoid frequent reallocations - - size_t i = 0; - size_t n = utf16.size(); - char buffer[4]; // Buffer to hold temporary UTF-8 bytes - char *output = buffer; - - while (i < n) { - uint16_t code_unit = utf16[i]; - if (!is_little_endian) { - code_unit = swapBytes(code_unit); - } - if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF && - utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { - // Surrogate pair - uint16_t high = code_unit; - uint16_t low = utf16[i + 1]; - if (!is_little_endian) { - low = swapBytes(low); - } - utf16SurrogatePairToUtf8(high, low, output); - utf8.append(buffer, output - buffer); - output = buffer; - ++i; - } else { - utf16ToUtf8(code_unit, output); - utf8.append(buffer, output - buffer); - output = buffer; - } - ++i; + std::string utf8; + utf8.reserve(utf16.size() * + 3); // Reserve enough space to avoid frequent reallocations + + size_t i = 0; + size_t n = utf16.size(); + char buffer[4]; // Buffer to hold temporary UTF-8 bytes + char *output = buffer; + + while (i < n) { + uint16_t code_unit = utf16[i]; + if (!is_little_endian) { + code_unit = swapBytes(code_unit); + } + if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF && + utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) { + // Surrogate pair + uint16_t high = code_unit; + uint16_t low = utf16[i + 1]; + if (!is_little_endian) { + low = swapBytes(low); + } + utf16SurrogatePairToUtf8(high, low, output); + utf8.append(buffer, output - buffer); + output = buffer; + ++i; + } else { + utf16ToUtf8(code_unit, output); + utf8.append(buffer, output - buffer); + output = buffer; } - return utf8; + ++i; + } + return utf8; } // Testing Basic Logic TEST(UTF16ToUTF8Test, BasicConversion) { -std::u16string utf16 = u"Hello, 世界!"; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, u8"Hello, 世界!"); + std::u16string utf16 = u"Hello, 世界!"; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, u8"Hello, 世界!"); } // Testing Empty String TEST(UTF16ToUTF8Test, EmptyString) { -std::u16string utf16 = u""; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, ""); + std::u16string utf16 = u""; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, ""); } // Testing emoji TEST(UTF16ToUTF8Test, SurrogatePairs) { -std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, "\xF0\x9F\x98\x80"); + std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, "\xF0\x9F\x98\x80"); } // Testing Boundary TEST(UTF16ToUTF8Test, BoundaryValues) { -std::u16string utf16 = {0x0000, 0xFFFF}; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -std::string expected_utf8 = std::string("\x00", 1) + "\xEF\xBF\xBF"; -ASSERT_EQ(utf8, expected_utf8); + std::u16string utf16 = {0x0000, 0xFFFF}; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + std::string expected_utf8 = std::string("\x00", 1) + "\xEF\xBF\xBF"; + ASSERT_EQ(utf8, expected_utf8); } // Testing Special Characters TEST(UTF16ToUTF8Test, SpecialCharacters) { -std::u16string utf16 = u" \n\t"; -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, " \n\t"); + std::u16string utf16 = u" \n\t"; + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, " \n\t"); } // Testing LittleEndian TEST(UTF16ToUTF8Test, LittleEndian) { -std::u16string utf16 = {0x61, 0x62}; // "ab" -std::string utf8 = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(utf8, "ab"); + std::u16string utf16 = {0x61, 0x62}; // "ab" + std::string utf8 = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(utf8, "ab"); } // Testing BigEndian TEST(UTF16ToUTF8Test, BigEndian) { -std::u16string utf16 = {0xFFFE, 0xFFFE}; -std::string utf8 = fury::utf16ToUtf8(utf16, false); -ASSERT_EQ(utf8, "\xEF\xBF\xBE\xEF\xBF\xBE"); + std::u16string utf16 = {0xFFFE, 0xFFFE}; + std::string utf8 = fury::utf16ToUtf8(utf16, false); + ASSERT_EQ(utf8, "\xEF\xBF\xBE\xEF\xBF\xBE"); } // Testing Performance TEST(UTF16ToUTF8Test, PerformanceTest) { -const size_t num_tests = 1000; -const size_t string_length = 1000; -// Default little_endian -bool is_little_endian = true; - -// Random UTF-16 -std::vector test_strings; -for (size_t i = 0; i < num_tests; ++i) { -test_strings.push_back(generateRandomUTF16String(string_length)); -} - -// Lib -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::wstring_convert, char16_t> convert; -std::string utf8 = convert.to_bytes(str); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "Standard library Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception: " << e.what(); -} - -// BaseLine -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::string utf8 = utf16ToUtf8BaseLine(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "Baseline Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception: " << e.what(); -} - -// SIMD -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::string utf8 = fury::utf16ToUtf8(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "SIMD Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception: " << e.what(); -} + const size_t num_tests = 1000; + const size_t string_length = 1000; + // Default little_endian + bool is_little_endian = true; + + // Random UTF-16 + std::vector test_strings; + for (size_t i = 0; i < num_tests; ++i) { + test_strings.push_back(generateRandomUTF16String(string_length)); + } + + // Lib + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::wstring_convert, char16_t> convert; + std::string utf8 = convert.to_bytes(str); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "Standard library Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception: " << e.what(); + } + + // BaseLine + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::string utf8 = utf16ToUtf8BaseLine(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "Baseline Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception: " << e.what(); + } + + // SIMD + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::string utf8 = fury::utf16ToUtf8(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "SIMD Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception: " << e.what(); + } } // Generate random UTF-8 string std::string generateRandomUTF8String(size_t length) { - std::string str; - std::mt19937 generator(std::random_device{}()); - std::uniform_int_distribution distribution(0, 0x10FFFF); - - while (str.size() < length) { - uint32_t code_point = distribution(generator); - - // Skip surrogate pairs (0xD800 to 0xDFFF) and other invalid Unicode code - // points - if ((code_point >= 0xD800 && code_point <= 0xDFFF) || - code_point > 0x10FFFF) { - continue; - } - - if (code_point <= 0x7F) { - str.push_back(static_cast(code_point)); - } else if (code_point <= 0x7FF) { - str.push_back(0xC0 | (code_point >> 6)); - str.push_back(0x80 | (code_point & 0x3F)); - } else if (code_point <= 0xFFFF) { - str.push_back(0xE0 | (code_point >> 12)); - str.push_back(0x80 | ((code_point >> 6) & 0x3F)); - str.push_back(0x80 | (code_point & 0x3F)); - } else if (code_point <= 0x10FFFF) { - str.push_back(0xF0 | (code_point >> 18)); - str.push_back(0x80 | ((code_point >> 12) & 0x3F)); - str.push_back(0x80 | ((code_point >> 6) & 0x3F)); - str.push_back(0x80 | (code_point & 0x3F)); - } + std::string str; + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (str.size() < length) { + uint32_t code_point = distribution(generator); + + // Skip surrogate pairs (0xD800 to 0xDFFF) and other invalid Unicode code + // points + if ((code_point >= 0xD800 && code_point <= 0xDFFF) || + code_point > 0x10FFFF) { + continue; } - return str; + if (code_point <= 0x7F) { + str.push_back(static_cast(code_point)); + } else if (code_point <= 0x7FF) { + str.push_back(0xC0 | (code_point >> 6)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0xFFFF) { + str.push_back(0xE0 | (code_point >> 12)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0x10FFFF) { + str.push_back(0xF0 | (code_point >> 18)); + str.push_back(0x80 | ((code_point >> 12) & 0x3F)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } + } + + return str; } std::u16string utf8ToUtf16BaseLine(const std::string &utf8, bool is_little_endian) { - std::u16string utf16; // Resulting UTF-16 string - size_t i = 0; // Index for traversing the UTF-8 string - size_t n = utf8.size(); // Total length of the UTF-8 string - - // Loop through each byte of the UTF-8 string - while (i < n) { - uint32_t code_point = 0; // The Unicode code point - unsigned char c = utf8[i]; // Current byte of the UTF-8 string - - // Determine the number of bytes for this character based on its first byte - if ((c & 0x80) == 0) { - // 1-byte character (ASCII) - code_point = c; - ++i; - } else if ((c & 0xE0) == 0xC0) { - // 2-byte character - code_point = c & 0x1F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - i += 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character - code_point = c & 0x0F; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - i += 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character - code_point = c & 0x07; - code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); - code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); - i += 4; - } else { - // Invalid UTF-8 byte sequence - throw std::invalid_argument("Invalid UTF-8 encoding."); - } - - // If the code point is beyond the BMP range, use surrogate pairs - if (code_point >= 0x10000) { - code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair - uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate - uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate - - // If not little-endian, swap bytes of the surrogates - if (!is_little_endian) { - high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); - low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); - } - - // Add both high and low surrogates to the UTF-16 string - utf16.push_back(high_surrogate); - utf16.push_back(low_surrogate); - } else { - // For code points within the BMP range, directly store as a 16-bit value - uint16_t utf16_char = static_cast(code_point); - - // If not little-endian, swap the bytes of the 16-bit character - if (!is_little_endian) { - utf16_char = (utf16_char >> 8) | (utf16_char << 8); - } - - // Add the UTF-16 character to the string - utf16.push_back(utf16_char); - } + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string + + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string + + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); } - // Return the resulting UTF-16 string - return utf16; + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); + } + } + + // Return the resulting UTF-16 string + return utf16; } // Testing Basic Logic TEST(UTF8ToUTF16Test, BasicConversion) { -std::string utf8 = u8"Hello, 世界!"; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -ASSERT_EQ(utf16, u"Hello, 世界!"); + std::string utf8 = u8"Hello, 世界!"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u"Hello, 世界!"); } // Testing Empty String TEST(UTF8ToUTF16Test, EmptyString) { -std::string utf8 = ""; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -ASSERT_EQ(utf16, u""); + std::string utf8 = ""; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u""); } // Testing emoji TEST(UTF8ToUTF16Test, SurrogatePairs) { -std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 emoji -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = {0xD83D, 0xDE00}; // Surrogate pair for emoji -ASSERT_EQ(utf16, expected_utf16); + std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 emoji + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = {0xD83D, 0xDE00}; // Surrogate pair for emoji + ASSERT_EQ(utf16, expected_utf16); } // Correct Boundary testing for U+FFFD (replacement character) TEST(UTF8ToUTF16Test, BoundaryValues) { -// "\xEF\xBF\xBD" is the UTF-8 encoding for U+FFFD (replacement character) -std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = { - 0xFFFD}; // Expected UTF-16 representation of U+FFFD -ASSERT_EQ(utf16, expected_utf16); + // "\xEF\xBF\xBD" is the UTF-8 encoding for U+FFFD (replacement character) + std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = { + 0xFFFD}; // Expected UTF-16 representation of U+FFFD + ASSERT_EQ(utf16, expected_utf16); } // Testing Special Characters TEST(UTF8ToUTF16Test, SpecialCharacters) { -std::string utf8 = " \n\t"; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -ASSERT_EQ(utf16, u" \n\t"); + std::string utf8 = " \n\t"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u" \n\t"); } // Testing LittleEndian TEST(UTF8ToUTF16Test, LittleEndian) { -std::string utf8 = "ab"; -std::u16string utf16 = fury::utf8ToUtf16(utf8, true); -std::u16string expected_utf16 = { - 0x61, 0x62}; // Little-endian UTF-16 representation of "ab" -ASSERT_EQ(utf16, expected_utf16); + std::string utf8 = "ab"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = { + 0x61, 0x62}; // Little-endian UTF-16 representation of "ab" + ASSERT_EQ(utf16, expected_utf16); } // Correct BigEndian testing for BOM (Byte Order Mark) TEST(UTF8ToUTF16Test, BigEndian) { -std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) -std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian -std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 -ASSERT_EQ(utf16, expected_utf16); + std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) + std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian + std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 + ASSERT_EQ(utf16, expected_utf16); } // Testing round-trip conversion (UTF-8 -> UTF-16 -> UTF-8) TEST(UTF8ToUTF16Test, RoundTripConversion) { -std::string original_utf8 = u8"Hello, 世界!"; -std::u16string utf16 = fury::utf8ToUtf16(original_utf8, true); -std::string utf8_converted_back = fury::utf16ToUtf8(utf16, true); -ASSERT_EQ(original_utf8, utf8_converted_back); + std::string original_utf8 = u8"Hello, 世界!"; + std::u16string utf16 = fury::utf8ToUtf16(original_utf8, true); + std::string utf8_converted_back = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(original_utf8, utf8_converted_back); } // Testing Performance TEST(UTF8ToUTF16Test, PerformanceTest) { -const size_t num_tests = 1000; -const size_t string_length = 1000; -// Default little_endian -bool is_little_endian = true; - -// Random UTF-8 -std::vector test_strings; -for (size_t i = 0; i < num_tests; ++i) { -test_strings.push_back(generateRandomUTF8String(string_length)); -} - -// Standard Library -try { -auto start_time = std::chrono::high_resolution_clock::now(); -std::wstring_convert, wchar_t> convert; -// Loop through test strings and convert each UTF-8 string to UTF-16 -for (const auto &str : test_strings) { -std::wstring wide_str = convert.from_bytes(str); -std::u16string utf16; -for (wchar_t wc : wide_str) { -utf16.push_back(static_cast(wc)); -} -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "Standard Library Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in standard library conversion: " -<< e.what(); -} - -// BaseLine -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in baseline conversion: " << e.what(); -} - -// Optimized (SIMD) -try { -auto start_time = std::chrono::high_resolution_clock::now(); -for (const auto &str : test_strings) { -std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); -} -auto end_time = std::chrono::high_resolution_clock::now(); -auto duration = std::chrono::duration_cast( - end_time - start_time) - .count(); -FURY_LOG(INFO) << "SIMD Optimized Running Time: " << duration << " ns"; -} catch (const std::exception &e) { -FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " -<< e.what(); -} + const size_t num_tests = 1000; + const size_t string_length = 1000; + // Default little_endian + bool is_little_endian = true; + + // Random UTF-8 + std::vector test_strings; + for (size_t i = 0; i < num_tests; ++i) { + test_strings.push_back(generateRandomUTF8String(string_length)); + } + + // Standard Library + try { + auto start_time = std::chrono::high_resolution_clock::now(); + std::wstring_convert, wchar_t> convert; + // Loop through test strings and convert each UTF-8 string to UTF-16 + for (const auto &str : test_strings) { + std::wstring wide_str = convert.from_bytes(str); + std::u16string utf16; + for (wchar_t wc : wide_str) { + utf16.push_back(static_cast(wc)); + } + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "Standard Library Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in standard library conversion: " + << e.what(); + } + + // BaseLine + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in baseline conversion: " << e.what(); + } + + // Optimized (SIMD) + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "SIMD Optimized Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " + << e.what(); + } } } // namespace fury int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); } \ No newline at end of file