apache · pandalee99 · Dec 24, 2024 · Dec 24, 2024 · Dec 24, 2024 · Dec 25, 2024
@@ -58,6 +58,129 @@ inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) {
   *utf8++ = static_cast<char>((code_point & 0x3F) | 0x80);
 }
 
+std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) {
+  std::u16string utf16;
+  utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations
+
+  char buffer[64]; // Buffer to hold temporary UTF-16 results
+  char16_t *output =
+      reinterpret_cast<char16_t *>(buffer); // Use char16_t for output
+
+  size_t i = 0;
+  size_t n = utf8.size();
+
+  while (i + 32 <= n) {
+
+    for (int j = 0; j < 32; ++j) {
+      uint8_t byte = utf8[i + j];
+
+      if (byte < 0x80) {
+        // 1-byte character (ASCII)
+        *output++ = static_cast<char16_t>(byte);
+      } else if (byte < 0xE0) {
+        // 2-byte character
+        uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F);
+        if (!is_little_endian) {
+          utf16_char = (utf16_char >> 8) |
+                       (utf16_char << 8); // Swap bytes for big-endian
+        }
+        *output++ = utf16_char;
+        ++j;
+      } else if (byte < 0xF0) {
+        // 3-byte character
+        uint16_t utf16_char = ((byte & 0x0F) << 12) |
+                              ((utf8[i + j + 1] & 0x3F) << 6) |
+                              (utf8[i + j + 2] & 0x3F);
+        if (!is_little_endian) {
+          utf16_char = (utf16_char >> 8) |
+                       (utf16_char << 8); // Swap bytes for big-endian
+        }
+        *output++ = utf16_char;
+        j += 2;
+      } else {
+        // 4-byte character (surrogate pair handling required)
+        uint32_t code_point =
+            ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) |
+            ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F);
+
+        // Convert the code point to a surrogate pair
+        uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
+        uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);
+
+        if (!is_little_endian) {
+          high_surrogate = (high_surrogate >> 8) |
+                           (high_surrogate << 8); // Swap bytes for big-endian
+          low_surrogate = (low_surrogate >> 8) |
+                          (low_surrogate << 8); // Swap bytes for big-endian
+        }
+
+        *output++ = high_surrogate;
+        *output++ = low_surrogate;
+
+        j += 3;
+      }
+    }
+
+    // Append the processed buffer to the final utf16 string
+    utf16.append(reinterpret_cast<char16_t *>(buffer),
+                 output - reinterpret_cast<char16_t *>(buffer));
+    output =
+        reinterpret_cast<char16_t *>(buffer); // Reset output buffer pointer
+    i += 32;
+  }
+
+  // Handle remaining characters
+  while (i < n) {
+    uint8_t byte = utf8[i];
+
+    if (byte < 0x80) {
+      *output++ = static_cast<char16_t>(byte);
+    } else if (byte < 0xE0) {
+      uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
+      if (!is_little_endian) {
+        utf16_char =
+            (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
+      }
+      *output++ = utf16_char;
+      ++i;
+    } else if (byte < 0xF0) {
+      uint16_t utf16_char = ((byte & 0x0F) << 12) |
+                            ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
+      if (!is_little_endian) {
+        utf16_char =
+            (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
+      }
+      *output++ = utf16_char;
+      i += 2;
+    } else {
+      uint32_t code_point = ((byte & 0x07) << 18) |
+                            ((utf8[i + 1] & 0x3F) << 12) |
+                            ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);
+
+      uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
+      uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);
+
+      if (!is_little_endian) {
+        high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
+        low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
+      }
+
+      *output++ = high_surrogate;
+      *output++ = low_surrogate;
+
+      i += 3;
+    }
+
+    ++i;
+  }
+
+  // Append the last part of the buffer to the utf16 string
+  utf16.append(reinterpret_cast<char16_t *>(buffer),
+               output - reinterpret_cast<char16_t *>(buffer));
+
+  return utf16;
+}
+
 #if defined(__x86_64__) || defined(_M_X64)
 
 bool isLatin(const std::string &str) {
@@ -168,6 +291,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  return utf8ToUtf16SIMD(utf8, is_little_endian);
+}
+
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
 
 bool isLatin(const std::string &str) {
@@ -264,6 +391,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  return utf8ToUtf16SIMD(utf8, is_little_endian);
+}
+
 #elif defined(__riscv) && __riscv_vector
 
 bool isLatin(const std::string &str) {
@@ -365,6 +496,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  return utf8ToUtf16SIMD(utf8, is_little_endian);
+}
+
 #else
 
 bool isLatin(const std::string &str) {
@@ -414,6 +549,78 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+// Fallback implementation without SIMD acceleration
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  std::u16string utf16;   // Resulting UTF-16 string
+  size_t i = 0;           // Index for traversing the UTF-8 string
+  size_t n = utf8.size(); // Total length of the UTF-8 string
+
+  // Loop through each byte of the UTF-8 string
+  while (i < n) {
+    uint32_t code_point = 0;   // The Unicode code point
+    unsigned char c = utf8[i]; // Current byte of the UTF-8 string
+
+    // Determine the number of bytes for this character based on its first byte
+    if ((c & 0x80) == 0) {
+      // 1-byte character (ASCII)
+      code_point = c;
+      ++i;
+    } else if ((c & 0xE0) == 0xC0) {
+      // 2-byte character
+      code_point = c & 0x1F;
+      code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+      i += 2;
+    } else if ((c & 0xF0) == 0xE0) {
+      // 3-byte character
+      code_point = c & 0x0F;
+      code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+      code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
+      i += 3;
+    } else if ((c & 0xF8) == 0xF0) {
+      // 4-byte character
+      code_point = c & 0x07;
+      code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+      code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
+      code_point = (code_point << 6) | (utf8[i + 3] & 0x3F);
+      i += 4;
+    } else {
+      // Invalid UTF-8 byte sequence
+      throw std::invalid_argument("Invalid UTF-8 encoding.");
+    }
+
+    // If the code point is beyond the BMP range, use surrogate pairs
+    if (code_point >= 0x10000) {
+      code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair
+      uint16_t high_surrogate = 0xD800 + (code_point >> 10);  // High surrogate
+      uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate
+
+      // If not little-endian, swap bytes of the surrogates
+      if (!is_little_endian) {
+        high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
+        low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
+      }
+
+      // Add both high and low surrogates to the UTF-16 string
+      utf16.push_back(high_surrogate);
+      utf16.push_back(low_surrogate);
+    } else {
+      // For code points within the BMP range, directly store as a 16-bit value
+      uint16_t utf16_char = static_cast<uint16_t>(code_point);
+
+      // If not little-endian, swap the bytes of the 16-bit character
+      if (!is_little_endian) {
+        utf16_char = (utf16_char >> 8) | (utf16_char << 8);
+      }
+
+      // Add the UTF-16 character to the string
+      utf16.push_back(utf16_char);
+    }
+  }
+
+  // Return the resulting UTF-16 string
+  return utf16;
+}
+
 #endif
 
-} // namespace fury
+} // namespace fury
@@ -27,4 +27,6 @@ bool isLatin(const std::string &str);
 
 std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian);
 
-} // namespace fury
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian);
+
+} // namespace fury