Skip to content

Commit

Permalink
feat(C++): The use of SIMD accelerated to implement and optimize utf1…
Browse files Browse the repository at this point in the history
…6 utf8 (#1732)

<!--
**Thanks for contributing to Fury.**

**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).**

Contribution Checklist

- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).

- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->

## What does this PR do?
Use SIMD acceleration to implement and optimize utf16 to utf8.
Adapter x86, arm and  risc-v
Add 8 tests verify the function
<img width="261" alt="fury_cpp_simd_utf_1"
src="https://github.com/user-attachments/assets/029fe6ea-b4be-4e26-85d2-3c5e02e64899">
And efficiency has improved
<img width="401" alt="fury_cpp_simd_utf_2"
src="https://github.com/user-attachments/assets/6e86c125-f5a2-46df-b3bd-3d12496e9238">
done.


## Related issues
Closes #1546 
<!--
Is there any related issue? Please attach here.

- #xxxx0
- #xxxx1
- #xxxx2
-->


## Does this PR introduce any user-facing change?

<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->

- [x] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
  • Loading branch information
pandalee99 authored Jul 15, 2024
1 parent b32f3f9 commit caf0e48
Show file tree
Hide file tree
Showing 3 changed files with 503 additions and 3 deletions.
304 changes: 301 additions & 3 deletions cpp/fury/util/string_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,37 @@
#include <riscv_vector.h>
#endif

#include <chrono>
#include <string>

namespace fury {

// Swap bytes to convert from big endian to little endian
inline uint16_t swapBytes(uint16_t value) {
return (value >> 8) | (value << 8);
}

inline void utf16ToUtf8(uint16_t code_unit, char *&output) {
if (code_unit < 0x80) {
*output++ = static_cast<char>(code_unit);
} else if (code_unit < 0x800) {
*output++ = static_cast<char>(0xC0 | (code_unit >> 6));
*output++ = static_cast<char>(0x80 | (code_unit & 0x3F));
} else {
*output++ = static_cast<char>(0xE0 | (code_unit >> 12));
*output++ = static_cast<char>(0x80 | ((code_unit >> 6) & 0x3F));
*output++ = static_cast<char>(0x80 | (code_unit & 0x3F));
}
}

inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) {
uint32_t code_point = 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00);
*utf8++ = static_cast<char>((code_point >> 18) | 0xF0);
*utf8++ = static_cast<char>(((code_point >> 12) & 0x3F) | 0x80);
*utf8++ = static_cast<char>(((code_point >> 6) & 0x3F) | 0x80);
*utf8++ = static_cast<char>((code_point & 0x3F) | 0x80);
}

#if defined(__x86_64__) || defined(_M_X64)

bool isLatin(const std::string &str) {
Expand All @@ -55,6 +84,90 @@ bool isLatin(const std::string &str) {
return true;
}

std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
std::string utf8;
utf8.reserve(utf16.size() *
3); // Reserve enough space to avoid frequent reallocations

const __m256i limit1 = _mm256_set1_epi16(0x80);
const __m256i limit2 = _mm256_set1_epi16(0x800);
const __m256i surrogate_high_start = _mm256_set1_epi16(0xD800);
const __m256i surrogate_high_end = _mm256_set1_epi16(0xDBFF);
const __m256i surrogate_low_start = _mm256_set1_epi16(0xDC00);
const __m256i surrogate_low_end = _mm256_set1_epi16(0xDFFF);

char buffer[64]; // Buffer to hold temporary UTF-8 bytes
char *output = buffer;

size_t i = 0;
size_t n = utf16.size();

while (i + 16 <= n) {
__m256i in =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(utf16.data() + i));

if (!is_little_endian) {
in = _mm256_or_si256(
_mm256_slli_epi16(in, 8),
_mm256_srli_epi16(in, 8)); // Swap bytes for big-endian
}

__m256i mask1 = _mm256_cmpgt_epi16(in, limit1);
__m256i mask2 = _mm256_cmpgt_epi16(in, limit2);
__m256i high_surrogate_mask =
_mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_high_start),
_mm256_cmpgt_epi16(in, surrogate_high_end));
__m256i low_surrogate_mask =
_mm256_and_si256(_mm256_cmpgt_epi16(in, surrogate_low_start),
_mm256_cmpgt_epi16(in, surrogate_low_end));

if (_mm256_testz_si256(mask1, mask1)) {
// All values < 0x80, 1 byte per character
for (int j = 0; j < 16; ++j) {
*output++ = static_cast<char>(utf16[i + j]);
}
} else if (_mm256_testz_si256(mask2, mask2)) {
// All values < 0x800, 2 bytes per character
for (int j = 0; j < 16; ++j) {
utf16ToUtf8(utf16[i + j], output);
}
} else {
// Mix of 1, 2, and 3 byte characters
for (int j = 0; j < 16; ++j) {
if (_mm256_testz_si256(high_surrogate_mask, high_surrogate_mask) &&
j + 1 < 16 &&
!_mm256_testz_si256(low_surrogate_mask, low_surrogate_mask)) {
// Surrogate pair
utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output);
++j;
} else {
utf16ToUtf8(utf16[i + j], output);
}
}
}

utf8.append(buffer, output - buffer);
output = buffer; // Reset output buffer pointer
i += 16;
}

// Handle remaining characters
while (i < n) {
if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF &&
utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
// Surrogate pair
utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output);
++i;
} else {
utf16ToUtf8(utf16[i], output);
}
++i;
}
utf8.append(buffer, output - buffer);

return utf8;
}

#elif defined(__ARM_NEON) || defined(__ARM_NEON__)

bool isLatin(const std::string &str) {
Expand All @@ -80,18 +193,89 @@ bool isLatin(const std::string &str) {
return true;
}

std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
std::string utf8;
utf8.reserve(utf16.size() * 3);

uint16x8_t limit1 = vdupq_n_u16(0x80);
uint16x8_t limit2 = vdupq_n_u16(0x800);
uint16x8_t surrogate_high_start = vdupq_n_u16(0xD800);
uint16x8_t surrogate_high_end = vdupq_n_u16(0xDBFF);
uint16x8_t surrogate_low_start = vdupq_n_u16(0xDC00);
uint16x8_t surrogate_low_end = vdupq_n_u16(0xDFFF);

char buffer[64];
char *output = buffer;
size_t i = 0;
size_t n = utf16.size();

while (i + 8 <= n) {
uint16x8_t in =
vld1q_u16(reinterpret_cast<const uint16_t *>(utf16.data() + i));
if (!is_little_endian) {
in = vorrq_u16(vshlq_n_u16(in, 8),
vshrq_n_u16(in, 8)); // Swap bytes for big-endian
}

uint16x8_t mask1 = vcgtq_u16(in, limit1);
uint16x8_t mask2 = vcgtq_u16(in, limit2);
uint16x8_t high_surrogate_mask = vandq_u16(
vcgtq_u16(in, surrogate_high_start), vcltq_u16(in, surrogate_high_end));
uint16x8_t low_surrogate_mask = vandq_u16(
vcgtq_u16(in, surrogate_low_start), vcltq_u16(in, surrogate_low_end));

if (vmaxvq_u16(mask1) == 0) {
for (int j = 0; j < 8; ++j) {
*output++ = static_cast<char>(utf16[i + j]);
}
} else if (vmaxvq_u16(mask2) == 0) {
for (int j = 0; j < 8; ++j) {
utf16ToUtf8(utf16[i + j], output);
}
} else {
for (int j = 0; j < 8; ++j) {
if (vmaxvq_u16(high_surrogate_mask) == 0 && j + 1 < 8 &&
vmaxvq_u16(low_surrogate_mask) != 0) {
utf16SurrogatePairToUtf8(utf16[i + j], utf16[i + j + 1], output);
++j;
} else {
utf16ToUtf8(utf16[i + j], output);
}
}
}

utf8.append(buffer, output - buffer);
output = buffer;
i += 8;
}

while (i < n) {
if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF &&
utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output);
++i;
} else {
utf16ToUtf8(utf16[i], output);
}
++i;
}
utf8.append(buffer, output - buffer);

return utf8;
}

#elif defined(__riscv) && __riscv_vector

bool isLatin(const std::string &str) {
const char *data = str.data();
size_t len = str.size();

size_t i = 0;
auto latin_mask = vmv_v_x_u8m1(0x80, 16);
for (; i + 16 <= len; i += 16) {
auto chars = vle8_v_u8m1(reinterpret_cast<const uint8_t *>(data + i), 16);
auto mask = vmv_v_x_u8m1(0x80, 16);
auto result = vand_vv_u8m1(chars, mask, 16);
if (vmax_v_u8m1(result, 16) != 0) {
auto result = vand_vv_u8m1(chars, latin_mask, 16);
if (vfirst_m_b8(vmsne_vx_u8m1_b8(result, 0, 16))) {
return false;
}
}
Expand All @@ -105,6 +289,82 @@ bool isLatin(const std::string &str) {
return true;
}

std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
std::string utf8;
utf8.reserve(utf16.size() * 3);

auto limit1 = vmv_v_x_u16m1(0x80, 8);
auto limit2 = vmv_v_x_u16m1(0x800, 8);
auto surrogate_high_start = vmv_v_x_u16m1(0xD800, 8);
auto surrogate_high_end = vmv_v_x_u16m1(0xDBFF, 8);
auto surrogate_low_start = vmv_v_x_u16m1(0xDC00, 8);
auto surrogate_low_end = vmv_v_x_u16m1(0xDFFF, 8);

char buffer[48];
char *output = buffer;
size_t i = 0;
size_t n = utf16.size();

while (i + 8 <= n) {
auto in =
vle16_v_u16m1(reinterpret_cast<const uint16_t *>(utf16.data() + i), 8);
if (!is_little_endian) {
in = vor_vv_u16m1(vsrl_vx_u16m1(in, 8, 8), vsll_vx_u16m1(in, 8, 8), 8);
}

auto mask1 = vmsgt_vx_u16m1(in, 0x80, 8);
auto mask2 = vmsgt_vx_u16m1(in, 0x800, 8);
auto high_surrogate_mask = vmand_vv_u16m1(vmsgt_vx_u16m1(in, 0xD800, 8),
vmslt_vx_u16m1(in, 0xDBFF, 8), 8);
auto low_surrogate_mask = vmand_vv_u16m1(vmsgt_vx_u16m1(in, 0xDC00, 8),
vmslt_vx_u16m1(in, 0xDFFF, 8), 8);

if (vmslt_vx_u16m1(mask1, 0, 8)) {
for (int j = 0; j < 8; ++j) {
*output++ = static_cast<char>(vget_vx_u16m1(in, j));
}
} else if (vmslt_vx_u16m1(mask2, 0, 8)) {
for (int j = 0; j < 8; ++j) {
utf16ToUtf8(vget_vx_u16m1(in, j), output);
}
} else {
for (int j = 0; j < 8; ++j) {
if (vfirst_m_b8(
vmand_vv_b8(high_surrogate_mask,
vmsne_vx_u8m1_b8(vmv_v_x_u8m1(0, 8), 0, 8))) &&
j + 1 < 8 &&
vfirst_m_b8(
vmand_vv_b8(low_surrogate_mask,
vmsne_vx_u8m1_b8(vmv_v_x_u8m1(0, 8), 0, 8)))) {
utf16SurrogatePairToUtf8(vget_vx_u16m1(in, j),
vget_vx_u16m1(in, j + 1), output);
++j;
} else {
utf16ToUtf8(vget_vx_u16m1(in, j), output);
}
}
}

utf8.append(buffer, output - buffer);
output = buffer;
i += 8;
}

while (i < n) {
if (i + 1 < n && utf16[i] >= 0xD800 && utf16[i] <= 0xDBFF &&
utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
utf16SurrogatePairToUtf8(utf16[i], utf16[i + 1], output);
++i;
} else {
utf16ToUtf8(utf16[i], output);
}
++i;
}
utf8.append(buffer, output - buffer);

return utf8;
}

#else

bool isLatin(const std::string &str) {
Expand All @@ -116,6 +376,44 @@ bool isLatin(const std::string &str) {
return true;
}

// Fallback implementation without SIMD acceleration
std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
std::string utf8;
utf8.reserve(utf16.size() *
3); // Reserve enough space to avoid frequent reallocations

size_t i = 0;
size_t n = utf16.size();
char buffer[4]; // Buffer to hold temporary UTF-8 bytes
char *output = buffer;

while (i < n) {
uint16_t code_unit = utf16[i];
if (!is_little_endian) {
code_unit = swapBytes(code_unit);
}
if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF &&
utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
// Surrogate pair
uint16_t high = code_unit;
uint16_t low = utf16[i + 1];
if (!is_little_endian) {
low = swapBytes(low);
}
utf16SurrogatePairToUtf8(high, low, output);
utf8.append(buffer, output - buffer);
output = buffer;
++i;
} else {
utf16ToUtf8(code_unit, output);
utf8.append(buffer, output - buffer);
output = buffer;
}
++i;
}
return utf8;
}

#endif

} // namespace fury
2 changes: 2 additions & 0 deletions cpp/fury/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,6 @@ namespace fury {

bool isLatin(const std::string &str);

std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian);

} // namespace fury
Loading

0 comments on commit caf0e48

Please sign in to comment.