From 6d676f478179489d7a4c21df259a13774bb51ba7 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sun, 1 Aug 2021 15:50:43 +0100 Subject: [PATCH] Use simdutf for UTF8 validation --- cbits/simdutf.cpp | 14204 ++++++++++++++++++++++++++++++++++++ cbits/validate_utf8.cpp | 6 + include/simdutf.h | 1084 +++ src/Data/Text/Encoding.hs | 38 +- text.cabal | 20 +- 5 files changed, 15349 insertions(+), 3 deletions(-) create mode 100644 cbits/simdutf.cpp create mode 100644 cbits/validate_utf8.cpp create mode 100644 include/simdutf.h diff --git a/cbits/simdutf.cpp b/cbits/simdutf.cpp new file mode 100644 index 00000000..cf7d32ff --- /dev/null +++ b/cbits/simdutf.cpp @@ -0,0 +1,14204 @@ +/* auto-generated on 2021-07-29 10:43:28 -0400. Do not edit! */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf.cpp +/* begin file src/simdutf.cpp */ +#include "simdutf.h" +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=implementation.cpp +/* begin file src/implementation.cpp */ +#include +#include +#include + +// Useful for debugging purposes +namespace simdutf { +namespace { + +template +std::string toBinaryString(T b) { + std::string binary = ""; + T mask = T(1) << (sizeof(T) * CHAR_BIT - 1); + while (mask > 0) { + binary += ((b & mask) == 0) ? '0' : '1'; + mask >>= 1; + } + return binary; +} +} +} + +// Implementations +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64.h +/* begin file src/simdutf/arm64.h */ +#ifndef SIMDUTF_ARM64_H +#define SIMDUTF_ARM64_H + +#ifdef SIMDUTF_FALLBACK_H +#error "arm64.h must be included before fallback.h" +#endif + + +#ifndef SIMDUTF_IMPLEMENTATION_ARM64 +#define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64 + + + +#if SIMDUTF_IMPLEMENTATION_ARM64 + +namespace simdutf { +/** + * Implementation for NEON (ARMv8). + */ +namespace arm64 { +} // namespace arm64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h +/* begin file src/simdutf/arm64/implementation.h */ +#ifndef SIMDUTF_ARM64_IMPLEMENTATION_H +#define SIMDUTF_ARM64_IMPLEMENTATION_H + + +namespace simdutf { +namespace arm64 { + +namespace { +using namespace simdutf; +} + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {} + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace arm64 +} // namespace simdutf + +#endif // SIMDUTF_ARM64_IMPLEMENTATION_H +/* end file src/simdutf/arm64/implementation.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h +/* begin file src/simdutf/arm64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "arm64" +// #define SIMDUTF_IMPLEMENTATION arm64 +/* end file src/simdutf/arm64/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h +/* begin file src/simdutf/arm64/intrinsics.h */ +#ifndef SIMDUTF_ARM64_INTRINSICS_H +#define SIMDUTF_ARM64_INTRINSICS_H + + +// This should be the correct header whether +// you use visual studio or other compilers. +#include + +#endif // SIMDUTF_ARM64_INTRINSICS_H +/* end file src/simdutf/arm64/intrinsics.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h +/* begin file src/simdutf/arm64/bitmanipulation.h */ +#ifndef SIMDUTF_ARM64_BITMANIPULATION_H +#define SIMDUTF_ARM64_BITMANIPULATION_H + +namespace simdutf { +namespace arm64 { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +NO_SANITIZE_UNDEFINED +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + unsigned long ret; + // Search the mask data from least significant bit (LSB) + // to the most significant bit (MSB) for a set bit (1). + _BitScanForward64(&ret, input_num); + return (int)ret; +#else // SIMDUTF_REGULAR_VISUAL_STUDIO + return __builtin_ctzll(input_num); +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num-1); +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int leading_zeroes(uint64_t input_num) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// SIMDUTF_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int count_ones(uint64_t input_num) { + return vaddv_u8(vcnt_u8(vcreate_u8(input_num))); +} + +simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + *result = value1 + value2; + return *result < value1; +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf + +#endif // SIMDUTF_ARM64_BITMANIPULATION_H +/* end file src/simdutf/arm64/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmask.h +/* begin file src/simdutf/arm64/bitmask.h */ +#ifndef SIMDUTF_ARM64_BITMASK_H +#define SIMDUTF_ARM64_BITMASK_H + +namespace simdutf { +namespace arm64 { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdutf_really_inline uint64_t prefix_xor(uint64_t bitmask) { + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf + +#endif +/* end file src/simdutf/arm64/bitmask.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h +/* begin file src/simdutf/arm64/simd.h */ +#ifndef SIMDUTF_ARM64_SIMD_H +#define SIMDUTF_ARM64_SIMD_H + +#include + + +namespace simdutf { +namespace arm64 { +namespace { +namespace simd { + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +namespace { +// Start of private section with Visual Studio workaround + + +/** + * make_uint8x16_t initializes a SIMD register (uint8x16_t). + * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...} + * is not recognized under Visual Studio! This is a workaround. + * Using a std::initializer_list as a parameter resulted in + * inefficient code. With the current approach, if the parameters are + * compile-time constants, + * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}. + * You should not use this function except for compile-time constants: + * it is not efficient. + */ +simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, + uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8, + uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12, + uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) { + // Doing a load like so end ups generating worse code. + // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_u8(array); + uint8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_u8(x1, x, 0); + x = vsetq_lane_u8(x2, x, 1); + x = vsetq_lane_u8(x3, x, 2); + x = vsetq_lane_u8(x4, x, 3); + x = vsetq_lane_u8(x5, x, 4); + x = vsetq_lane_u8(x6, x, 5); + x = vsetq_lane_u8(x7, x, 6); + x = vsetq_lane_u8(x8, x, 7); + x = vsetq_lane_u8(x9, x, 8); + x = vsetq_lane_u8(x10, x, 9); + x = vsetq_lane_u8(x11, x, 10); + x = vsetq_lane_u8(x12, x, 11); + x = vsetq_lane_u8(x13, x, 12); + x = vsetq_lane_u8(x14, x, 13); + x = vsetq_lane_u8(x15, x, 14); + x = vsetq_lane_u8(x16, x, 15); + return x; +} + +// We have to do the same work for make_int8x16_t +simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4, + int8_t x5, int8_t x6, int8_t x7, int8_t x8, + int8_t x9, int8_t x10, int8_t x11, int8_t x12, + int8_t x13, int8_t x14, int8_t x15, int8_t x16) { + // Doing a load like so end ups generating worse code. + // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, + // x9, x10,x11,x12,x13,x14,x15,x16}; + // return vld1q_s8(array); + int8x16_t x{}; + // incredibly, Visual Studio does not allow x[0] = x1 + x = vsetq_lane_s8(x1, x, 0); + x = vsetq_lane_s8(x2, x, 1); + x = vsetq_lane_s8(x3, x, 2); + x = vsetq_lane_s8(x4, x, 3); + x = vsetq_lane_s8(x5, x, 4); + x = vsetq_lane_s8(x6, x, 5); + x = vsetq_lane_s8(x7, x, 6); + x = vsetq_lane_s8(x8, x, 7); + x = vsetq_lane_s8(x9, x, 8); + x = vsetq_lane_s8(x10, x, 9); + x = vsetq_lane_s8(x11, x, 10); + x = vsetq_lane_s8(x12, x, 11); + x = vsetq_lane_s8(x13, x, 12); + x = vsetq_lane_s8(x14, x, 13); + x = vsetq_lane_s8(x15, x, 14); + x = vsetq_lane_s8(x16, x, 15); + return x; +} + +simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1, uint16_t x2, uint16_t x3, uint16_t x4, + uint16_t x5, uint16_t x6, uint16_t x7, uint16_t x8) { + uint16x8_t x{}; + x = vsetq_lane_u16(x1, x, 0); + x = vsetq_lane_u16(x2, x, 1); + x = vsetq_lane_u16(x3, x, 2); + x = vsetq_lane_u16(x4, x, 3); + x = vsetq_lane_u16(x5, x, 4); + x = vsetq_lane_u16(x6, x, 5); + x = vsetq_lane_u16(x7, x, 6); + x = vsetq_lane_u16(x8, x, 7);; + return x; +} + +simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t x3, int16_t x4, + int16_t x5, int16_t x6, int16_t x7, int16_t x8) { + uint16x8_t x{}; + x = vsetq_lane_s16(x1, x, 0); + x = vsetq_lane_s16(x2, x, 1); + x = vsetq_lane_s16(x3, x, 2); + x = vsetq_lane_s16(x4, x, 3); + x = vsetq_lane_s16(x5, x, 4); + x = vsetq_lane_s16(x6, x, 5); + x = vsetq_lane_s16(x7, x, 6); + x = vsetq_lane_s16(x8, x, 7);; + return x; +} + + +// End of private section with Visual Studio workaround +} // namespace +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO + + + template + struct simd8; + + // + // Base class of simd8 and simd8, both of which use uint8x16_t internally. + // + template> + struct base_u8 { + uint8x16_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {} + simdutf_really_inline operator const uint8x16_t&() const { return this->value; } + simdutf_really_inline operator uint8x16_t&() { return this->value; } + simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); } + simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); } + + // Bit operations + simdutf_really_inline simd8 operator|(const simd8 other) const { return vorrq_u8(*this, other); } + simdutf_really_inline simd8 operator&(const simd8 other) const { return vandq_u8(*this, other); } + simdutf_really_inline simd8 operator^(const simd8 other) const { return veorq_u8(*this, other); } + simdutf_really_inline simd8 bit_andnot(const simd8 other) const { return vbicq_u8(*this, other); } + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + simdutf_really_inline simd8& operator|=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline simd8& operator&=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline simd8& operator^=(const simd8 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast ^ other; return *this_cast; } + + simdutf_really_inline Mask operator==(const simd8 other) const { return vceqq_u8(*this, other); } + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_u8(prev_chunk, *this, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base_u8 { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + static simdutf_really_inline simd8 splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); } + + simdutf_really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // False constructor + simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {} + simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } + + // We return uint32_t instead of uint16_t because that seems to be more efficient for most + // purposes (cutting it down to uint16_t costs performance in some compilers). + simdutf_really_inline uint32_t to_bitmask() const { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80); +#else + const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; +#endif + auto minput = *this & bit_mask; + uint8x16_t tmp = vpaddq_u8(minput, minput); + tmp = vpaddq_u8(tmp, tmp); + tmp = vpaddq_u8(tmp, tmp); + return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); + } + simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; } + simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; } + simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; } + + + }; + + // Unsigned bytes + template<> + struct simd8: base_u8 { + static simdutf_really_inline simd8 splat(uint8_t _value) { return vmovq_n_u8(_value); } + static simdutf_really_inline simd8 zero() { return vdupq_n_u8(0); } + static simdutf_really_inline simd8 load(const uint8_t* values) { return vld1q_u8(values); } + simdutf_really_inline simd8(const simd8& value) = default; + simdutf_really_inline simd8(const uint8x16_t _value) : base_u8(_value) {} + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(make_uint8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(uint8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} +#endif + + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Store to array + simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); } + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return vqaddq_u8(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return vqsubq_u8(*this, other); } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { return vaddq_u8(*this, other); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return vsubq_u8(*this, other); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + // Order-specific operations + simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); } + simdutf_really_inline uint8_t min_val() const { return vminvq_u8(*this); } + simdutf_really_inline simd8 max_val(const simd8 other) const { return vmaxq_u8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return vminq_u8(*this, other); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return vcleq_u8(*this, other); } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return vcgeq_u8(*this, other); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return vcltq_u8(*this, other); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return vcgtq_u8(*this, other); } + // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return simd8(*this > other); } + // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's. + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return simd8(*this < other); } + + // Bit-specific operations + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return vtstq_u8(*this, bits); } + simdutf_really_inline bool is_ascii() const { return this->max_val() < 0b10000000u; } + + simdutf_really_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return (*this & bits).any_bits_set_anywhere(); } + template + simdutf_really_inline simd8 shr() const { return vshrq_n_u8(*this, N); } + template + simdutf_really_inline simd8 shl() const { return vshlq_n_u8(*this, N); } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + + + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + + template + simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) const { + return vqtbl1q_u8(*this, simd8(original)); + } + }; + + // Signed bytes + template<> + struct simd8 { + int8x16_t value; + + static simdutf_really_inline simd8 splat(int8_t _value) { return vmovq_n_s8(_value); } + static simdutf_really_inline simd8 zero() { return vdupq_n_s8(0); } + static simdutf_really_inline simd8 load(const int8_t values[16]) { return vld1q_s8(values); } + simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { + vst1q_u16(reinterpret_cast(p), vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))); + vst1q_u16(reinterpret_cast(p + 8), vmovl_high_u8(vreinterpretq_u8_s8(this->value))); + } + // Conversion from/to SIMD register + simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {} + simdutf_really_inline operator const int8x16_t&() const { return this->value; } + simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); } + simdutf_really_inline operator int8x16_t&() { return this->value; } + + // Zero constructor + simdutf_really_inline simd8() : simd8(zero()) {} + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(make_int8x16_t( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} +#else + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(int8x16_t{ + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + }) {} +#endif + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Store to array + simdutf_really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, value); } + // Explicit conversion to/from unsigned + // + // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type. + // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14 + // and relatively ugly and hard to read. +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {} +#endif + simdutf_really_inline operator simd8() const { return vreinterpretq_u8_s8(this->value); } + + simdutf_really_inline simd8 operator|(const simd8 other) const { return vorrq_s8(value, other.value); } + simdutf_really_inline simd8 operator&(const simd8 other) const { return vandq_s8(value, other.value); } + simdutf_really_inline simd8 operator^(const simd8 other) const { return veorq_s8(value, other.value); } + simdutf_really_inline simd8 bit_andnot(const simd8 other) const { return vbicq_s8(value, other.value); } + + // Math + simdutf_really_inline simd8 operator+(const simd8 other) const { return vaddq_s8(value, other.value); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return vsubq_s8(value, other.value); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *this; } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *this; } + + simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); } + simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); } + simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; } + + // Order-sensitive comparisons + simdutf_really_inline simd8 max_val(const simd8 other) const { return vmaxq_s8(value, other.value); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return vminq_s8(value, other.value); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return vcgtq_s8(value, other.value); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return vcltq_s8(value, other.value); } + simdutf_really_inline simd8 operator==(const simd8 other) const { return vceqq_s8(value, other.value); } + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return vextq_s8(prev_chunk, *this, 16 - N); + } + + // Perform a lookup assuming no value is larger than 16 + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + + template + simdutf_really_inline simd8 apply_lookup_16_to(const simd8 original) { + return vqtbl1q_s8(*this, simd8(original)); + } + }; + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd8)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd8)*3/sizeof(T)); + } + + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd8)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd8)*1); + this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd8)*2); + this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd8)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ); +#else + const uint8x16_t bit_mask = { + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + }; +#endif + // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask)); + uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask)); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + return simd8x64( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), + (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), + (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask, + this->chunks[2] > mask, + this->chunks[3] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask, + this->chunks[2] >= mask, + this->chunks[3] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + simd8(uint8x16_t(this->chunks[0])) >= mask, + simd8(uint8x16_t(this->chunks[1])) >= mask, + simd8(uint8x16_t(this->chunks[2])) >= mask, + simd8(uint8x16_t(this->chunks[3])) >= mask + ).to_bitmask(); + } + }; // struct simd8x64 +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h +/* begin file src/simdutf/arm64/simd16-inl.h */ +template +struct simd16; + + template> + struct base_u16 { + uint16x8_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + simdutf_really_inline base_u16() = default; + simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {} + simdutf_really_inline operator const uint16x8_t&() const { return this->value; } + simdutf_really_inline operator uint16x8_t&() { return this->value; } + // Bit operations + simdutf_really_inline simd16 operator|(const simd16 other) const { return vorrq_u16(*this, other); } + simdutf_really_inline simd16 operator&(const simd16 other) const { return vandq_u16(*this, other); } + simdutf_really_inline simd16 operator^(const simd16 other) const { return veorq_u16(*this, other); } + simdutf_really_inline simd16 bit_andnot(const simd16 other) const { return vbicq_u16(*this, other); } + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + simdutf_really_inline simd16& operator|=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline simd16& operator&=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline simd16& operator^=(const simd16 other) { auto this_cast = static_cast*>(this); *this_cast = *this_cast ^ other; return *this_cast; } + + simdutf_really_inline Mask operator==(const simd16 other) const { return vceqq_u16(*this, other); } + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return vextq_u18(prev_chunk, *this, 8 - N); + } + }; + +template> +struct base16: base_u16 { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline base16() : base_u16() {} + simdutf_really_inline base16(const uint16x8_t _value) : base_u16(_value) {} + template + simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {} + + simdutf_really_inline Mask operator==(const simd16 other) const { return vceqq_u16(*this, other); } + + static const int SIZE = sizeof(base_u16::value); + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return vextq_u18(prev_chunk, *this, 8 - N); + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template<> +struct simd16: base16 { + static simdutf_really_inline simd16 splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); } + + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const uint16x8_t _value) : base16(_value) {} + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + +}; + +template +struct base16_numeric: base16 { + static simdutf_really_inline simd16 splat(T _value) { return vmovq_n_u16(_value); } + static simdutf_really_inline simd16 zero() { return vdupq_n_u16(0); } + static simdutf_really_inline simd16 load(const T values[8]) { + return vld1q_u16(reinterpret_cast(values)); + } + + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd16 operator+(const simd16 other) const { return vaddq_u8(*this, other); } + simdutf_really_inline simd16 operator-(const simd16 other) const { return vsubq_u8(*this, other); } + simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } +}; + +// Signed words +template<> +struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} +#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO + simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric(_value) {} +#endif + simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric(vreinterpretq_u16_s16(_value)) {} + + // Splat constructor + simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + simdutf_really_inline operator simd16() const; + simdutf_really_inline operator const uint16x8_t&() const { return this->value; } + simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); } + + simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); } + simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); } + // Order-sensitive comparisons + simdutf_really_inline simd16 max_val(const simd16 other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); } +}; + + + + +// Unsigned words +template<> +struct simd16: base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + + + simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); } + simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); } + // Saturated math + simdutf_really_inline simd16 saturating_add(const simd16 other) const { return vqaddq_u16(*this, other); } + simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return vqsubq_u16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd16 max_val(const simd16 other) const { return vmaxq_u16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return vminq_u16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd16 operator<=(const simd16 other) const { return vcleq_u16(*this, other); } + simdutf_really_inline simd16 operator>=(const simd16 other) const { return vcgeq_u16(*this, other); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return vcgtq_u16(*this, other); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return vcltq_u16(*this, other); } + + // Bit-specific operations + simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } + template + simdutf_really_inline simd16 shr() const { return simd16(vshrq_n_u16(*this, N)); } + template + simdutf_really_inline simd16 shl() const { return simd16(vshlq_n_u16(*this, N)); } + + // logical operations + simdutf_really_inline simd16 operator|(const simd16 other) const { return vorrq_u16(*this, other); } + simdutf_really_inline simd16 operator&(const simd16 other) const { return vandq_u16(*this, other); } + simdutf_really_inline simd16 operator^(const simd16 other) const { return veorq_u16(*this, other); } + + // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + return vqmovn_high_u16(vqmovn_u16(v0), v1); + } +}; +simdutf_really_inline simd16::operator simd16() const { return this->value; } + + + template + struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block."); + const simd16 chunks[NUM_CHUNKS]; + + simd16x32(const simd16x32& o) = delete; // no copy allowed + simd16x32& operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed + + simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1, const simd16 chunk2, const simd16 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T)), simd16::load(ptr+2*sizeof(simd16)/sizeof(T)), simd16::load(ptr+3*sizeof(simd16)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd16)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd16)*3/sizeof(T)); + } + + simdutf_really_inline simd16 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)*1); + this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16)*2); + this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t bit_mask = make_uint8x16_t( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ); +#else + const uint8x16_t bit_mask = { + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + }; +#endif + // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one. + uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask))); + uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask))); + sum0 = vpaddq_u8(sum0, sum1); + sum0 = vpaddq_u8(sum0, sum0); + return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + + return simd16x32( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + return simd16x32( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), + (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), + (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + + }; // struct simd16x32 + template<> + simdutf_really_inline uint64_t simd16x32::not_in_range(const uint16_t low, const uint16_t high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + simd16x32 x( + simd16((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)), + simd16((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)), + simd16((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)), + simd16((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)) + ); + return x.to_bitmask(); + } +/* end file src/simdutf/arm64/simd16-inl.h */ +} // namespace simd +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf + +#endif // SIMDUTF_ARM64_SIMD_H +/* end file src/simdutf/arm64/simd.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h +/* begin file src/simdutf/arm64/end.h */ +/* end file src/simdutf/arm64/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_ARM64 + +#endif // SIMDUTF_ARM64_H +/* end file src/simdutf/arm64.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell.h +/* begin file src/simdutf/haswell.h */ +#ifndef SIMDUTF_HASWELL_H +#define SIMDUTF_HASWELL_H + +#ifdef SIMDUTF_WESTMERE_H +#error "haswell.h must be included before westmere.h" +#endif +#ifdef SIMDUTF_FALLBACK_H +#error "haswell.h must be included before fallback.h" +#endif + + +// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected +// at runtime. +#ifndef SIMDUTF_IMPLEMENTATION_HASWELL +// +// You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__ +// because we want to rely on *runtime dispatch*. +// +#define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64) +#endif +// To see why (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see +// https://github.com/simdutf/simdutf/issues/1247 +#define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__)) + +#if SIMDUTF_IMPLEMENTATION_HASWELL + +#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,pclmul,lzcnt") + +namespace simdutf { +/** + * Implementation for Haswell (Intel AVX2). + */ +namespace haswell { +} // namespace haswell +} // namespace simdutf + +// +// These two need to be included outside SIMDUTF_TARGET_REGION +// +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h +/* begin file src/simdutf/haswell/implementation.h */ +#ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H +#define SIMDUTF_HASWELL_IMPLEMENTATION_H + + +// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION +namespace simdutf { +namespace haswell { + +using namespace simdutf; + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation( + "haswell", + "Intel/AMD AVX2", + internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 + ) {} + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace haswell +} // namespace simdutf + +#endif // SIMDUTF_HASWELL_IMPLEMENTATION_H +/* end file src/simdutf/haswell/implementation.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h +/* begin file src/simdutf/haswell/intrinsics.h */ +#ifndef SIMDUTF_HASWELL_INTRINSICS_H +#define SIMDUTF_HASWELL_INTRINSICS_H + + +#ifdef SIMDUTF_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#else +#include // elsewhere +#endif // SIMDUTF_VISUAL_STUDIO + +#ifdef SIMDUTF_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + * e.g., if __AVX2__ is set... in turn, we normally set these + * macros by compiling against the corresponding architecture + * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole + * software with these advanced instructions. In simdutf, we + * want to compile the whole program for a generic target, + * and only target our specific kernels. As a workaround, + * we directly include the needed headers. These headers would + * normally guard against such usage, but we carefully included + * (or ) before, so the headers + * are fooled. + */ +#include // for _blsr_u64 +#include // for __lzcnt64 +#include // for most things (AVX2, AVX512, _popcnt64) +#include +#include +#include +#include +#include // for _mm_clmulepi64_si128 +// unfortunately, we may not get _blsr_u64, but, thankfully, clang +// has it as a macro. +#ifndef _blsr_u64 +// we roll our own +SIMDUTF_TARGET_HASWELL +static simdutf_really_inline uint64_t _blsr_u64(uint64_t n) { + return (n - 1) & n; +} +SIMDUTF_UNTARGET_REGION +#endif // _blsr_u64 +#endif // SIMDUTF_CLANG_VISUAL_STUDIO + +#endif // SIMDUTF_HASWELL_INTRINSICS_H +/* end file src/simdutf/haswell/intrinsics.h */ + +// +// The rest need to be inside the region +// +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h +/* begin file src/simdutf/haswell/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "haswell" +// #define SIMDUTF_IMPLEMENTATION haswell +SIMDUTF_TARGET_HASWELL +/* end file src/simdutf/haswell/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h +/* begin file src/simdutf/haswell/bitmanipulation.h */ +#ifndef SIMDUTF_HASWELL_BITMANIPULATION_H +#define SIMDUTF_HASWELL_BITMANIPULATION_H + +namespace simdutf { +namespace haswell { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +NO_SANITIZE_UNDEFINED +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + return (int)_tzcnt_u64(input_num); +#else // SIMDUTF_REGULAR_VISUAL_STUDIO + //////// + // You might expect the next line to be equivalent to + // return (int)_tzcnt_u64(input_num); + // but the generated code differs and might be less efficient? + //////// + return __builtin_ctzll(input_num); +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return _blsr_u64(input_num); +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int leading_zeroes(uint64_t input_num) { + return int(_lzcnt_u64(input_num)); +} + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdutf_really_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + return _addcarry_u64(0, value1, value2, + reinterpret_cast(result)); +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +#endif // SIMDUTF_HASWELL_BITMANIPULATION_H +/* end file src/simdutf/haswell/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmask.h +/* begin file src/simdutf/haswell/bitmask.h */ +#ifndef SIMDUTF_HASWELL_BITMASK_H +#define SIMDUTF_HASWELL_BITMASK_H + +namespace simdutf { +namespace haswell { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdutf_really_inline uint64_t prefix_xor(const uint64_t bitmask) { + // There should be no such thing with a processor supporting avx2 + // but not clmul. + __m128i all_ones = _mm_set1_epi8('\xFF'); + __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0); + return _mm_cvtsi128_si64(result); +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +#endif // SIMDUTF_HASWELL_BITMASK_H +/* end file src/simdutf/haswell/bitmask.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h +/* begin file src/simdutf/haswell/simd.h */ +#ifndef SIMDUTF_HASWELL_SIMD_H +#define SIMDUTF_HASWELL_SIMD_H + + +namespace simdutf { +namespace haswell { +namespace { +namespace simd { + + // Forward-declared so they can be used by splat and friends. + template + struct base { + __m256i value; + + // Zero constructor + simdutf_really_inline base() : value{__m256i()} {} + + // Conversion from SIMD register + simdutf_really_inline base(const __m256i _value) : value(_value) {} + // Conversion to SIMD register + simdutf_really_inline operator const __m256i&() const { return this->value; } + simdutf_really_inline operator __m256i&() { return this->value; } + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this))); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1))); + } + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); } + simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); } + simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); } + simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); } + simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + typedef uint32_t bitmask_t; + typedef uint64_t bitmask2_t; + + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m256i _value) : base>(_value) {} + simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); } + simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); } + simdutf_really_inline Mask operator==(const simd8 other) const { return _mm256_cmpeq_epi8(*this, other); } + + static const int SIZE = sizeof(base::value); + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdutf_really_inline simd8 splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); } + + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m256i _value) : base8(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} + + simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); } + simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } + simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); } + simdutf_really_inline bool all() const { return static_cast(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; } + simdutf_really_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdutf_really_inline simd8 splat(T _value) { return _mm256_set1_epi8(_value); } + static simdutf_really_inline simd8 zero() { return _mm256_setzero_si256(); } + static simdutf_really_inline simd8 load(const T values[32]) { + return _mm256_loadu_si256(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m256i _value) : base8(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { return _mm256_add_epi8(*this, other); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return _mm256_sub_epi8(*this, other); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm256_shuffle_epi8(lookup_table, *this); + } + + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) : base8_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {} + simdutf_really_inline operator simd8() const; + // Member-by-member initialization + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, + int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, + int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; } + // Order-sensitive comparisons + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm256_max_epi8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm256_min_epi8(*this, other); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return _mm256_cmpgt_epi8(*this, other); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return _mm256_cmpgt_epi8(other, *this); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m256i _value) : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15, + uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23, + uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31 + ) : simd8(_mm256_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v16,v17,v18,v19,v20,v21,v22,v23, + v24,v25,v26,v27,v28,v29,v30,v31 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15, + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm256_adds_epu8(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm256_subs_epu8(*this, other); } + + // Order-specific operations + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm256_max_epu8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm256_min_epu8(other, *this); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return this->lt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdutf_really_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; } + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm256_testz_si256(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd8 shr() const { return simd8(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + simdutf_really_inline simd8 shl() const { return simd8(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); } + }; + simdutf_really_inline simd8::operator simd8() const { return this->value; } + + + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1) : chunks{chunk0, chunk1} {} + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + + simdutf_really_inline simd8 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd8)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd8)); + } + + simdutf_really_inline simd8x64 bit_or(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] | mask, + this->chunks[1] | mask + ); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + return simd8x64( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] < mask, + this->chunks[1] < mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + (simd8(__m256i(this->chunks[0])) >= mask), + (simd8(__m256i(this->chunks[1])) >= mask) + ).to_bitmask(); + } + }; // struct simd8x64 + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h +/* begin file src/simdutf/haswell/simd16-inl.h */ +#ifdef __GNUC__ +#if __GNUC__ < 8 +#define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2) +#define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2) +#endif +#endif + +template +struct simd16; + +template> +struct base16: base> { + using bitmask_type = uint32_t; + + simdutf_really_inline base16() : base>() {} + simdutf_really_inline base16(const __m256i _value) : base>(_value) {} + template + simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast(ptr))) {} + + simdutf_really_inline Mask operator==(const simd16 other) const { return _mm256_cmpeq_epi16(*this, other); } + + /// the size of vector in bytes + static const int SIZE = sizeof(base>::value); + + /// the number of elements of type T a vector can hold + static const int ELEMENTS = SIZE / sizeof(T); + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return _mm256_alignr_epi8(*this, prev_chunk, 16 - N); + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template<> +struct simd16: base16 { + static simdutf_really_inline simd16 splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); } + + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const __m256i _value) : base16(_value) {} + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + + simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); } + simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); } + simdutf_really_inline simd16 operator~() const { return *this ^ true; } +}; + +template +struct base16_numeric: base16 { + static simdutf_really_inline simd16 splat(T _value) { return _mm256_set1_epi16(_value); } + static simdutf_really_inline simd16 zero() { return _mm256_setzero_si256(); } + static simdutf_really_inline simd16 load(const T values[8]) { + return _mm256_loadu_si256(reinterpret_cast(values)); + } + + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const __m256i _value) : base16(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd16 operator+(const simd16 other) const { return _mm256_add_epi16(*this, other); } + simdutf_really_inline simd16 operator-(const simd16 other) const { return _mm256_sub_epi16(*this, other); } + simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } +}; + +// Signed words +template<> +struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m256i _value) : base16_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + // Order-sensitive comparisons + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm256_max_epi16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm256_min_epi16(*this, other); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return _mm256_cmpgt_epi16(*this, other); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm256_cmpgt_epi16(other, *this); } +}; + +// Unsigned words +template<> +struct simd16: base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m256i _value) : base16_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + + // Saturated math + simdutf_really_inline simd16 saturating_add(const simd16 other) const { return _mm256_adds_epu16(*this, other); } + simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return _mm256_subs_epu16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm256_max_epu16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm256_min_epu16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd16 operator<=(const simd16 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd16 operator>=(const simd16 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd16 operator>(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } + simdutf_really_inline simd16 bits_not_set(simd16 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd16 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd16 any_bits_set(simd16 bits) const { return ~this->bits_not_set(bits); } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd16 bits) const { return _mm256_testz_si256(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd16 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd16 shr() const { return simd16(_mm256_srli_epi16(*this, N)); } + template + simdutf_really_inline simd16 shl() const { return simd16(_mm256_slli_epi16(*this, N)); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); } + + // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + // Note: the AVX2 variant of pack operates on 128-bit lanes, thus + // we have to shuffle lanes in order to produce bytes in the + // correct order. + + // get the 0th lanes + const __m128i lo_0 = _mm256_extracti128_si256(v0, 0); + const __m128i lo_1 = _mm256_extracti128_si256(v1, 0); + + // get the 1st lanes + const __m128i hi_0 = _mm256_extracti128_si256(v0, 1); + const __m128i hi_1 = _mm256_extracti128_si256(v1, 1); + + // build new vectors (shuffle lanes) + const __m256i t0 = _mm256_set_m128i(lo_1, lo_0); + const __m256i t1 = _mm256_set_m128i(hi_1, hi_0); + + // pack words in linear order from v0 and v1 + return _mm256_packus_epi16(t0, t1); + } +}; + + + template + struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block."); + const simd16 chunks[NUM_CHUNKS]; + + simd16x32(const simd16x32& o) = delete; // no copy allowed + simd16x32& operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed + + simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1) : chunks{chunk0, chunk1} {} + simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r_hi = this->chunks[1].to_bitmask(); + return r_lo | (r_hi << 32); + } + + simdutf_really_inline simd16 reduce_or() const { + return this->chunks[0] | this->chunks[1]; + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)); + } + + simdutf_really_inline simd16x32 bit_or(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] | mask, + this->chunks[1] | mask + ); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] == mask, + this->chunks[1] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd16x32 &other) const { + return simd16x32( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] <= mask, + this->chunks[1] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + + return simd16x32( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(static_cast(low-1)); + const simd16 mask_high = simd16::splat(static_cast(high+1)); + return simd16x32( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] < mask, + this->chunks[1] < mask + ).to_bitmask(); + } + }; // struct simd16x32 +/* end file src/simdutf/haswell/simd16-inl.h */ + +} // namespace simd + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +#endif // SIMDUTF_HASWELL_SIMD_H +/* end file src/simdutf/haswell/simd.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h +/* begin file src/simdutf/haswell/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/haswell/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_HASWELL +#endif // SIMDUTF_HASWELL_COMMON_H +/* end file src/simdutf/haswell.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere.h +/* begin file src/simdutf/westmere.h */ +#ifndef SIMDUTF_WESTMERE_H +#define SIMDUTF_WESTMERE_H + +#ifdef SIMDUTF_FALLBACK_H +#error "westmere.h must be included before fallback.h" +#endif + + +// Default Westmere to on if this is x86-64, unless we'll always select Haswell. +#ifndef SIMDUTF_IMPLEMENTATION_WESTMERE +// +// You do not want to set it to (SIMDUTF_IS_X86_64 && !SIMDUTF_REQUIRES_HASWELL) +// because you want to rely on runtime dispatch! +// +#define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__ && __PCLMUL__) + +#if SIMDUTF_IMPLEMENTATION_WESTMERE + +#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,pclmul") + +namespace simdutf { +/** + * Implementation for Westmere (Intel SSE4.2). + */ +namespace westmere { +} // namespace westmere +} // namespace simdutf + +// +// These two need to be included outside SIMDUTF_TARGET_REGION +// +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h +/* begin file src/simdutf/westmere/implementation.h */ +#ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H +#define SIMDUTF_WESTMERE_IMPLEMENTATION_H + + +// The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION +namespace simdutf { +namespace westmere { + +namespace { +using namespace simdutf; +} + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {} + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace westmere +} // namespace simdutf + +#endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H +/* end file src/simdutf/westmere/implementation.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h +/* begin file src/simdutf/westmere/intrinsics.h */ +#ifndef SIMDUTF_WESTMERE_INTRINSICS_H +#define SIMDUTF_WESTMERE_INTRINSICS_H + +#ifdef SIMDUTF_VISUAL_STUDIO +// under clang within visual studio, this will include +#include // visual studio or clang +#else +#include // elsewhere +#endif // SIMDUTF_VISUAL_STUDIO + + +#ifdef SIMDUTF_CLANG_VISUAL_STUDIO +/** + * You are not supposed, normally, to include these + * headers directly. Instead you should either include intrin.h + * or x86intrin.h. However, when compiling with clang + * under Windows (i.e., when _MSC_VER is set), these headers + * only get included *if* the corresponding features are detected + * from macros: + */ +#include // for _mm_alignr_epi8 +#include // for _mm_clmulepi64_si128 +#endif + + + +#endif // SIMDUTF_WESTMERE_INTRINSICS_H +/* end file src/simdutf/westmere/intrinsics.h */ + +// +// The rest need to be inside the region +// +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h +/* begin file src/simdutf/westmere/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "westmere" +// #define SIMDUTF_IMPLEMENTATION westmere +SIMDUTF_TARGET_WESTMERE +/* end file src/simdutf/westmere/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h +/* begin file src/simdutf/westmere/bitmanipulation.h */ +#ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H +#define SIMDUTF_WESTMERE_BITMANIPULATION_H + +namespace simdutf { +namespace westmere { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +NO_SANITIZE_UNDEFINED +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + unsigned long ret; + // Search the mask data from least significant bit (LSB) + // to the most significant bit (MSB) for a set bit (1). + _BitScanForward64(&ret, input_num); + return (int)ret; +#else // SIMDUTF_REGULAR_VISUAL_STUDIO + return __builtin_ctzll(input_num); +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num-1); +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int leading_zeroes(uint64_t input_num) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// SIMDUTF_REGULAR_VISUAL_STUDIO +} + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num);// Visual Studio wants two underscores +} +#else +simdutf_really_inline long long int count_ones(uint64_t input_num) { + return _popcnt64(input_num); +} +#endif + +simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + return _addcarry_u64(0, value1, value2, + reinterpret_cast(result)); +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +#endif // SIMDUTF_WESTMERE_BITMANIPULATION_H +/* end file src/simdutf/westmere/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmask.h +/* begin file src/simdutf/westmere/bitmask.h */ +#ifndef SIMDUTF_WESTMERE_BITMASK_H +#define SIMDUTF_WESTMERE_BITMASK_H + +namespace simdutf { +namespace westmere { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdutf_really_inline uint64_t prefix_xor(const uint64_t bitmask) { + // There should be no such thing with a processing supporting avx2 + // but not clmul. + __m128i all_ones = _mm_set1_epi8('\xFF'); + __m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0); + return _mm_cvtsi128_si64(result); +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +#endif // SIMDUTF_WESTMERE_BITMASK_H +/* end file src/simdutf/westmere/bitmask.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h +/* begin file src/simdutf/westmere/simd.h */ +#ifndef SIMDUTF_WESTMERE_SIMD_H +#define SIMDUTF_WESTMERE_SIMD_H + +namespace simdutf { +namespace westmere { +namespace { +namespace simd { + + template + struct base { + __m128i value; + + // Zero constructor + simdutf_really_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + simdutf_really_inline base(const __m128i _value) : value(_value) {} + // Conversion to SIMD register + simdutf_really_inline operator const __m128i&() const { return this->value; } + simdutf_really_inline operator __m128i&() { return this->value; } + simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const { + _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi16(*this)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi16(_mm_srli_si128(*this,8))); + } + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); } + simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); } + simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); } + simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); } + simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast | other; return *this_cast; } + simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast & other; return *this_cast; } + simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast(this); *this_cast = *this_cast ^ other; return *this_cast; } + }; + + // Forward-declared so they can be used by splat and friends. + template + struct simd8; + + template> + struct base8: base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); } + simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); } + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m128i _value) : base>(_value) {} + + simdutf_really_inline Mask operator==(const simd8 other) const { return _mm_cmpeq_epi8(*this, other); } + + static const int SIZE = sizeof(base>::value); + + template + simdutf_really_inline simd8 prev(const simd8 prev_chunk) const { + return _mm_alignr_epi8(*this, prev_chunk, 16 - N); + } + }; + + // SIMD byte mask type (returned by things like eq and gt) + template<> + struct simd8: base8 { + static simdutf_really_inline simd8 splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); } + + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m128i _value) : base8(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) : base8(splat(_value)) {} + + simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } + simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); } + simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; } + simdutf_really_inline simd8 operator~() const { return *this ^ true; } + }; + + template + struct base8_numeric: base8 { + static simdutf_really_inline simd8 splat(T _value) { return _mm_set1_epi8(_value); } + static simdutf_really_inline simd8 zero() { return _mm_setzero_si128(); } + static simdutf_really_inline simd8 load(const T values[16]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16( + T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m128i _value) : base8(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { return _mm_add_epi8(*this, other); } + simdutf_really_inline simd8 operator-(const simd8 other) const { return _mm_sub_epi8(*this, other); } + simdutf_really_inline simd8& operator+=(const simd8 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd8& operator-=(const simd8 other) { *this = *this - other; return *static_cast*>(this); } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return _mm_shuffle_epi8(lookup_table, *this); + } + + template + simdutf_really_inline simd8 lookup_16( + L replace0, L replace1, L replace2, L replace3, + L replace4, L replace5, L replace6, L replace7, + L replace8, L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, + replace4, replace5, replace6, replace7, + replace8, replace9, replace10, replace11, + replace12, replace13, replace14, replace15 + )); + } + }; + + // Signed bytes + template<> + struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + simdutf_really_inline operator simd8() const; + simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; } + + // Order-sensitive comparisons + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epi8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epi8(*this, other); } + simdutf_really_inline simd8 operator>(const simd8 other) const { return _mm_cmpgt_epi8(*this, other); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return _mm_cmpgt_epi8(other, *this); } + }; + + // Unsigned bytes + template<> + struct simd8: base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) : base8_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) : simd8(_mm_setr_epi8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + )) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15 + ) { + return simd8( + v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10,v11,v12,v13,v14,v15 + ); + } + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu8(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu8(*this, other); } + + // Order-specific operations + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epu8(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epu8(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd8 operator>(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd8 operator<(const simd8 other) const { return this->gt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { return *this == uint8_t(0); } + simdutf_really_inline simd8 bits_not_set(simd8 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { return ~this->bits_not_set(bits); } + simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd8 shr() const { return simd8(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); } + template + simdutf_really_inline simd8 shl() const { return simd8(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } + }; + simdutf_really_inline simd8::operator simd8() const { return this->value; } + + // Unsigned bytes + template<> + struct simd8: base { + static simdutf_really_inline simd8 splat(uint16_t _value) { return _mm_set1_epi16(_value); } + static simdutf_really_inline simd8 load(const uint16_t values[8]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + + simdutf_really_inline simd8() : base() {} + simdutf_really_inline simd8(const __m128i _value) : base(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8( + uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7 + ) : simd8(_mm_setr_epi16( + v0, v1, v2, v3, v4, v5, v6, v7 + )) {} + + // Saturated math + simdutf_really_inline simd8 saturating_add(const simd8 other) const { return _mm_adds_epu16(*this, other); } + simdutf_really_inline simd8 saturating_sub(const simd8 other) const { return _mm_subs_epu16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd8 max_val(const simd8 other) const { return _mm_max_epu16(*this, other); } + simdutf_really_inline simd8 min_val(const simd8 other) const { return _mm_min_epu16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 gt_bits(const simd8 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 lt_bits(const simd8 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd8 operator<=(const simd8 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd8 operator>=(const simd8 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd8 operator==(const simd8 other) const { return _mm_cmpeq_epi16(*this, other); } + simdutf_really_inline simd8 operator&(const simd8 other) const { return _mm_and_si128(*this, other); } + simdutf_really_inline simd8 operator|(const simd8 other) const { return _mm_or_si128(*this, other); } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { return *this == uint16_t(0); } + simdutf_really_inline simd8 any_bits_set() const { return ~this->bits_not_set(); } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { return _mm_testz_si128(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { return !bits_not_set_anywhere(bits); } + }; + template + struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, const simd8 chunk2, const simd8 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd8)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd8)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd8)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd8)*3/sizeof(T)); + } + + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd8)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd8)*1); + this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd8)*2); + this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd8)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() ); + uint64_t r1 = this->chunks[1].to_bitmask() ; + uint64_t r2 = this->chunks[2].to_bitmask() ; + uint64_t r3 = this->chunks[3].to_bitmask() ; + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low-1); + const simd8 mask_high = simd8::splat(high+1); + return simd8x64( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), + (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), + (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask, + this->chunks[2] > mask, + this->chunks[3] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask, + this->chunks[2] >= mask, + this->chunks[3] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + simd8(__m128i(this->chunks[0])) >= mask, + simd8(__m128i(this->chunks[1])) >= mask, + simd8(__m128i(this->chunks[2])) >= mask, + simd8(__m128i(this->chunks[3])) >= mask + ).to_bitmask(); + } + }; // struct simd8x64 + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h +/* begin file src/simdutf/westmere/simd16-inl.h */ +template +struct simd16; + +template> +struct base16: base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline base16() : base>() {} + simdutf_really_inline base16(const __m128i _value) : base>(_value) {} + template + simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast(ptr))) {} + + simdutf_really_inline Mask operator==(const simd16 other) const { return _mm_cmpeq_epi16(*this, other); } + + static const int SIZE = sizeof(base>::value); + + template + simdutf_really_inline simd16 prev(const simd16 prev_chunk) const { + return _mm_alignr_epi8(*this, prev_chunk, 16 - N); + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template<> +struct simd16: base16 { + static simdutf_really_inline simd16 splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); } + + simdutf_really_inline simd16() : base16() {} + simdutf_really_inline simd16(const __m128i _value) : base16(_value) {} + // Splat constructor + simdutf_really_inline simd16(bool _value) : base16(splat(_value)) {} + + simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); } + simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); } + simdutf_really_inline simd16 operator~() const { return *this ^ true; } +}; + +template +struct base16_numeric: base16 { + static simdutf_really_inline simd16 splat(T _value) { return _mm_set1_epi16(_value); } + static simdutf_really_inline simd16 zero() { return _mm_setzero_si128(); } + static simdutf_really_inline simd16 load(const T values[8]) { + return _mm_loadu_si128(reinterpret_cast(values)); + } + + simdutf_really_inline base16_numeric() : base16() {} + simdutf_really_inline base16_numeric(const __m128i _value) : base16(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); } + + // Override to distinguish from bool version + simdutf_really_inline simd16 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd16 operator+(const simd16 other) const { return _mm_add_epi16(*this, other); } + simdutf_really_inline simd16 operator-(const simd16 other) const { return _mm_sub_epi16(*this, other); } + simdutf_really_inline simd16& operator+=(const simd16 other) { *this = *this + other; return *static_cast*>(this); } + simdutf_really_inline simd16& operator-=(const simd16 other) { *this = *this - other; return *static_cast*>(this); } +}; + +// Signed words +template<> +struct simd16 : base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m128i _value) : base16_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + // Member-by-member initialization + simdutf_really_inline simd16( + int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7) + : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {} + simdutf_really_inline operator simd16() const; + + // Order-sensitive comparisons + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm_max_epi16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm_min_epi16(*this, other); } + simdutf_really_inline simd16 operator>(const simd16 other) const { return _mm_cmpgt_epi16(*this, other); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return _mm_cmpgt_epi16(other, *this); } +}; + +// Unsigned words +template<> +struct simd16: base16_numeric { + simdutf_really_inline simd16() : base16_numeric() {} + simdutf_really_inline simd16(const __m128i _value) : base16_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {} + // Array constructor + simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {} + simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast(values))) {} + // Member-by-member initialization + simdutf_really_inline simd16( + uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7) + : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd16 repeat_16( + uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7 + ) { + return simd16(v0, v1, v2, v3, v4, v5, v6, v7); + } + + // Saturated math + simdutf_really_inline simd16 saturating_add(const simd16 other) const { return _mm_adds_epu16(*this, other); } + simdutf_really_inline simd16 saturating_sub(const simd16 other) const { return _mm_subs_epu16(*this, other); } + + // Order-specific operations + simdutf_really_inline simd16 max_val(const simd16 other) const { return _mm_max_epu16(*this, other); } + simdutf_really_inline simd16 min_val(const simd16 other) const { return _mm_min_epu16(*this, other); } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 gt_bits(const simd16 other) const { return this->saturating_sub(other); } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd16 lt_bits(const simd16 other) const { return other.saturating_sub(*this); } + simdutf_really_inline simd16 operator<=(const simd16 other) const { return other.max_val(*this) == other; } + simdutf_really_inline simd16 operator>=(const simd16 other) const { return other.min_val(*this) == other; } + simdutf_really_inline simd16 operator>(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + simdutf_really_inline simd16 operator<(const simd16 other) const { return this->gt_bits(other).any_bits_set(); } + + // Bit-specific operations + simdutf_really_inline simd16 bits_not_set() const { return *this == uint16_t(0); } + simdutf_really_inline simd16 bits_not_set(simd16 bits) const { return (*this & bits).bits_not_set(); } + simdutf_really_inline simd16 any_bits_set() const { return ~this->bits_not_set(); } + simdutf_really_inline simd16 any_bits_set(simd16 bits) const { return ~this->bits_not_set(bits); } + + simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); } + simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); } + simdutf_really_inline bool bits_not_set_anywhere(simd16 bits) const { return _mm_testz_si128(*this, bits); } + simdutf_really_inline bool any_bits_set_anywhere(simd16 bits) const { return !bits_not_set_anywhere(bits); } + template + simdutf_really_inline simd16 shr() const { return simd16(_mm_srli_epi16(*this, N)); } + template + simdutf_really_inline simd16 shl() const { return simd16(_mm_slli_epi16(*this, N)); } + // Get one of the bits and make a bitmask out of it. + // e.g. value.get_bit<7>() gets the high bit + template + simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); } + + // Pack with the unsigned saturation two uint16_t words into single uint8_t vector + static simdutf_really_inline simd8 pack(const simd16& v0, const simd16& v1) { + return _mm_packus_epi16(v0, v1); + } +}; +simdutf_really_inline simd16::operator simd16() const { return this->value; } + +template + struct simd16x32 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd16); + static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block."); + const simd16 chunks[NUM_CHUNKS]; + + simd16x32(const simd16x32& o) = delete; // no copy allowed + simd16x32& operator=(const simd16 other) = delete; // no assignment allowed + simd16x32() = delete; // no default constructor allowed + + simdutf_really_inline simd16x32(const simd16 chunk0, const simd16 chunk1, const simd16 chunk2, const simd16 chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {} + simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16::load(ptr), simd16::load(ptr+sizeof(simd16)/sizeof(T)), simd16::load(ptr+2*sizeof(simd16)/sizeof(T)), simd16::load(ptr+3*sizeof(simd16)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr+sizeof(simd16)*0/sizeof(T)); + this->chunks[1].store(ptr+sizeof(simd16)*1/sizeof(T)); + this->chunks[2].store(ptr+sizeof(simd16)*2/sizeof(T)); + this->chunks[3].store(ptr+sizeof(simd16)*3/sizeof(T)); + } + + simdutf_really_inline simd16 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]); + } + + simdutf_really_inline bool is_ascii() const { + return this->reduce_or().is_ascii(); + } + + simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const { + this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16)*0); + this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16)*1); + this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16)*2); + this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16)*3); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() ); + uint64_t r1 = this->chunks[1].to_bitmask() ; + uint64_t r2 = this->chunks[2].to_bitmask() ; + uint64_t r3 = this->chunks[3].to_bitmask() ; + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] == mask, + this->chunks[1] == mask, + this->chunks[2] == mask, + this->chunks[3] == mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd16x32 &other) const { + return simd16x32( + this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3] + ).to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] <= mask, + this->chunks[1] <= mask, + this->chunks[2] <= mask, + this->chunks[3] <= mask + ).to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(low); + const simd16 mask_high = simd16::splat(high); + + return simd16x32( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd16 mask_low = simd16::splat(static_cast(low-1)); + const simd16 mask_high = simd16::splat(static_cast(high+1)); + return simd16x32( + (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low), + (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low), + (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low), + (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd16 mask = simd16::splat(m); + return simd16x32( + this->chunks[0] < mask, + this->chunks[1] < mask, + this->chunks[2] < mask, + this->chunks[3] < mask + ).to_bitmask(); + } + }; // struct simd16x32 +/* end file src/simdutf/westmere/simd16-inl.h */ + +} // namespace simd +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H +/* end file src/simdutf/westmere/simd.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h +/* begin file src/simdutf/westmere/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/westmere/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_WESTMERE +#endif // SIMDUTF_WESTMERE_COMMON_H +/* end file src/simdutf/westmere.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h +/* begin file src/simdutf/ppc64.h */ +#ifndef SIMDUTF_PPC64_H +#define SIMDUTF_PPC64_H + +#ifdef SIMDUTF_FALLBACK_H +#error "ppc64.h must be included before fallback.h" +#endif + + +#ifndef SIMDUTF_IMPLEMENTATION_PPC64 +#define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64 + + + +#if SIMDUTF_IMPLEMENTATION_PPC64 + +namespace simdutf { +/** + * Implementation for ALTIVEC (PPC64). + */ +namespace ppc64 { +} // namespace ppc64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h +/* begin file src/simdutf/ppc64/implementation.h */ +#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H +#define SIMDUTF_PPC64_IMPLEMENTATION_H + + +namespace simdutf { +namespace ppc64 { + +namespace { +using namespace simdutf; +} // namespace + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() + : simdutf::implementation("ppc64", "PPC64 ALTIVEC", + internal::instruction_set::ALTIVEC) {} + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace ppc64 +} // namespace simdutf + +#endif // SIMDUTF_PPC64_IMPLEMENTATION_H +/* end file src/simdutf/ppc64/implementation.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h +/* begin file src/simdutf/ppc64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "ppc64" +// #define SIMDUTF_IMPLEMENTATION ppc64 +/* end file src/simdutf/ppc64/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h +/* begin file src/simdutf/ppc64/intrinsics.h */ +#ifndef SIMDUTF_PPC64_INTRINSICS_H +#define SIMDUTF_PPC64_INTRINSICS_H + + +// This should be the correct header whether +// you use visual studio or other compilers. +#include + +// These are defined by altivec.h in GCC toolchain, it is safe to undef them. +#ifdef bool +#undef bool +#endif + +#ifdef vector +#undef vector +#endif + +#endif // SIMDUTF_PPC64_INTRINSICS_H +/* end file src/simdutf/ppc64/intrinsics.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h +/* begin file src/simdutf/ppc64/bitmanipulation.h */ +#ifndef SIMDUTF_PPC64_BITMANIPULATION_H +#define SIMDUTF_PPC64_BITMANIPULATION_H + +namespace simdutf { +namespace ppc64 { +namespace { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. +NO_SANITIZE_UNDEFINED +simdutf_really_inline int trailing_zeroes(uint64_t input_num) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + unsigned long ret; + // Search the mask data from least significant bit (LSB) + // to the most significant bit (MSB) for a set bit (1). + _BitScanForward64(&ret, input_num); + return (int)ret; +#else // SIMDUTF_REGULAR_VISUAL_STUDIO + return __builtin_ctzll(input_num); +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline uint64_t clear_lowest_bit(uint64_t input_num) { + return input_num & (input_num - 1); +} + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int leading_zeroes(uint64_t input_num) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif // SIMDUTF_REGULAR_VISUAL_STUDIO +} + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +simdutf_really_inline int count_ones(uint64_t input_num) { + // note: we do not support legacy 32-bit Windows + return __popcnt64(input_num); // Visual Studio wants two underscores +} +#else +simdutf_really_inline int count_ones(uint64_t input_num) { + return __builtin_popcountll(input_num); +} +#endif + +simdutf_really_inline bool add_overflow(uint64_t value1, uint64_t value2, + uint64_t *result) { +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + *result = value1 + value2; + return *result < value1; +#else + return __builtin_uaddll_overflow(value1, value2, + reinterpret_cast(result)); +#endif +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +#endif // SIMDUTF_PPC64_BITMANIPULATION_H +/* end file src/simdutf/ppc64/bitmanipulation.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmask.h +/* begin file src/simdutf/ppc64/bitmask.h */ +#ifndef SIMDUTF_PPC64_BITMASK_H +#define SIMDUTF_PPC64_BITMASK_H + +namespace simdutf { +namespace ppc64 { +namespace { + +// +// Perform a "cumulative bitwise xor," flipping bits each time a 1 is +// encountered. +// +// For example, prefix_xor(00100100) == 00011100 +// +simdutf_really_inline uint64_t prefix_xor(uint64_t bitmask) { + // You can use the version below, however gcc sometimes miscompiles + // vec_pmsum_be, it happens somewhere around between 8 and 9th version. + // The performance boost was not noticeable, falling back to a usual + // implementation. + // __vector unsigned long long all_ones = {~0ull, ~0ull}; + // __vector unsigned long long mask = {bitmask, 0}; + // // Clang and GCC return different values for pmsum for ull so cast it to one. + // // Generally it is not specified by ALTIVEC ISA what is returned by + // // vec_pmsum_be. + // #if defined(__LITTLE_ENDIAN__) + // return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[0]); + // #else + // return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[1]); + // #endif + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +#endif +/* end file src/simdutf/ppc64/bitmask.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h +/* begin file src/simdutf/ppc64/simd.h */ +#ifndef SIMDUTF_PPC64_SIMD_H +#define SIMDUTF_PPC64_SIMD_H + +#include + +namespace simdutf { +namespace ppc64 { +namespace { +namespace simd { + +using __m128i = __vector unsigned char; + +template struct base { + __m128i value; + + // Zero constructor + simdutf_really_inline base() : value{__m128i()} {} + + // Conversion from SIMD register + simdutf_really_inline base(const __m128i _value) : value(_value) {} + + // Conversion to SIMD register + simdutf_really_inline operator const __m128i &() const { + return this->value; + } + simdutf_really_inline operator __m128i &() { return this->value; } + + // Bit operations + simdutf_really_inline Child operator|(const Child other) const { + return vec_or(this->value, (__m128i)other); + } + simdutf_really_inline Child operator&(const Child other) const { + return vec_and(this->value, (__m128i)other); + } + simdutf_really_inline Child operator^(const Child other) const { + return vec_xor(this->value, (__m128i)other); + } + simdutf_really_inline Child bit_andnot(const Child other) const { + return vec_andc(this->value, (__m128i)other); + } + simdutf_really_inline Child &operator|=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast | other; + return *this_cast; + } + simdutf_really_inline Child &operator&=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast & other; + return *this_cast; + } + simdutf_really_inline Child &operator^=(const Child other) { + auto this_cast = static_cast(this); + *this_cast = *this_cast ^ other; + return *this_cast; + } +}; + +// Forward-declared so they can be used by splat and friends. +template struct simd8; + +template > +struct base8 : base> { + typedef uint16_t bitmask_t; + typedef uint32_t bitmask2_t; + + simdutf_really_inline base8() : base>() {} + simdutf_really_inline base8(const __m128i _value) : base>(_value) {} + + simdutf_really_inline Mask operator==(const simd8 other) const { + return (__m128i)vec_cmpeq(this->value, (__m128i)other); + } + + static const int SIZE = sizeof(base>::value); + + template + simdutf_really_inline simd8 prev(simd8 prev_chunk) const { + __m128i chunk = this->value; +#ifdef __LITTLE_ENDIAN__ + chunk = (__m128i)vec_reve(this->value); + prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk); +#endif + chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N); +#ifdef __LITTLE_ENDIAN__ + chunk = (__m128i)vec_reve((__m128i)chunk); +#endif + return chunk; + } +}; + +// SIMD byte mask type (returned by things like eq and gt) +template <> struct simd8 : base8 { + static simdutf_really_inline simd8 splat(bool _value) { + return (__m128i)vec_splats((unsigned char)(-(!!_value))); + } + + simdutf_really_inline simd8() : base8() {} + simdutf_really_inline simd8(const __m128i _value) + : base8(_value) {} + // Splat constructor + simdutf_really_inline simd8(bool _value) + : base8(splat(_value)) {} + + simdutf_really_inline int to_bitmask() const { + __vector unsigned long long result; + const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00}; + + result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value, + (__m128i)perm_mask)); +#ifdef __LITTLE_ENDIAN__ + return static_cast(result[1]); +#else + return static_cast(result[0]); +#endif + } + simdutf_really_inline bool any() const { + return !vec_all_eq(this->value, (__m128i)vec_splats(0)); + } + simdutf_really_inline simd8 operator~() const { + return this->value ^ (__m128i)splat(true); + } +}; + +template struct base8_numeric : base8 { + static simdutf_really_inline simd8 splat(T value) { + (void)value; + return (__m128i)vec_splats(value); + } + static simdutf_really_inline simd8 zero() { return splat(0); } + static simdutf_really_inline simd8 load(const T values[16]) { + return (__m128i)(vec_vsx_ld(0, reinterpret_cast(values))); + } + // Repeat 16 values as many times as necessary (usually for lookup tables) + static simdutf_really_inline simd8 repeat_16(T v0, T v1, T v2, T v3, T v4, + T v5, T v6, T v7, T v8, T v9, + T v10, T v11, T v12, T v13, + T v14, T v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15); + } + + simdutf_really_inline base8_numeric() : base8() {} + simdutf_really_inline base8_numeric(const __m128i _value) + : base8(_value) {} + + // Store to array + simdutf_really_inline void store(T dst[16]) const { + vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst)); + } + + // Override to distinguish from bool version + simdutf_really_inline simd8 operator~() const { return *this ^ 0xFFu; } + + // Addition/subtraction are the same for signed and unsigned + simdutf_really_inline simd8 operator+(const simd8 other) const { + return (__m128i)((__m128i)this->value + (__m128i)other); + } + simdutf_really_inline simd8 operator-(const simd8 other) const { + return (__m128i)((__m128i)this->value - (__m128i)other); + } + simdutf_really_inline simd8 &operator+=(const simd8 other) { + *this = *this + other; + return *static_cast *>(this); + } + simdutf_really_inline simd8 &operator-=(const simd8 other) { + *this = *this - other; + return *static_cast *>(this); + } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + simdutf_really_inline simd8 lookup_16(simd8 lookup_table) const { + return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value); + } + + template + simdutf_really_inline simd8 + lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4, + L replace5, L replace6, L replace7, L replace8, L replace9, + L replace10, L replace11, L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } +}; + +// Signed bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) + : base8_numeric(_value) {} + + // Splat constructor + simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, + v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 + repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, + int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Order-sensitive comparisons + simdutf_really_inline simd8 + max_val(const simd8 other) const { + return (__m128i)vec_max((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdutf_really_inline simd8 + min_val(const simd8 other) const { + return (__m128i)vec_min((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdutf_really_inline simd8 + operator>(const simd8 other) const { + return (__m128i)vec_cmpgt((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } + simdutf_really_inline simd8 + operator<(const simd8 other) const { + return (__m128i)vec_cmplt((__vector signed char)this->value, + (__vector signed char)(__m128i)other); + } +}; + +// Unsigned bytes +template <> struct simd8 : base8_numeric { + simdutf_really_inline simd8() : base8_numeric() {} + simdutf_really_inline simd8(const __m128i _value) + : base8_numeric(_value) {} + // Splat constructor + simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Array constructor + simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {} + // Member-by-member initialization + simdutf_really_inline + simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, + uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + simdutf_really_inline static simd8 + repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, + uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, + uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, + uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Saturated math + simdutf_really_inline simd8 + saturating_add(const simd8 other) const { + return (__m128i)vec_adds(this->value, (__m128i)other); + } + simdutf_really_inline simd8 + saturating_sub(const simd8 other) const { + return (__m128i)vec_subs(this->value, (__m128i)other); + } + + // Order-specific operations + simdutf_really_inline simd8 + max_val(const simd8 other) const { + return (__m128i)vec_max(this->value, (__m128i)other); + } + simdutf_really_inline simd8 + min_val(const simd8 other) const { + return (__m128i)vec_min(this->value, (__m128i)other); + } + // Same as >, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + gt_bits(const simd8 other) const { + return this->saturating_sub(other); + } + // Same as <, but only guarantees true is nonzero (< guarantees true = -1) + simdutf_really_inline simd8 + lt_bits(const simd8 other) const { + return other.saturating_sub(*this); + } + simdutf_really_inline simd8 + operator<=(const simd8 other) const { + return other.max_val(*this) == other; + } + simdutf_really_inline simd8 + operator>=(const simd8 other) const { + return other.min_val(*this) == other; + } + simdutf_really_inline simd8 + operator>(const simd8 other) const { + return this->gt_bits(other).any_bits_set(); + } + simdutf_really_inline simd8 + operator<(const simd8 other) const { + return this->gt_bits(other).any_bits_set(); + } + + // Bit-specific operations + simdutf_really_inline simd8 bits_not_set() const { + return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0))); + } + simdutf_really_inline simd8 bits_not_set(simd8 bits) const { + return (*this & bits).bits_not_set(); + } + simdutf_really_inline simd8 any_bits_set() const { + return ~this->bits_not_set(); + } + simdutf_really_inline simd8 any_bits_set(simd8 bits) const { + return ~this->bits_not_set(bits); + } + + simdutf_really_inline bool is_ascii() const { + return this->saturating_sub(0b01111111u).bits_not_set_anywhere(); + } + + simdutf_really_inline bool bits_not_set_anywhere() const { + return vec_all_eq(this->value, (__m128i)vec_splats(0)); + } + simdutf_really_inline bool any_bits_set_anywhere() const { + return !bits_not_set_anywhere(); + } + simdutf_really_inline bool bits_not_set_anywhere(simd8 bits) const { + return vec_all_eq(vec_and(this->value, (__m128i)bits), + (__m128i)vec_splats(0)); + } + simdutf_really_inline bool any_bits_set_anywhere(simd8 bits) const { + return !bits_not_set_anywhere(bits); + } + template simdutf_really_inline simd8 shr() const { + return simd8( + (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N))); + } + template simdutf_really_inline simd8 shl() const { + return simd8( + (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N))); + } +}; + +template struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, + "PPC64 kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64 &o) = delete; // no copy allowed + simd8x64 & + operator=(const simd8 other) = delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + simdutf_really_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + + simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8::load(ptr), simd8::load(ptr+sizeof(simd8)/sizeof(T)), simd8::load(ptr+2*sizeof(simd8)/sizeof(T)), simd8::load(ptr+3*sizeof(simd8)/sizeof(T))} {} + + simdutf_really_inline void store(T* ptr) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0/sizeof(T)); + this->chunks[1].store(ptr + sizeof(simd8) * 1/sizeof(T)); + this->chunks[2].store(ptr + sizeof(simd8) * 2/sizeof(T)); + this->chunks[3].store(ptr + sizeof(simd8) * 3/sizeof(T)); + } + + simdutf_really_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } + + + simdutf_really_inline bool is_ascii() const { + return input.reduce_or().is_ascii(); + } + + simdutf_really_inline uint64_t to_bitmask() const { + uint64_t r0 = uint32_t(this->chunks[0].to_bitmask()); + uint64_t r1 = this->chunks[1].to_bitmask(); + uint64_t r2 = this->chunks[2].to_bitmask(); + uint64_t r3 = this->chunks[3].to_bitmask(); + return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48); + } + + simdutf_really_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t eq(const simd8x64 &other) const { + return simd8x64(this->chunks[0] == other.chunks[0], + this->chunks[1] == other.chunks[1], + this->chunks[2] == other.chunks[2], + this->chunks[3] == other.chunks[3]) + .to_bitmask(); + } + + simdutf_really_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + + return simd8x64( + (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low), + (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low), + (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low), + (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t not_in_range(const T low, const T high) const { + const simd8 mask_low = simd8::splat(low); + const simd8 mask_high = simd8::splat(high); + return simd8x64( + (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low), + (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low), + (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low), + (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low) + ).to_bitmask(); + } + simdutf_really_inline uint64_t lt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64(this->chunks[0] < mask, this->chunks[1] < mask, + this->chunks[2] < mask, this->chunks[3] < mask) + .to_bitmask(); + } + + simdutf_really_inline uint64_t gt(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] > mask, + this->chunks[1] > mask, + this->chunks[2] > mask, + this->chunks[3] > mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq(const T m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + this->chunks[0] >= mask, + this->chunks[1] >= mask, + this->chunks[2] >= mask, + this->chunks[3] >= mask + ).to_bitmask(); + } + simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const { + const simd8 mask = simd8::splat(m); + return simd8x64( + simd8(this->chunks[0]) >= mask, + simd8(this->chunks[1]) >= mask, + simd8(this->chunks[2]) >= mask, + simd8(this->chunks[3]) >= mask + ).to_bitmask(); + } +}; // struct simd8x64 + +} // namespace simd +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +#endif // SIMDUTF_PPC64_SIMD_INPUT_H +/* end file src/simdutf/ppc64/simd.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h +/* begin file src/simdutf/ppc64/end.h */ +/* end file src/simdutf/ppc64/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_PPC64 + +#endif // SIMDUTF_PPC64_H +/* end file src/simdutf/ppc64.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback.h +/* begin file src/simdutf/fallback.h */ +#ifndef SIMDUTF_FALLBACK_H +#define SIMDUTF_FALLBACK_H + + +// Default Fallback to on unless a builtin implementation has already been selected. +#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK +#define SIMDUTF_IMPLEMENTATION_FALLBACK 1 // (!SIMDUTF_CAN_ALWAYS_RUN_ARM64 && !SIMDUTF_CAN_ALWAYS_RUN_HASWELL && !SIMDUTF_CAN_ALWAYS_RUN_WESTMERE && !SIMDUTF_CAN_ALWAYS_RUN_PPC64) +#endif +#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK SIMDUTF_IMPLEMENTATION_FALLBACK + +#if SIMDUTF_IMPLEMENTATION_FALLBACK + +namespace simdutf { +/** + * Fallback implementation (runs on any machine). + */ +namespace fallback { +} // namespace fallback +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h +/* begin file src/simdutf/fallback/implementation.h */ +#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H +#define SIMDUTF_FALLBACK_IMPLEMENTATION_H + + +namespace simdutf { +namespace fallback { + +namespace { +using namespace simdutf; +} + +class implementation final : public simdutf::implementation { +public: + simdutf_really_inline implementation() : simdutf::implementation( + "fallback", + "Generic fallback implementation", + 0 + ) {} + simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final; + simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) const noexcept final; + simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final; + simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final; + simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t length) const noexcept; + simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept; + simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept; + simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept; +}; + +} // namespace fallback +} // namespace simdutf + +#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H +/* end file src/simdutf/fallback/implementation.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h +/* begin file src/simdutf/fallback/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "fallback" +// #define SIMDUTF_IMPLEMENTATION fallback +/* end file src/simdutf/fallback/begin.h */ + +// Declarations +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h +/* begin file src/simdutf/fallback/bitmanipulation.h */ +#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H +#define SIMDUTF_FALLBACK_BITMANIPULATION_H + +#include + +namespace simdutf { +namespace fallback { +namespace { + +#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64) +static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) { + unsigned long x0 = (unsigned long)x, top, bottom; + _BitScanForward(&top, (unsigned long)(x >> 32)); + _BitScanForward(&bottom, x0); + *ret = x0 ? bottom : 32 + top; + return x != 0; +} +static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) { + unsigned long x1 = (unsigned long)(x >> 32), top, bottom; + _BitScanReverse(&top, x1); + _BitScanReverse(&bottom, (unsigned long)x); + *ret = x1 ? top + 32 : bottom; + return x != 0; +} +#endif + +/* result might be undefined when input_num is zero */ +simdutf_really_inline int leading_zeroes(uint64_t input_num) { +#ifdef _MSC_VER + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + if (_BitScanReverse64(&leading_zero, input_num)) + return (int)(63 - leading_zero); + else + return 64; +#else + return __builtin_clzll(input_num); +#endif// _MSC_VER +} + +} // unnamed namespace +} // namespace fallback +} // namespace simdutf + +#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H +/* end file src/simdutf/fallback/bitmanipulation.h */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h +/* begin file src/simdutf/fallback/end.h */ +/* end file src/simdutf/fallback/end.h */ + +#endif // SIMDUTF_IMPLEMENTATION_FALLBACK +#endif // SIMDUTF_FALLBACK_H +/* end file src/simdutf/fallback.h */ + + +namespace simdutf { +bool implementation::supported_by_runtime_system() const { + uint32_t required_instruction_sets = this->required_instruction_sets(); + uint32_t supported_instruction_sets = internal::detect_supported_architectures(); + return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets); +} + +simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept { + // If there is a BOM, then we trust it. + auto bom_encoding = simdutf::BOM::check_bom(input, length); + if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + // UTF8 is common, it includes ASCII, and is commonly represented + // without a BOM, so if it fits, go with that. Note that it is still + // possible to get it wrong, we are only 'guessing'. If some has UTF-16 + // data without a BOM, it could pass as UTF-8. + // + // An interesting twist might be to check for UTF-16 ASCII first (every + // other byte is zero). + if(validate_utf8(input, length)) { return encoding_type::UTF8; } + // The next most common encoding that might appear without BOM is probably + // UTF-16LE, so try that next. + if((length % 2) == 0) { + if(validate_utf16(reinterpret_cast(input), length)) { return encoding_type::UTF16_LE; } + } + return encoding_type::unspecified; +} + +namespace internal { + +// Static array of known implementations. We're hoping these get baked into the executable +// without requiring a static initializer. + +#if SIMDUTF_IMPLEMENTATION_HASWELL +const haswell::implementation haswell_singleton{}; +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE +const westmere::implementation westmere_singleton{}; +#endif // SIMDUTF_IMPLEMENTATION_WESTMERE +#if SIMDUTF_IMPLEMENTATION_ARM64 +const arm64::implementation arm64_singleton{}; +#endif // SIMDUTF_IMPLEMENTATION_ARM64 +#if SIMDUTF_IMPLEMENTATION_PPC64 +const ppc64::implementation ppc64_singleton{}; +#endif // SIMDUTF_IMPLEMENTATION_PPC64 +#if SIMDUTF_IMPLEMENTATION_FALLBACK +const fallback::implementation fallback_singleton{}; +#endif // SIMDUTF_IMPLEMENTATION_FALLBACK + +/** + * @private Detects best supported implementation on first use, and sets it + */ +class detect_best_supported_implementation_on_first_use final : public implementation { +public: + const std::string &name() const noexcept final { return set_best()->name(); } + const std::string &description() const noexcept final { return set_best()->description(); } + uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); } + + simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override { + return set_best()->validate_utf8(buf, len); + } + + simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->validate_utf16(buf, len); + } + + simdutf_warn_unused size_t convert_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_utf8_to_utf16(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { + return set_best()->convert_valid_utf8_to_utf16(buf, len, utf16_output); + } + + simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_utf16_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override { + return set_best()->convert_valid_utf16_to_utf8(buf, len, utf8_output); + } + + simdutf_warn_unused size_t count_utf16(const char16_t * buf, size_t len) const noexcept final override { + return set_best()->count_utf16(buf, len); + } + + simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override { + return set_best()->count_utf8(buf, len); + } + + simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * buf, size_t len) const noexcept override { + return set_best()->utf8_length_from_utf16(buf, len); + } + + simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override { + return set_best()->utf16_length_from_utf8(buf, len); + } + + simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} + +private: + const implementation *set_best() const noexcept; +}; + +const detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton; + +const std::initializer_list available_implementation_pointers { +#if SIMDUTF_IMPLEMENTATION_HASWELL + &haswell_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE + &westmere_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_ARM64 + &arm64_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 + &ppc64_singleton, +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK + &fallback_singleton, +#endif +}; // available_implementation_pointers + +// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support +class unsupported_implementation final : public implementation { +public: + simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override { + return false; // Just refuse to validate. Given that we have a fallback implementation + // it seems unlikely that unsupported_implementation will ever be used. If it is used, + // then it will flag all strings as invalid. The alternative is to return an error_code + // from which the user has to figure out whether the string is valid UTF-8... which seems + // like a lot of work just to handle the very unlikely case that we have an unsupported + // implementation. And, when it does happen (that we have an unsupported implementation), + // what are the chances that the programmer has a fallback? Given that *we* provide the + // fallback, it implies that the programmer would need a fallback for our fallback. + } + + simdutf_warn_unused bool validate_utf16(const char16_t*, size_t) const noexcept final override { + return false; + } + + simdutf_warn_unused size_t convert_utf8_to_utf16(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char*, size_t, char16_t*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t*, size_t, char*) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t count_utf16(const char16_t *, size_t) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override { + return 0; + } + + simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *, size_t) const noexcept override { + return 0; + } + + simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override { + return 0; + } + + unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} +}; + +const unsupported_implementation unsupported_singleton{}; + +size_t available_implementation_list::size() const noexcept { + return internal::available_implementation_pointers.size(); +} +const implementation * const *available_implementation_list::begin() const noexcept { + return internal::available_implementation_pointers.begin(); +} +const implementation * const *available_implementation_list::end() const noexcept { + return internal::available_implementation_pointers.end(); +} +const implementation *available_implementation_list::detect_best_supported() const noexcept { + // They are prelisted in priority order, so we just go down the list + uint32_t supported_instruction_sets = internal::detect_supported_architectures(); + for (const implementation *impl : internal::available_implementation_pointers) { + uint32_t required_instruction_sets = impl->required_instruction_sets(); + if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; } + } + return &unsupported_singleton; // this should never happen? +} + +const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept { + SIMDUTF_PUSH_DISABLE_WARNINGS + SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe + char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION"); + SIMDUTF_POP_DISABLE_WARNINGS + + if (force_implementation_name) { + auto force_implementation = available_implementations[force_implementation_name]; + if (force_implementation) { + return active_implementation = force_implementation; + } else { + // Note: abort() and stderr usage within the library is forbidden. + return active_implementation = &unsupported_singleton; + } + } + return active_implementation = available_implementations.detect_best_supported(); +} + +} // namespace internal + +SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations{}; +SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr active_implementation{&internal::detect_best_supported_implementation_on_first_use_singleton}; + +simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept { + return active_implementation->validate_utf8(buf, len); +} +simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept { + return active_implementation->convert_utf8_to_utf16(input, length, utf16_output); +} +simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept { + return active_implementation->validate_utf16(buf, len); +} +simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept { + return active_implementation->convert_valid_utf8_to_utf16(input, length, utf16_buffer); +} +simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_utf16_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept { + return active_implementation->convert_valid_utf16_to_utf8(buf, len, utf8_buffer); +} +simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept { + return active_implementation->count_utf16(input, length); +} +simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept { + return active_implementation->count_utf8(input, length); +} +simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept { + return active_implementation->utf8_length_from_utf16(input, length); +} +simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept { + return active_implementation->utf16_length_from_utf8(input, length); +} +simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept { + return active_implementation->autodetect_encoding(buf, length); +} + +const implementation * builtin_implementation() { + static const implementation * builtin_impl = available_implementations[STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)]; + return builtin_impl; +} + + +} // namespace simdutf + +/* end file src/implementation.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=encoding_types.cpp +/* begin file src/encoding_types.cpp */ + +namespace simdutf { +std::string to_string(encoding_type bom) { + switch (bom) { + case UTF16_LE: return "UTF16 little-endian"; + case UTF16_BE: return "UTF16 big-endian"; + case UTF32_LE: return "UTF32 little-endian"; + case UTF32_BE: return "UTF32 big-endian"; + case UTF8: return "UTF8"; + case unspecified: return "unknown"; + default: return "error"; + } +} + +namespace BOM { +// Note that BOM for UTF8 is discouraged. +encoding_type check_bom(const uint8_t* byte, size_t length) { + if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) { + if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) { + return encoding_type::UTF32_LE; + } else { + return encoding_type::UTF16_LE; + } + } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) { + return encoding_type::UTF16_BE; + } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) { + return encoding_type::UTF32_BE; + } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) { + return encoding_type::UTF8; + } + return encoding_type::unspecified; + } + +encoding_type check_bom(const char* byte, size_t length) { + return check_bom(reinterpret_cast(byte), length); + } + + size_t bom_byte_size(encoding_type bom) { + switch (bom) { + case UTF16_LE: return 2; + case UTF16_BE: return 2; + case UTF32_LE: return 4; + case UTF32_BE: return 4; + case UTF8: return 3; + case unspecified: return 0; + default: return 0; + } +} + +} +} +/* end file src/encoding_types.cpp */ +// The large tables should be included once and they +// should not depend on a kernel. +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h +/* begin file src/tables/utf8_to_utf16_tables.h */ +#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H +#define SIMDUTF_UTF8_TO_UTF16_TABLES_H +#include + +namespace simdutf { +namespace { +namespace tables { +namespace utf8_to_utf16 { +/** + * utf8bigindex uses about 8 kB + * shufutf8 uses about 3344 B + * + * So we use a bit over 11 kB. It would be + * easy to save about 4 kB by only + * storing the index in utf8bigindex, and + * deriving the consumed bytes otherwise. + * However, this may come at a significant (10% to 20%) + * performance penalty. + */ + +const uint8_t shufutf8[209][16] = +{ {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0}, + {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}, + {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0}, + {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0}, + {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0}, + {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0}, + {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}}; +/* number of two bytes : 64 */ +/* number of two + three bytes : 145 */ +/* number of two + three + four bytes : 209 */ +const uint8_t utf8bigindex[4096][2] = +{ {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {152, 7}, + {164, 7}, + {145, 3}, + {0, 12}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {171, 8}, + {72, 8}, + {183, 8}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {186, 8}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {187, 9}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {77, 7}, + {95, 7}, + {7, 9}, + {194, 7}, + {83, 7}, + {101, 7}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {192, 11}, + {152, 7}, + {164, 7}, + {145, 3}, + {204, 11}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {207, 11}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {117, 11}, + {72, 8}, + {135, 11}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {141, 11}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {143, 11}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {31, 11}, + {47, 11}, + {7, 9}, + {194, 7}, + {83, 7}, + {55, 11}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {59, 11}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {61, 11}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {62, 11}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {152, 7}, + {164, 7}, + {145, 3}, + {0, 12}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {208, 12}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {171, 8}, + {72, 8}, + {183, 8}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {186, 8}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {144, 12}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {77, 7}, + {95, 7}, + {7, 9}, + {194, 7}, + {83, 7}, + {101, 7}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {63, 12}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {148, 6}, + {0, 12}, + {151, 6}, + {163, 6}, + {66, 6}, + {0, 12}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {192, 11}, + {152, 7}, + {164, 7}, + {145, 3}, + {204, 11}, + {155, 7}, + {167, 7}, + {69, 7}, + {179, 7}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {170, 7}, + {71, 7}, + {182, 7}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {185, 7}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {207, 11}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {117, 11}, + {72, 8}, + {135, 11}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {141, 11}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {104, 8}, + {68, 6}, + {122, 8}, + {74, 6}, + {92, 6}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {76, 6}, + {94, 6}, + {5, 8}, + {193, 6}, + {82, 6}, + {100, 6}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {77, 7}, + {95, 7}, + {6, 8}, + {194, 7}, + {83, 7}, + {101, 7}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {143, 11}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {31, 11}, + {47, 11}, + {7, 9}, + {194, 7}, + {83, 7}, + {55, 11}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {59, 11}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {61, 11}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {0, 12}, + {0, 12}, + {147, 5}, + {0, 12}, + {150, 5}, + {162, 5}, + {65, 5}, + {0, 12}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {176, 10}, + {148, 6}, + {188, 10}, + {151, 6}, + {163, 6}, + {66, 6}, + {200, 10}, + {154, 6}, + {166, 6}, + {68, 6}, + {178, 6}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {169, 6}, + {70, 6}, + {181, 6}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {191, 10}, + {152, 7}, + {164, 7}, + {145, 3}, + {203, 10}, + {90, 10}, + {108, 10}, + {69, 7}, + {126, 10}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {114, 10}, + {71, 7}, + {132, 10}, + {77, 7}, + {95, 7}, + {65, 5}, + {194, 7}, + {83, 7}, + {101, 7}, + {67, 5}, + {119, 7}, + {73, 5}, + {91, 5}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {138, 10}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {103, 7}, + {68, 6}, + {121, 7}, + {74, 6}, + {92, 6}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {76, 6}, + {94, 6}, + {4, 7}, + {193, 6}, + {82, 6}, + {100, 6}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {206, 10}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {116, 10}, + {72, 8}, + {134, 10}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {140, 10}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {62, 11}, + {15, 10}, + {122, 8}, + {23, 10}, + {39, 10}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {27, 10}, + {43, 10}, + {5, 8}, + {193, 6}, + {82, 6}, + {51, 10}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {29, 10}, + {45, 10}, + {6, 8}, + {194, 7}, + {83, 7}, + {53, 10}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {57, 10}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {0, 12}, + {0, 12}, + {0, 12}, + {146, 4}, + {0, 12}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {160, 9}, + {172, 9}, + {147, 5}, + {184, 9}, + {150, 5}, + {162, 5}, + {65, 5}, + {196, 9}, + {153, 5}, + {165, 5}, + {67, 5}, + {177, 5}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {175, 9}, + {148, 6}, + {142, 10}, + {81, 9}, + {99, 9}, + {66, 6}, + {199, 9}, + {87, 9}, + {105, 9}, + {68, 6}, + {123, 9}, + {74, 6}, + {92, 6}, + {64, 4}, + {0, 12}, + {157, 6}, + {111, 9}, + {70, 6}, + {129, 9}, + {76, 6}, + {94, 6}, + {65, 5}, + {193, 6}, + {82, 6}, + {100, 6}, + {67, 5}, + {118, 6}, + {73, 5}, + {91, 5}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {190, 9}, + {152, 7}, + {164, 7}, + {145, 3}, + {202, 9}, + {89, 9}, + {107, 9}, + {69, 7}, + {125, 9}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {113, 9}, + {71, 7}, + {131, 9}, + {30, 10}, + {46, 10}, + {7, 9}, + {194, 7}, + {83, 7}, + {54, 10}, + {11, 9}, + {119, 7}, + {19, 9}, + {35, 9}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {137, 9}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {58, 10}, + {13, 9}, + {121, 7}, + {21, 9}, + {37, 9}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {25, 9}, + {41, 9}, + {4, 7}, + {193, 6}, + {82, 6}, + {49, 9}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {145, 3}, + {205, 9}, + {156, 8}, + {168, 8}, + {146, 4}, + {180, 8}, + {149, 4}, + {161, 4}, + {64, 4}, + {0, 12}, + {159, 8}, + {115, 9}, + {72, 8}, + {133, 9}, + {78, 8}, + {96, 8}, + {65, 5}, + {195, 8}, + {84, 8}, + {102, 8}, + {67, 5}, + {120, 8}, + {73, 5}, + {91, 5}, + {64, 4}, + {0, 12}, + {0, 12}, + {174, 8}, + {148, 6}, + {139, 9}, + {80, 8}, + {98, 8}, + {66, 6}, + {198, 8}, + {86, 8}, + {60, 10}, + {14, 9}, + {122, 8}, + {22, 9}, + {38, 9}, + {3, 8}, + {0, 12}, + {157, 6}, + {110, 8}, + {70, 6}, + {128, 8}, + {26, 9}, + {42, 9}, + {5, 8}, + {193, 6}, + {82, 6}, + {50, 9}, + {9, 8}, + {118, 6}, + {17, 8}, + {33, 8}, + {0, 6}, + {0, 12}, + {0, 12}, + {0, 12}, + {0, 12}, + {189, 8}, + {152, 7}, + {164, 7}, + {145, 3}, + {201, 8}, + {88, 8}, + {106, 8}, + {69, 7}, + {124, 8}, + {75, 7}, + {93, 7}, + {64, 4}, + {0, 12}, + {158, 7}, + {112, 8}, + {71, 7}, + {130, 8}, + {28, 9}, + {44, 9}, + {6, 8}, + {194, 7}, + {83, 7}, + {52, 9}, + {10, 8}, + {119, 7}, + {18, 8}, + {34, 8}, + {1, 7}, + {0, 12}, + {0, 12}, + {173, 7}, + {148, 6}, + {136, 8}, + {79, 7}, + {97, 7}, + {66, 6}, + {197, 7}, + {85, 7}, + {56, 9}, + {12, 8}, + {121, 7}, + {20, 8}, + {36, 8}, + {2, 7}, + {0, 12}, + {157, 6}, + {109, 7}, + {70, 6}, + {127, 7}, + {24, 8}, + {40, 8}, + {4, 7}, + {193, 6}, + {82, 6}, + {48, 8}, + {8, 7}, + {118, 6}, + {16, 7}, + {32, 7}, + {0, 6}}; +} // utf8_to_utf16 namespace +} // tables namespace +} // unnamed namespace +} // namespace simdutf + +#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H +/* end file src/tables/utf8_to_utf16_tables.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h +/* begin file src/tables/utf16_to_utf8_tables.h */ +// file generated by scripts/sse_convert_utf16_to_utf8.py +#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H +#define SIMDUTF_UTF16_TO_UTF8_TABLES_H + +namespace simdutf { +namespace { +namespace tables { +namespace utf16_to_utf8 { + + // 1 byte for length, 16 bytes for mask + const uint8_t pack_1_2_utf8_bytes[256][17] = { + {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}, + {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80}, + {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80}, + {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, + {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80}, + {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80}, + {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, + {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80}, + {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80}, + {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, + {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80}, + {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80}, + {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80}, + {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80}, + {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80}, + {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, + {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80}, + {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80}, + {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80}, + {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80}, + {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80}, + {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80}, + {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80}, + {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80} + }; + + // 1 byte for length, 16 bytes for mask + const uint8_t pack_1_2_3_utf8_bytes[256][17] = { + {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80}, + {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, + {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, + {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80}, + {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80}, + {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}, + {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80} + }; + +} // utf16_to_utf8 namespace +} // tables namespace +} // unnamed namespace +} // namespace simdutf + +#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H +/* end file src/tables/utf16_to_utf8_tables.h */ +// End of tables. + +// The scalar routines should be included once. +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h +/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ +#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H +#define SIMDUTF_VALID_UTF16_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf8 { + +inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 4 ASCII characters + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while(pos < final_pos) { + *utf8_output++ = char(buf[pos]); + pos++; + } + continue; + } + } + uint16_t word = data[pos]; + if((word & 0xFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word &0xF800 ) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + if(pos + 1 >= len) { return 0; } // minimal bound checking + uint16_t next_word = data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return utf8_output - start; +} + +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h +/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ +#ifndef SIMDUTF_UTF16_TO_UTF8_H +#define SIMDUTF_UTF16_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16_to_utf8 { + +inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) { + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 8 ASCII characters + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while(pos < final_pos) { + *utf8_output++ = char(buf[pos]); + pos++; + } + continue; + } + } + uint16_t word = data[pos]; + if((word & 0xFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word &0xF800 ) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + if(pos + 1 >= len) { return 0; } + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return 0; } + uint16_t next_word = data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return 0; } + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return utf8_output - start; +} + +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ +#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H +#define SIMDUTF_VALID_UTF8_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf16 { + +inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + // try to convert the next block of 8 ASCII bytes + if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 8; + while(pos < final_pos) { + *utf16_output++ = char16_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 1 >= len) { break; } // minimal bound checking + *utf16_output++ = char16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111)); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 2 >= len) { break; } // minimal bound checking + *utf16_output++ = char16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111)); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { break; } // minimal bound checking + uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12) + | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111); + code_word -= 0x10000; + *utf16_output++ = char16_t(0xD800 + (code_word >> 10)); + *utf16_output++ = char16_t(0xDC00 + (code_word & 0x3FF)); + pos += 4; + } else { + // we may have a continuation but we do not do error checking + return 0; + } + } + return utf16_output - start; +} + + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ +#ifndef SIMDUTF_UTF8_TO_UTF16_H +#define SIMDUTF_UTF8_TO_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8_to_utf16 { + +inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) { + const uint8_t *data = reinterpret_cast(buf); + size_t pos = 0; + char16_t* start{utf16_output}; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + size_t final_pos = pos + 16; + while(pos < final_pos) { + *utf16_output++ = char16_t(buf[pos]); + pos++; + } + continue; + } + } + uint8_t leading_byte = data[pos]; // leading byte + if (leading_byte < 0b10000000) { + // converting one ASCII byte !!! + *utf16_output++ = char16_t(leading_byte); + pos++; + } else if ((leading_byte & 0b11100000) == 0b11000000) { + // We have a two-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 1 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if (code_point < 0x80 || 0x7ff < code_point) { return 0; } + *utf16_output++ = char16_t(code_point); + pos += 2; + } else if ((leading_byte & 0b11110000) == 0b11100000) { + // We have a three-byte UTF-8, it should become + // a single UTF-16 word. + if(pos + 2 >= len) { return 0; } // minimal bound checking + + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + // range check + uint32_t code_point = (leading_byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if (code_point < 0x800 || 0xffff < code_point || + (0xd7ff < code_point && code_point < 0xe000)) { + return 0; + } + *utf16_output++ = char16_t(code_point); + pos += 3; + } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 + // we have a 4-byte UTF-8 word. + if(pos + 3 >= len) { return 0; } // minimal bound checking + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } + + // range check + uint32_t code_point = + (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } + code_point -= 0x10000; + *utf16_output++ = char16_t(0xD800 + (code_point >> 10)); + *utf16_output++ = char16_t(0xDC00 + (code_point & 0x3FF)); + pos += 4; + } else { + return 0; + } + } + return utf16_output - start; +} + +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf8.h +/* begin file src/scalar/utf8.h */ +#ifndef SIMDUTF_UTF8_H +#define SIMDUTF_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf8 { +// credit: based on code from Google Fuchsia (Apache Licensed) +inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept { + const uint8_t *data = reinterpret_cast(buf); + uint64_t pos = 0; + uint32_t code_point = 0; + while (pos < len) { + // check of the next 8 bytes are ascii. + uint64_t next_pos = pos + 16; + if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v1; + std::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; + if ((v & 0x8080808080808080) == 0) { + pos = next_pos; + continue; + } + } + unsigned char byte = data[pos]; + if (byte < 0b10000000) { + pos++; + continue; + } else if ((byte & 0b11100000) == 0b11000000) { + next_pos = pos + 2; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); + if ((code_point < 0x80) || (0x7ff < code_point)) { return false; } + } else if ((byte & 0b11110000) == 0b11100000) { + next_pos = pos + 3; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = (byte & 0b00001111) << 12 | + (data[pos + 1] & 0b00111111) << 6 | + (data[pos + 2] & 0b00111111); + if ((code_point < 0x800) || (0xffff < code_point) || + (0xd7ff < code_point && code_point < 0xe000)) { + return false; + } + } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000 + next_pos = pos + 4; + if (next_pos > len) { return false; } + if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; } + if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; } + // range check + code_point = + (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | + (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); + if (code_point <= 0xffff || 0x10ffff < code_point) { return false; } + } else { + // we may have a continuation + return false; + } + pos = next_pos; + } + return true; +} + +inline size_t count_code_points(const char* buf, size_t len) { + const int8_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + // -65 is 0b10111111, anything larger in two-complement's should start a new code point. + if(p[i] > -65) { counter++; } + } + return counter; +} + +inline size_t utf16_length_from_utf8(const char* buf, size_t len) { + const int8_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + if(p[i] > -65) { counter++; } + if(uint8_t(p[i]) >= 240) { counter++; } + } + return counter; +} + +} // utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf8.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=scalar/utf16.h +/* begin file src/scalar/utf16.h */ +#ifndef SIMDUTF_UTF16_H +#define SIMDUTF_UTF16_H + +namespace simdutf { +namespace scalar { +namespace { +namespace utf16 { + +inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept { + const uint16_t *data = reinterpret_cast(buf); + uint64_t pos = 0; + while (pos < len) { + uint16_t word = data[pos]; + if((word &0xF800) == 0xD800) { + if(pos + 1 >= len) { return false; } + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return false; } + uint16_t next_word = data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return false; } + pos += 2; + } else { + pos++; + } + } + return true; +} + + +inline size_t count_code_points(const char16_t* buf, size_t len) { + // We are not BOM aware. + const uint16_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + counter += ((p[i] & 0xFC00) != 0xDC00); + } + return counter; +} + +inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) { + // We are not BOM aware. + const uint16_t * p = reinterpret_cast(buf); + size_t counter{0}; + for(size_t i = 0; i < len; i++) { + /** ASCII **/ + if(p[i] <= 0x7F) { counter++; } + /** two-byte **/ + else if(p[i] <= 0x7FF) { counter += 2; } + /** three-byte **/ + else if((p[i] <= 0xD7FF) || (p[i] >= 0xE000)) { counter += 3; } + /** surrogates -- 4 bytes **/ + else { counter += 2; } + } + return counter; +} + +} // utf16 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/utf16.h */ +// + + +SIMDUTF_PUSH_DISABLE_WARNINGS +SIMDUTF_DISABLE_UNDESIRED_WARNINGS + + +#if SIMDUTF_IMPLEMENTATION_ARM64 +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp +/* begin file src/arm64/implementation.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h +/* begin file src/simdutf/arm64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "arm64" +// #define SIMDUTF_IMPLEMENTATION arm64 +/* end file src/simdutf/arm64/begin.h */ +namespace simdutf { +namespace arm64 { +namespace { +#ifndef SIMDUTF_ARM64_H +#error "arm64.h must be included" +#endif +using namespace simd; + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + simd8 bits = input.reduce_or(); + return bits.max_val() < 0b10000000u; +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1 >= uint8_t(0b11000000u); + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well. + // This will work fine because we only have to report errors for cases with 0-1 lead bytes. + // Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is + // guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character. + // The error will be detected there. + return is_second_byte ^ is_third_byte ^ is_fourth_byte; +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2 >= uint8_t(0b11100000u); + simd8 is_fourth_byte = prev3 >= uint8_t(0b11110000u); + return is_third_byte ^ is_fourth_byte; +} +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp +/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +std::pair arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) { + uint8_t * utf8_output = reinterpret_cast(utf8_out); + const char16_t* end = buf + len; + + const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800); + const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800); + const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080); + + while (buf + 16 <= end) { + uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); + if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!! + // It is common enough that we have sequences of 16 consecutive ASCII characters. + uint16x8_t nextin = vld1q_u16(reinterpret_cast(buf) + 8); + if(vmaxvq_u16(nextin) > 0x7F) { + // 1. pack the bytes + // obviously suboptimal. + uint8x8_t utf8_packed = vmovn_u16(in); + // 2. store (8 bytes) + vst1_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin); + // 2. store (16 bytes) + vst1q_u8(utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + if (vmaxvq_u16(in) <= 0x7FF) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00); + const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const uint16x8_t t0 = vshlq_n_u16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const uint16x8_t t1 = vandq_u16(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const uint16x8_t t2 = vandq_u16(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const uint16x8_t t3 = vorrq_u16(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const uint16x8_t t4 = vorrq_u16(t3, v_c080); + // 2. merge ASCII and 2-byte codewords + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); + const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4)); + // 3. prepare bitmask for 8-bit lookup +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080); +#else + const uint16x8_t mask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0002, 0x0008, + 0x0020, 0x0080 }; +#endif + uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask)); + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const uint8x16_t shuffle = vld1q_u8(row + 1); + const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle); + + // 5. store bytes + vst1q_u8(utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } + const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (vmaxvq_u16(surrogates_bytemask) == 0) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); +#else + const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e}; +#endif + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) vmovq_n_u16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even))); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000)); + + // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa] + const uint16x8_t s0 = vshrq_n_u16(in, 12); + // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000] + const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000)); + // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000] + const uint16x8_t s1s = vshlq_n_u16(s1, 2); + // [00bb|bbbb|0000|aaaa] + const uint16x8_t s2 = vorrq_u16(s0, s1s); + // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000)); + const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF); + const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff); + const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask); + const uint16x8_t s4 = veorq_u16(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4)); + const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4)); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F); + const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f); +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 ); + const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 ); +#else + const uint16x8_t onemask = { 0x0001, 0x0004, + 0x0010, 0x0040, + 0x0100, 0x0400, + 0x1000, 0x4000 }; + const uint16x8_t twomask = { 0x0002, 0x0008, + 0x0020, 0x0080, + 0x0200, 0x0800, + 0x2000, 0x8000 }; +#endif + const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask)); + const uint16_t mask = vaddvq_u16(combined); + // The following fast path may or may not be beneficial. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0}; + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle); + vst1q_u8(utf8_output, utf8_0); + utf8_output += 12; + vst1q_u8(utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const uint8x16_t shuffle0 = vld1q_u8(row0 + 1); + const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const uint8x16_t shuffle1 = vld1q_u8(row1 + 1); + const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1); + + vst1q_u8(utf8_output, utf8_0); + utf8_output += row0[0]; + vst1q_u8(utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, reinterpret_cast(utf8_output)); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(buf, reinterpret_cast(utf8_output)); +} +/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp +/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */ +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + uint8x16_t in = vld1q_u8(reinterpret_cast(input)); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + if((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) { + // We process in chunks of 16 bytes + vst1q_u16(reinterpret_cast(utf16_output), vmovl_u8(vget_low_u8 (in))); + vst1q_u16(reinterpret_cast(utf16_output) + 8, vmovl_high_u8(in)); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } + if((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); +#else + const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; +#endif + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); + uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); + vst1q_u8(reinterpret_cast(utf16_output), composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO + const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255); +#else + const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255}; +#endif + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits + uint8x16_t middlebyte = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint32x4_t highbyte = + vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits + uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); + uint32x4_t composed = + vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); + uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); + vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); + utf16_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + + + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f))); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00))); + uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2))); + vst1q_u8(reinterpret_cast(utf16_output), composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits + uint8x16_t middlebyte = + vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint32x4_t highbyte = + vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits + uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4); + uint32x4_t composed = + vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted); + uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed); + vst1q_u16(reinterpret_cast(utf16_output), composed_repacked); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + uint8x16_t sh = vld1q_u8(reinterpret_cast(simdutf::tables::utf8_to_utf16::shufutf8[idx])); + uint8x16_t perm = vqtbl1q_u8(in, sh); + uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); + uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); + uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2)); + uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000))); + // correct for spurious high bit + uint8x16_t correct = + vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1)); + middlehighbyte = veorq_u8(correct, middlehighbyte); + uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4)); + uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000))); + uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6)); + uint8x16_t composed = + vorrq_u8(vorrq_u8(ascii, middlebyte_shifted), + vorrq_u8(highbyte_shifted, middlehighbyte_shifted)); + uint32x4_t composedminus = + vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000)); + uint32x4_t lowtenbits = + vandq_u32(composedminus, vmovq_n_u32(0x3ff)); + uint32x4_t hightenbits = vshrq_n_u32(composedminus, 10); + uint32x4_t lowtenbitsadd = + vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00)); + uint32x4_t hightenbitsadd = + vaddq_u32(hightenbits, vmovq_n_u32(0xD800)); + uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16); + uint32x4_t surrogates = + vorrq_u32(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed)); + uint32_t surrogate_buffer[4]; + vst1q_u32(surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] < 65536) { + utf16_output[0] = uint16_t(basic_buffer[i]); + utf16_output++; + } else { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xFFFF); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16le.cpp +/* begin file src/arm64/arm_validate_utf16le.cpp */ + +const char16_t* arm_validate_utf16le(const char16_t* input, size_t size) { + const char16_t* end = input + size; + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + while (input + 16 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + const auto in0 = simd16(input); + const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + const simd8 in = simd16::pack(t0, t1); + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = ((in & v_f8) == v_d8); + if(surrogates_wordmask.none()) { + input += 16; + } else { + const auto vH = simd8((in & v_fc) == v_dc); + const auto vL = simd8(surrogates_wordmask).bit_andnot(vH); + // We are going to need these later: + const uint8_t low_vh = vH.first(); + const uint8_t high_vl = vL.last(); + // We shift vH down, possibly killing low_vh + const auto sh = simd8({1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0xFF}); + const auto vHshifteddown = vH.apply_lookup_16_to(sh); + const auto match = vHshifteddown == vL; + // We need to handle the fact that high_vl is unmatched. + // We could use this... + // const uint8x16_t allbutlast = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xFF}; + // match = vorrq_u8(match, allbutlast); + // but sh will do: + const auto fmatch = simd8(simd8(match) | sh); + // We deliberately take these two lines out of the following branchy code + // so that they are always s + if (fmatch.all() && low_vh == 0) { + input += (high_vl == 0) ? 16 : 15; + } else { + return nullptr; + } + } + } + return input; +} +/* end file src/arm64/arm_validate_utf16le.cpp */ + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace arm64 { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + // + // For pure ASCII inputs, this function is not optimally fast because they are + // faster ways to just check for ASCII than to compute the continuation mask. + // However, the continuation mask is more informative. There might be a trade-off + // involved. + // + simd8x64 in(reinterpret_cast(input + pos)); + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + if(utf8_continuation_mask != 0) { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } else { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// other functions +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace arm64 { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} +} // utf8 namespace +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +#include +namespace simdutf { +namespace arm64 { +namespace { +namespace utf16 { + +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} +} // utf16 +} // unnamed namespace +} // namespace arm64 +} // namespace simdutf +/* end file src/generic/utf16.h */ +// +// Implementation-specific overrides +// +namespace simdutf { +namespace arm64 { + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return arm64::utf8_validation::generic_validate_utf8(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = arm_validate_utf16le(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = arm_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +} // namespace arm64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h +/* begin file src/simdutf/arm64/end.h */ +/* end file src/simdutf/arm64/end.h */ +/* end file src/arm64/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_FALLBACK +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp +/* begin file src/fallback/implementation.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h +/* begin file src/simdutf/fallback/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "fallback" +// #define SIMDUTF_IMPLEMENTATION fallback +/* end file src/simdutf/fallback/begin.h */ + + +namespace simdutf { +namespace fallback { + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return scalar::utf8::validate(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + return scalar::utf8_to_utf16::convert_valid(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::utf16_length_from_utf8(input, length); +} + +} // namespace fallback +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h +/* begin file src/simdutf/fallback/end.h */ +/* end file src/simdutf/fallback/end.h */ +/* end file src/fallback/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_HASWELL +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp +/* begin file src/haswell/implementation.cpp */ + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h +/* begin file src/simdutf/haswell/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "haswell" +// #define SIMDUTF_IMPLEMENTATION haswell +SIMDUTF_TARGET_HASWELL +/* end file src/simdutf/haswell/begin.h */ +namespace simdutf { +namespace haswell { +namespace { +#ifndef SIMDUTF_HASWELL_H +#error "haswell.h must be included" +#endif +using namespace simd; + + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + return input.reduce_or().is_ascii(); +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_third_byte | is_fourth_byte) > int8_t(0); +} + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp +/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF)) { + // We process the data in chunks of 16 bytes. + _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), _mm256_cvtepu8_epi16(in)); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } + if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + const __m128i composed_repacked = _mm_packus_epi32(composed, composed); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + return 12; + } + + const uint8_t idx = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + const __m128i composed_repacked = _mm_packus_epi32(composed, composed); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + const __m128i composedminus = + _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); + const __m128i lowtenbits = + _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); + const __m128i hightenbits = _mm_srli_epi32(composedminus, 10); + const __m128i lowtenbitsadd = + _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); + const __m128i hightenbitsadd = + _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); + const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); + const __m128i surrogates = + _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + _mm_storeu_si128((__m128i *)basic_buffer, composed); + uint32_t surrogate_buffer[4]; + _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] < 65536) { + utf16_output[0] = uint16_t(basic_buffer[i]); + utf16_output++; + } else { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xFFFF); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16le.cpp +/* begin file src/haswell/avx2_validate_utf16le.cpp */ +/* + In UTF-16 words in range 0xD800 to 0xDFFF have special meaning. + + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. + + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate + + We're going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) + + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 words + and recheck this word in the next iteration +*/ + +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check the rest); + - nullptr if an error was detected. +*/ +const char16_t* avx2_validate_utf16le(const char16_t* input, size_t size) { + const char16_t* end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::ELEMENTS * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + const auto in0 = simd16(input); + const auto in1 = simd16(input + simd16::ELEMENTS); + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); + if (surrogates_bitmask == 0x0) { + input += simd16::ELEMENTS * 2; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint32_t V = ~surrogates_bitmask; + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint32_t H = vH.to_bitmask(); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint32_t L = ~H & surrogates_bitmask; + + const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint32_t b = a << 1; // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint32_t c = V | a | b; // Combine all the masks into the final one. + + if (c == 0xffffffff) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += simd16::ELEMENTS * 2; + } else if (c == 0x7fffffff) { + // The 31 lower words of the input register contains valid UTF-16. + // The 31 word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += simd16::ELEMENTS * 2 - 1; + } else { + return nullptr; + } + } + } + + return input; +} +/* end file src/haswell/avx2_validate_utf16le.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp +/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + + +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +std::pair sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) { + const char16_t* end = buf + len; + const __m256i v_0000 = _mm256_setzero_si256(); + const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800); + const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800); + const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080); + const size_t safety_margin = 8; // to avoid overruns + + while (buf + 16 + safety_margin <= end) { + __m256i in = _mm256_loadu_si256((__m256i*)buf); + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80); + if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!! + // 1. pack the bytes + const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1)); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + // no bits set above 7th bit + const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000); + const uint32_t one_byte_bitmask = static_cast(_mm256_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000); + const uint32_t one_or_two_bytes_bitmask = static_cast(_mm256_movemask_epi8(one_or_two_bytes_bytemask)); + if (one_or_two_bytes_bitmask == 0xffffffff) { + + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00); + const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m256i t0 = _mm256_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m256i t1 = _mm256_and_si256(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m256i t2 = _mm256_and_si256(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m256i t3 = _mm256_or_si256(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m256i t4 = _mm256_or_si256(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + const uint32_t M0 = one_byte_bitmask & 0x55555555; + const uint32_t M1 = M0 >> 7; + const uint32_t M2 = (M1 | M0) & 0x00ff00ff; + // 4. pack the bytes + + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0]; + const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0]; + + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1)); + + const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2)); + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed)); + utf8_output += row[0]; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1)); + utf8_output += row_2[0]; + + // 6. adjust pointers + buf += 16; + continue; + } + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a suggogate word + // at the end of a chunk. + const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint32_t surrogates_bitmask = static_cast(_mm256_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x00000000) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e, + 0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm256_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m256i t0 = _mm256_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m256i s0 = _mm256_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000)); + const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m256i s4 = _mm256_xor_si256(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m256i out0 = _mm256_unpacklo_epi16(t2, s4); + const __m256i out1 = _mm256_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint32_t mask = (one_byte_bitmask & 0x55555555) | + (one_or_two_bytes_bitmask & 0xaaaaaaaa); + // Due to the wider registers, the following path is less likely to be useful. + /*if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle); + const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1)); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1)); + utf8_output += 12; + buf += 16; + continue; + }*/ + const uint8_t mask0 = uint8_t(mask); + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1); + + const uint8_t mask2 = static_cast(mask >> 16); + const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0]; + const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1)); + const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2); + + + const uint8_t mask3 = static_cast(mask >> 24); + const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0]; + const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1)); + const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_2); + utf8_output += row2[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_3); + utf8_output += row3[0]; + buf += 16; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + return std::make_pair(buf, utf8_output); +} +/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */ + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace haswell { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + // + // For pure ASCII inputs, this function is not optimally fast because they are + // faster ways to just check for ASCII than to compute the continuation mask. + // However, the continuation mask is more informative. There might be a trade-off + // involved. + // + simd8x64 in(reinterpret_cast(input + pos)); + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + if(utf8_continuation_mask != 0) { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } else { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// other functions +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace haswell { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} +} // utf8 namespace +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +#include +namespace simdutf { +namespace haswell { +namespace { +namespace utf16 { + +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} +} // utf16 +} // unnamed namespace +} // namespace haswell +} // namespace simdutf +/* end file src/generic/utf16.h */ + +namespace simdutf { +namespace haswell { + + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return haswell::utf8_validation::generic_validate_utf8(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = avx2_validate_utf16le(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = haswell::sse_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +} // namespace haswell +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h +/* begin file src/simdutf/haswell/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/haswell/end.h */ +/* end file src/haswell/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_PPC64 +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp +/* begin file src/ppc64/implementation.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h +/* begin file src/simdutf/ppc64/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "ppc64" +// #define SIMDUTF_IMPLEMENTATION ppc64 +/* end file src/simdutf/ppc64/begin.h */ +namespace simdutf { +namespace ppc64 { +namespace { +#ifndef SIMDUTF_PPC64_H +#error "ppc64.h must be included" +#endif +using namespace simd; + + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + // careful: 0x80 is not ascii. + return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere(); +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_third_byte | is_fourth_byte) > int8_t(0); +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace ppc64 { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + // + // For pure ASCII inputs, this function is not optimally fast because they are + // faster ways to just check for ASCII than to compute the continuation mask. + // However, the continuation mask is more informative. There might be a trade-off + // involved. + // + simd8x64 in(reinterpret_cast(input + pos)); + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + if(utf8_continuation_mask != 0) { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } else { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// other functions +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} +} // utf8 namespace +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +#include +namespace simdutf { +namespace ppc64 { +namespace { +namespace utf16 { + +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} +} // utf16 +} // unnamed namespace +} // namespace ppc64 +} // namespace simdutf +/* end file src/generic/utf16.h */ + +// +// Implementation-specific overrides +// +namespace simdutf { +namespace ppc64 { + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return ppc64::utf8_validation::generic_validate_utf8(buf,len); +} + +simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept { + return scalar::utf16::validate(buf, len); +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept { + return 0; // stub +} + +simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return scalar::utf16_to_utf8::convert_valid(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept { + return scalar::utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return scalar::utf8::utf16_length_from_utf8(input, length); +} + +} // namespace ppc64 +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h +/* begin file src/simdutf/ppc64/end.h */ +/* end file src/simdutf/ppc64/end.h */ +/* end file src/ppc64/implementation.cpp */ +#endif +#if SIMDUTF_IMPLEMENTATION_WESTMERE +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp +/* begin file src/westmere/implementation.cpp */ +#include + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h +/* begin file src/simdutf/westmere/begin.h */ +// redefining SIMDUTF_IMPLEMENTATION to "westmere" +// #define SIMDUTF_IMPLEMENTATION westmere +SIMDUTF_TARGET_WESTMERE +/* end file src/simdutf/westmere/begin.h */ +namespace simdutf { +namespace westmere { +namespace { +#ifndef SIMDUTF_WESTMERE_H +#error "westmere.h must be included" +#endif +using namespace simd; + +simdutf_really_inline bool is_ascii(const simd8x64& input) { + return input.reduce_or().is_ascii(); +} + +simdutf_unused simdutf_really_inline simd8 must_be_continuation(const simd8 prev1, const simd8 prev2, const simd8 prev3) { + simd8 is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0 + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0); +} + +simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 prev2, const simd8 prev3) { + simd8 is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0 + simd8 is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0 + // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine. + return simd8(is_third_byte | is_fourth_byte) > int8_t(0); +} + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp +/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */ +// depends on "tables/utf8_to_utf16_tables.h" + + +// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the +// end of the code points. Only the least significant 12 bits of the mask +// are accessed. +// It returns how many bytes were consumed (up to 12). +size_t convert_masked_utf8_to_utf16(const char *input, + uint64_t utf8_end_of_code_point_mask, + char16_t *&utf16_output) { + // we use an approach where we try to process up to 12 input bytes. + // Why 12 input bytes and not 16? Because we are concerned with the size of + // the lookup tables. Also 12 is nicely divisible by two and three. + // + // + // Optimization note: our main path below is load-latency dependent. Thus it is maybe + // beneficial to have fast paths that depend on branch prediction but have less latency. + // This results in more instructions but, potentially, also higher speeds. + // + // We first try a few fast paths. + const __m128i in = _mm_loadu_si128((__m128i *)input); + const uint16_t input_utf8_end_of_code_point_mask = + utf8_end_of_code_point_mask & 0xFFF; + if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF)) { + // We process the data in chunks of 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), _mm_cvtepu8_epi16(in)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), _mm_cvtepu8_epi16(_mm_srli_si128(in,8))); + utf16_output += 16; // We wrote 16 16-bit characters. + return 16; // We consumed 16 bytes. + } + if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { + // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 8; // We wrote 16 bytes, 8 code points. + return 16; + } + if(input_utf8_end_of_code_point_mask == 0x924) { + // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words. + // There is probably a more efficient sequence, but the following might do. + const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + const __m128i composed_repacked = _mm_packus_epi32(composed, composed); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + return 12; + } + /// We do not have a fast path available, so we fallback. + + const uint8_t idx = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0]; + const uint8_t consumed = + tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1]; + if (idx < 64) { + // SIX (6) input code-words + // this is a relatively easy scenario + // we process SIX (6) input code-words. The max length in bytes of six code + // words spanning between 1 and 2 bytes each is 12 bytes. On processors + // where pdep/pext is fast, we might be able to use a small lookup table. + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f)); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00)); + const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2)); + _mm_storeu_si128((__m128i *)utf16_output, composed); + utf16_output += 6; // We wrote 12 bytes, 6 code points. + } else if (idx < 145) { + // FOUR (4) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = + _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits + const __m128i middlebyte = + _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + const __m128i highbyte = + _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted); + const __m128i composed_repacked = _mm_packus_epi32(composed, composed); + _mm_storeu_si128((__m128i *)utf16_output, composed_repacked); + utf16_output += 4; + } else if (idx < 209) { + // TWO (2) input code-words + const __m128i sh = + _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]); + const __m128i perm = _mm_shuffle_epi8(in, sh); + const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); + const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); + const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2); + __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000)); + // correct for spurious high bit + const __m128i correct = + _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1); + middlehighbyte = _mm_xor_si128(correct, middlehighbyte); + const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4); + const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000)); + const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6); + const __m128i composed = + _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), + _mm_or_si128(highbyte_shifted, middlehighbyte_shifted)); + const __m128i composedminus = + _mm_sub_epi32(composed, _mm_set1_epi32(0x10000)); + const __m128i lowtenbits = + _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff)); + const __m128i hightenbits = _mm_srli_epi32(composedminus, 10); + const __m128i lowtenbitsadd = + _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00)); + const __m128i hightenbitsadd = + _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800)); + const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16); + const __m128i surrogates = + _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted); + uint32_t basic_buffer[4]; + _mm_storeu_si128((__m128i *)basic_buffer, composed); + uint32_t surrogate_buffer[4]; + _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates); + for (size_t i = 0; i < 3; i++) { + if (basic_buffer[i] < 65536) { + utf16_output[0] = uint16_t(basic_buffer[i]); + utf16_output++; + } else { + utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xFFFF); + utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16); + utf16_output += 2; + } + } + } else { + // here we know that there is an error but we do not handle errors + } + return consumed; +} +/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16le.cpp +/* begin file src/westmere/sse_validate_utf16le.cpp */ +/* + In UTF-16 words in range 0xD800 to 0xDFFF have special meaning. + + In a vectorized algorithm we want to examine the most significant + nibble in order to select a fast path. If none of highest nibbles + are 0xD (13), than we are sure that UTF-16 chunk in a vector + register is valid. + + Let us analyze what we need to check if the nibble is 0xD. The + value of the preceding nibble determines what we have: + + 0xd000 .. 0xd7ff - a valid word + 0xd800 .. 0xdbff - low surrogate + 0xdc00 .. 0xdfff - high surrogate + + Other constraints we have to consider: + - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) + - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) + - there must not be sole low surrogate nor high surrogate + + We're going to build three bitmasks based on the 3rd nibble: + - V = valid word, + - L = low surrogate (0xd800 .. 0xdbff) + - H = high surrogate (0xdc00 .. 0xdfff) + + 0 1 2 3 4 5 6 7 <--- word index + [ V | L | H | L | H | V | V | L ] + 1 0 0 0 0 1 1 0 - V = valid masks + 0 1 0 1 0 0 0 1 - L = low surrogate + 0 0 1 0 1 0 0 0 - H high surrogate + + + 1 0 0 0 0 1 1 0 V = valid masks + 0 1 0 1 0 0 0 0 a = L & (H >> 1) + 0 0 1 0 1 0 0 0 b = a << 1 + 1 1 1 1 1 1 1 0 c = V | a | b + ^ + the last bit can be zero, we just consume 7 words + and recheck this word in the next iteration +*/ + +/* Returns: + - pointer to the last unprocessed character (a scalar fallback should check the rest); + - nullptr if an error was detected. +*/ +const char16_t* sse_validate_utf16le(const char16_t* input, size_t size) { + const char16_t* end = input + size; + + const auto v_d8 = simd8::splat(0xd8); + const auto v_f8 = simd8::splat(0xf8); + const auto v_fc = simd8::splat(0xfc); + const auto v_dc = simd8::splat(0xdc); + + while (input + simd16::SIZE * 2 < end) { + // 0. Load data: since the validation takes into account only higher + // byte of each word, we compress the two vectors into one which + // consists only the higher bytes. + const auto in0 = simd16(input); + const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); + + const auto t0 = in0.shr<8>(); + const auto t1 = in1.shr<8>(); + + const auto in = simd16::pack(t0, t1); + + // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). + const auto surrogates_wordmask = (in & v_f8) == v_d8; + const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); + if (surrogates_bitmask == 0x0000) { + input += 16; + } else { + // 2. We have some surrogates that have to be distinguished: + // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) + // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) + // + // Fact: high surrogate has 11th bit set (3rd bit in the higher word) + + // V - non-surrogate words + // V = not surrogates_wordmask + const uint16_t V = static_cast(~surrogates_bitmask); + + // H - word-mask for high surrogates: the six highest bits are 0b1101'11 + const auto vH = (in & v_fc) == v_dc; + const uint16_t H = static_cast(vH.to_bitmask()); + + // L - word mask for low surrogates + // L = not H and surrogates_wordmask + const uint16_t L = static_cast(~H & surrogates_bitmask); + + const uint16_t a = static_cast(L & (H >> 1)); // A low surrogate must be followed by high one. + // (A low surrogate placed in the 7th register's word + // is an exception we handle.) + const uint16_t b = static_cast(a << 1); // Just mark that the opposite fact is hold, + // thanks to that we have only two masks for valid case. + const uint16_t c = static_cast(V | a | b); // Combine all the masks into the final one. + + if (c == 0xffff) { + // The whole input register contains valid UTF-16, i.e., + // either single words or proper surrogate pairs. + input += 16; + } else if (c == 0x7fff) { + // The 15 lower words of the input register contains valid UTF-16. + // The 15th word may be either a low or high surrogate. It the next + // iteration we 1) check if the low surrogate is followed by a high + // one, 2) reject sole high surrogate. + input += 15; + } else { + return nullptr; + } + } + } + + return input; +} +/* end file src/westmere/sse_validate_utf16le.cpp */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp +/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */ +/* + The vectorized algorithm works on single SSE register i.e., it + loads eight 16-bit words. + + We consider three cases: + 1. an input register contains no surrogates and each value + is in range 0x0000 .. 0x07ff. + 2. an input register contains no surrogates and values are + is in range 0x0000 .. 0xffff. + 3. an input register contains surrogates --- i.e. codepoints + can have 16 or 32 bits. + + Ad 1. + + When values are less than 0x0800, it means that a 16-bit words + can be converted into: 1) single UTF8 byte (when it's an ASCII + char) or 2) two UTF8 bytes. + + For this case we do only some shuffle to obtain these 2-byte + codes and finally compress the whole SSE register with a single + shuffle. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + Ad 2. + + When values fit in 16-bit words, but are above 0x07ff, then + a single word may produce one, two or three UTF8 bytes. + + We prepare data for all these three cases in two registers. + The first register contains lower two UTF8 bytes (used in all + cases), while the second one contains just the third byte for + the three-UTF8-bytes case. + + Finally these two registers are interleaved forming eight-element + array of 32-bit values. The array spans two SSE registers. + The bytes from the registers are compressed using two shuffles. + + We need 256-entry lookup table to get a compression pattern + and the number of output bytes in the compressed vector register. + Each entry occupies 17 bytes. + + + To summarize: + - We need two 256-entry tables that have 8704 bytes in total. +*/ + +/* + Returns a pair: the first unprocessed byte from buf and utf8_output + A scalar routing should carry on the conversion of the tail. +*/ +std::pair sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) { + + const char16_t* end = buf + len; + + const __m128i v_0000 = _mm_setzero_si128(); + const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800); + const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800); + const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080); + while (buf + 16 <= end) { + __m128i in = _mm_loadu_si128((__m128i*)buf); + // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes + const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80); + if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!! + __m128i nextin = _mm_loadu_si128((__m128i*)buf+1); + if(!_mm_testz_si128(nextin, v_ff80)) { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in,in); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 8; + utf8_output += 8; + in = nextin; + } else { + // 1. pack the bytes + // obviously suboptimal. + const __m128i utf8_packed = _mm_packus_epi16(in,nextin); + // 2. store (16 bytes) + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + // 3. adjust pointers + buf += 16; + utf8_output += 16; + continue; // we are done for this round! + } + } + + // no bits set above 7th bit + const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000); + const uint16_t one_byte_bitmask = static_cast(_mm_movemask_epi8(one_byte_bytemask)); + + // no bits set above 11th bit + const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000); + const uint16_t one_or_two_bytes_bitmask = static_cast(_mm_movemask_epi8(one_or_two_bytes_bytemask)); + + if (one_or_two_bytes_bitmask == 0xffff) { + // 1. prepare 2-byte values + // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8 + // expected output : [110a|aaaa|10bb|bbbb] x 8 + const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); + const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); + + // t0 = [000a|aaaa|bbbb|bb00] + const __m128i t0 = _mm_slli_epi16(in, 2); + // t1 = [000a|aaaa|0000|0000] + const __m128i t1 = _mm_and_si128(t0, v_1f00); + // t2 = [0000|0000|00bb|bbbb] + const __m128i t2 = _mm_and_si128(in, v_003f); + // t3 = [000a|aaaa|00bb|bbbb] + const __m128i t3 = _mm_or_si128(t1, t2); + // t4 = [110a|aaaa|10bb|bbbb] + const __m128i t4 = _mm_or_si128(t3, v_c080); + + // 2. merge ASCII and 2-byte codewords + const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask); + + // 3. prepare bitmask for 8-bit lookup + // one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB) + const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a + const uint16_t m1 = static_cast(m0 >> 7); // m1 = 00000000h0g0f0e0 + const uint8_t m2 = static_cast((m0 | m1) & 0xff); // m2 = hdgcfbea + // 4. pack the bytes + const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0]; + const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1)); + const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle); + + // 5. store bytes + _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); + + // 6. adjust pointers + buf += 8; + utf8_output += row[0]; + continue; + + } + + // 1. Check if there are any surrogate word in the input chunk. + // We have also deal with situation when there is a suggogate word + // at the end of a chunk. + const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800); + + // bitmask = 0x0000 if there are no surrogates + // = 0xc000 if the last word is a surrogate + const uint16_t surrogates_bitmask = static_cast(_mm_movemask_epi8(surrogates_bytemask)); + // It might seem like checking for surrogates_bitmask == 0xc000 could help. However, + // it is likely an uncommon occurrence. + if (surrogates_bitmask == 0x0000) { + // case: words from register produce either 1, 2 or 3 UTF-8 bytes + const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0a0a, 0x0c0c, 0x0e0e); + + /* In this branch we handle three cases: + 1. [0000|0000|0ccc|cccc] => [0ccc|cccc] - single UFT-8 byte + 2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc] - two UTF-8 bytes + 3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes + + We expand the input word (16-bit) into two words (32-bit), thus + we have room for four bytes. However, we need five distinct bit + layouts. Note that the last byte in cases #2 and #3 is the same. + + We precompute byte 1 for case #1 and the common byte for cases #2 & #3 + in register t2. + + We precompute byte 1 for case #3 and -- **conditionally** -- precompute + either byte 1 for case #2 or byte 2 for case #3. Note that they + differ by exactly one bit. + + Finally from these two words we build proper UTF-8 sequence, taking + into account the case (i.e, the number of bytes to write). + */ + /** + * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce: + * t2 => [0ccc|cccc] [10cc|cccc] + * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb]) + */ +#define vec(x) _mm_set1_epi16(static_cast(x)) + // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc] + const __m128i t0 = _mm_shuffle_epi8(in, dup_even); + // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc] + const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111)); + // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc] + const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000)); + + // [aaaa|bbbb|bbcc|cccc] => [0000|aaaa|bbbb|bbcc] + const __m128i s0 = _mm_srli_epi16(in, 4); + // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00] + const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100)); + // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa] + const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140)); + // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa] + const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000)); + const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000)); + const __m128i s4 = _mm_xor_si128(s3, m0); +#undef vec + + // 4. expand words 16-bit => 32-bit + const __m128i out0 = _mm_unpacklo_epi16(t2, s4); + const __m128i out1 = _mm_unpackhi_epi16(t2, s4); + + // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle + const uint16_t mask = (one_byte_bitmask & 0x5555) | + (one_or_two_bytes_bitmask & 0xaaaa); + if(mask == 0) { + // We only have three-byte words. Use fast path. + const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle); + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += 12; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += 12; + buf += 8; + continue; + } + const uint8_t mask0 = uint8_t(mask); + + const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0]; + const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1)); + const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0); + + const uint8_t mask1 = static_cast(mask >> 8); + + const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0]; + const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1)); + const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1); + + _mm_storeu_si128((__m128i*)utf8_output, utf8_0); + utf8_output += row0[0]; + _mm_storeu_si128((__m128i*)utf8_output, utf8_1); + utf8_output += row1[0]; + + buf += 8; + // surrogate pair(s) in a register + } else { + // Let us do a scalar fallback. + // It may seem wasteful to use scalar code, but being efficient with SIMD + // in the presence of surrogate pairs may require non-trivial tables. + size_t forward = 15; + size_t k = 0; + if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);} + for(; k < forward; k++) { + uint16_t word = buf[k]; + if((word & 0xFF80)==0) { + *utf8_output++ = char(word); + } else if((word & 0xF800)==0) { + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else if((word &0xF800 ) != 0xD800) { + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + } else { + // must be a surrogate pair + uint16_t diff = uint16_t(word - 0xD800); + uint16_t next_word = buf[k+1]; + k++; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if((diff | diff2) > 0x3FF) { return std::make_pair(nullptr, utf8_output); } + uint32_t value = (diff << 10) + diff2 + 0x10000; + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + } + } + buf += k; + } + } // while + + return std::make_pair(buf, utf8_output); +} +/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */ + +// UTF-16 => UTF-8 conversion + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h +/* begin file src/generic/buf_block_reader.h */ +namespace simdutf { +namespace westmere { +namespace { + +// Walks through a buffer in block-sized increments, loading the last part with spaces +template +struct buf_block_reader { +public: + simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + simdutf_really_inline size_t block_index(); + simdutf_really_inline bool has_full_block() const; + simdutf_really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + simdutf_really_inline size_t get_remainder(uint8_t *dst) const; + simdutf_really_inline void advance(); +private: + const uint8_t *buf; + const size_t len; + const size_t lenminusstep; + size_t idx; +}; + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text_64(const uint8_t *text) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +// Routines to print masks and text for debugging bitmask operations +simdutf_unused static char * format_input_text(const simd8x64& in) { + static char *buf = reinterpret_cast(malloc(sizeof(simd8x64) + 1)); + in.store(reinterpret_cast(buf)); + for (size_t i=0; i); i++) { + if (buf[i] < ' ') { buf[i] = '_'; } + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + +simdutf_unused static char * format_mask(uint64_t mask) { + static char *buf = reinterpret_cast(malloc(64 + 1)); + for (size_t i=0; i<64; i++) { + buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' '; + } + buf[64] = '\0'; + return buf; +} + +template +simdutf_really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +simdutf_really_inline size_t buf_block_reader::block_index() { return idx; } + +template +simdutf_really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +simdutf_really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +simdutf_really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers + std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + std::memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +simdutf_really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/buf_block_reader.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h +/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { + +using namespace simd; + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + // + // Return nonzero if there are incomplete multibyte characters at the end of the block: + // e.g. if there is a 4-byte character, but it's 3 bytes from the end. + // + simdutf_really_inline simd8 is_incomplete(const simd8 input) { + // If the previous input's last 3 bytes match this, they're too short (they ended at EOF): + // ... 1111____ 111_____ 11______ + static const uint8_t max_array[32] = { + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1 + }; + const simd8 max_value(&max_array[sizeof(max_array)-sizeof(simd8)]); + return input.gt_bits(max_value); + } + + struct utf8_checker { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + // The last input we received + simd8 prev_input_block; + // Whether the last input we received was incomplete (used for ASCII fast path) + simd8 prev_incomplete; + + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + // The only problem that can happen at EOF is that a multibyte character is too short + // or a byte value too large in the last bytes: check_special_cases only checks for bytes + // too large in the first of two bytes. + simdutf_really_inline void check_eof() { + // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't + // possibly finish them. + this->error |= this->prev_incomplete; + } + + simdutf_really_inline void check_next_input(const simd8x64& input) { + if(simdutf_likely(is_ascii(input))) { + this->error |= this->prev_incomplete; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], this->prev_input_block); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + this->prev_incomplete = is_incomplete(input.chunks[simd8x64::NUM_CHUNKS-1]); + this->prev_input_block = input.chunks[simd8x64::NUM_CHUNKS-1]; + + } + } + // do not forget to call check_eof! + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // namespace utf8_validation + +using utf8_validation::utf8_checker; + +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h +/* begin file src/generic/utf8_validation/utf8_validator.h */ +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_validation { + +/** + * Validates that the string is actual UTF-8. + */ +template +bool generic_validate_utf8(const uint8_t * input, size_t length) { + checker c{}; + buf_block_reader<64> reader(input, length); + while (reader.has_full_block()) { + simd::simd8x64 in(reader.full_block()); + c.check_next_input(in); + reader.advance(); + } + uint8_t block[64]{}; + reader.get_remainder(block); + simd::simd8x64 in(block); + c.check_next_input(in); + reader.advance(); + c.check_eof(); + return !c.errors(); +} + +bool generic_validate_utf8(const char * input, size_t length) { + return generic_validate_utf8(reinterpret_cast(input),length); +} + +} // namespace utf8_validation +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_validation/utf8_validator.h */ +// transcoding from UTF-8 to UTF-16 +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ + + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf16 { + +using namespace simd; + + +simdutf_warn_unused size_t convert_valid(const char* input, size_t size, + char16_t* utf16_output) noexcept { + // The implementation is not specific to haswell and should be moved to the generic directory. + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + // this loop could be unrolled further. For example, we could process the mask + // far more than 64 bytes. + // + // For pure ASCII inputs, this function is not optimally fast because they are + // faster ways to just check for ASCII than to compute the continuation mask. + // However, the continuation mask is more informative. There might be a trade-off + // involved. + // + simd8x64 in(reinterpret_cast(input + pos)); + uint64_t utf8_continuation_mask = in.lt(-65 + 1); + // -65 is 0b10111111 in two-complement's, so largest possible continuation byte + if(utf8_continuation_mask != 0) { + // Slow path. We hope that the compiler will recognize that this is a slow path. + // Anything that is not a continuation mask is a 'leading byte', that is, the + // start of a new code point. + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + // The *start* of code points is not so useful, rather, we want the *end* of code points. + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times when using solely + // the slow/regular path, and at least four times if there are fast paths. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + // + // Thus we may allow convert_masked_utf8_to_utf16 to process + // more bytes at a time under a fast-path mode where 16 bytes + // are consumed at once (e.g., when encountering ASCII). + size_t consumed = convert_masked_utf8_to_utf16(input + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } else { + in.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } + } + utf16_output += scalar::utf8_to_utf16::convert_valid(input + pos, size - pos, utf16_output); + return utf16_output - start; +} + + +} // namespace utf8_to_utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h +/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */ + + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8_to_utf16 { +using namespace simd; + + + simdutf_really_inline simd8 check_special_cases(const simd8 input, const simd8 prev1) { +// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII) +// Bit 1 = Too Long (ASCII followed by continuation) +// Bit 2 = Overlong 3-byte +// Bit 4 = Surrogate +// Bit 5 = Overlong 2-byte +// Bit 7 = Two Continuations + constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______ + // 11______ 11______ + constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______ + constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____ + constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____ + constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______ + constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______ + constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____ + // 11110100 101_____ + // 11110101 1001____ + // 11110101 101_____ + // 1111011_ 1001____ + // 1111011_ 101_____ + // 11111___ 1001____ + // 11111___ 101_____ + constexpr const uint8_t TOO_LARGE_1000 = 1<<6; + // 11110101 1000____ + // 1111011_ 1000____ + // 11111___ 1000____ + constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____ + + const simd8 byte_1_high = prev1.shr<4>().lookup_16( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); + constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 . + const simd8 byte_1_low = (prev1 & 0x0F).lookup_16( + // ____0000 ________ + CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, + // ____0001 ________ + CARRY | OVERLONG_2, + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + CARRY | TOO_LARGE, + // ____0101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____011_ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + + // ____1___ ________ + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000, + // ____1101 ________ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, + CARRY | TOO_LARGE | TOO_LARGE_1000, + CARRY | TOO_LARGE | TOO_LARGE_1000 + ); + const simd8 byte_2_high = input.shr<4>().lookup_16( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, + // ________ 1001____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, + // ________ 101_____ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); + return (byte_1_high & byte_1_low & byte_2_high); + } + simdutf_really_inline simd8 check_multibyte_lengths(const simd8 input, + const simd8 prev_input, const simd8 sc) { + simd8 prev2 = input.prev<2>(prev_input); + simd8 prev3 = input.prev<3>(prev_input); + simd8 must23 = simd8(must_be_2_3_continuation(prev2, prev3)); + simd8 must23_80 = must23 & uint8_t(0x80); + return must23_80 ^ sc; + } + + + struct validating_transcoder { + // If this is nonzero, there has been a UTF-8 error. + simd8 error; + + validating_transcoder() : error(uint8_t(0)) {} + // + // Check whether the current bytes are valid UTF-8. + // + simdutf_really_inline void check_utf8_bytes(const simd8 input, const simd8 prev_input) { + // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes + // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers) + simd8 prev1 = input.prev<1>(prev_input); + simd8 sc = check_special_cases(input, prev1); + this->error |= check_multibyte_lengths(input, prev_input, sc); + } + + + + simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) { + size_t pos = 0; + char16_t* start{utf16_output}; + const size_t safety_margin = 16; // to avoid overruns! + while(pos + 64 + safety_margin <= size) { + simd8x64 input(reinterpret_cast(in + pos)); + if(input.is_ascii()) { + input.store_ascii_as_utf16(utf16_output); + utf16_output += 64; + pos += 64; + } else { + // you might think that a for-loop would work, but under Visual Studio, it is not good enough. + static_assert((simd8x64::NUM_CHUNKS == 2) || (simd8x64::NUM_CHUNKS == 4), + "We support either two or four chunks per 64-byte block."); + auto zero = simd8{uint8_t(0)}; + if(simd8x64::NUM_CHUNKS == 2) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + } else if(simd8x64::NUM_CHUNKS == 4) { + this->check_utf8_bytes(input.chunks[0], zero); + this->check_utf8_bytes(input.chunks[1], input.chunks[0]); + this->check_utf8_bytes(input.chunks[2], input.chunks[1]); + this->check_utf8_bytes(input.chunks[3], input.chunks[2]); + } + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + uint64_t utf8_leading_mask = ~utf8_continuation_mask; + uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; + // We process in blocks of up to 12 bytes except possibly + // for fast paths which may process up to 16 bytes. For the + // slow path to work, we should have at least 12 input bytes left. + size_t max_starting_point = (pos + 64) - 12; + // Next loop is going to run at least five times. + while(pos < max_starting_point) { + // Performance note: our ability to compute 'consumed' and + // then shift and recompute is critical. If there is a + // latency of, say, 4 cycles on getting 'consumed', then + // the inner loop might have a total latency of about 6 cycles. + // Yet we process between 6 to 12 inputs bytes, thus we get + // a speed limit between 1 cycle/byte and 0.5 cycle/byte + // for this section of the code. Hence, there is a limit + // to how much we can further increase this latency before + // it seriously harms performance. + size_t consumed = convert_masked_utf8_to_utf16(in + pos, + utf8_end_of_code_point_mask, utf16_output); + pos += consumed; + utf8_end_of_code_point_mask >>= consumed; + } + // At this point there may remain between 0 and 12 bytes in the + // 64-byte block.These bytes will be processed again. So we have an + // 80% efficiency (in the worst case). In practice we expect an + // 85% to 90% efficiency. + } + } + if(errors()) { return 0; } + if(pos < size) { + size_t howmany = scalar::utf8_to_utf16::convert(in + pos, size - pos, utf16_output); + if(howmany == 0) { return 0; } + utf16_output += howmany; + } + return utf16_output - start; + } + + simdutf_really_inline bool errors() const { + return this->error.any_bits_set_anywhere(); + } + + }; // struct utf8_checker +} // utf8_to_utf16 namespace +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */ +// other functions +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf8.h +/* begin file src/generic/utf8.h */ + +namespace simdutf { +namespace westmere { +namespace { +namespace utf8 { + +using namespace simd; + +simdutf_really_inline size_t count_code_points(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + count += 64 - count_ones(utf8_continuation_mask); + } + return count + scalar::utf8::count_code_points(in + pos, size - pos); +} + + +simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 64 <= size; pos += 64) { + simd8x64 input(reinterpret_cast(in + pos)); + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + // We count one word for anything that is not a continuation (so + // leading bytes). + count += 64 - count_ones(utf8_continuation_mask); + int64_t utf8_4byte = input.gteq_unsigned(240); + count += count_ones(utf8_4byte); + } + return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos); +} +} // utf8 namespace +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf8.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=generic/utf16.h +/* begin file src/generic/utf16.h */ +#include +namespace simdutf { +namespace westmere { +namespace { +namespace utf16 { + +simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF); + count += count_ones(not_pair) / 2; + } + return count + scalar::utf16::count_code_points(in + pos, size - pos); +} +simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) { + size_t pos = 0; + size_t count = 0; + // This algorithm could no doubt be improved! + for(;pos + 32 <= size; pos += 32) { + simd16x32 input(reinterpret_cast(in + pos)); + uint64_t ascii_mask = input.lteq(0x7F); + uint64_t twobyte_mask = input.lteq(0x7FF); + uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF); + + size_t ascii_count = count_ones(ascii_mask) / 2; + size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2; + size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2; + size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2; + count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count; + } + return count + scalar::utf16::utf8_length_from_utf16(in + pos, size - pos); +} +} // utf16 +} // unnamed namespace +} // namespace westmere +} // namespace simdutf +/* end file src/generic/utf16.h */ +// +// Implementation-specific overrides +// + +namespace simdutf { +namespace westmere { + +simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { + return westmere::utf8_validation::generic_validate_utf8(buf, len); +} + +simdutf_warn_unused bool implementation::validate_utf16(const char16_t *buf, size_t len) const noexcept { + const char16_t* tail = sse_validate_utf16le(buf, len); + if (tail) { + return scalar::utf16::validate(tail, len - (tail - buf)); + } else { + return false; + } +} + +simdutf_warn_unused size_t implementation::convert_utf8_to_utf16(const char* buf, size_t len, char16_t* utf16_output) const noexcept { + utf8_to_utf16::validating_transcoder converter; + return converter.convert(buf, len, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16(const char* input, size_t size, + char16_t* utf16_output) const noexcept { + return utf8_to_utf16::convert_valid(input, size, utf16_output); +} + +simdutf_warn_unused size_t implementation::convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + std::pair ret = sse_convert_utf16_to_utf8(buf, len, utf8_output); + if (ret.first == nullptr) { return 0; } + size_t saved_bytes = ret.second - utf8_output; + if (ret.first != buf + len) { + const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert( + ret.first, len - (ret.first - buf), ret.second); + if (scalar_saved_bytes == 0) { return 0; } + saved_bytes += scalar_saved_bytes; + } + return saved_bytes; +} + +simdutf_warn_unused size_t implementation::convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept { + return convert_utf16_to_utf8(buf, len, utf8_output); +} + +simdutf_warn_unused size_t implementation::count_utf16(const char16_t * input, size_t length) const noexcept { + return utf16::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept { + return utf8::count_code_points(input, length); +} + +simdutf_warn_unused size_t implementation::utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept { + return utf16::utf8_length_from_utf16(input, length); +} + +simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept { + return utf8::utf16_length_from_utf8(input, length); +} + +} // namespace westmere +} // namespace simdutf + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h +/* begin file src/simdutf/westmere/end.h */ +SIMDUTF_UNTARGET_REGION +/* end file src/simdutf/westmere/end.h */ +/* end file src/westmere/implementation.cpp */ +#endif + +SIMDUTF_POP_DISABLE_WARNINGS +/* end file src/simdutf.cpp */ diff --git a/cbits/validate_utf8.cpp b/cbits/validate_utf8.cpp new file mode 100644 index 00000000..73ab5f25 --- /dev/null +++ b/cbits/validate_utf8.cpp @@ -0,0 +1,6 @@ +#include "simdutf.h" + +extern "C" +int _hs_text_is_valid_utf8(const char* str, size_t len){ + return simdutf::validate_utf8(str, len); +} diff --git a/include/simdutf.h b/include/simdutf.h new file mode 100644 index 00000000..82e999fb --- /dev/null +++ b/include/simdutf.h @@ -0,0 +1,1084 @@ +/* auto-generated on 2021-07-29 10:43:28 -0400. Do not edit! */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf.h +/* begin file include/simdutf.h */ +#ifndef SIMDUTF_H +#define SIMDUTF_H +#include +#include +#include +#include + +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h +/* begin file include/simdutf/compiler_check.h */ +#ifndef SIMDUTF_COMPILER_CHECK_H +#define SIMDUTF_COMPILER_CHECK_H + +#ifndef __cplusplus +#error simdutf requires a C++ compiler +#endif + +#ifndef SIMDUTF_CPLUSPLUS +#if defined(_MSVC_LANG) && !defined(__clang__) +#define SIMDUTF_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG) +#else +#define SIMDUTF_CPLUSPLUS __cplusplus +#endif +#endif + +// C++ 17 +#if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L) +#define SIMDUTF_CPLUSPLUS17 1 +#endif + +// C++ 14 +#if !defined(SIMDUTF_CPLUSPLUS14) && (SIMDUTF_CPLUSPLUS >= 201402L) +#define SIMDUTF_CPLUSPLUS14 1 +#endif + +// C++ 11 +#if !defined(SIMDUTF_CPLUSPLUS11) && (SIMDUTF_CPLUSPLUS >= 201103L) +#define SIMDUTF_CPLUSPLUS11 1 +#endif + +#ifndef SIMDUTF_CPLUSPLUS11 +#error simdutf requires a compiler compliant with the C++11 standard +#endif + +#endif // SIMDUTF_COMPILER_CHECK_H +/* end file include/simdutf/compiler_check.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h +/* begin file include/simdutf/common_defs.h */ +#ifndef SIMDUTF_COMMON_DEFS_H +#define SIMDUTF_COMMON_DEFS_H + +#include +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/portability.h +/* begin file include/simdutf/portability.h */ +#ifndef SIMDUTF_PORTABILITY_H +#define SIMDUTF_PORTABILITY_H + +#include +#include +#include +#include +#include +#ifndef _WIN32 +// strcasecmp, strncasecmp +#include +#endif + +#ifdef _MSC_VER +#define SIMDUTF_VISUAL_STUDIO 1 +/** + * We want to differentiate carefully between + * clang under visual studio and regular visual + * studio. + * + * Under clang for Windows, we enable: + * * target pragmas so that part and only part of the + * code gets compiled for advanced instructions. + * + */ +#ifdef __clang__ +// clang under visual studio +#define SIMDUTF_CLANG_VISUAL_STUDIO 1 +#else +// just regular visual studio (best guess) +#define SIMDUTF_REGULAR_VISUAL_STUDIO 1 +#endif // __clang__ +#endif // _MSC_VER + +#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO +// https://en.wikipedia.org/wiki/C_alternative_tokens +// This header should have no effect, except maybe +// under Visual Studio. +#include +#endif + +#if defined(__x86_64__) || defined(_M_AMD64) +#define SIMDUTF_IS_X86_64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define SIMDUTF_IS_ARM64 1 +#elif defined(__PPC64__) || defined(_M_PPC64) +//#define SIMDUTF_IS_PPC64 1 +#pragma message("The simdutf library does yet support SIMD acceleration under\ +POWER processors. Please see https://github.com/lemire/simdutf/issues/51") +#else +// The simdutf library is designed +// for 64-bit processors and it seems that you are not +// compiling for a known 64-bit platform. Please +// use a 64-bit target such as x64 or 64-bit ARM for best performance. +#define SIMDUTF_IS_32BITS 1 + +// We do not support 32-bit platforms, but it can be +// handy to identify them. +#if defined(_M_IX86) || defined(__i386__) +#define SIMDUTF_IS_X86_32BITS 1 +#elif defined(__arm__) || defined(_M_ARM) +#define SIMDUTF_IS_ARM_32BITS 1 +#elif defined(__PPC__) || defined(_M_PPC) +#define SIMDUTF_IS_PPC_32BITS 1 +#endif + +#endif // defined(__x86_64__) || defined(_M_AMD64) + +#ifdef SIMDUTF_IS_32BITS +#ifndef SIMDUTF_NO_PORTABILITY_WARNING +#pragma message("The simdutf library is designed \ +for 64-bit processors and it seems that you are not \ +compiling for a known 64-bit platform. All fast kernels \ +will be disabled and performance may be poor. Please \ +use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.") +#endif // SIMDUTF_NO_PORTABILITY_WARNING +#endif // SIMDUTF_IS_32BITS + +// this is almost standard? +#undef STRINGIFY_IMPLEMENTATION_ +#undef STRINGIFY +#define STRINGIFY_IMPLEMENTATION_(a) #a +#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a) + +// Our fast kernels require 64-bit systems. +// +// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions. +// Furthermore, the number of SIMD registers is reduced. +// +// On 32-bit ARM, we would have smaller registers. +// +// The simdutf users should still have the fallback kernel. It is +// slower, but it should run everywhere. + +// +// Enable valid runtime implementations, and select SIMDUTF_BUILTIN_IMPLEMENTATION +// + +// We are going to use runtime dispatch. +#ifdef SIMDUTF_IS_X86_64 +#ifdef __clang__ +// clang does not have GCC push pop +// warning: clang attribute push can't be used within a namespace in clang up +// til 8.0 so SIMDUTF_TARGET_REGION and SIMDUTF_UNTARGET_REGION must be *outside* of a +// namespace. +#define SIMDUTF_TARGET_REGION(T) \ + _Pragma(STRINGIFY( \ + clang attribute push(__attribute__((target(T))), apply_to = function))) +#define SIMDUTF_UNTARGET_REGION _Pragma("clang attribute pop") +#elif defined(__GNUC__) +// GCC is easier +#define SIMDUTF_TARGET_REGION(T) \ + _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T))) +#define SIMDUTF_UNTARGET_REGION _Pragma("GCC pop_options") +#endif // clang then gcc + +#endif // x86 + +// Default target region macros don't do anything. +#ifndef SIMDUTF_TARGET_REGION +#define SIMDUTF_TARGET_REGION(T) +#define SIMDUTF_UNTARGET_REGION +#endif + +// Is threading enabled? +#if defined(_REENTRANT) || defined(_MT) +#ifndef SIMDUTF_THREADS_ENABLED +#define SIMDUTF_THREADS_ENABLED +#endif +#endif + +// workaround for large stack sizes under -O0. +// https://github.com/simdutf/simdutf/issues/691 +#ifdef __APPLE__ +#ifndef __OPTIMIZE__ +// Apple systems have small stack sizes in secondary threads. +// Lack of compiler optimization may generate high stack usage. +// Users may want to disable threads for safety, but only when +// in debug mode which we detect by the fact that the __OPTIMIZE__ +// macro is not defined. +#undef SIMDUTF_THREADS_ENABLED +#endif +#endif + + +#if defined(__clang__) +#define NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined"))) +#elif defined(__GNUC__) +#define NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined)) +#else +#define NO_SANITIZE_UNDEFINED +#endif + +#ifdef SIMDUTF_VISUAL_STUDIO +// This is one case where we do not distinguish between +// regular visual studio and clang under visual studio. +// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has) +#define simdutf_strcasecmp _stricmp +#define simdutf_strncasecmp _strnicmp +#else +// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8). +// So they are only useful for ASCII in our context. +// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings +#define simdutf_strcasecmp strcasecmp +#define simdutf_strncasecmp strncasecmp +#endif + +#ifdef NDEBUG + +#ifdef SIMDUTF_VISUAL_STUDIO +#define SIMDUTF_UNREACHABLE() __assume(0) +#define SIMDUTF_ASSUME(COND) __assume(COND) +#else +#define SIMDUTF_UNREACHABLE() __builtin_unreachable(); +#define SIMDUTF_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0) +#endif + +#else // NDEBUG + +#define SIMDUTF_UNREACHABLE() assert(0); +#define SIMDUTF_ASSUME(COND) assert(COND) + +#endif + +#endif // SIMDUTF_PORTABILITY_H +/* end file include/simdutf/portability.h */ + + +#if defined(__GNUC__) + // Marks a block with a name so that MCA analysis can see it. + #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name); + #define SIMDUTF_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name); + #define SIMDUTF_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name); +#else + #define SIMDUTF_BEGIN_DEBUG_BLOCK(name) + #define SIMDUTF_END_DEBUG_BLOCK(name) + #define SIMDUTF_DEBUG_BLOCK(name, block) +#endif + +// Align to N-byte boundary +#define SIMDUTF_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) +#define SIMDUTF_ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) + +#define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) + +#if defined(SIMDUTF_REGULAR_VISUAL_STUDIO) + + #define simdutf_really_inline __forceinline + #define simdutf_never_inline __declspec(noinline) + + #define simdutf_unused + #define simdutf_warn_unused + + #ifndef simdutf_likely + #define simdutf_likely(x) x + #endif + #ifndef simdutf_unlikely + #define simdutf_unlikely(x) x + #endif + + #define SIMDUTF_PUSH_DISABLE_WARNINGS __pragma(warning( push )) + #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 )) + #define SIMDUTF_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER )) + // Get rid of Intellisense-only warnings (Code Analysis) + // Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910). + #ifdef __has_include + #if __has_include() + #include + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS) + #endif + #endif + + #ifndef SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #endif + + #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_VS_WARNING(4996) + #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING + #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop )) + +#else // SIMDUTF_REGULAR_VISUAL_STUDIO + + #define simdutf_really_inline inline __attribute__((always_inline)) + #define simdutf_never_inline inline __attribute__((noinline)) + + #define simdutf_unused __attribute__((unused)) + #define simdutf_warn_unused __attribute__((warn_unused_result)) + + #ifndef simdutf_likely + #define simdutf_likely(x) __builtin_expect(!!(x), 1) + #endif + #ifndef simdutf_unlikely + #define simdutf_unlikely(x) __builtin_expect(!!(x), 0) + #endif + + #define SIMDUTF_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push") + // gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary + #define SIMDUTF_PUSH_DISABLE_ALL_WARNINGS SIMDUTF_PUSH_DISABLE_WARNINGS \ + SIMDUTF_DISABLE_GCC_WARNING(-Weffc++) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wall) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wconversion) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wextra) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wattributes) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wreturn-type) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wshadow) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wunused-parameter) \ + SIMDUTF_DISABLE_GCC_WARNING(-Wunused-variable) + #define SIMDUTF_PRAGMA(P) _Pragma(#P) + #define SIMDUTF_DISABLE_GCC_WARNING(WARNING) SIMDUTF_PRAGMA(GCC diagnostic ignored #WARNING) + #if defined(SIMDUTF_CLANG_VISUAL_STUDIO) + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS SIMDUTF_DISABLE_GCC_WARNING(-Wmicrosoft-include) + #else + #define SIMDUTF_DISABLE_UNDESIRED_WARNINGS + #endif + #define SIMDUTF_DISABLE_DEPRECATED_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wdeprecated-declarations) + #define SIMDUTF_DISABLE_STRICT_OVERFLOW_WARNING SIMDUTF_DISABLE_GCC_WARNING(-Wstrict-overflow) + #define SIMDUTF_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop") + + + +#endif // MSC_VER + +#if defined(SIMDUTF_VISUAL_STUDIO) + /** + * It does not matter here whether you are using + * the regular visual studio or clang under visual + * studio. + */ + #if SIMDUTF_USING_LIBRARY + #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport) + #else + #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport) + #endif +#else + #define SIMDUTF_DLLIMPORTEXPORT +#endif + +/// If EXPR is an error, returns it. +#define SIMDUTF_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } + + +#endif // SIMDUTF_COMMON_DEFS_H +/* end file include/simdutf/common_defs.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h +/* begin file include/simdutf/encoding_types.h */ +#include + +namespace simdutf { + +enum encoding_type { + UTF16_LE, // BOM 0xff 0xfe + UTF16_BE, // BOM 0xfe 0xff + UTF32_LE, // BOM 0xff 0xfe 0x00 0x00 + UTF32_BE, // BOM 0x00 0x00 0xfe 0xff + UTF8, // BOM 0xef 0xbb 0xbf + unspecified +}; + +std::string to_string(encoding_type bom); + +// Note that BOM for UTF8 is discouraged. +namespace BOM { + +/** + * Checks for a BOM. If not, returns unspecified + * @param input the string to process + * @param length the length of the string in words + * @return the corresponding encoding + */ + +encoding_type check_bom(const uint8_t* byte, size_t length); +encoding_type check_bom(const char* byte, size_t length); +/** + * Returns the size, in bytes, of the BOM for a given encoding type. + * Note that UTF8 BOM are discouraged. + * @param bom the encoding type + * @return the size in bytes of the corresponding BOM + */ +size_t bom_byte_size(encoding_type bom); + +} // BOM namespace +} // simdutf namespace +/* end file include/simdutf/encoding_types.h */ + +SIMDUTF_PUSH_DISABLE_WARNINGS +SIMDUTF_DISABLE_UNDESIRED_WARNINGS + +// Public API +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h +/* begin file include/simdutf/simdutf_version.h */ +// /include/simdutf/simdutf_version.h automatically generated by release.py, +// do not change by hand +#ifndef SIMDUTF_SIMDUTF_VERSION_H +#define SIMDUTF_SIMDUTF_VERSION_H + +/** The version of simdutf being used (major.minor.revision) */ +#define SIMDUTF_VERSION 0.1.0 + +namespace simdutf { +enum { + /** + * The major version (MAJOR.minor.revision) of simdutf being used. + */ + SIMDUTF_VERSION_MAJOR = 0, + /** + * The minor version (major.MINOR.revision) of simdutf being used. + */ + SIMDUTF_VERSION_MINOR = 1, + /** + * The revision (major.minor.REVISION) of simdutf being used. + */ + SIMDUTF_VERSION_REVISION = 0 +}; +} // namespace simdutf + +#endif // SIMDUTF_SIMDUTF_VERSION_H +/* end file include/simdutf/simdutf_version.h */ +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/implementation.h +/* begin file include/simdutf/implementation.h */ +#ifndef SIMDUTF_IMPLEMENTATION_H +#define SIMDUTF_IMPLEMENTATION_H +#include +#include +#include +// dofile: invoked with prepath=/Users/lemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h +/* begin file include/simdutf/internal/isadetection.h */ +/* From +https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h +Highly modified. + +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, +Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute +(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, +Samy Bengio, Johnny Mariethoz) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories +America and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef SIMDutf_INTERNAL_ISADETECTION_H +#define SIMDutf_INTERNAL_ISADETECTION_H + +#include +#include +#if defined(_MSC_VER) +#include +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) +#include +#endif + +namespace simdutf { +namespace internal { + + +enum instruction_set { + DEFAULT = 0x0, + NEON = 0x1, + AVX2 = 0x4, + SSE42 = 0x8, + PCLMULQDQ = 0x10, + BMI1 = 0x20, + BMI2 = 0x40, + ALTIVEC = 0x80 +}; + +#if defined(__PPC64__) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::ALTIVEC; +} + +#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64 + +#if defined(__ARM_NEON) + +static inline uint32_t detect_supported_architectures() { + return instruction_set::NEON; +} + +#else // ARM without NEON + +static inline uint32_t detect_supported_architectures() { + return instruction_set::DEFAULT; +} + +#endif + +#elif defined(__x86_64__) || defined(_M_AMD64) // x64 + + +namespace { +// Can be found on Intel ISA Reference for CPUID +constexpr uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7 +constexpr uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7 +constexpr uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7 +constexpr uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1 +constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1 +} + + + +static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, + uint32_t *edx) { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, *eax); + *eax = cpu_info[0]; + *ebx = cpu_info[1]; + *ecx = cpu_info[2]; + *edx = cpu_info[3]; +#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid(level, eax, ebx, ecx, edx); +#else + uint32_t a = *eax, b, c = *ecx, d; + asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d)); + *eax = a; + *ebx = b; + *ecx = c; + *edx = d; +#endif +} + +static inline uint32_t detect_supported_architectures() { + uint32_t eax, ebx, ecx, edx; + uint32_t host_isa = 0x0; + + // ECX for EAX=0x7 + eax = 0x7; + ecx = 0x0; + cpuid(&eax, &ebx, &ecx, &edx); + if (ebx & cpuid_avx2_bit) { + host_isa |= instruction_set::AVX2; + } + if (ebx & cpuid_bmi1_bit) { + host_isa |= instruction_set::BMI1; + } + + if (ebx & cpuid_bmi2_bit) { + host_isa |= instruction_set::BMI2; + } + + // EBX for EAX=0x1 + eax = 0x1; + cpuid(&eax, &ebx, &ecx, &edx); + + if (ecx & cpuid_sse42_bit) { + host_isa |= instruction_set::SSE42; + } + + if (ecx & cpuid_pclmulqdq_bit) { + host_isa |= instruction_set::PCLMULQDQ; + } + + return host_isa; +} +#else // fallback + + +static inline uint32_t detect_supported_architectures() { + return instruction_set::DEFAULT; +} + + +#endif // end SIMD extension detection code + +} // namespace internal +} // namespace simdutf + +#endif // SIMDutf_INTERNAL_ISADETECTION_H +/* end file include/simdutf/internal/isadetection.h */ + + +namespace simdutf { + +/** + * Autodetect the encoding of the input. + * + * @param input the string to analyze. + * @param length the length of the string in bytes. + * @return the detected encoding type + */ +simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * input, size_t length) noexcept; +simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const uint8_t * input, size_t length) noexcept { + return autodetect_encoding(reinterpret_cast(input), length); +} + + +/** + * Validate the UTF-8 string. + * + * Overridden by each implementation. + * + * @param buf the UTF-8 string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid UTF-8. + */ +simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept; + +/** + * Validate the UTF-16LE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16LE. + */ +simdutf_warn_unused bool validate_utf16(const char16_t *buf, size_t len) noexcept; + +/** + * Convert possibly broken UTF-8 string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf8_output) noexcept; + +/** + * Convert valid UTF-8 string into UTF-16LE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ +simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept; + +/** + * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE + */ +simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept; + +/** + * Convert possibly broken UTF-16LE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ +simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Convert valid UTF-16LE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept; + +/** + * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-8 + */ +simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept; + +/** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ +simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept; + +/** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return number of code points + */ +simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept; + +/** + * An implementation of simdutf for a particular CPU architecture. + * + * Also used to maintain the currently active implementation. The active implementation is + * automatically initialized on first use to the most advanced implementation supported by the host. + */ +class implementation { +public: + + /** + * The name of this implementation. + * + * const implementation *impl = simdutf::active_implementation; + * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * + * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" + */ + virtual const std::string &name() const { return _name; } + + /** + * The description of this implementation. + * + * const implementation *impl = simdutf::active_implementation; + * cout << "simdutf is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * + * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" + */ + virtual const std::string &description() const { return _description; } + + /** + * The instruction sets this implementation is compiled against + * and the current CPU match. This function may poll the current CPU/system + * and should therefore not be called too often if performance is a concern. + * + * + * @return true if the implementation can be safely used on the current system (determined at runtime) + */ + bool supported_by_runtime_system() const; + + /** + * This function will try to detect the encoding + * @param input the string to identify + * @param length the length of the string in bytes. + * @return the encoding type detected + */ + virtual encoding_type autodetect_encoding(const char * input, size_t length) const noexcept; + + /** + * @private For internal implementation use + * + * The instruction sets this implementation is compiled against. + * + * @return a mask of all required `internal::instruction_set::` values + */ + virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }; + + + /** + * Validate the UTF-8 string. + * + * Overridden by each implementation. + * + * @param buf the UTF-8 string to validate. + * @param len the length of the string in bytes. + * @return true if and only if the string is valid UTF-8. + */ + simdutf_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0; + + /** + * Validate the UTF-16LE string. + * + * Overridden by each implementation. + * + * This function is not BOM-aware. + * + * @param buf the UTF-16LE string to validate. + * @param len the length of the string in number of 2-byte words (char16_t). + * @return true if and only if the string is valid UTF-16LE. + */ + simdutf_warn_unused virtual bool validate_utf16(const char16_t *buf, size_t len) const noexcept = 0; + + /** + * Convert possibly broken UTF-8 string into UTF-16LE string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if the input was not valid UTF-8 string + */ + simdutf_warn_unused virtual size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf8_output) const noexcept = 0; + + /** + * Convert valid UTF-8 string into UTF-16LE string. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t + */ + simdutf_warn_unused virtual size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) const noexcept = 0; + + /** + * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format. + * + * This function does not validate the input. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return the number of char16_t words required to encode the UTF-8 string as UTF-16LE + */ + simdutf_warn_unused virtual size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept = 0; + + /** + * Convert possibly broken UTF-16LE string into UTF-8 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold conversion result + * @return number of written words; 0 if input is not a valid UTF-16LE string + */ + simdutf_warn_unused virtual size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Convert valid UTF-16LE string into UTF-8 string. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @param utf8_buffer the pointer to buffer that can hold the conversion result + * @return number of written words; 0 if conversion is not possible + */ + simdutf_warn_unused virtual size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) const noexcept = 0; + + /** + * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format. + * + * This function does not validate the input. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to convert + * @param length the length of the string in 2-byte words (char16_t) + * @return the number of bytes required to encode the UTF-16LE string as UTF-8 + */ + simdutf_warn_unused virtual size_t utf8_length_from_utf16(const char16_t * input, size_t length) const noexcept = 0; + + /** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-16LE. + * + * This function is not BOM-aware. + * + * @param input the UTF-16LE string to process + * @param length the length of the string in 2-byte words (char16_t) + * @return number of code points + */ + simdutf_warn_unused virtual size_t count_utf16(const char16_t * input, size_t length) const noexcept = 0; + + /** + * Count the number of code points (characters) in the string assuming that + * it is valid. + * + * This function assumes that the input string is valid UTF-8. + * + * @param input the UTF-8 string to process + * @param length the length of the string in bytes + * @return number of code points + */ + simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0; + + + +protected: + /** @private Construct an implementation with the given name and description. For subclasses. */ + simdutf_really_inline implementation( + std::string name, + std::string description, + uint32_t required_instruction_sets + ) : + _name(name), + _description(description), + _required_instruction_sets(required_instruction_sets) + { + } + virtual ~implementation()=default; + +private: + /** + * The name of this implementation. + */ + const std::string _name; + + /** + * The description of this implementation. + */ + const std::string _description; + + /** + * Instruction sets required for this implementation. + */ + const uint32_t _required_instruction_sets; +}; + +/** @private */ +namespace internal { + +/** + * The list of available implementations compiled into simdutf. + */ +class available_implementation_list { +public: + /** Get the list of available implementations compiled into simdutf */ + simdutf_really_inline available_implementation_list() {} + /** Number of implementations */ + size_t size() const noexcept; + /** STL const begin() iterator */ + const implementation * const *begin() const noexcept; + /** STL const end() iterator */ + const implementation * const *end() const noexcept; + + /** + * Get the implementation with the given name. + * + * Case sensitive. + * + * const implementation *impl = simdutf::available_implementations["westmere"]; + * if (!impl) { exit(1); } + * if (!imp->supported_by_runtime_system()) { exit(1); } + * simdutf::active_implementation = impl; + * + * @param name the implementation to find, e.g. "westmere", "haswell", "arm64" + * @return the implementation, or nullptr if the parse failed. + */ + const implementation * operator[](const std::string &name) const noexcept { + for (const implementation * impl : *this) { + if (impl->name() == name) { return impl; } + } + return nullptr; + } + + /** + * Detect the most advanced implementation supported by the current host. + * + * This is used to initialize the implementation on startup. + * + * const implementation *impl = simdutf::available_implementation::detect_best_supported(); + * simdutf::active_implementation = impl; + * + * @return the most advanced supported implementation for the current host, or an + * implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported + * implementation. Will never return nullptr. + */ + const implementation *detect_best_supported() const noexcept; +}; + +template +class atomic_ptr { +public: + atomic_ptr(T *_ptr) : ptr{_ptr} {} + + operator const T*() const { return ptr.load(); } + const T& operator*() const { return *ptr; } + const T* operator->() const { return ptr.load(); } + + operator T*() { return ptr.load(); } + T& operator*() { return *ptr; } + T* operator->() { return ptr.load(); } + atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; } + +private: + std::atomic ptr; +}; + +} // namespace internal + +/** + * The list of available implementations compiled into simdutf. + */ +extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list available_implementations; + +/** + * The active implementation. + * + * Automatically initialized on first use to the most advanced implementation supported by this hardware. + */ +extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr active_implementation; + +} // namespace simdutf + +#endif // SIMDUTF_IMPLEMENTATION_H +/* end file include/simdutf/implementation.h */ + + +// Implementation-internal files (must be included before the implementations themselves, to keep +// amalgamation working--otherwise, the first time a file is included, it might be put inside the +// #ifdef SIMDUTF_IMPLEMENTATION_ARM64/FALLBACK/etc., which means the other implementations can't +// compile unless that implementation is turned on). + + +SIMDUTF_POP_DISABLE_WARNINGS + +#endif // SIMDUTF_H +/* end file include/simdutf.h */ diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 71eaacb6..a40d6ec1 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -78,7 +78,7 @@ import Data.Text.Internal.Unsafe.Char (unsafeWrite) import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word8) -import Foreign.C.Types (CSize(..)) +import Foreign.C.Types (CSize(..), CInt(..)) import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) @@ -154,6 +154,10 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize +isValidBS :: ByteString -> Bool +isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len) + -- | Decode a 'ByteString' containing UTF-8 encoded text. -- -- Surrogate code points in replacement character returned by 'OnDecodeError' @@ -164,6 +168,9 @@ decodeUtf8With :: #endif OnDecodeError -> ByteString -> Text decodeUtf8With onErr bs + | isValidBS bs = + let !(SBS.SBS arr) = SBS.toShort bs in + (Text (A.ByteArray arr) 0 (B.length bs)) | B.null undecoded = txt | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of Nothing -> txt' @@ -190,6 +197,21 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do | i < len1 = B.index bs1 i | otherwise = B.index bs2 (i - len1) + -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 + guessUtf8Boundary :: Int + guessUtf8Boundary + | len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII + | len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point + | len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point + | len2 >= 3 && w2 >= 0xC0 = len2 - 3 + | len2 >= 4 && w3 >= 0xC0 = len2 - 4 + | otherwise = 0 + where + w0 = B.index bs2 (len2 - 1) + w1 = B.index bs2 (len2 - 2) + w2 = B.index bs2 (len2 - 3) + w3 = B.index bs2 (len2 - 4) + decodeFrom :: Int -> DecoderResult decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) where @@ -205,10 +227,21 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do A.shrinkM dst dstOff arr <- A.unsafeFreeze dst return (Text arr 0 dstOff, mempty) + + | srcOff >= len1 + , srcOff < len1 + guessUtf8Boundary + , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen + , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) + , isValidBS bs = do + withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> + unsafeSTToIO $ A.copyP dst dstOff src (len1 + guessUtf8Boundary - srcOff) + inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) + | dstOff + 4 > dstLen = do let dstLen' = dstLen + 4 dst' <- A.resizeM dst dstLen' outer dst' dstLen' srcOff dstOff + | otherwise = case decodeFrom srcOff of Accept c -> do d <- unsafeWrite dst dstOff c @@ -508,3 +541,6 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) cSizeToInt :: CSize -> Int cSizeToInt = fromIntegral + +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 + :: Ptr Word8 -> CSize -> IO CInt diff --git a/text.cabal b/text.cabal index e6992850..ceb67819 100644 --- a/text.cabal +++ b/text.cabal @@ -1,4 +1,4 @@ -cabal-version: >= 1.10 +cabal-version: 2.2 name: text version: 1.2.5.0 @@ -38,7 +38,7 @@ description: based on the well-respected and liberally licensed [ICU library](http://site.icu-project.org/). -license: BSD2 +license: BSD-2-Clause license-file: LICENSE author: Bryan O'Sullivan maintainer: Haskell Text Team , Core Libraries Committee @@ -54,6 +54,7 @@ extra-source-files: README.markdown changelog.md scripts/*.hs + include/*.h tests/literal-rule-test.sh tests/LiteralRuleTest.hs @@ -67,7 +68,22 @@ library cbits/measure_off.c cbits/reverse.c cbits/utils.c + cxx-sources: cbits/simdutf.cpp + cbits/validate_utf8.cpp + include-dirs: include hs-source-dirs: src + cxx-options: -std=c++17 + + if os(windows) + if arch(x86_64) + extra-libraries: stdc++-6 gcc_s_seh-1 + else + extra-libraries: stdc++-6 gcc_s_dw2-1 + else + if os(darwin) + extra-libraries: c++ + else + extra-libraries: stdc++ exposed-modules: Data.Text